public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-4505] i386: Implement doubleword right shifts by 1 bit using s[ha]r+rcr.
@ 2023-10-09 11:04 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2023-10-09 11:04 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:34d4168eb72d8e74387ad4b2fdf7d2417af561e9

commit r14-4505-g34d4168eb72d8e74387ad4b2fdf7d2417af561e9
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Mon Oct 9 12:02:07 2023 +0100

    i386: Implement doubleword right shifts by 1 bit using s[ha]r+rcr.
    
    This patch tweaks the i386 back-end's ix86_split_ashr and ix86_split_lshr
    functions to implement doubleword right shifts by 1 bit, using a shift
    of the highpart that sets the carry flag followed by a rotate-carry-right
    (RCR) instruction on the lowpart.
    
    Conceptually this is similar to the recent left shift patch, but with two
    complicating factors.  The first is that although the RCR sequence is
    shorter, and is a ~3x performance improvement on AMD, my microbenchmarking
    shows it ~10% slower on Intel.  Hence this patch also introduces a new
    X86_TUNE_USE_RCR tuning parameter.  The second is that I believe this is
    the first time a "rotate-right-through-carry" and a right shift that sets
    the carry flag from the least significant bit has been modelled in GCC RTL
    (on a MODE_CC target).  For this I've used the i386 back-end's UNSPEC_CC_NE
    which seems appropriate.  Finally rcrsi2 and rcrdi2 are separate
    define_insns so that we can use their generator functions.
    
    For the pair of functions:
    unsigned __int128 foo(unsigned __int128 x) { return x >> 1; }
    __int128 bar(__int128 x) { return x >> 1; }
    
    with -O2 -march=znver4 we previously generated:
    
    foo:    movq    %rdi, %rax
            movq    %rsi, %rdx
            shrdq   $1, %rsi, %rax
            shrq    %rdx
            ret
    bar:    movq    %rdi, %rax
            movq    %rsi, %rdx
            shrdq   $1, %rsi, %rax
            sarq    %rdx
            ret
    
    with this patch we now generate:
    
    foo:    movq    %rsi, %rdx
            movq    %rdi, %rax
            shrq    %rdx
            rcrq    %rax
            ret
    bar:    movq    %rsi, %rdx
            movq    %rdi, %rax
            sarq    %rdx
            rcrq    %rax
            ret
    
    2023-10-09  Roger Sayle  <roger@nextmovesoftware.com>
    
    gcc/ChangeLog
            * config/i386/i386-expand.cc (ix86_split_ashr): Split shifts by
            one into ashr[sd]i3_carry followed by rcr[sd]i2, if TARGET_USE_RCR
            or -Oz.
            (ix86_split_lshr): Likewise, split shifts by one bit into
            lshr[sd]i3_carry followed by rcr[sd]i2, if TARGET_USE_RCR or -Oz.
            * config/i386/i386.h (TARGET_USE_RCR): New backend macro.
            * config/i386/i386.md (rcrsi2): New define_insn for rcrl.
            (rcrdi2): New define_insn for rcrq.
            (<anyshiftrt><mode>3_carry): New define_insn for right shifts that
            set the carry flag from the least significant bit, modelled using
            UNSPEC_CC_NE.
            * config/i386/x86-tune.def (X86_TUNE_USE_RCR): New tuning parameter
            controlling use of rcr 1 vs. shrd, which is significantly faster on
            AMD processors.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/rcr-1.c: New 64-bit test case.
            * gcc.target/i386/rcr-2.c: New 32-bit test case.

Diff:
---
 gcc/config/i386/i386-expand.cc        | 32 +++++++++++++++++++++
 gcc/config/i386/i386.h                |  1 +
 gcc/config/i386/i386.md               | 53 +++++++++++++++++++++++++++++++++++
 gcc/config/i386/x86-tune.def          |  3 ++
 gcc/testsuite/gcc.target/i386/rcr-1.c |  6 ++++
 gcc/testsuite/gcc.target/i386/rcr-2.c |  6 ++++
 6 files changed, 101 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 964b1fdef80..d5083494798 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -6509,6 +6509,22 @@ ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
 	    emit_insn (gen_ashr3 (low[0], low[0],
 				  GEN_INT (count - half_width)));
 	}
+      else if (count == 1
+	       && (TARGET_USE_RCR || optimize_size > 1))
+	{
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+	  if (mode == DImode)
+	    {
+	      emit_insn (gen_ashrsi3_carry (high[0], high[0]));
+	      emit_insn (gen_rcrsi2 (low[0], low[0]));
+	    }
+	  else
+	    {
+	      emit_insn (gen_ashrdi3_carry (high[0], high[0]));
+	      emit_insn (gen_rcrdi2 (low[0], low[0]));
+	    }
+	}
       else
 	{
 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
@@ -6574,6 +6590,22 @@ ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
 	    emit_insn (gen_lshr3 (low[0], low[0],
 				  GEN_INT (count - half_width)));
 	}
+      else if (count == 1
+	       && (TARGET_USE_RCR || optimize_size > 1))
+	{
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+	  if (mode == DImode)
+	    {
+	      emit_insn (gen_lshrsi3_carry (high[0], high[0]));
+	      emit_insn (gen_rcrsi2 (low[0], low[0]));
+	    }
+	  else
+	    {
+	      emit_insn (gen_lshrdi3_carry (high[0], high[0]));
+	      emit_insn (gen_rcrdi2 (low[0], low[0]));
+	    }
+	}
       else
 	{
 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index ab7cafa508d..e4c1fc6eef0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -453,6 +453,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
 	ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
 #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
+#define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b4790807d8b..f390fb5692b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15854,6 +15854,59 @@
  [(parallel [(set (strict_low_part (match_dup 0))
 		  (bswap:HI (match_dup 0)))
 	     (clobber (reg:CC FLAGS_REG))])])
+
+;; Rotations through carry flag
+(define_insn "rcrsi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI
+	  (lshiftrt:SI (match_operand:SI 1 "register_operand" "0")
+		       (const_int 1))
+	  (ashift:SI (ltu:SI (reg:CCC FLAGS_REG) (const_int 0))
+		     (const_int 31))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "rcr{l}\t%0"
+  [(set_attr "type" "ishift1")
+   (set_attr "memory" "none")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "SI")])
+
+(define_insn "rcrdi2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(plus:DI
+	  (lshiftrt:DI (match_operand:DI 1 "register_operand" "0")
+		       (const_int 1))
+	  (ashift:DI (ltu:DI (reg:CCC FLAGS_REG) (const_int 0))
+		     (const_int 63))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  "rcr{q}\t%0"
+  [(set_attr "type" "ishift1")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "DI")])
+
+;; Versions of sar and shr that set the carry flag.
+(define_insn "<insn><mode>3_carry"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(and:SWI48 (match_operand:SWI48 1 "register_operand" "0")
+				(const_int 1))
+		     (const_int 0)] UNSPEC_CC_NE))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+	(any_shiftrt:SWI48 (match_dup 1) (const_int 1)))]
+  ""
+{
+  if (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
+    return "<shift>{<imodesuffix>}\t%0";
+  return "<shift>{<imodesuffix>}\t{1, %0|%0, 1}";
+}
+  [(set_attr "type" "ishift1")
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (match_test "TARGET_SHIFT1")
+	    (match_test "optimize_function_for_size_p (cfun)"))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
 \f
 ;; Bit set / bit test instructions
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4b2c5d59a95..3636a4a95d8 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -717,3 +717,6 @@ DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
 /* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
   modifications on architectures where theses operations are slow.  */
 DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
+
+/* X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd.  */
+DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr", m_AMD_MULTIPLE)
diff --git a/gcc/testsuite/gcc.target/i386/rcr-1.c b/gcc/testsuite/gcc.target/i386/rcr-1.c
new file mode 100644
index 00000000000..8f369efacf4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/rcr-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-Oz" } */
+unsigned __int128 foo(unsigned __int128 x) { return x >> 1; }
+__int128 bar(__int128 x) { return x >> 1; }
+/* { dg-final { scan-assembler-times "rcrq" 2 } } */
+/* { dg-final { scan-assembler-not "shrdq" } } */
diff --git a/gcc/testsuite/gcc.target/i386/rcr-2.c b/gcc/testsuite/gcc.target/i386/rcr-2.c
new file mode 100644
index 00000000000..c8ed50e7884
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/rcr-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-Oz -mno-stv" } */
+unsigned long long foo(unsigned long long x) { return x >> 1; }
+long long bar(long long x) { return x >> 1; }
+/* { dg-final { scan-assembler-times "rcrl" 2 } } */
+/* { dg-final { scan-assembler-not "shrdl" } } */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-10-09 11:04 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-09 11:04 [gcc r14-4505] i386: Implement doubleword right shifts by 1 bit using s[ha]r+rcr Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).