public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-1362] Use xchg for DImode double word rotate by 32 bits with -m32 on x86.
@ 2022-06-30 10:03 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2022-06-30 10:03 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:00193676a5a3e7e50e1fa6646bb5abb5a7b2acbb

commit r13-1362-g00193676a5a3e7e50e1fa6646bb5abb5a7b2acbb
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Thu Jun 30 11:00:03 2022 +0100

    Use xchg for DImode double word rotate by 32 bits with -m32 on x86.
    
    This patch was motivated by the investigation of Linus Torvalds' spill
    heavy cryptography kernels in PR 105930.  The <any_rotate>di3 expander
    handles all rotations by an immediate constant for 1..63 bits with the
    exception of 32 bits, which FAILs and is then split by the middle-end.
    This patch makes these 32-bit doubleword rotations consistent with the
    other DImode rotations during reload, which results in reduced register
    pressure, fewer instructions and the use of x86's xchg instruction
    when appropriate.  In theory, xchg can be handled by register renaming,
    but even on micro-architectures where it's implemented by 3 uops (no
    worse than a three instruction shuffle), avoiding nominating a
    "temporary" register, reduces user-visible register pressure (and
    has obvious code size benefits).
    
    The effects are best shown with the new testcase:
    
    unsigned long long bar();
    unsigned long long foo()
    {
      unsigned long long x = bar();
      return (x>>32) | (x<<32);
    }
    
    for which GCC with -m32 -O2 currently generates:
    
            subl    $12, %esp
            call    bar
            addl    $12, %esp
            movl    %eax, %ecx
            movl    %edx, %eax
            movl    %ecx, %edx
            ret
    
    but with this patch now generates:
    
            subl    $12, %esp
            call    bar
            addl    $12, %esp
            xchgl   %edx, %eax
            ret
    
    With this patch, the number of lines of assembly language generated
    for the blake2b kernel (from the attachment to PR105930) decreases
    from 5626 to 5404. Although there's an impressive reduction in
    instruction count, there's no change/reduction in stack frame size.
    
    2022-06-30  Roger Sayle  <roger@nextmovesoftware.com>
                Uroš Bizjak  <ubizjak@gmail.com>
    
    gcc/ChangeLog
            * config/i386/i386.md (swap_mode): Rename from *swap<mode> to
            provide gen_swapsi.
            (<any_rotate>di3): Handle !TARGET_64BIT rotations by 32 bits
            via new gen_<insn>32di2_doubleword below.
            (<anyrotate>32di2_doubleword): New define_insn_and_split
            that splits after reload as either a pair of move instructions
            or an xchgl (using gen_swapsi).
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/xchg-3.c: New test case.

Diff:
---
 gcc/config/i386/i386.md                | 22 +++++++++++++++++++++-
 gcc/testsuite/gcc.target/i386/xchg-3.c | 12 ++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 125a3b44a6d..04cd2bc173e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2966,7 +2966,7 @@
    (set_attr "memory" "load")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*swap<mode>"
+(define_insn "swap<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "+r")
 	(match_operand:SWI48 1 "register_operand" "+r"))
    (set (match_dup 1)
@@ -13673,6 +13673,8 @@
   else if (const_1_to_31_operand (operands[2], VOIDmode))
     emit_insn (gen_ix86_<insn>di3_doubleword
 		(operands[0], operands[1], operands[2]));
+  else if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 32)
+    emit_insn (gen_<insn>32di2_doubleword (operands[0], operands[1]));
   else
     FAIL;
 
@@ -13845,6 +13847,24 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn_and_split "<insn>32di2_doubleword"
+ [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+       (any_rotate:DI (match_operand:DI 1 "nonimmediate_operand" "0,r,o")
+                      (const_int 32)))]
+ "!TARGET_64BIT"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (match_dup 3))
+  (set (match_dup 2) (match_dup 1))]
+{
+  split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);
+  if (rtx_equal_p (operands[0], operands[1]))
+    {
+      emit_insn (gen_swapsi (operands[0], operands[2]));
+      DONE;
+    }
+})
+
 (define_mode_attr rorx_immediate_operand
 	[(SI "const_0_to_31_operand")
 	 (DI "const_0_to_63_operand")])
diff --git a/gcc/testsuite/gcc.target/i386/xchg-3.c b/gcc/testsuite/gcc.target/i386/xchg-3.c
new file mode 100644
index 00000000000..eec05f06c97
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/xchg-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2" } */
+
+unsigned long long bar();
+
+unsigned long long foo()
+{
+  unsigned long long x = bar();
+  return (x>>32) | (x<<32);
+}
+
+/*{ dg-final { scan-assembler "xchgl" } } */


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-06-30 10:03 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-30 10:03 [gcc r13-1362] Use xchg for DImode double word rotate by 32 bits with -m32 on x86 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).