public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-1525] [PATCH] PR rtl-optimization/46235: Improved use of bt for bit tests on x86_64.
@ 2021-06-16  8:58 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2021-06-16  8:58 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:3155d51bfd1de8b6c4645dcb2292248a8d7cc3c9

commit r12-1525-g3155d51bfd1de8b6c4645dcb2292248a8d7cc3c9
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Wed Jun 16 09:56:09 2021 +0100

    [PATCH] PR rtl-optimization/46235: Improved use of bt for bit tests on x86_64.
    
    This patch tackles PR46235 to improve the code generated for bit tests
    on x86_64 by making more use of the bt instruction.  Currently, GCC emits
    bt instructions when followed by condition jumps (thanks to Uros' splitters).
    This patch adds splitters in i386.md, to catch the cases where bt is followed
    by a conditional move (as in the original report), or by a setc/setnc (as in
    comment 5 of the Bugzilla PR).
    
    With this patch, the function in the original PR
    int foo(int a, int x, int y) {
        if (a & (1 << x))
           return a;
       return 1;
    }
    
    which with -O2 on mainline generates:
    foo:    movl    %edi, %eax
            movl    %esi, %ecx
            sarl    %cl, %eax
            testb   $1, %al
            movl    $1, %eax
            cmovne  %edi, %eax
            ret
    
    now generates:
    foo:    btl     %esi, %edi
            movl    $1, %eax
            cmovc   %edi, %eax
            ret
    
    Likewise, IsBitSet1 and IsBitSet2 (from comment 5)
    bool IsBitSet1(unsigned char byte, int index) {
        return (byte & (1<<index)) != 0;
    }
    bool IsBitSet2(unsigned char byte, int index) {
        return (byte >> index) & 1;
    }
    
    Before:
            movzbl  %dil, %eax
            movl    %esi, %ecx
            sarl    %cl, %eax
            andl    $1, %eax
            ret
    
    After:
            movzbl  %dil, %edi
            btl     %esi, %edi
            setc    %al
            ret
    
    According to Agner Fog, SAR/SHR r,cl takes 2 cycles on skylake,
    where BT r,r takes only one, so the performance improvements on
    recent hardware may be more significant than implied by just
    the reduced number of instructions.  I've avoided transforming cases
    (such as btsi_setcsi) where using bt sequences may not be a clear
    win (over sarq/andl).
    
    2010-06-15  Roger Sayle  <roger@nextmovesoftware.com>
    
    gcc/ChangeLog
            PR rtl-optimization/46235
            * config/i386/i386.md: New define_split for bt followed by cmov.
            (*bt<mode>_setcqi): New define_insn_and_split for bt followed by setc.
            (*bt<mode>_setncqi): New define_insn_and_split for bt then setnc.
            (*bt<mode>_setnc<mode>): New define_insn_and_split for bt followed
            by setnc with zero extension.
    
    gcc/testsuite/ChangeLog
            PR rtl-optimization/46235
            * gcc.target/i386/bt-5.c: New test.
            * gcc.target/i386/bt-6.c: New test.
            * gcc.target/i386/bt-7.c: New test.

Diff:
---
 gcc/config/i386/i386.md              | 94 ++++++++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/bt-5.c | 44 +++++++++++++++++
 gcc/testsuite/gcc.target/i386/bt-6.c | 69 ++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/bt-7.c | 69 ++++++++++++++++++++++++++
 4 files changed, 276 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6e4abf32e7c..48532eb7ddf 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12794,6 +12794,100 @@
   operands[0] = shallow_copy_rtx (operands[0]);
   PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
 })
+
+;; Help combine recognize bt followed by cmov
+(define_split
+  [(set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248
+	 (ne
+	  (zero_extract:SWI48
+	   (match_operand:SWI48 1 "register_operand")
+	   (const_int 1)
+	   (zero_extend:SI (match_operand:QI 2 "register_operand")))
+	  (const_int 0))
+	 (match_operand:SWI248 3 "nonimmediate_operand")
+	 (match_operand:SWI248 4 "nonimmediate_operand")))]
+  "TARGET_USE_BT && TARGET_CMOVE
+   && !(MEM_P (operands[3]) && MEM_P (operands[4]))
+   && ix86_pre_reload_split ()"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	 (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+	 (const_int 0)))
+   (set (match_dup 0)
+	(if_then_else:SWI248 (eq (reg:CCC FLAGS_REG) (const_int 0))
+			     (match_dup 3)
+			     (match_dup 4)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+})
+
+;; Help combine recognize bt followed by setc
+(define_insn_and_split "*bt<mode>_setcqi"
+  [(set (subreg:SWI48 (match_operand:QI 0 "register_operand") 0)
+        (zero_extract:SWI48
+         (match_operand:SWI48 1 "register_operand")
+         (const_int 1)
+         (zero_extend:SI (match_operand:QI 2 "register_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 0)
+        (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+})
+
+;; Help combine recognize bt followed by setnc
+(define_insn_and_split "*bt<mode>_setncqi"
+  [(set (match_operand:QI 0 "register_operand")
+	(and:QI
+	 (not:QI
+	  (subreg:QI
+	   (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand")
+			   (match_operand:QI 2 "register_operand")) 0))
+	 (const_int 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 0)
+        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+})
+
+(define_insn_and_split "*bt<mode>_setnc<mode>"
+  [(set (match_operand:SWI48 0 "register_operand")
+	(and:SWI48
+	 (not:SWI48
+	  (lshiftrt:SWI48 (match_operand:SWI48 1 "register_operand")
+			  (match_operand:QI 2 "register_operand")))
+	 (const_int 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+        (compare:CCC
+         (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+         (const_int 0)))
+   (set (match_dup 3)
+        (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (set (match_dup 0) (zero_extend:SWI48 (match_dup 3)))]
+{
+  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
+  operands[3] = gen_reg_rtx (QImode);
+})
 \f
 ;; Store-flag instructions.
 
diff --git a/gcc/testsuite/gcc.target/i386/bt-5.c b/gcc/testsuite/gcc.target/i386/bt-5.c
new file mode 100644
index 00000000000..73e7ed282d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bt-5.c
@@ -0,0 +1,44 @@
+/* PR rtl-optimization/46235 */
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+int foo (int a, int x, int y)
+{
+  if (a & (1<<x))
+    return a;
+  return 1;
+}
+
+int bar_ww (int a, int x, int y, int z)
+{
+  return (a & (1<<x)) ? y : z;
+}
+
+int bar_lw (long long a, int x, int y, int z)
+{
+  return (a & (1LL<<x)) ? y : z;
+}
+
+long long bar_wl (int a, int x, long long y, long long z)
+{
+  return (a & (1<<x)) ? y : z;
+}
+
+long long bar_ll (long long a, int x, long long y, long long z)
+{
+  return (a & (1LL<<x)) ? y : z;
+}
+
+short bar_ws (int a, int x, short y, short z)
+{
+  return (a & (1<<x)) ? y : z;
+}
+
+short bar_ls (long long a, int x, short y, short z)
+{
+  return (a & (1LL<<x)) ? y : z;
+}
+
+/* { dg-final { scan-assembler-times "bt\[lq\]\[ \t\]" 7 } } */
+/* { dg-final { scan-assembler-not "sar\[lq\]\[ \t\]" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/bt-6.c b/gcc/testsuite/gcc.target/i386/bt-6.c
new file mode 100644
index 00000000000..d4ef80589f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bt-6.c
@@ -0,0 +1,69 @@
+/* PR rtl-optimization/46235 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+unsigned char set1_bb (unsigned char x, int y)
+{
+  return (x & (1<<y)) != 0;
+}
+
+unsigned char set2_bb (unsigned char x, int y)
+{
+  return (x >> y) & 1;
+}
+
+unsigned char set1_wb (int x, int y)
+{
+  return (x & (1<<y)) != 0;
+}
+
+unsigned char set2_wb (int x, int y)
+{
+  return (x >> y) & 1;
+}
+
+unsigned char clr1_bb (unsigned char x, int y)
+{
+  return (x & (1<<y)) == 0;
+}
+
+unsigned char clr2_bb (unsigned char x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+unsigned char clr1_wb (int x, int y)
+{
+  return (x & (1<<y)) == 0;
+}
+
+unsigned char clr2_wb (int x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+int clr1_bw (unsigned char x, int y)
+{
+  return (x & (1<<y)) == 0;
+}
+
+int clr2_bw (unsigned char x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+int clr1_ww (int x, int y)
+{
+  return (x & (1<<y)) == 0;
+}
+
+int clr2_ww (int x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+/* { dg-final { scan-assembler-times "bt\[lq\]\[ \t\]" 12 } } */
+/* { dg-final { scan-assembler-not "sar\[lq\]\[ \t\]" } } */
+/* { dg-final { scan-assembler-not "and\[lq\]\[ \t\]" } } */
+/* { dg-final { scan-assembler-not "not\[lq\]\[ \t\]" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/bt-7.c b/gcc/testsuite/gcc.target/i386/bt-7.c
new file mode 100644
index 00000000000..292d7414c42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bt-7.c
@@ -0,0 +1,69 @@
+/* PR rtl-optimization/46235 */
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+unsigned char set1_lb (long long x, int y)
+{
+  return (x & (1LL<<y)) != 0;
+}
+
+unsigned char set2_lb (long long x, int y)
+{
+  return (x >> y) & 1;
+}
+
+unsigned char clr1_lb (long long x, int y)
+{
+  return (x & (1LL<<y)) == 0;
+}
+
+unsigned char clr2_lb (long long x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+int clr1_lw (long long x, int y)
+{
+  return (x & (1LL<<y)) == 0;
+}
+
+int clr2_lw (long long x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+long long clr1_bl (unsigned char x, int y)
+{
+  return (x & (1<<y)) == 0;
+}
+
+long long clr2_bl (unsigned char x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+long long clr1_wl (int x, int y)
+{
+  return (x & (1<<y)) == 0;
+}
+
+long long clr2_wl (int x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+long long clr1_ll (long long x, int y)
+{
+  return (x & (1LL<<y)) == 0;
+}
+
+long long clr2_ll (long long x, int y)
+{
+  return !((x >> y) & 1);
+}
+
+/* { dg-final { scan-assembler-times "bt\[lq\]\[ \t\]" 12 } } */
+/* { dg-final { scan-assembler-not "sar\[lq\]\[ \t\]" } } */
+/* { dg-final { scan-assembler-not "and\[lq\]\[ \t\]" } } */
+/* { dg-final { scan-assembler-not "not\[lq\]\[ \t\]" } } */
+


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-06-16  8:58 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-16  8:58 [gcc r12-1525] [PATCH] PR rtl-optimization/46235: Improved use of bt for bit tests on x86_64 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).