public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-2649] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
@ 2021-07-31  7:21 Jakub Jelinek
  0 siblings, 0 replies; only message in thread
From: Jakub Jelinek @ 2021-07-31  7:21 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:91425e2adecd00091d7443104ecb367686e88663

commit r12-2649-g91425e2adecd00091d7443104ecb367686e88663
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Sat Jul 31 09:19:32 2021 +0200

    i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
    
    This patch improves emitted code for the non-TARGET_LZCNT case.
    As __builtin_clz* is UB on 0 argument and for !TARGET_LZCNT
    CLZ_VALUE_DEFINED_AT_ZERO is 0, it is UB even at RTL time and so we
    can take advantage of that and assume the result will be 0 to 31 or
    0 to 63.
    Given that, sign or zero extension of that result are the same and
    are actually already performed by bsrl or xorl instructions.
    And constant - __builtin_clz* can be simplified into
    bsr + constant - bitmask.
    For TARGET_LZCNT, a lot of this is already fine as is (e.g. the sign or
    zero extensions), and other optimizations are IMHO not possible
    (if we have lzcnt, we've lost information on whether it is UB at
    zero or not and so can't transform it into bsr even when that is
    1-2 insns shorter).
    The changes on the 3 testcases between unpatched and patched gcc
    are for -m64:
    pr78103-1.s:
            bsrq    %rdi, %rax
    -       xorq    $63, %rax
    -       cltq
    +       xorl    $63, %eax
    ...
            bsrq    %rdi, %rax
    -       xorq    $63, %rax
    -       cltq
    +       xorl    $63, %eax
    ...
            bsrl    %edi, %eax
            xorl    $31, %eax
    -       cltq
    ...
            bsrl    %edi, %eax
            xorl    $31, %eax
    -       cltq
    pr78103-2.s:
            bsrl    %edi, %edi
    -       movl    $32, %eax
    -       xorl    $31, %edi
    -       subl    %edi, %eax
    +       leal    1(%rdi), %eax
    ...
    -       bsrl    %edi, %edi
    -       movl    $31, %eax
    -       xorl    $31, %edi
    -       subl    %edi, %eax
    +       bsrl    %edi, %eax
    ...
            bsrq    %rdi, %rdi
    -       movl    $64, %eax
    -       xorq    $63, %rdi
    -       subl    %edi, %eax
    +       leal    1(%rdi), %eax
    ...
    -       bsrq    %rdi, %rdi
    -       movl    $63, %eax
    -       xorq    $63, %rdi
    -       subl    %edi, %eax
    +       bsrq    %rdi, %rax
    pr78103-3.s:
            bsrl    %edi, %edi
    -       movl    $32, %eax
    -       xorl    $31, %edi
    -       movslq  %edi, %rdi
    -       subq    %rdi, %rax
    +       leaq    1(%rdi), %rax
    ...
    -       bsrl    %edi, %edi
    -       movl    $31, %eax
    -       xorl    $31, %edi
    -       movslq  %edi, %rdi
    -       subq    %rdi, %rax
    +       bsrl    %edi, %eax
    ...
            bsrq    %rdi, %rdi
    -       movl    $64, %eax
    -       xorq    $63, %rdi
    -       movslq  %edi, %rdi
    -       subq    %rdi, %rax
    +       leaq    1(%rdi), %rax
    ...
    -       bsrq    %rdi, %rdi
    -       movl    $63, %eax
    -       xorq    $63, %rdi
    -       movslq  %edi, %rdi
    -       subq    %rdi, %rax
    +       bsrq    %rdi, %rax
    
    Most of the changes are done with combine splitters, but for
    *bsr_rex64_2 and *bsr_2 I had to use define_insn_and_split, because
    as mentioned in the PR the combiner unfortunately doesn't create LOG_LINKS
    in between the two insns created by combine splitter, so it can't be
    combined further with following instructions.
    
    2021-07-31  Jakub Jelinek  <jakub@redhat.com>
    
            PR target/78103
            * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
            define_insn patterns.
            (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
            Add combine splitters for constant - clz.
            (clz<mode>2): Use a temporary pseudo for bsr result.
    
            * gcc.target/i386/pr78103-1.c: New test.
            * gcc.target/i386/pr78103-2.c: New test.
            * gcc.target/i386/pr78103-3.c: New test.

Diff:
---
 gcc/config/i386/i386.md                   | 210 +++++++++++++++++++++++++++++-
 gcc/testsuite/gcc.target/i386/pr78103-1.c |  28 ++++
 gcc/testsuite/gcc.target/i386/pr78103-2.c |  33 +++++
 gcc/testsuite/gcc.target/i386/pr78103-3.c |  32 +++++
 4 files changed, 298 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 73a495390df..c9787d73262 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14784,6 +14784,18 @@
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "DI")])
 
+(define_insn "bsr_rex64_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(minus:DI (const_int 63)
+		  (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "DI")])
+
 (define_insn "bsr"
   [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
@@ -14798,17 +14810,204 @@
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "SI")])
 
+(define_insn "bsr_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI (const_int 31)
+		  (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT"
+  "bsr{l}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+(define_insn "bsr_zext_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI
+	    (const_int 31)
+	    (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{l}\t{%1, %k0|%k0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+; As bsr is undefined behavior on zero and for other input
+; values it is in range 0 to 63, we can optimize away sign-extends.
+(define_insn_and_split "*bsr_rex64_2"
+  [(set (match_operand:DI 0 "register_operand")
+	(xor:DI
+	  (sign_extend:DI
+	    (minus:SI
+	      (const_int 63)
+	      (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			 0)))
+	  (const_int 63)))
+    (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (match_dup 1) (const_int 0)))
+	      (set (match_dup 2)
+		   (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+		   (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[2] = gen_reg_rtx (DImode);
+  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
+})
+
+(define_insn_and_split "*bsr_2"
+  [(set (match_operand:DI 0 "register_operand")
+	(sign_extend:DI
+	  (xor:SI
+	    (minus:SI
+	      (const_int 31)
+	      (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	    (const_int 31))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (match_dup 1) (const_int 0)))
+	      (set (match_dup 2)
+		   (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+		   (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[2] = gen_reg_rtx (SImode);")
+
+; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
+; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
+; in [0, 63] or [0, 31] range.
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(minus:SI
+	  (match_operand:SI 2 "const_int_operand")
+	  (xor:SI
+	    (minus:SI (const_int 63)
+		      (subreg:SI
+			(clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			0))
+	    (const_int 63))))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  [(set (match_dup 3)
+	(minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0)
+	(plus:SI (match_dup 5) (match_dup 4)))]
+{
+  operands[3] = gen_reg_rtx (DImode);
+  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
+  if (INTVAL (operands[2]) == 63)
+    {
+      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
+      emit_move_insn (operands[0], operands[5]);
+      DONE;
+    }
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
+})
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(minus:SI
+	  (match_operand:SI 2 "const_int_operand")
+	  (xor:SI
+	    (minus:SI (const_int 31)
+		      (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	    (const_int 31))))]
+  "!TARGET_LZCNT && ix86_pre_reload_split ()"
+  [(set (match_dup 3)
+	(minus:SI (const_int 31) (clz:SI (match_dup 1))))
+   (set (match_dup 0)
+	(plus:SI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      emit_insn (gen_bsr_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (SImode);
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(minus:DI
+	  (match_operand:DI 2 "const_int_operand")
+	  (xor:DI
+	    (sign_extend:DI
+	      (minus:SI (const_int 63)
+			(subreg:SI
+			  (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			  0)))
+	    (const_int 63))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
+       == UINTVAL (operands[2]) - 63)"
+  [(set (match_dup 3)
+	(minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0)
+	(plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 63)
+    {
+      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(minus:DI
+	  (match_operand:DI 2 "const_int_operand")
+	  (sign_extend:DI
+	    (xor:SI
+	      (minus:SI (const_int 31)
+			(clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	      (const_int 31)))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
+       == UINTVAL (operands[2]) - 31)"
+  [(set (match_dup 3)
+	(zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
+   (set (match_dup 0)
+	(plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
+})
+
 (define_expand "clz<mode>2"
   [(parallel
      [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
 		     (const_int 0)))
-      (set (match_operand:SWI48 0 "register_operand")
-	   (minus:SWI48
-	     (match_dup 2)
-	     (clz:SWI48 (match_dup 1))))])
+      (set (match_dup 3) (minus:SWI48
+			   (match_dup 2)
+			   (clz:SWI48 (match_dup 1))))])
    (parallel
-     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
+     [(set (match_operand:SWI48 0 "register_operand")
+	   (xor:SWI48 (match_dup 3) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
 {
@@ -14818,6 +15017,7 @@
       DONE;
     }
   operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
+  operands[3] = gen_reg_rtx (<MODE>mode);
 })
 
 (define_insn_and_split "clz<mode>2_lzcnt"
diff --git a/gcc/testsuite/gcc.target/i386/pr78103-1.c b/gcc/testsuite/gcc.target/i386/pr78103-1.c
new file mode 100644
index 00000000000..95aea694c45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr78103-1.c
@@ -0,0 +1,28 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mcltq\M} } } */
+
+long long
+foo (long long x)
+{
+  return __builtin_clzll (x);
+}
+
+long long
+bar (long long x)
+{
+  return (unsigned int) __builtin_clzll (x);
+}
+
+long long
+baz (int x)
+{
+  return __builtin_clz (x);
+}
+
+long long
+qux (int x)
+{
+  return (unsigned int) __builtin_clz (x);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr78103-2.c b/gcc/testsuite/gcc.target/i386/pr78103-2.c
new file mode 100644
index 00000000000..b3523382926
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr78103-2.c
@@ -0,0 +1,33 @@
+/* PR target/78103 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubl\M} } } */
+/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
+
+unsigned int
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned int
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+#ifdef __x86_64__
+unsigned int
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned int
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/pr78103-3.c b/gcc/testsuite/gcc.target/i386/pr78103-3.c
new file mode 100644
index 00000000000..49a36eccf4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr78103-3.c
@@ -0,0 +1,32 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubq\M} } } */
+/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
+
+unsigned long long
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned long long
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+unsigned long long
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned long long
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-07-31  7:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-31  7:21 [gcc r12-2649] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103] Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).