public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
@ 2021-07-28  8:35 Jakub Jelinek
  2021-07-30 10:27 ` Uros Bizjak
  0 siblings, 1 reply; 8+ messages in thread
From: Jakub Jelinek @ 2021-07-28  8:35 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

Hi!

This patch improves emitted code for the non-TARGET_LZCNT case.
As __builtin_clz* is UB on 0 argument and for !TARGET_LZCNT
CLZ_VALUE_DEFINED_AT_ZERO is 0, it is UB even at RTL time and so we
can take advantage of that and assume the result will be 0 to 31 or
0 to 63.
Given that, sign or zero extension of that result are the same and
are actually already performed by bsrl or xorl instructions.
And constant - __builtin_clz* can be simplified into
bsr + constant - bitmask.
For TARGET_LZCNT, a lot of this is already fine as is (e.g. the sign or
zero extensions), and other optimizations are IMHO not possible
(if we have lzcnt, we've lost information on whether it is UB at
zero or not and so can't transform it into bsr even when that is
1-2 insns shorter).
The changes on the 3 testcases between unpatched and patched gcc
are for -m64:
pr78103-1.s:
 	bsrq	%rdi, %rax
-	xorq	$63, %rax
-	cltq
+	xorl	$63, %eax
...
 	bsrq	%rdi, %rax
-	xorq	$63, %rax
-	cltq
+	xorl	$63, %eax
...
 	bsrl	%edi, %eax
 	xorl	$31, %eax
-	cltq
...
 	bsrl	%edi, %eax
 	xorl	$31, %eax
-	cltq
pr78103-2.s:
 	bsrl	%edi, %edi
-	movl	$32, %eax
-	xorl	$31, %edi
-	subl	%edi, %eax
+	leal	1(%rdi), %eax
...
-	bsrl	%edi, %edi
-	movl	$31, %eax
-	xorl	$31, %edi
-	subl	%edi, %eax
+	bsrl	%edi, %eax
...
 	bsrq	%rdi, %rdi
-	movl	$64, %eax
-	xorq	$63, %rdi
-	subl	%edi, %eax
+	leal	1(%rdi), %eax
...
-	bsrq	%rdi, %rdi
-	movl	$63, %eax
-	xorq	$63, %rdi
-	subl	%edi, %eax
+	bsrq	%rdi, %rax
pr78103-3.s:
 	bsrl	%edi, %edi
-	movl	$32, %eax
-	xorl	$31, %edi
-	movslq	%edi, %rdi
-	subq	%rdi, %rax
+	leaq	1(%rdi), %rax
...
-	bsrl	%edi, %edi
-	movl	$31, %eax
-	xorl	$31, %edi
-	movslq	%edi, %rdi
-	subq	%rdi, %rax
+	bsrl	%edi, %eax
...
 	bsrq	%rdi, %rdi
-	movl	$64, %eax
-	xorq	$63, %rdi
-	movslq	%edi, %rdi
-	subq	%rdi, %rax
+	leaq	1(%rdi), %rax
...
-	bsrq	%rdi, %rdi
-	movl	$63, %eax
-	xorq	$63, %rdi
-	movslq	%edi, %rdi
-	subq	%rdi, %rax
+	bsrq	%rdi, %rax

Most of the changes are done with combine splitters, but for
*bsr_rex64_2 and *bsr_2 I had to use define_insn_and_split, because
as mentioned in the PR the combiner unfortunately doesn't create LOG_LINKS
in between the two insns created by combine splitter, so it can't be
combined further with following instructions.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-07-28  Jakub Jelinek  <jakub@redhat.com>

	PR target/78103
	* config/i386/i386.md (*bsr_rex64_1, *bsr_1, *bsr_zext_1): New
	define_insn patterns.
	(*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
	Add combine splitters for constant - clz.
	(clz<mode>2): Use a temporary pseudo for bsr result.

	* gcc.target/i386/pr78103-1.c: New test.
	* gcc.target/i386/pr78103-2.c: New test.
	* gcc.target/i386/pr78103-3.c: New test.

--- gcc/config/i386/i386.md.jj	2021-07-27 09:47:30.311970004 +0200
+++ gcc/config/i386/i386.md	2021-07-27 15:37:59.011394624 +0200
@@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "DI")])
 
+(define_insn "*bsr_rex64_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(minus:DI (const_int 63)
+		  (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "DI")])
+
 (define_insn "bsr"
   [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
@@ -14775,17 +14787,210 @@ (define_insn "bsr"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "SI")])
 
+(define_insn "*bsr_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI (const_int 31)
+		  (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT"
+  "bsr{l}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+(define_insn "*bsr_zext_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI
+	    (const_int 31)
+	    (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{l}\t{%1, %k0|%k0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+; As bsr is undefined behavior on zero and for other input
+; values it is in range 0 to 63, we can optimize away sign-extends.
+(define_insn_and_split "*bsr_rex64_2"
+  [(set (match_operand:DI 0 "register_operand")
+	(xor:DI
+	  (sign_extend:DI
+	    (minus:SI
+	      (const_int 63)
+	      (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			 0)))
+	  (const_int 63)))
+    (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (match_dup 1) (const_int 0)))
+	      (set (match_dup 2)
+		   (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+		   (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[2] = gen_reg_rtx (DImode);
+  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
+})
+
+(define_insn_and_split "*bsr_2"
+  [(set (match_operand:DI 0 "register_operand")
+	(sign_extend:DI
+	  (xor:SI
+	    (minus:SI
+	      (const_int 31)
+	      (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	    (const_int 31))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (match_dup 1) (const_int 0)))
+	      (set (match_dup 2)
+		   (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+		   (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[2] = gen_reg_rtx (SImode);")
+
+; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
+; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
+; in [0, 63] or [0, 31] range.
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(minus:SI
+	  (match_operand:SI 2 "const_int_operand")
+	  (xor:SI
+	    (minus:SI (const_int 63)
+		      (subreg:SI
+			(clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			0))
+	    (const_int 63))))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  [(set (match_dup 3) (minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0) (plus:SI (match_dup 5) (match_dup 4)))]
+{
+  operands[3] = gen_reg_rtx (DImode);
+  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
+  if (INTVAL (operands[2]) == 63)
+    {
+      rtx tem = gen_rtx_CLZ (DImode, operands[1]);
+      tem = gen_rtx_MINUS (DImode, operands[2], tem);
+      tem = gen_rtx_SET (operands[3], tem);
+      emit_insn (tem);
+      emit_move_insn (operands[0], operands[5]);
+      DONE;
+    }
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
+})
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(minus:SI
+	  (match_operand:SI 2 "const_int_operand")
+	  (xor:SI
+	    (minus:SI (const_int 31)
+		      (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	    (const_int 31))))]
+  "!TARGET_LZCNT && ix86_pre_reload_split ()"
+  [(set (match_dup 3) (minus:SI (const_int 31) (clz:SI (match_dup 1))))
+   (set (match_dup 0) (plus:SI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      rtx tem = gen_rtx_CLZ (SImode, operands[1]);
+      tem = gen_rtx_MINUS (SImode, operands[2], tem);
+      tem = gen_rtx_SET (operands[0], tem);
+      emit_insn (tem);
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (SImode);
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(minus:DI
+	  (match_operand:DI 2 "const_int_operand")
+	  (xor:DI
+	    (sign_extend:DI
+	      (minus:SI (const_int 63)
+			(subreg:SI
+			  (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			  0)))
+	    (const_int 63))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
+       == UINTVAL (operands[2]) - 63)"
+  [(set (match_dup 3) (minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0) (plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 63)
+    {
+      rtx tem = gen_rtx_CLZ (DImode, operands[1]);
+      tem = gen_rtx_MINUS (DImode, operands[2], tem);
+      tem = gen_rtx_SET (operands[0], tem);
+      emit_insn (tem);
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(minus:DI
+	  (match_operand:DI 2 "const_int_operand")
+	  (sign_extend:DI
+	    (xor:SI
+	      (minus:SI (const_int 31)
+			(clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	      (const_int 31)))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
+       == UINTVAL (operands[2]) - 31)"
+  [(set (match_dup 3)
+	(zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
+   (set (match_dup 0) (plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      rtx tem = gen_rtx_CLZ (SImode, operands[1]);
+      tem = gen_rtx_MINUS (SImode, operands[2], tem);
+      tem = gen_rtx_ZERO_EXTEND (DImode, tem);
+      tem = gen_rtx_SET (operands[0], tem);
+      emit_insn (tem);
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
+})
+
 (define_expand "clz<mode>2"
   [(parallel
      [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
 		     (const_int 0)))
-      (set (match_operand:SWI48 0 "register_operand")
-	   (minus:SWI48
-	     (match_dup 2)
-	     (clz:SWI48 (match_dup 1))))])
+      (set (match_dup 3) (minus:SWI48
+			   (match_dup 2)
+			   (clz:SWI48 (match_dup 1))))])
    (parallel
-     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
+     [(set (match_operand:SWI48 0 "register_operand")
+	   (xor:SWI48 (match_dup 3) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
 {
@@ -14795,6 +15000,7 @@ (define_expand "clz<mode>2"
       DONE;
     }
   operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
+  operands[3] = gen_reg_rtx (<MODE>mode);
 })
 
 (define_insn_and_split "clz<mode>2_lzcnt"
--- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj	2021-07-27 10:29:13.278547362 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-1.c	2021-07-27 10:29:13.278547362 +0200
@@ -0,0 +1,28 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mcltq\M} } } */
+
+long long
+foo (long long x)
+{
+  return __builtin_clzll (x);
+}
+
+long long
+bar (long long x)
+{
+  return (unsigned int) __builtin_clzll (x);
+}
+
+long long
+baz (int x)
+{
+  return __builtin_clz (x);
+}
+
+long long
+qux (int x)
+{
+  return (unsigned int) __builtin_clz (x);
+}
--- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj	2021-07-27 10:29:13.278547362 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-2.c	2021-07-27 10:29:13.278547362 +0200
@@ -0,0 +1,33 @@
+/* PR target/78103 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubl\M} } } */
+/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
+
+unsigned int
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned int
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+#ifdef __x86_64__
+unsigned int
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned int
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}
+#endif
--- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj	2021-07-27 15:45:51.421113690 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-3.c	2021-07-27 15:45:35.678323393 +0200
@@ -0,0 +1,32 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubq\M} } } */
+/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
+
+unsigned long long
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned long long
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+unsigned long long
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned long long
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}

	Jakub


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
  2021-07-28  8:35 [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103] Jakub Jelinek
@ 2021-07-30 10:27 ` Uros Bizjak
  2021-07-30 13:26   ` Jakub Jelinek
  0 siblings, 1 reply; 8+ messages in thread
From: Uros Bizjak @ 2021-07-30 10:27 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Wed, Jul 28, 2021 at 10:36 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> This patch improves emitted code for the non-TARGET_LZCNT case.
> As __builtin_clz* is UB on 0 argument and for !TARGET_LZCNT
> CLZ_VALUE_DEFINED_AT_ZERO is 0, it is UB even at RTL time and so we
> can take advantage of that and assume the result will be 0 to 31 or
> 0 to 63.
> Given that, sign or zero extension of that result are the same and
> are actually already performed by bsrl or xorl instructions.
> And constant - __builtin_clz* can be simplified into
> bsr + constant - bitmask.
> For TARGET_LZCNT, a lot of this is already fine as is (e.g. the sign or
> zero extensions), and other optimizations are IMHO not possible
> (if we have lzcnt, we've lost information on whether it is UB at
> zero or not and so can't transform it into bsr even when that is
> 1-2 insns shorter).
> The changes on the 3 testcases between unpatched and patched gcc
> are for -m64:
> pr78103-1.s:
>         bsrq    %rdi, %rax
> -       xorq    $63, %rax
> -       cltq
> +       xorl    $63, %eax
> ...
>         bsrq    %rdi, %rax
> -       xorq    $63, %rax
> -       cltq
> +       xorl    $63, %eax
> ...
>         bsrl    %edi, %eax
>         xorl    $31, %eax
> -       cltq
> ...
>         bsrl    %edi, %eax
>         xorl    $31, %eax
> -       cltq
> pr78103-2.s:
>         bsrl    %edi, %edi
> -       movl    $32, %eax
> -       xorl    $31, %edi
> -       subl    %edi, %eax
> +       leal    1(%rdi), %eax
> ...
> -       bsrl    %edi, %edi
> -       movl    $31, %eax
> -       xorl    $31, %edi
> -       subl    %edi, %eax
> +       bsrl    %edi, %eax
> ...
>         bsrq    %rdi, %rdi
> -       movl    $64, %eax
> -       xorq    $63, %rdi
> -       subl    %edi, %eax
> +       leal    1(%rdi), %eax
> ...
> -       bsrq    %rdi, %rdi
> -       movl    $63, %eax
> -       xorq    $63, %rdi
> -       subl    %edi, %eax
> +       bsrq    %rdi, %rax
> pr78103-3.s:
>         bsrl    %edi, %edi
> -       movl    $32, %eax
> -       xorl    $31, %edi
> -       movslq  %edi, %rdi
> -       subq    %rdi, %rax
> +       leaq    1(%rdi), %rax
> ...
> -       bsrl    %edi, %edi
> -       movl    $31, %eax
> -       xorl    $31, %edi
> -       movslq  %edi, %rdi
> -       subq    %rdi, %rax
> +       bsrl    %edi, %eax
> ...
>         bsrq    %rdi, %rdi
> -       movl    $64, %eax
> -       xorq    $63, %rdi
> -       movslq  %edi, %rdi
> -       subq    %rdi, %rax
> +       leaq    1(%rdi), %rax
> ...
> -       bsrq    %rdi, %rdi
> -       movl    $63, %eax
> -       xorq    $63, %rdi
> -       movslq  %edi, %rdi
> -       subq    %rdi, %rax
> +       bsrq    %rdi, %rax
>
> Most of the changes are done with combine splitters, but for
> *bsr_rex64_2 and *bsr_2 I had to use define_insn_and_split, because
> as mentioned in the PR the combiner unfortunately doesn't create LOG_LINKS
> in between the two insns created by combine splitter, so it can't be
> combined further with following instructions.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2021-07-28  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/78103
>         * config/i386/i386.md (*bsr_rex64_1, *bsr_1, *bsr_zext_1): New
>         define_insn patterns.
>         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
>         Add combine splitters for constant - clz.
>         (clz<mode>2): Use a temporary pseudo for bsr result.
>
>         * gcc.target/i386/pr78103-1.c: New test.
>         * gcc.target/i386/pr78103-2.c: New test.
>         * gcc.target/i386/pr78103-3.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2021-07-27 09:47:30.311970004 +0200
> +++ gcc/config/i386/i386.md     2021-07-27 15:37:59.011394624 +0200
> @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "DI")])
>
> +(define_insn "*bsr_rex64_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (minus:DI (const_int 63)
> +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{q}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "DI")])
> +
>  (define_insn "bsr"
>    [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> @@ -14775,17 +14787,210 @@ (define_insn "bsr"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "SI")])
>
> +(define_insn "*bsr_1"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +       (minus:SI (const_int 31)
> +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT"
> +  "bsr{l}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +(define_insn "*bsr_zext_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (minus:SI
> +           (const_int 31)
> +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{l}\t{%1, %k0|%k0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +; As bsr is undefined behavior on zero and for other input
> +; values it is in range 0 to 63, we can optimize away sign-extends.
> +(define_insn_and_split "*bsr_rex64_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (xor:DI
> +         (sign_extend:DI
> +           (minus:SI
> +             (const_int 63)
> +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                        0)))
> +         (const_int 63)))
> +    (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +{
> +  operands[2] = gen_reg_rtx (DImode);
> +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> +})
> +
> +(define_insn_and_split "*bsr_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (sign_extend:DI
> +         (xor:SI
> +           (minus:SI
> +             (const_int 31)
> +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +  "operands[2] = gen_reg_rtx (SImode);")
> +
> +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> +; in [0, 63] or [0, 31] range.
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 63)
> +                     (subreg:SI
> +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                       0))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3) (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0) (plus:SI (match_dup 5) (match_dup 4)))]

Please put some space here, e.g.:

[(set (match_dup 3)
        (minus:DI (const_int 63) (clz:DI (match_dup 1))))
 (set (match_dup 0)
        (plus:SI (match_dup 5) (match_dup 4)))]

OK with other patterns changed in a similar way.

> +{
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      rtx tem = gen_rtx_CLZ (DImode, operands[1]);
> +      tem = gen_rtx_MINUS (DImode, operands[2], tem);
> +      tem = gen_rtx_SET (operands[3], tem);
> +      emit_insn (tem);

Can you just name the relevant insn pattern and use

emit_insn (gen_bsr_1)?

This is much more friendly and descriptive to readers. (The combiner
adds clobber automatically, but only after it fails to recognize insn
without clobber. We don't have insn without clobber, so we can help
the combiner a bit.)

Thanks,
Uros.

> +      emit_move_insn (operands[0], operands[5]);
> +      DONE;
> +    }
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 31)
> +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))]
> +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3) (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> +   (set (match_dup 0) (plus:SI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      rtx tem = gen_rtx_CLZ (SImode, operands[1]);
> +      tem = gen_rtx_MINUS (SImode, operands[2], tem);
> +      tem = gen_rtx_SET (operands[0], tem);
> +      emit_insn (tem);
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (SImode);
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (xor:DI
> +           (sign_extend:DI
> +             (minus:SI (const_int 63)
> +                       (subreg:SI
> +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                         0)))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> +       == UINTVAL (operands[2]) - 63)"
> +  [(set (match_dup 3) (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0) (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      rtx tem = gen_rtx_CLZ (DImode, operands[1]);
> +      tem = gen_rtx_MINUS (DImode, operands[2], tem);
> +      tem = gen_rtx_SET (operands[0], tem);
> +      emit_insn (tem);
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (sign_extend:DI
> +           (xor:SI
> +             (minus:SI (const_int 31)
> +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +             (const_int 31)))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> +       == UINTVAL (operands[2]) - 31)"
> +  [(set (match_dup 3)
> +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> +   (set (match_dup 0) (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      rtx tem = gen_rtx_CLZ (SImode, operands[1]);
> +      tem = gen_rtx_MINUS (SImode, operands[2], tem);
> +      tem = gen_rtx_ZERO_EXTEND (DImode, tem);
> +      tem = gen_rtx_SET (operands[0], tem);
> +      emit_insn (tem);
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> +})
> +
>  (define_expand "clz<mode>2"
>    [(parallel
>       [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
>                      (const_int 0)))
> -      (set (match_operand:SWI48 0 "register_operand")
> -          (minus:SWI48
> -            (match_dup 2)
> -            (clz:SWI48 (match_dup 1))))])
> +      (set (match_dup 3) (minus:SWI48
> +                          (match_dup 2)
> +                          (clz:SWI48 (match_dup 1))))])
>     (parallel
> -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> +     [(set (match_operand:SWI48 0 "register_operand")
> +          (xor:SWI48 (match_dup 3) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>    ""
>  {
> @@ -14795,6 +15000,7 @@ (define_expand "clz<mode>2"
>        DONE;
>      }
>    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> +  operands[3] = gen_reg_rtx (<MODE>mode);
>  })
>
>  (define_insn_and_split "clz<mode>2_lzcnt"
> --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-27 10:29:13.278547362 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-27 10:29:13.278547362 +0200
> @@ -0,0 +1,28 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> +
> +long long
> +foo (long long x)
> +{
> +  return __builtin_clzll (x);
> +}
> +
> +long long
> +bar (long long x)
> +{
> +  return (unsigned int) __builtin_clzll (x);
> +}
> +
> +long long
> +baz (int x)
> +{
> +  return __builtin_clz (x);
> +}
> +
> +long long
> +qux (int x)
> +{
> +  return (unsigned int) __builtin_clz (x);
> +}
> --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-27 10:29:13.278547362 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-27 10:29:13.278547362 +0200
> @@ -0,0 +1,33 @@
> +/* PR target/78103 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> +
> +unsigned int
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned int
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +#ifdef __x86_64__
> +unsigned int
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned int
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
> +#endif
> --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-27 15:45:51.421113690 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-27 15:45:35.678323393 +0200
> @@ -0,0 +1,32 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> +
> +unsigned long long
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned long long
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
>
>         Jakub
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
  2021-07-30 10:27 ` Uros Bizjak
@ 2021-07-30 13:26   ` Jakub Jelinek
  2021-07-31 19:38     ` H.J. Lu
  2021-07-31 19:53     ` H.J. Lu
  0 siblings, 2 replies; 8+ messages in thread
From: Jakub Jelinek @ 2021-07-30 13:26 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> Please put some space here, e.g.:
...
> Can you just name the relevant insn pattern and use
> 
> emit_insn (gen_bsr_1)?

Here is the updated patch.  I'll bootstrap/regtest it tonight.

2021-07-30  Jakub Jelinek  <jakub@redhat.com>

	PR target/78103
	* config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
	define_insn patterns.
	(*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
	Add combine splitters for constant - clz.
	(clz<mode>2): Use a temporary pseudo for bsr result.

	* gcc.target/i386/pr78103-1.c: New test.
	* gcc.target/i386/pr78103-2.c: New test.
	* gcc.target/i386/pr78103-3.c: New test.

--- gcc/config/i386/i386.md.jj	2021-07-28 12:05:56.857977764 +0200
+++ gcc/config/i386/i386.md	2021-07-30 15:13:49.994946550 +0200
@@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "DI")])
 
+(define_insn "bsr_rex64_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(minus:DI (const_int 63)
+		  (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "DI")])
+
 (define_insn "bsr"
   [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
@@ -14775,17 +14787,204 @@ (define_insn "bsr"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "SI")])
 
+(define_insn "bsr_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI (const_int 31)
+		  (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT"
+  "bsr{l}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+(define_insn "bsr_zext_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI
+	    (const_int 31)
+	    (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{l}\t{%1, %k0|%k0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+; As bsr is undefined behavior on zero and for other input
+; values it is in range 0 to 63, we can optimize away sign-extends.
+(define_insn_and_split "*bsr_rex64_2"
+  [(set (match_operand:DI 0 "register_operand")
+	(xor:DI
+	  (sign_extend:DI
+	    (minus:SI
+	      (const_int 63)
+	      (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			 0)))
+	  (const_int 63)))
+    (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (match_dup 1) (const_int 0)))
+	      (set (match_dup 2)
+		   (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+		   (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[2] = gen_reg_rtx (DImode);
+  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
+})
+
+(define_insn_and_split "*bsr_2"
+  [(set (match_operand:DI 0 "register_operand")
+	(sign_extend:DI
+	  (xor:SI
+	    (minus:SI
+	      (const_int 31)
+	      (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	    (const_int 31))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (match_dup 1) (const_int 0)))
+	      (set (match_dup 2)
+		   (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+		   (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[2] = gen_reg_rtx (SImode);")
+
+; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
+; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
+; in [0, 63] or [0, 31] range.
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(minus:SI
+	  (match_operand:SI 2 "const_int_operand")
+	  (xor:SI
+	    (minus:SI (const_int 63)
+		      (subreg:SI
+			(clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			0))
+	    (const_int 63))))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  [(set (match_dup 3)
+	(minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0)
+	(plus:SI (match_dup 5) (match_dup 4)))]
+{
+  operands[3] = gen_reg_rtx (DImode);
+  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
+  if (INTVAL (operands[2]) == 63)
+    {
+      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
+      emit_move_insn (operands[0], operands[5]);
+      DONE;
+    }
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
+})
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(minus:SI
+	  (match_operand:SI 2 "const_int_operand")
+	  (xor:SI
+	    (minus:SI (const_int 31)
+		      (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	    (const_int 31))))]
+  "!TARGET_LZCNT && ix86_pre_reload_split ()"
+  [(set (match_dup 3)
+	(minus:SI (const_int 31) (clz:SI (match_dup 1))))
+   (set (match_dup 0)
+	(plus:SI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      emit_insn (gen_bsr_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (SImode);
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(minus:DI
+	  (match_operand:DI 2 "const_int_operand")
+	  (xor:DI
+	    (sign_extend:DI
+	      (minus:SI (const_int 63)
+			(subreg:SI
+			  (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+			  0)))
+	    (const_int 63))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
+       == UINTVAL (operands[2]) - 63)"
+  [(set (match_dup 3)
+	(minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0)
+	(plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 63)
+    {
+      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(minus:DI
+	  (match_operand:DI 2 "const_int_operand")
+	  (sign_extend:DI
+	    (xor:SI
+	      (minus:SI (const_int 31)
+			(clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+	      (const_int 31)))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
+       == UINTVAL (operands[2]) - 31)"
+  [(set (match_dup 3)
+	(zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
+   (set (match_dup 0)
+	(plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
+})
+
 (define_expand "clz<mode>2"
   [(parallel
      [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
 		     (const_int 0)))
-      (set (match_operand:SWI48 0 "register_operand")
-	   (minus:SWI48
-	     (match_dup 2)
-	     (clz:SWI48 (match_dup 1))))])
+      (set (match_dup 3) (minus:SWI48
+			   (match_dup 2)
+			   (clz:SWI48 (match_dup 1))))])
    (parallel
-     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
+     [(set (match_operand:SWI48 0 "register_operand")
+	   (xor:SWI48 (match_dup 3) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
 {
@@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
       DONE;
     }
   operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
+  operands[3] = gen_reg_rtx (<MODE>mode);
 })
 
 (define_insn_and_split "clz<mode>2_lzcnt"
--- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj	2021-07-30 15:07:26.104139537 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-1.c	2021-07-30 15:07:26.104139537 +0200
@@ -0,0 +1,28 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mcltq\M} } } */
+
+long long
+foo (long long x)
+{
+  return __builtin_clzll (x);
+}
+
+long long
+bar (long long x)
+{
+  return (unsigned int) __builtin_clzll (x);
+}
+
+long long
+baz (int x)
+{
+  return __builtin_clz (x);
+}
+
+long long
+qux (int x)
+{
+  return (unsigned int) __builtin_clz (x);
+}
--- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj	2021-07-30 15:07:26.104139537 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-2.c	2021-07-30 15:07:26.104139537 +0200
@@ -0,0 +1,33 @@
+/* PR target/78103 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubl\M} } } */
+/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
+
+unsigned int
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned int
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+#ifdef __x86_64__
+unsigned int
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned int
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}
+#endif
--- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj	2021-07-30 15:07:26.104139537 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-3.c	2021-07-30 15:07:26.104139537 +0200
@@ -0,0 +1,32 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubq\M} } } */
+/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
+
+unsigned long long
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned long long
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+unsigned long long
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned long long
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}


	Jakub


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
  2021-07-30 13:26   ` Jakub Jelinek
@ 2021-07-31 19:38     ` H.J. Lu
  2021-07-31 19:42       ` H.J. Lu
  2021-07-31 19:53     ` H.J. Lu
  1 sibling, 1 reply; 8+ messages in thread
From: H.J. Lu @ 2021-07-31 19:38 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > Please put some space here, e.g.:
> ...
> > Can you just name the relevant insn pattern and use
> >
> > emit_insn (gen_bsr_1)?
>
> Here is the updated patch.  I'll bootstrap/regtest it tonight.
>
> 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/78103
>         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
>         define_insn patterns.
>         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
>         Add combine splitters for constant - clz.
>         (clz<mode>2): Use a temporary pseudo for bsr result.
>
>         * gcc.target/i386/pr78103-1.c: New test.
>         * gcc.target/i386/pr78103-2.c: New test.
>         * gcc.target/i386/pr78103-3.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "DI")])
>
> +(define_insn "bsr_rex64_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (minus:DI (const_int 63)
> +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{q}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "DI")])
> +
>  (define_insn "bsr"
>    [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> @@ -14775,17 +14787,204 @@ (define_insn "bsr"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "SI")])
>
> +(define_insn "bsr_1"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +       (minus:SI (const_int 31)
> +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT"
> +  "bsr{l}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +(define_insn "bsr_zext_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (minus:SI
> +           (const_int 31)
> +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{l}\t{%1, %k0|%k0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +; As bsr is undefined behavior on zero and for other input
> +; values it is in range 0 to 63, we can optimize away sign-extends.
> +(define_insn_and_split "*bsr_rex64_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (xor:DI
> +         (sign_extend:DI
> +           (minus:SI
> +             (const_int 63)
> +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                        0)))
> +         (const_int 63)))
> +    (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +{
> +  operands[2] = gen_reg_rtx (DImode);
> +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> +})
> +
> +(define_insn_and_split "*bsr_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (sign_extend:DI
> +         (xor:SI
> +           (minus:SI
> +             (const_int 31)
> +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +  "operands[2] = gen_reg_rtx (SImode);")
> +
> +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> +; in [0, 63] or [0, 31] range.
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 63)
> +                     (subreg:SI
> +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                       0))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3)
> +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:SI (match_dup 5) (match_dup 4)))]
> +{
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> +      emit_move_insn (operands[0], operands[5]);
> +      DONE;
> +    }
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 31)
> +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))]
> +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3)
> +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:SI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (SImode);
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (xor:DI
> +           (sign_extend:DI
> +             (minus:SI (const_int 63)
> +                       (subreg:SI
> +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                         0)))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> +       == UINTVAL (operands[2]) - 63)"
> +  [(set (match_dup 3)
> +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (sign_extend:DI
> +           (xor:SI
> +             (minus:SI (const_int 31)
> +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +             (const_int 31)))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> +       == UINTVAL (operands[2]) - 31)"
> +  [(set (match_dup 3)
> +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> +   (set (match_dup 0)
> +       (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> +})
> +
>  (define_expand "clz<mode>2"
>    [(parallel
>       [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
>                      (const_int 0)))
> -      (set (match_operand:SWI48 0 "register_operand")
> -          (minus:SWI48
> -            (match_dup 2)
> -            (clz:SWI48 (match_dup 1))))])
> +      (set (match_dup 3) (minus:SWI48
> +                          (match_dup 2)
> +                          (clz:SWI48 (match_dup 1))))])
>     (parallel
> -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> +     [(set (match_operand:SWI48 0 "register_operand")
> +          (xor:SWI48 (match_dup 3) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>    ""
>  {
> @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
>        DONE;
>      }
>    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> +  operands[3] = gen_reg_rtx (<MODE>mode);
>  })
>
>  (define_insn_and_split "clz<mode>2_lzcnt"
> --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,28 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> +
> +long long
> +foo (long long x)
> +{
> +  return __builtin_clzll (x);
> +}
> +
> +long long
> +bar (long long x)
> +{
> +  return (unsigned int) __builtin_clzll (x);
> +}
> +
> +long long
> +baz (int x)
> +{
> +  return __builtin_clz (x);
> +}
> +
> +long long
> +qux (int x)
> +{
> +  return (unsigned int) __builtin_clz (x);
> +}
> --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,33 @@
> +/* PR target/78103 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
                                                           ^^^^^^^^ It
should also allow incl, like "incl %eax" for -m32.
> +
> +unsigned int
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned int
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +#ifdef __x86_64__
> +unsigned int
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned int
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
> +#endif
> --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,32 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> +
> +unsigned long long
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned long long
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
>
>
>         Jakub
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
  2021-07-31 19:38     ` H.J. Lu
@ 2021-07-31 19:42       ` H.J. Lu
  0 siblings, 0 replies; 8+ messages in thread
From: H.J. Lu @ 2021-07-31 19:42 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On Sat, Jul 31, 2021 at 12:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > > Please put some space here, e.g.:
> > ...
> > > Can you just name the relevant insn pattern and use
> > >
> > > emit_insn (gen_bsr_1)?
> >
> > Here is the updated patch.  I'll bootstrap/regtest it tonight.
> >
> > 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
> >
> >         PR target/78103
> >         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
> >         define_insn patterns.
> >         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
> >         Add combine splitters for constant - clz.
> >         (clz<mode>2): Use a temporary pseudo for bsr result.
> >
> >         * gcc.target/i386/pr78103-1.c: New test.
> >         * gcc.target/i386/pr78103-2.c: New test.
> >         * gcc.target/i386/pr78103-3.c: New test.
> >
> > --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> > +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> > @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
> >     (set_attr "znver1_decode" "vector")
> >     (set_attr "mode" "DI")])
> >
> > +(define_insn "bsr_rex64_1"
> > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > +       (minus:DI (const_int 63)
> > +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT"
> > +  "bsr{q}\t{%1, %0|%0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "DI")])
> > +
> >  (define_insn "bsr"
> >    [(set (reg:CCZ FLAGS_REG)
> >         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> > @@ -14775,17 +14787,204 @@ (define_insn "bsr"
> >     (set_attr "znver1_decode" "vector")
> >     (set_attr "mode" "SI")])
> >
> > +(define_insn "bsr_1"
> > +  [(set (match_operand:SI 0 "register_operand" "=r")
> > +       (minus:SI (const_int 31)
> > +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT"
> > +  "bsr{l}\t{%1, %0|%0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "SI")])
> > +
> > +(define_insn "bsr_zext_1"
> > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > +       (zero_extend:DI
> > +         (minus:SI
> > +           (const_int 31)
> > +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT"
> > +  "bsr{l}\t{%1, %k0|%k0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "SI")])
> > +
> > +; As bsr is undefined behavior on zero and for other input
> > +; values it is in range 0 to 63, we can optimize away sign-extends.
> > +(define_insn_and_split "*bsr_rex64_2"
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (xor:DI
> > +         (sign_extend:DI
> > +           (minus:SI
> > +             (const_int 63)
> > +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                        0)))
> > +         (const_int 63)))
> > +    (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > +             (set (match_dup 2)
> > +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> > +   (parallel [(set (match_dup 0)
> > +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> > +             (clobber (reg:CC FLAGS_REG))])]
> > +{
> > +  operands[2] = gen_reg_rtx (DImode);
> > +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> > +})
> > +
> > +(define_insn_and_split "*bsr_2"
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (sign_extend:DI
> > +         (xor:SI
> > +           (minus:SI
> > +             (const_int 31)
> > +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +           (const_int 31))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > +             (set (match_dup 2)
> > +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> > +   (parallel [(set (match_dup 0)
> > +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> > +             (clobber (reg:CC FLAGS_REG))])]
> > +  "operands[2] = gen_reg_rtx (SImode);")
> > +
> > +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> > +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> > +; in [0, 63] or [0, 31] range.
> > +(define_split
> > +  [(set (match_operand:SI 0 "register_operand")
> > +       (minus:SI
> > +         (match_operand:SI 2 "const_int_operand")
> > +         (xor:SI
> > +           (minus:SI (const_int 63)
> > +                     (subreg:SI
> > +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                       0))
> > +           (const_int 63))))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  [(set (match_dup 3)
> > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:SI (match_dup 5) (match_dup 4)))]
> > +{
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> > +  if (INTVAL (operands[2]) == 63)
> > +    {
> > +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> > +      emit_move_insn (operands[0], operands[5]);
> > +      DONE;
> > +    }
> > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:SI 0 "register_operand")
> > +       (minus:SI
> > +         (match_operand:SI 2 "const_int_operand")
> > +         (xor:SI
> > +           (minus:SI (const_int 31)
> > +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +           (const_int 31))))]
> > +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> > +  [(set (match_dup 3)
> > +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:SI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 31)
> > +    {
> > +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (SImode);
> > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (minus:DI
> > +         (match_operand:DI 2 "const_int_operand")
> > +         (xor:DI
> > +           (sign_extend:DI
> > +             (minus:SI (const_int 63)
> > +                       (subreg:SI
> > +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                         0)))
> > +           (const_int 63))))]
> > +  "!TARGET_LZCNT
> > +   && TARGET_64BIT
> > +   && ix86_pre_reload_split ()
> > +   && ((unsigned HOST_WIDE_INT)
> > +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> > +       == UINTVAL (operands[2]) - 63)"
> > +  [(set (match_dup 3)
> > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 63)
> > +    {
> > +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (minus:DI
> > +         (match_operand:DI 2 "const_int_operand")
> > +         (sign_extend:DI
> > +           (xor:SI
> > +             (minus:SI (const_int 31)
> > +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +             (const_int 31)))))]
> > +  "!TARGET_LZCNT
> > +   && TARGET_64BIT
> > +   && ix86_pre_reload_split ()
> > +   && ((unsigned HOST_WIDE_INT)
> > +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> > +       == UINTVAL (operands[2]) - 31)"
> > +  [(set (match_dup 3)
> > +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> > +   (set (match_dup 0)
> > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 31)
> > +    {
> > +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> > +})
> > +
> >  (define_expand "clz<mode>2"
> >    [(parallel
> >       [(set (reg:CCZ FLAGS_REG)
> >         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> >                      (const_int 0)))
> > -      (set (match_operand:SWI48 0 "register_operand")
> > -          (minus:SWI48
> > -            (match_dup 2)
> > -            (clz:SWI48 (match_dup 1))))])
> > +      (set (match_dup 3) (minus:SWI48
> > +                          (match_dup 2)
> > +                          (clz:SWI48 (match_dup 1))))])
> >     (parallel
> > -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> > +     [(set (match_operand:SWI48 0 "register_operand")
> > +          (xor:SWI48 (match_dup 3) (match_dup 2)))
> >        (clobber (reg:CC FLAGS_REG))])]
> >    ""
> >  {
> > @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
> >        DONE;
> >      }
> >    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> > +  operands[3] = gen_reg_rtx (<MODE>mode);
> >  })
> >
> >  (define_insn_and_split "clz<mode>2_lzcnt"
> > --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,28 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> > +
> > +long long
> > +foo (long long x)
> > +{
> > +  return __builtin_clzll (x);
> > +}
> > +
> > +long long
> > +bar (long long x)
> > +{
> > +  return (unsigned int) __builtin_clzll (x);
> > +}
> > +
> > +long long
> > +baz (int x)
> > +{
> > +  return __builtin_clz (x);
> > +}
> > +
> > +long long
> > +qux (int x)
> > +{
> > +  return (unsigned int) __builtin_clz (x);
> > +}
> > --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,33 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> > +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
>                                                            ^^^^^^^^ It
> should also allow incl, like "incl %eax" for -m32.

Like

diff --git a/gcc/testsuite/gcc.target/i386/pr78103-2.c
b/gcc/testsuite/gcc.target/i386/pr78103-2.c
index b3523382926..30f7f98f60a 100644
--- a/gcc/testsuite/gcc.target/i386/pr78103-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr78103-2.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler-not {\mmovl\M} } } */
 /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
 /* { dg-final { scan-assembler-not {\msubl\M} } } */
-/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
+/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} } } */

 unsigned int
 foo (unsigned int x)

-- 
H.J.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
  2021-07-30 13:26   ` Jakub Jelinek
  2021-07-31 19:38     ` H.J. Lu
@ 2021-07-31 19:53     ` H.J. Lu
  2021-08-01 17:12       ` [PATCH] i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt H.J. Lu
  1 sibling, 1 reply; 8+ messages in thread
From: H.J. Lu @ 2021-07-31 19:53 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > Please put some space here, e.g.:
> ...
> > Can you just name the relevant insn pattern and use
> >
> > emit_insn (gen_bsr_1)?
>
> Here is the updated patch.  I'll bootstrap/regtest it tonight.
>
> 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/78103
>         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
>         define_insn patterns.
>         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
>         Add combine splitters for constant - clz.
>         (clz<mode>2): Use a temporary pseudo for bsr result.
>
>         * gcc.target/i386/pr78103-1.c: New test.
>         * gcc.target/i386/pr78103-2.c: New test.
>         * gcc.target/i386/pr78103-3.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "DI")])
>
> +(define_insn "bsr_rex64_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (minus:DI (const_int 63)
> +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{q}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "DI")])
> +
>  (define_insn "bsr"
>    [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> @@ -14775,17 +14787,204 @@ (define_insn "bsr"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "SI")])
>
> +(define_insn "bsr_1"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +       (minus:SI (const_int 31)
> +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT"
> +  "bsr{l}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +(define_insn "bsr_zext_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (minus:SI
> +           (const_int 31)
> +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{l}\t{%1, %k0|%k0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +; As bsr is undefined behavior on zero and for other input
> +; values it is in range 0 to 63, we can optimize away sign-extends.
> +(define_insn_and_split "*bsr_rex64_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (xor:DI
> +         (sign_extend:DI
> +           (minus:SI
> +             (const_int 63)
> +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                        0)))
> +         (const_int 63)))
> +    (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +{
> +  operands[2] = gen_reg_rtx (DImode);
> +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> +})
> +
> +(define_insn_and_split "*bsr_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (sign_extend:DI
> +         (xor:SI
> +           (minus:SI
> +             (const_int 31)
> +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +  "operands[2] = gen_reg_rtx (SImode);")
> +
> +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> +; in [0, 63] or [0, 31] range.
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 63)
> +                     (subreg:SI
> +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                       0))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3)
> +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:SI (match_dup 5) (match_dup 4)))]
> +{
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> +      emit_move_insn (operands[0], operands[5]);
> +      DONE;
> +    }
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 31)
> +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))]
> +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3)
> +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:SI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (SImode);
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (xor:DI
> +           (sign_extend:DI
> +             (minus:SI (const_int 63)
> +                       (subreg:SI
> +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                         0)))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> +       == UINTVAL (operands[2]) - 63)"
> +  [(set (match_dup 3)
> +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (sign_extend:DI
> +           (xor:SI
> +             (minus:SI (const_int 31)
> +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +             (const_int 31)))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> +       == UINTVAL (operands[2]) - 31)"
> +  [(set (match_dup 3)
> +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> +   (set (match_dup 0)
> +       (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> +})
> +
>  (define_expand "clz<mode>2"
>    [(parallel
>       [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
>                      (const_int 0)))
> -      (set (match_operand:SWI48 0 "register_operand")
> -          (minus:SWI48
> -            (match_dup 2)
> -            (clz:SWI48 (match_dup 1))))])
> +      (set (match_dup 3) (minus:SWI48
> +                          (match_dup 2)
> +                          (clz:SWI48 (match_dup 1))))])
>     (parallel
> -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> +     [(set (match_operand:SWI48 0 "register_operand")
> +          (xor:SWI48 (match_dup 3) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>    ""
>  {
> @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
>        DONE;
>      }
>    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> +  operands[3] = gen_reg_rtx (<MODE>mode);
>  })
>
>  (define_insn_and_split "clz<mode>2_lzcnt"
> --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,28 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> +
> +long long
> +foo (long long x)
> +{
> +  return __builtin_clzll (x);
> +}
> +
> +long long
> +bar (long long x)
> +{
> +  return (unsigned int) __builtin_clzll (x);
> +}
> +
> +long long
> +baz (int x)
> +{
> +  return __builtin_clz (x);
> +}
> +
> +long long
> +qux (int x)
> +{
> +  return (unsigned int) __builtin_clz (x);
> +}
> --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,33 @@
> +/* PR target/78103 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> +
> +unsigned int
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned int
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +#ifdef __x86_64__
> +unsigned int
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned int
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
> +#endif
> --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,32 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */

It fails for -mx32:

@@ -7,7 +7,7 @@ foo:
 .LFB0:
  .cfi_startproc
  bsrl %edi, %edi
- leaq 1(%rdi), %rax
+ leal 1(%rdi), %eax  << This should also work for -m64.  Why isn't it
used for -m64?
  ret
  .cfi_endproc
 .LFE0:
@@ -30,7 +30,7 @@ baz:
 .LFB2:
  .cfi_startproc
  bsrq %rdi, %rdi
- leaq 1(%rdi), %rax
+ leal 1(%rdi), %eax   << This should also work for -m64.  Why isn't
it used for -m64?
  ret
  .cfi_endproc
 .LFE2:
@@ -42,6 +42,7 @@ qux:
 .LFB3:
  .cfi_startproc
  bsrq %rdi, %rax
+ movl %eax, %eax  << Why is it generated for -mx32?
  ret
  .cfi_endproc
 .LFE3:

> +unsigned long long
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned long long
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
>
>
>         Jakub
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt
  2021-07-31 19:53     ` H.J. Lu
@ 2021-08-01 17:12       ` H.J. Lu
  2021-08-01 19:20         ` Uros Bizjak
  0 siblings, 1 reply; 8+ messages in thread
From: H.J. Lu @ 2021-08-01 17:12 UTC (permalink / raw)
  To: Jakub Jelinek, Uros Bizjak, gcc-patches

On Sat, Jul 31, 2021 at 12:53:44PM -0700, H.J. Lu wrote:
> On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > > Please put some space here, e.g.:
> > ...
> > > Can you just name the relevant insn pattern and use
> > >
> > > emit_insn (gen_bsr_1)?
> >
> > Here is the updated patch.  I'll bootstrap/regtest it tonight.
> >
> > 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
> >
> >         PR target/78103
> >         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
> >         define_insn patterns.
> >         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
> >         Add combine splitters for constant - clz.
> >         (clz<mode>2): Use a temporary pseudo for bsr result.
> >
> >         * gcc.target/i386/pr78103-1.c: New test.
> >         * gcc.target/i386/pr78103-2.c: New test.
> >         * gcc.target/i386/pr78103-3.c: New test.
> >
> > --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> > +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> > @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
> >     (set_attr "znver1_decode" "vector")
> >     (set_attr "mode" "DI")])
> >
> > +(define_insn "bsr_rex64_1"
> > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > +       (minus:DI (const_int 63)
> > +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT"
> > +  "bsr{q}\t{%1, %0|%0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "DI")])
> > +
> >  (define_insn "bsr"
> >    [(set (reg:CCZ FLAGS_REG)
> >         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> > @@ -14775,17 +14787,204 @@ (define_insn "bsr"
> >     (set_attr "znver1_decode" "vector")
> >     (set_attr "mode" "SI")])
> >
> > +(define_insn "bsr_1"
> > +  [(set (match_operand:SI 0 "register_operand" "=r")
> > +       (minus:SI (const_int 31)
> > +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT"
> > +  "bsr{l}\t{%1, %0|%0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "SI")])
> > +
> > +(define_insn "bsr_zext_1"
> > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > +       (zero_extend:DI
> > +         (minus:SI
> > +           (const_int 31)
> > +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT"
> > +  "bsr{l}\t{%1, %k0|%k0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "SI")])
> > +
> > +; As bsr is undefined behavior on zero and for other input
> > +; values it is in range 0 to 63, we can optimize away sign-extends.
> > +(define_insn_and_split "*bsr_rex64_2"
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (xor:DI
> > +         (sign_extend:DI
> > +           (minus:SI
> > +             (const_int 63)
> > +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                        0)))
> > +         (const_int 63)))
> > +    (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > +             (set (match_dup 2)
> > +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> > +   (parallel [(set (match_dup 0)
> > +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> > +             (clobber (reg:CC FLAGS_REG))])]
> > +{
> > +  operands[2] = gen_reg_rtx (DImode);
> > +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> > +})
> > +
> > +(define_insn_and_split "*bsr_2"
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (sign_extend:DI
> > +         (xor:SI
> > +           (minus:SI
> > +             (const_int 31)
> > +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +           (const_int 31))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > +             (set (match_dup 2)
> > +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> > +   (parallel [(set (match_dup 0)
> > +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> > +             (clobber (reg:CC FLAGS_REG))])]
> > +  "operands[2] = gen_reg_rtx (SImode);")
> > +
> > +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> > +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> > +; in [0, 63] or [0, 31] range.
> > +(define_split
> > +  [(set (match_operand:SI 0 "register_operand")
> > +       (minus:SI
> > +         (match_operand:SI 2 "const_int_operand")
> > +         (xor:SI
> > +           (minus:SI (const_int 63)
> > +                     (subreg:SI
> > +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                       0))
> > +           (const_int 63))))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  [(set (match_dup 3)
> > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:SI (match_dup 5) (match_dup 4)))]
> > +{
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> > +  if (INTVAL (operands[2]) == 63)
> > +    {
> > +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> > +      emit_move_insn (operands[0], operands[5]);
> > +      DONE;
> > +    }
> > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:SI 0 "register_operand")
> > +       (minus:SI
> > +         (match_operand:SI 2 "const_int_operand")
> > +         (xor:SI
> > +           (minus:SI (const_int 31)
> > +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +           (const_int 31))))]
> > +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> > +  [(set (match_dup 3)
> > +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:SI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 31)
> > +    {
> > +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (SImode);
> > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (minus:DI
> > +         (match_operand:DI 2 "const_int_operand")
> > +         (xor:DI
> > +           (sign_extend:DI
> > +             (minus:SI (const_int 63)
> > +                       (subreg:SI
> > +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                         0)))
> > +           (const_int 63))))]
> > +  "!TARGET_LZCNT
> > +   && TARGET_64BIT
> > +   && ix86_pre_reload_split ()
> > +   && ((unsigned HOST_WIDE_INT)
> > +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> > +       == UINTVAL (operands[2]) - 63)"
> > +  [(set (match_dup 3)
> > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 63)
> > +    {
> > +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (minus:DI
> > +         (match_operand:DI 2 "const_int_operand")
> > +         (sign_extend:DI
> > +           (xor:SI
> > +             (minus:SI (const_int 31)
> > +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +             (const_int 31)))))]
> > +  "!TARGET_LZCNT
> > +   && TARGET_64BIT
> > +   && ix86_pre_reload_split ()
> > +   && ((unsigned HOST_WIDE_INT)
> > +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> > +       == UINTVAL (operands[2]) - 31)"
> > +  [(set (match_dup 3)
> > +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> > +   (set (match_dup 0)
> > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 31)
> > +    {
> > +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> > +})
> > +
> >  (define_expand "clz<mode>2"
> >    [(parallel
> >       [(set (reg:CCZ FLAGS_REG)
> >         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> >                      (const_int 0)))
> > -      (set (match_operand:SWI48 0 "register_operand")
> > -          (minus:SWI48
> > -            (match_dup 2)
> > -            (clz:SWI48 (match_dup 1))))])
> > +      (set (match_dup 3) (minus:SWI48
> > +                          (match_dup 2)
> > +                          (clz:SWI48 (match_dup 1))))])
> >     (parallel
> > -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> > +     [(set (match_operand:SWI48 0 "register_operand")
> > +          (xor:SWI48 (match_dup 3) (match_dup 2)))
> >        (clobber (reg:CC FLAGS_REG))])]
> >    ""
> >  {
> > @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
> >        DONE;
> >      }
> >    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> > +  operands[3] = gen_reg_rtx (<MODE>mode);
> >  })
> >
> >  (define_insn_and_split "clz<mode>2_lzcnt"
> > --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,28 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> > +
> > +long long
> > +foo (long long x)
> > +{
> > +  return __builtin_clzll (x);
> > +}
> > +
> > +long long
> > +bar (long long x)
> > +{
> > +  return (unsigned int) __builtin_clzll (x);
> > +}
> > +
> > +long long
> > +baz (int x)
> > +{
> > +  return __builtin_clz (x);
> > +}
> > +
> > +long long
> > +qux (int x)
> > +{
> > +  return (unsigned int) __builtin_clz (x);
> > +}
> > --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,33 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> > +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> > +
> > +unsigned int
> > +foo (unsigned int x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> > +}
> > +
> > +unsigned int
> > +bar (unsigned int x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> > +}
> > +
> > +#ifdef __x86_64__
> > +unsigned int
> > +baz (unsigned long long x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> > +}
> > +
> > +unsigned int
> > +qux (unsigned long long x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> > +}
> > +#endif
> > --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,32 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> > +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> 
> It fails for -mx32:
> 
> @@ -7,7 +7,7 @@ foo:
>  .LFB0:
>   .cfi_startproc
>   bsrl %edi, %edi
> - leaq 1(%rdi), %rax
> + leal 1(%rdi), %eax  << This should also work for -m64.  Why isn't it
> used for -m64?
>   ret
>   .cfi_endproc
>  .LFE0:
> @@ -30,7 +30,7 @@ baz:
>  .LFB2:
>   .cfi_startproc
>   bsrq %rdi, %rdi
> - leaq 1(%rdi), %rax
> + leal 1(%rdi), %eax   << This should also work for -m64.  Why isn't
> it used for -m64?
>   ret
>   .cfi_endproc
>  .LFE2:
> @@ -42,6 +42,7 @@ qux:
>  .LFB3:
>   .cfi_startproc
>   bsrq %rdi, %rax
> + movl %eax, %eax  << Why is it generated for -mx32?
>   ret
>   .cfi_endproc
>  .LFE3:
> 

Add a zero_extend patten for bsr_rex64_1 and use it to split SImode
constant - __builtin_clzll to avoid unncessary zero_extend.

OK for master?

H.J.
---
gcc/

	PR target/78103
	* config/i386/i386.md (bsr_rex64_1_zext): New.
	(combine splitter for SImode constant - clzll): Replace
	gen_bsr_rex64_1 with gen_bsr_rex64_1_zext.

gcc/testsuite/

	PR target/78103
	* gcc.target/i386/pr78103-2.c: Also scan incl.
	* gcc.target/i386/pr78103-3.c: Scan leal|addl|incl for x32.  Also
	scan incq.
---
 gcc/config/i386/i386.md                   | 17 ++++++++++++++++-
 gcc/testsuite/gcc.target/i386/pr78103-2.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr78103-3.c |  3 ++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c9787d73262..0c23ddb8d1f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14796,6 +14796,21 @@ (define_insn "bsr_rex64_1"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "DI")])
 
+(define_insn "bsr_rex64_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI (const_int 63)
+		    (subreg:SI
+		      (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))
+		      0))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "DI")])
+
 (define_insn "bsr"
   [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
@@ -14907,7 +14922,7 @@ (define_split
   operands[5] = lowpart_subreg (SImode, operands[3], DImode);
   if (INTVAL (operands[2]) == 63)
     {
-      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
+      emit_insn (gen_bsr_rex64_1_zext (operands[3], operands[1]));
       emit_move_insn (operands[0], operands[5]);
       DONE;
     }
diff --git a/gcc/testsuite/gcc.target/i386/pr78103-2.c b/gcc/testsuite/gcc.target/i386/pr78103-2.c
index b3523382926..30f7f98f60a 100644
--- a/gcc/testsuite/gcc.target/i386/pr78103-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr78103-2.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler-not {\mmovl\M} } } */
 /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
 /* { dg-final { scan-assembler-not {\msubl\M} } } */
-/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
+/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} } } */
 
 unsigned int
 foo (unsigned int x)
diff --git a/gcc/testsuite/gcc.target/i386/pr78103-3.c b/gcc/testsuite/gcc.target/i386/pr78103-3.c
index 49a36eccf4d..b8d82312a0e 100644
--- a/gcc/testsuite/gcc.target/i386/pr78103-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr78103-3.c
@@ -5,7 +5,8 @@
 /* { dg-final { scan-assembler-not {\mmovslq\M} } } */
 /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
 /* { dg-final { scan-assembler-not {\msubq\M} } } */
-/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
+/* { dg-final { scan-assembler {\m(leaq|addq|incq)\M} { target { ! x32 } } } } */
+/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} { target x32 } } } */
 
 unsigned long long
 foo (unsigned int x)
-- 
2.31.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt
  2021-08-01 17:12       ` [PATCH] i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt H.J. Lu
@ 2021-08-01 19:20         ` Uros Bizjak
  0 siblings, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2021-08-01 19:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches

On Sun, Aug 1, 2021 at 7:12 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Jul 31, 2021 at 12:53:44PM -0700, H.J. Lu wrote:
> > On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > > > Please put some space here, e.g.:
> > > ...
> > > > Can you just name the relevant insn pattern and use
> > > >
> > > > emit_insn (gen_bsr_1)?
> > >
> > > Here is the updated patch.  I'll bootstrap/regtest it tonight.
> > >
> > > 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
> > >
> > >         PR target/78103
> > >         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
> > >         define_insn patterns.
> > >         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
> > >         Add combine splitters for constant - clz.
> > >         (clz<mode>2): Use a temporary pseudo for bsr result.
> > >
> > >         * gcc.target/i386/pr78103-1.c: New test.
> > >         * gcc.target/i386/pr78103-2.c: New test.
> > >         * gcc.target/i386/pr78103-3.c: New test.
> > >
> > > --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> > > +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> > > @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
> > >     (set_attr "znver1_decode" "vector")
> > >     (set_attr "mode" "DI")])
> > >
> > > +(define_insn "bsr_rex64_1"
> > > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > > +       (minus:DI (const_int 63)
> > > +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT"
> > > +  "bsr{q}\t{%1, %0|%0, %1}"
> > > +  [(set_attr "type" "alu1")
> > > +   (set_attr "prefix_0f" "1")
> > > +   (set_attr "znver1_decode" "vector")
> > > +   (set_attr "mode" "DI")])
> > > +
> > >  (define_insn "bsr"
> > >    [(set (reg:CCZ FLAGS_REG)
> > >         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> > > @@ -14775,17 +14787,204 @@ (define_insn "bsr"
> > >     (set_attr "znver1_decode" "vector")
> > >     (set_attr "mode" "SI")])
> > >
> > > +(define_insn "bsr_1"
> > > +  [(set (match_operand:SI 0 "register_operand" "=r")
> > > +       (minus:SI (const_int 31)
> > > +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT"
> > > +  "bsr{l}\t{%1, %0|%0, %1}"
> > > +  [(set_attr "type" "alu1")
> > > +   (set_attr "prefix_0f" "1")
> > > +   (set_attr "znver1_decode" "vector")
> > > +   (set_attr "mode" "SI")])
> > > +
> > > +(define_insn "bsr_zext_1"
> > > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > > +       (zero_extend:DI
> > > +         (minus:SI
> > > +           (const_int 31)
> > > +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT"
> > > +  "bsr{l}\t{%1, %k0|%k0, %1}"
> > > +  [(set_attr "type" "alu1")
> > > +   (set_attr "prefix_0f" "1")
> > > +   (set_attr "znver1_decode" "vector")
> > > +   (set_attr "mode" "SI")])
> > > +
> > > +; As bsr is undefined behavior on zero and for other input
> > > +; values it is in range 0 to 63, we can optimize away sign-extends.
> > > +(define_insn_and_split "*bsr_rex64_2"
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (xor:DI
> > > +         (sign_extend:DI
> > > +           (minus:SI
> > > +             (const_int 63)
> > > +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > > +                        0)))
> > > +         (const_int 63)))
> > > +    (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > > +             (set (match_dup 2)
> > > +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> > > +   (parallel [(set (match_dup 0)
> > > +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> > > +             (clobber (reg:CC FLAGS_REG))])]
> > > +{
> > > +  operands[2] = gen_reg_rtx (DImode);
> > > +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> > > +})
> > > +
> > > +(define_insn_and_split "*bsr_2"
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (sign_extend:DI
> > > +         (xor:SI
> > > +           (minus:SI
> > > +             (const_int 31)
> > > +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > > +           (const_int 31))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > > +             (set (match_dup 2)
> > > +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> > > +   (parallel [(set (match_dup 0)
> > > +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> > > +             (clobber (reg:CC FLAGS_REG))])]
> > > +  "operands[2] = gen_reg_rtx (SImode);")
> > > +
> > > +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> > > +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> > > +; in [0, 63] or [0, 31] range.
> > > +(define_split
> > > +  [(set (match_operand:SI 0 "register_operand")
> > > +       (minus:SI
> > > +         (match_operand:SI 2 "const_int_operand")
> > > +         (xor:SI
> > > +           (minus:SI (const_int 63)
> > > +                     (subreg:SI
> > > +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > > +                       0))
> > > +           (const_int 63))))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > > +  [(set (match_dup 3)
> > > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > > +   (set (match_dup 0)
> > > +       (plus:SI (match_dup 5) (match_dup 4)))]
> > > +{
> > > +  operands[3] = gen_reg_rtx (DImode);
> > > +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> > > +  if (INTVAL (operands[2]) == 63)
> > > +    {
> > > +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> > > +      emit_move_insn (operands[0], operands[5]);
> > > +      DONE;
> > > +    }
> > > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> > > +})
> > > +
> > > +(define_split
> > > +  [(set (match_operand:SI 0 "register_operand")
> > > +       (minus:SI
> > > +         (match_operand:SI 2 "const_int_operand")
> > > +         (xor:SI
> > > +           (minus:SI (const_int 31)
> > > +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > > +           (const_int 31))))]
> > > +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> > > +  [(set (match_dup 3)
> > > +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> > > +   (set (match_dup 0)
> > > +       (plus:SI (match_dup 3) (match_dup 4)))]
> > > +{
> > > +  if (INTVAL (operands[2]) == 31)
> > > +    {
> > > +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> > > +      DONE;
> > > +    }
> > > +  operands[3] = gen_reg_rtx (SImode);
> > > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> > > +})
> > > +
> > > +(define_split
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (minus:DI
> > > +         (match_operand:DI 2 "const_int_operand")
> > > +         (xor:DI
> > > +           (sign_extend:DI
> > > +             (minus:SI (const_int 63)
> > > +                       (subreg:SI
> > > +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > > +                         0)))
> > > +           (const_int 63))))]
> > > +  "!TARGET_LZCNT
> > > +   && TARGET_64BIT
> > > +   && ix86_pre_reload_split ()
> > > +   && ((unsigned HOST_WIDE_INT)
> > > +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> > > +       == UINTVAL (operands[2]) - 63)"
> > > +  [(set (match_dup 3)
> > > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > > +   (set (match_dup 0)
> > > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > > +{
> > > +  if (INTVAL (operands[2]) == 63)
> > > +    {
> > > +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> > > +      DONE;
> > > +    }
> > > +  operands[3] = gen_reg_rtx (DImode);
> > > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> > > +})
> > > +
> > > +(define_split
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (minus:DI
> > > +         (match_operand:DI 2 "const_int_operand")
> > > +         (sign_extend:DI
> > > +           (xor:SI
> > > +             (minus:SI (const_int 31)
> > > +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > > +             (const_int 31)))))]
> > > +  "!TARGET_LZCNT
> > > +   && TARGET_64BIT
> > > +   && ix86_pre_reload_split ()
> > > +   && ((unsigned HOST_WIDE_INT)
> > > +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> > > +       == UINTVAL (operands[2]) - 31)"
> > > +  [(set (match_dup 3)
> > > +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> > > +   (set (match_dup 0)
> > > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > > +{
> > > +  if (INTVAL (operands[2]) == 31)
> > > +    {
> > > +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> > > +      DONE;
> > > +    }
> > > +  operands[3] = gen_reg_rtx (DImode);
> > > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> > > +})
> > > +
> > >  (define_expand "clz<mode>2"
> > >    [(parallel
> > >       [(set (reg:CCZ FLAGS_REG)
> > >         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> > >                      (const_int 0)))
> > > -      (set (match_operand:SWI48 0 "register_operand")
> > > -          (minus:SWI48
> > > -            (match_dup 2)
> > > -            (clz:SWI48 (match_dup 1))))])
> > > +      (set (match_dup 3) (minus:SWI48
> > > +                          (match_dup 2)
> > > +                          (clz:SWI48 (match_dup 1))))])
> > >     (parallel
> > > -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> > > +     [(set (match_operand:SWI48 0 "register_operand")
> > > +          (xor:SWI48 (match_dup 3) (match_dup 2)))
> > >        (clobber (reg:CC FLAGS_REG))])]
> > >    ""
> > >  {
> > > @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
> > >        DONE;
> > >      }
> > >    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> > > +  operands[3] = gen_reg_rtx (<MODE>mode);
> > >  })
> > >
> > >  (define_insn_and_split "clz<mode>2_lzcnt"
> > > --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> > > +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> > > @@ -0,0 +1,28 @@
> > > +/* PR target/78103 */
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-O2 -mno-lzcnt" } */
> > > +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> > > +
> > > +long long
> > > +foo (long long x)
> > > +{
> > > +  return __builtin_clzll (x);
> > > +}
> > > +
> > > +long long
> > > +bar (long long x)
> > > +{
> > > +  return (unsigned int) __builtin_clzll (x);
> > > +}
> > > +
> > > +long long
> > > +baz (int x)
> > > +{
> > > +  return __builtin_clz (x);
> > > +}
> > > +
> > > +long long
> > > +qux (int x)
> > > +{
> > > +  return (unsigned int) __builtin_clz (x);
> > > +}
> > > --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> > > +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> > > @@ -0,0 +1,33 @@
> > > +/* PR target/78103 */
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mno-lzcnt" } */
> > > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > > +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> > > +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> > > +
> > > +unsigned int
> > > +foo (unsigned int x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> > > +}
> > > +
> > > +unsigned int
> > > +bar (unsigned int x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> > > +}
> > > +
> > > +#ifdef __x86_64__
> > > +unsigned int
> > > +baz (unsigned long long x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> > > +}
> > > +
> > > +unsigned int
> > > +qux (unsigned long long x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> > > +}
> > > +#endif
> > > --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 15:07:26.104139537 +0200
> > > +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 +0200
> > > @@ -0,0 +1,32 @@
> > > +/* PR target/78103 */
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-O2 -mno-lzcnt" } */
> > > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > > +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> > > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > > +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> > > +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> >
> > It fails for -mx32:
> >
> > @@ -7,7 +7,7 @@ foo:
> >  .LFB0:
> >   .cfi_startproc
> >   bsrl %edi, %edi
> > - leaq 1(%rdi), %rax
> > + leal 1(%rdi), %eax  << This should also work for -m64.  Why isn't it
> > used for -m64?
> >   ret
> >   .cfi_endproc
> >  .LFE0:
> > @@ -30,7 +30,7 @@ baz:
> >  .LFB2:
> >   .cfi_startproc
> >   bsrq %rdi, %rdi
> > - leaq 1(%rdi), %rax
> > + leal 1(%rdi), %eax   << This should also work for -m64.  Why isn't
> > it used for -m64?
> >   ret
> >   .cfi_endproc
> >  .LFE2:
> > @@ -42,6 +42,7 @@ qux:
> >  .LFB3:
> >   .cfi_startproc
> >   bsrq %rdi, %rax
> > + movl %eax, %eax  << Why is it generated for -mx32?
> >   ret
> >   .cfi_endproc
> >  .LFE3:
> >
>
> Add a zero_extend patten for bsr_rex64_1 and use it to split SImode
> constant - __builtin_clzll to avoid unncessary zero_extend.
>
> OK for master?
>
> H.J.
> ---
> gcc/
>
>         PR target/78103
>         * config/i386/i386.md (bsr_rex64_1_zext): New.
>         (combine splitter for SImode constant - clzll): Replace
>         gen_bsr_rex64_1 with gen_bsr_rex64_1_zext.
>
> gcc/testsuite/
>
>         PR target/78103
>         * gcc.target/i386/pr78103-2.c: Also scan incl.
>         * gcc.target/i386/pr78103-3.c: Scan leal|addl|incl for x32.  Also
>         scan incq.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.md                   | 17 ++++++++++++++++-
>  gcc/testsuite/gcc.target/i386/pr78103-2.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr78103-3.c |  3 ++-
>  3 files changed, 19 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index c9787d73262..0c23ddb8d1f 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -14796,6 +14796,21 @@ (define_insn "bsr_rex64_1"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "DI")])
>
> +(define_insn "bsr_rex64_1_zext"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (minus:SI (const_int 63)
> +                   (subreg:SI
> +                     (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))
> +                     0))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{q}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "DI")])
> +
>  (define_insn "bsr"
>    [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> @@ -14907,7 +14922,7 @@ (define_split
>    operands[5] = lowpart_subreg (SImode, operands[3], DImode);
>    if (INTVAL (operands[2]) == 63)
>      {
> -      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> +      emit_insn (gen_bsr_rex64_1_zext (operands[3], operands[1]));
>        emit_move_insn (operands[0], operands[5]);
>        DONE;
>      }
> diff --git a/gcc/testsuite/gcc.target/i386/pr78103-2.c b/gcc/testsuite/gcc.target/i386/pr78103-2.c
> index b3523382926..30f7f98f60a 100644
> --- a/gcc/testsuite/gcc.target/i386/pr78103-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr78103-2.c
> @@ -4,7 +4,7 @@
>  /* { dg-final { scan-assembler-not {\mmovl\M} } } */
>  /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
>  /* { dg-final { scan-assembler-not {\msubl\M} } } */
> -/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> +/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} } } */
>
>  unsigned int
>  foo (unsigned int x)
> diff --git a/gcc/testsuite/gcc.target/i386/pr78103-3.c b/gcc/testsuite/gcc.target/i386/pr78103-3.c
> index 49a36eccf4d..b8d82312a0e 100644
> --- a/gcc/testsuite/gcc.target/i386/pr78103-3.c
> +++ b/gcc/testsuite/gcc.target/i386/pr78103-3.c
> @@ -5,7 +5,8 @@
>  /* { dg-final { scan-assembler-not {\mmovslq\M} } } */
>  /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
>  /* { dg-final { scan-assembler-not {\msubq\M} } } */
> -/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> +/* { dg-final { scan-assembler {\m(leaq|addq|incq)\M} { target { ! x32 } } } } */
> +/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} { target x32 } } } */
>
>  unsigned long long
>  foo (unsigned int x)
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-08-01 19:21 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-28  8:35 [PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103] Jakub Jelinek
2021-07-30 10:27 ` Uros Bizjak
2021-07-30 13:26   ` Jakub Jelinek
2021-07-31 19:38     ` H.J. Lu
2021-07-31 19:42       ` H.J. Lu
2021-07-31 19:53     ` H.J. Lu
2021-08-01 17:12       ` [PATCH] i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt H.J. Lu
2021-08-01 19:20         ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).