public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [x86 PATCH] Optimize double word negation of zero extended values.
@ 2022-05-23 19:40 Roger Sayle
  2022-05-23 20:40 ` Uros Bizjak
  0 siblings, 1 reply; 2+ messages in thread
From: Roger Sayle @ 2022-05-23 19:40 UTC (permalink / raw)
  To: 'GCC Patches'

[-- Attachment #1: Type: text/plain, Size: 2585 bytes --]


It's not uncommon for GCC to convert between a (zero or one) Boolean
value and a (zero or all ones) mask value, possibly of a wider type,
using negation.

Currently on x86_64, the following simple test case:
__int128 foo(unsigned long x) { return -(__int128)x; }

compiles with -O2 to:

        movq    %rdi, %rax
        xorl    %edx, %edx
        negq    %rax
        adcq    $0, %rdx
        negq    %rdx
        ret

with this patch, which adds an additional peephole2 to i386.md,
we instead generate the improved:

        movq    %rdi, %rax
        negq    %rax
        sbbq    %rdx, %rdx
        ret

[and likewise for the (DImode) long long version using -m32.]
A peephole2 is appropriate as the double word negation and the
operation providing the xor are typically only split after combine.

In fact, the new peephole2 sequence:
;; Convert:
;;   xorl %edx, %edx
;;   negl %eax
;;   adcl $0, %edx
;;   negl %edx
;; to:
;;   negl %eax
;;   sbbl %edx, %edx    // *x86_mov<mode>cc_0_m1

is nearly identical to (and placed immediately after) the existing:
;; Convert:
;;   mov %esi, %edx
;;   negl %eax
;;   adcl $0, %edx
;;   negl %edx
;; to:
;;   xorl %edx, %edx
;;   negl %eax
;;   sbbl %esi, %edx


One potential objection/concern is that "sbb? %reg,%reg" may possibly be
incorrectly perceived as a false register dependency on older hardware,
much like "xor? %reg,%reg" may be perceived as a false dependency on
really old hardware.  This doesn't currently appear to be a concern
for the i386 backend's *x86_move<mode>cc_0_m1 as shown by the following
test code:

int bar(unsigned int x, unsigned int y) {
  return x > y ? -1 : 0;
}

which currently generates a "naked" sbb:
        cmp     esi, edi
        sbb     eax, eax
        ret

If anyone does potentially encounter a stall, it would easy to add
a splitter or peephole2 controlled by a tuning flag to insert an additional
xor to break the false dependency chain (when not optimizing for size),
but I don't believe this is required on recent microarchitectures.


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}, with
no new failures.  Ok for mainline?


2022-05-23  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/i386.md (peephole2): Convert xor;neg;adc;neg,
	i.e. a double word negation of a zero extended operand, to
	neg;sbb.

gcc/testsuite/ChangeLog
	* gcc.target/i386/neg-zext-1.c: New test case for ia32.
	* gcc.target/i386/neg-zext-2.c: New test case for int128.


Thanks in advance,
Roger
--


[-- Attachment #2: patchnz.txt --]
[-- Type: text/plain, Size: 2554 bytes --]

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 792bae1..692f9b6 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -11039,6 +11039,46 @@
      (clobber (reg:CC FLAGS_REG))])]
   "ix86_expand_clear (operands[0]);")
 
+;; Convert:
+;;   xorl %edx, %edx
+;;   negl %eax
+;;   adcl $0, %edx
+;;   negl %edx
+;; to:
+;;   negl %eax
+;;   sbbl %edx, %edx	// *x86_mov<mode>cc_0_m1
+
+(define_peephole2
+  [(parallel
+    [(set (match_operand:SWI48 0 "general_reg_operand") (const_int 0))
+     (clobber (reg:CC FLAGS_REG))])
+   (parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (ne:CCC (match_operand:SWI48 1 "general_reg_operand") (const_int 0)))
+     (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
+   (parallel
+    [(set (match_dup 0)
+	  (plus:SWI48 (plus:SWI48
+			(ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0))
+			(match_dup 0))
+		      (const_int 0)))
+     (clobber (reg:CC FLAGS_REG))])
+   (parallel
+    [(set (match_dup 0)
+	  (neg:SWI48 (match_dup 0)))
+     (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[0]) != REGNO (operands[1])"
+  [(parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (ne:CCC (match_dup 1) (const_int 0)))
+     (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
+   (parallel
+    [(set (match_dup 0)
+	  (if_then_else:SWI48 (ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0))
+			      (const_int -1)
+			      (const_int 0)))
+     (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*neg<mode>_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(neg:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")))
diff --git a/gcc/testsuite/gcc.target/i386/neg-zext-1.c b/gcc/testsuite/gcc.target/i386/neg-zext-1.c
new file mode 100644
index 0000000..ec91fb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/neg-zext-1.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2" } */
+
+long long foo(unsigned int x) { return -(long long)x; }
+
+/* { dg-final { scan-assembler "sbb" } } */
+/* { dg-final { scan-assembler-not "adc" } } */
diff --git a/gcc/testsuite/gcc.target/i386/neg-zext-2.c b/gcc/testsuite/gcc.target/i386/neg-zext-2.c
new file mode 100644
index 0000000..a6ed077
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/neg-zext-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+__int128 fool(unsigned long x) { return -(__int128)x; }
+
+/* { dg-final { scan-assembler "sbb" } } */
+/* { dg-final { scan-assembler-not "adc" } } */

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [x86 PATCH] Optimize double word negation of zero extended values.
  2022-05-23 19:40 [x86 PATCH] Optimize double word negation of zero extended values Roger Sayle
@ 2022-05-23 20:40 ` Uros Bizjak
  0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2022-05-23 20:40 UTC (permalink / raw)
  To: Roger Sayle; +Cc: GCC Patches

On Mon, May 23, 2022 at 9:40 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> It's not uncommon for GCC to convert between a (zero or one) Boolean
> value and a (zero or all ones) mask value, possibly of a wider type,
> using negation.
>
> Currently on x86_64, the following simple test case:
> __int128 foo(unsigned long x) { return -(__int128)x; }
>
> compiles with -O2 to:
>
>         movq    %rdi, %rax
>         xorl    %edx, %edx
>         negq    %rax
>         adcq    $0, %rdx
>         negq    %rdx
>         ret
>
> with this patch, which adds an additional peephole2 to i386.md,
> we instead generate the improved:
>
>         movq    %rdi, %rax
>         negq    %rax
>         sbbq    %rdx, %rdx
>         ret
>
> [and likewise for the (DImode) long long version using -m32.]
> A peephole2 is appropriate as the double word negation and the
> operation providing the xor are typically only split after combine.
>
> In fact, the new peephole2 sequence:
> ;; Convert:
> ;;   xorl %edx, %edx
> ;;   negl %eax
> ;;   adcl $0, %edx
> ;;   negl %edx
> ;; to:
> ;;   negl %eax
> ;;   sbbl %edx, %edx    // *x86_mov<mode>cc_0_m1
>
> is nearly identical to (and placed immediately after) the existing:
> ;; Convert:
> ;;   mov %esi, %edx
> ;;   negl %eax
> ;;   adcl $0, %edx
> ;;   negl %edx
> ;; to:
> ;;   xorl %edx, %edx
> ;;   negl %eax
> ;;   sbbl %esi, %edx
>
>
> One potential objection/concern is that "sbb? %reg,%reg" may possibly be
> incorrectly perceived as a false register dependency on older hardware,
> much like "xor? %reg,%reg" may be perceived as a false dependency on
> really old hardware.  This doesn't currently appear to be a concern
> for the i386 backend's *x86_move<mode>cc_0_m1 as shown by the following
> test code:
>
> int bar(unsigned int x, unsigned int y) {
>   return x > y ? -1 : 0;
> }
>
> which currently generates a "naked" sbb:
>         cmp     esi, edi
>         sbb     eax, eax
>         ret
>
> If anyone does potentially encounter a stall, it would easy to add
> a splitter or peephole2 controlled by a tuning flag to insert an additional
> xor to break the false dependency chain (when not optimizing for size),
> but I don't believe this is required on recent microarchitectures.
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}, with
> no new failures.  Ok for mainline?
>
>
> 2022-05-23  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386.md (peephole2): Convert xor;neg;adc;neg,
>         i.e. a double word negation of a zero extended operand, to
>         neg;sbb.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/neg-zext-1.c: New test case for ia32.
>         * gcc.target/i386/neg-zext-2.c: New test case for int128.

[1] suggests that sbb reg,reg breaks dependency chain only for AMD
targets (bulldozer/zen/bobcat/jaguar) and not for Intel. However, we
use "naked" sbb extensively and nobody complained about it, so I guess
the patch is OK.

[1] https://reviews.llvm.org/D116804

Thanks,
Uros.

>
> Thanks in advance,
> Roger
> --
>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-05-23 20:40 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-23 19:40 [x86 PATCH] Optimize double word negation of zero extended values Roger Sayle
2022-05-23 20:40 ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).