* [x86 PATCH] Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces).
@ 2023-06-27 17:22 Roger Sayle
2023-06-27 20:20 ` Uros Bizjak
0 siblings, 1 reply; 2+ messages in thread
From: Roger Sayle @ 2023-06-27 17:22 UTC (permalink / raw)
To: gcc-patches; +Cc: 'Uros Bizjak'
[-- Attachment #1: Type: text/plain, Size: 3458 bytes --]
This patch fixes some very odd (unanticipated) code generation by
compare_by_pieces with -m32 -mavx, since the recent addition of the
cbranchoi4 pattern. The issue is that cbranchoi4 is available with
TARGET_AVX, but cbranchti4 is currently conditional on TARGET_64BIT
which results in the odd behaviour (thanks to OPTAB_WIDEN) that with
-m32 -mavx, compare_by_pieces ends up (inefficiently) widening 128-bit
comparisons to 256-bits before performing PTEST.
This patch fixes this by providing a cbranchti4 pattern that's available
with either TARGET_64BIT or TARGET_SSE4_1.
For the test case below (again from PR 104610):
int foo(char *a)
{
static const char t[] = "0123456789012345678901234567890";
return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
}
GCC with -m32 -O2 -mavx currently produces the bonkers:
foo: pushl %ebp
movl %esp, %ebp
andl $-32, %esp
subl $64, %esp
movl 8(%ebp), %eax
vmovdqa .LC0, %xmm4
movl $0, 48(%esp)
vmovdqu (%eax), %xmm2
movl $0, 52(%esp)
movl $0, 56(%esp)
movl $0, 60(%esp)
movl $0, 16(%esp)
movl $0, 20(%esp)
movl $0, 24(%esp)
movl $0, 28(%esp)
vmovdqa %xmm2, 32(%esp)
vmovdqa %xmm4, (%esp)
vmovdqa (%esp), %ymm5
vpxor 32(%esp), %ymm5, %ymm0
vptest %ymm0, %ymm0
jne .L2
vmovdqu 16(%eax), %xmm7
movl $0, 48(%esp)
movl $0, 52(%esp)
vmovdqa %xmm7, 32(%esp)
vmovdqa .LC1, %xmm7
movl $0, 56(%esp)
movl $0, 60(%esp)
movl $0, 16(%esp)
movl $0, 20(%esp)
movl $0, 24(%esp)
movl $0, 28(%esp)
vmovdqa %xmm7, (%esp)
vmovdqa (%esp), %ymm1
vpxor 32(%esp), %ymm1, %ymm0
vptest %ymm0, %ymm0
je .L6
.L2: movl $1, %eax
xorl $1, %eax
vzeroupper
leave
ret
.L6: xorl %eax, %eax
xorl $1, %eax
vzeroupper
leave
ret
with this patch, we now generate the (slightly) more sensible:
foo: vmovdqa .LC0, %xmm0
movl 4(%esp), %eax
vpxor (%eax), %xmm0, %xmm0
vptest %xmm0, %xmm0
jne .L2
vmovdqa .LC1, %xmm0
vpxor 16(%eax), %xmm0, %xmm0
vptest %xmm0, %xmm0
je .L5
.L2: movl $1, %eax
xorl $1, %eax
ret
.L5: xorl %eax, %eax
xorl $1, %eax
ret
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures. Ok for mainline?
2023-06-27 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_branch): Also use ptest
for TImode comparisons on 32-bit architectures.
* config/i386/i386.md (cbranch<mode>4): Change from SDWIM to
SWIM1248x to exclude/avoid TImode being conditional on -m64.
(cbranchti4): New define_expand for TImode on both TARGET_64BIT
and/or with TARGET_SSE4_1.
* config/i386/predicates.md (ix86_timode_comparison_operator):
New predicate that depends upon TARGET_64BIT.
(ix86_timode_comparison_operand): Likewise.
gcc/testsuite/ChangeLog
* gcc.target/i386/pieces-memcmp-2.c: New test case.
Thanks in advance,
Roger
--
[-- Attachment #2: patchti.txt --]
[-- Type: text/plain, Size: 3976 bytes --]
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 9a8d244..567248d 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2365,6 +2365,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
/* Handle special case - vector comparsion with boolean result, transform
it using ptest instruction. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || (mode == TImode && !TARGET_64BIT)
|| mode == OImode)
{
rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
@@ -2372,7 +2373,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
gcc_assert (code == EQ || code == NE);
- if (mode == OImode)
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
{
op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b50d82b..dcf0ba6 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1352,8 +1352,8 @@
(define_expand "cbranch<mode>4"
[(set (reg:CC FLAGS_REG)
- (compare:CC (match_operand:SDWIM 1 "nonimmediate_operand")
- (match_operand:SDWIM 2 "<general_operand>")))
+ (compare:CC (match_operand:SWIM1248x 1 "nonimmediate_operand")
+ (match_operand:SWIM1248x 2 "<general_operand>")))
(set (pc) (if_then_else
(match_operator 0 "ordered_comparison_operator"
[(reg:CC FLAGS_REG) (const_int 0)])
@@ -1368,6 +1368,22 @@
DONE;
})
+(define_expand "cbranchti4"
+ [(set (reg:CC FLAGS_REG)
+ (compare:CC (match_operand:TI 1 "nonimmediate_operand")
+ (match_operand:TI 2 "ix86_timode_comparison_operand")))
+ (set (pc) (if_then_else
+ (match_operator 0 "ix86_timode_comparison_operator"
+ [(reg:CC FLAGS_REG) (const_int 0)])
+ (label_ref (match_operand 3))
+ (pc)))]
+ "TARGET_64BIT || TARGET_SSE4_1"
+{
+ ix86_expand_branch (GET_CODE (operands[0]),
+ operands[1], operands[2], operands[3]);
+ DONE;
+})
+
(define_expand "cbranchoi4"
[(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:OI 1 "nonimmediate_operand")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index fb07707..2d50cbf 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1641,6 +1641,19 @@
(match_operand 0 "comparison_operator")
(match_operand 0 "ix86_trivial_fp_comparison_operator")))
+;; Return true if we can perform this comparison on TImode operands.
+(define_predicate "ix86_timode_comparison_operator"
+ (if_then_else (match_test "TARGET_64BIT")
+ (match_operand 0 "ordered_comparison_operator")
+ (match_operand 0 "bt_comparison_operator")))
+
+;; Return true if this is a valid second operand for a TImode comparison.
+(define_predicate "ix86_timode_comparison_operand"
+ (if_then_else (match_test "TARGET_64BIT")
+ (match_operand 0 "x86_64_general_operand")
+ (match_operand 0 "nonimmediate_operand")))
+
+
;; Nearly general operand, but accept any const_double, since we wish
;; to be able to drop them into memory rather than have them get pulled
;; into registers.
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c b/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c
new file mode 100644
index 0000000..6f996fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mavx2" } */
+
+int foo(char *a)
+{
+ static const char t[] = "0123456789012345678901234567890";
+ return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
+}
+
+/* { dg-final { scan-assembler-not "movl\[ \\t]*\\\$0," } } */
+/* { dg-final { scan-assembler-not "vptest\[ \\t]*%ymm" } } */
+/* { dg-final { scan-assembler-times "vptest\[ \\t]*%xmm" 2 } } */
+
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [x86 PATCH] Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces).
2023-06-27 17:22 [x86 PATCH] Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces) Roger Sayle
@ 2023-06-27 20:20 ` Uros Bizjak
0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2023-06-27 20:20 UTC (permalink / raw)
To: Roger Sayle; +Cc: gcc-patches
On Tue, Jun 27, 2023 at 7:22 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch fixes some very odd (unanticipated) code generation by
> compare_by_pieces with -m32 -mavx, since the recent addition of the
> cbranchoi4 pattern. The issue is that cbranchoi4 is available with
> TARGET_AVX, but cbranchti4 is currently conditional on TARGET_64BIT
> which results in the odd behaviour (thanks to OPTAB_WIDEN) that with
> -m32 -mavx, compare_by_pieces ends up (inefficiently) widening 128-bit
> comparisons to 256-bits before performing PTEST.
>
> This patch fixes this by providing a cbranchti4 pattern that's available
> with either TARGET_64BIT or TARGET_SSE4_1.
>
> For the test case below (again from PR 104610):
>
> int foo(char *a)
> {
> static const char t[] = "0123456789012345678901234567890";
> return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
> }
>
> GCC with -m32 -O2 -mavx currently produces the bonkers:
>
> foo: pushl %ebp
> movl %esp, %ebp
> andl $-32, %esp
> subl $64, %esp
> movl 8(%ebp), %eax
> vmovdqa .LC0, %xmm4
> movl $0, 48(%esp)
> vmovdqu (%eax), %xmm2
> movl $0, 52(%esp)
> movl $0, 56(%esp)
> movl $0, 60(%esp)
> movl $0, 16(%esp)
> movl $0, 20(%esp)
> movl $0, 24(%esp)
> movl $0, 28(%esp)
> vmovdqa %xmm2, 32(%esp)
> vmovdqa %xmm4, (%esp)
> vmovdqa (%esp), %ymm5
> vpxor 32(%esp), %ymm5, %ymm0
> vptest %ymm0, %ymm0
> jne .L2
> vmovdqu 16(%eax), %xmm7
> movl $0, 48(%esp)
> movl $0, 52(%esp)
> vmovdqa %xmm7, 32(%esp)
> vmovdqa .LC1, %xmm7
> movl $0, 56(%esp)
> movl $0, 60(%esp)
> movl $0, 16(%esp)
> movl $0, 20(%esp)
> movl $0, 24(%esp)
> movl $0, 28(%esp)
> vmovdqa %xmm7, (%esp)
> vmovdqa (%esp), %ymm1
> vpxor 32(%esp), %ymm1, %ymm0
> vptest %ymm0, %ymm0
> je .L6
> .L2: movl $1, %eax
> xorl $1, %eax
> vzeroupper
> leave
> ret
> .L6: xorl %eax, %eax
> xorl $1, %eax
> vzeroupper
> leave
> ret
>
> with this patch, we now generate the (slightly) more sensible:
>
> foo: vmovdqa .LC0, %xmm0
> movl 4(%esp), %eax
> vpxor (%eax), %xmm0, %xmm0
> vptest %xmm0, %xmm0
> jne .L2
> vmovdqa .LC1, %xmm0
> vpxor 16(%eax), %xmm0, %xmm0
> vptest %xmm0, %xmm0
> je .L5
> .L2: movl $1, %eax
> xorl $1, %eax
> ret
> .L5: xorl %eax, %eax
> xorl $1, %eax
> ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures. Ok for mainline?
>
>
> 2023-06-27 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * config/i386/i386-expand.cc (ix86_expand_branch): Also use ptest
> for TImode comparisons on 32-bit architectures.
> * config/i386/i386.md (cbranch<mode>4): Change from SDWIM to
> SWIM1248x to exclude/avoid TImode being conditional on -m64.
> (cbranchti4): New define_expand for TImode on both TARGET_64BIT
> and/or with TARGET_SSE4_1.
> * config/i386/predicates.md (ix86_timode_comparison_operator):
> New predicate that depends upon TARGET_64BIT.
> (ix86_timode_comparison_operand): Likewise.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/pieces-memcmp-2.c: New test case.
OK with a small fix.
Thanks,
Uros.
+;; Return true if this is a valid second operand for a TImode comparison.
+(define_predicate "ix86_timode_comparison_operand"
+ (if_then_else (match_test "TARGET_64BIT")
+ (match_operand 0 "x86_64_general_operand")
+ (match_operand 0 "nonimmediate_operand")))
+
+
Please remove the duplicate blank line above.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-06-27 20:20 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-27 17:22 [x86 PATCH] Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces) Roger Sayle
2023-06-27 20:20 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).