public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-2159] i386: Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces).
@ 2023-06-28 10:12 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2023-06-28 10:12 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:4afbebcdc5780d28e52b7d65643e462c7c3882ce

commit r14-2159-g4afbebcdc5780d28e52b7d65643e462c7c3882ce
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Wed Jun 28 11:11:34 2023 +0100

    i386: Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces).
    
    This patch fixes some very odd (unanticipated) code generation by
    compare_by_pieces with -m32 -mavx, since the recent addition of the
    cbranchoi4 pattern.  The issue is that cbranchoi4 is available with
    TARGET_AVX, but cbranchti4 is currently conditional on TARGET_64BIT
    which results in the odd behaviour (thanks to OPTAB_WIDEN) that with
    -m32 -mavx, compare_by_pieces ends up (inefficiently) widening 128-bit
    comparisons to 256-bits before performing PTEST.
    
    This patch fixes this by providing a cbranchti4 pattern that's available
    with either TARGET_64BIT or TARGET_SSE4_1.
    
    For the test case below (again from PR 104610):
    
    int foo(char *a)
    {
        static const char t[] = "0123456789012345678901234567890";
        return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
    }
    
    GCC with -m32 -O2 -mavx currently produces the bonkers:
    
    foo:    pushl   %ebp
            movl    %esp, %ebp
            andl    $-32, %esp
            subl    $64, %esp
            movl    8(%ebp), %eax
            vmovdqa .LC0, %xmm4
            movl    $0, 48(%esp)
            vmovdqu (%eax), %xmm2
            movl    $0, 52(%esp)
            movl    $0, 56(%esp)
            movl    $0, 60(%esp)
            movl    $0, 16(%esp)
            movl    $0, 20(%esp)
            movl    $0, 24(%esp)
            movl    $0, 28(%esp)
            vmovdqa %xmm2, 32(%esp)
            vmovdqa %xmm4, (%esp)
            vmovdqa (%esp), %ymm5
            vpxor   32(%esp), %ymm5, %ymm0
            vptest  %ymm0, %ymm0
            jne     .L2
            vmovdqu 16(%eax), %xmm7
            movl    $0, 48(%esp)
            movl    $0, 52(%esp)
            vmovdqa %xmm7, 32(%esp)
            vmovdqa .LC1, %xmm7
            movl    $0, 56(%esp)
            movl    $0, 60(%esp)
            movl    $0, 16(%esp)
            movl    $0, 20(%esp)
            movl    $0, 24(%esp)
            movl    $0, 28(%esp)
            vmovdqa %xmm7, (%esp)
            vmovdqa (%esp), %ymm1
            vpxor   32(%esp), %ymm1, %ymm0
            vptest  %ymm0, %ymm0
            je      .L6
    .L2:    movl    $1, %eax
            xorl    $1, %eax
            vzeroupper
            leave
            ret
    .L6:    xorl    %eax, %eax
            xorl    $1, %eax
            vzeroupper
            leave
            ret
    
    with this patch, we now generate the (slightly) more sensible:
    
    foo:    vmovdqa .LC0, %xmm0
            movl    4(%esp), %eax
            vpxor   (%eax), %xmm0, %xmm0
            vptest  %xmm0, %xmm0
            jne     .L2
            vmovdqa .LC1, %xmm0
            vpxor   16(%eax), %xmm0, %xmm0
            vptest  %xmm0, %xmm0
            je      .L5
    .L2:    movl    $1, %eax
            xorl    $1, %eax
            ret
    .L5:    xorl    %eax, %eax
            xorl    $1, %eax
            ret
    
    2023-06-28  Roger Sayle  <roger@nextmovesoftware.com>
    
    gcc/ChangeLog
            * config/i386/i386-expand.cc (ix86_expand_branch): Also use ptest
            for TImode comparisons on 32-bit architectures.
            * config/i386/i386.md (cbranch<mode>4): Change from SDWIM to
            SWIM1248x to exclude/avoid TImode being conditional on -m64.
            (cbranchti4): New define_expand for TImode on both TARGET_64BIT
            and/or with TARGET_SSE4_1.
            * config/i386/predicates.md (ix86_timode_comparison_operator):
            New predicate that depends upon TARGET_64BIT.
            (ix86_timode_comparison_operand): Likewise.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/pieces-memcmp-2.c: New test case.

Diff:
---
 gcc/config/i386/i386-expand.cc                  |  3 ++-
 gcc/config/i386/i386.md                         | 20 ++++++++++++++++++--
 gcc/config/i386/predicates.md                   | 12 ++++++++++++
 gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c | 13 +++++++++++++
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 9a8d244f85c..567248d6830 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2365,6 +2365,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   /* Handle special case - vector comparsion with boolean result, transform
      it using ptest instruction.  */
   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+      || (mode == TImode && !TARGET_64BIT)
       || mode == OImode)
     {
       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
@@ -2372,7 +2373,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
 
       gcc_assert (code == EQ || code == NE);
 
-      if (mode == OImode)
+      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
 	{
 	  op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
 	  op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 15c031066dd..a82cc353cfd 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1352,8 +1352,8 @@
 
 (define_expand "cbranch<mode>4"
   [(set (reg:CC FLAGS_REG)
-	(compare:CC (match_operand:SDWIM 1 "nonimmediate_operand")
-		    (match_operand:SDWIM 2 "<general_operand>")))
+	(compare:CC (match_operand:SWIM1248x 1 "nonimmediate_operand")
+		    (match_operand:SWIM1248x 2 "<general_operand>")))
    (set (pc) (if_then_else
 	       (match_operator 0 "ordered_comparison_operator"
 		[(reg:CC FLAGS_REG) (const_int 0)])
@@ -1368,6 +1368,22 @@
   DONE;
 })
 
+(define_expand "cbranchti4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:TI 1 "nonimmediate_operand")
+		    (match_operand:TI 2 "ix86_timode_comparison_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "ix86_timode_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_64BIT || TARGET_SSE4_1"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cbranchoi4"
   [(set (reg:CC FLAGS_REG)
 	(compare:CC (match_operand:OI 1 "nonimmediate_operand")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index fb07707dcba..7ddbe01a6f9 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1641,6 +1641,18 @@
                (match_operand 0 "comparison_operator")
                (match_operand 0 "ix86_trivial_fp_comparison_operator")))
 
+;; Return true if we can perform this comparison on TImode operands.
+(define_predicate "ix86_timode_comparison_operator"
+  (if_then_else (match_test "TARGET_64BIT")
+		(match_operand 0 "ordered_comparison_operator")
+		(match_operand 0 "bt_comparison_operator")))
+
+;; Return true if this is a valid second operand for a TImode comparison.
+(define_predicate "ix86_timode_comparison_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+		(match_operand 0 "x86_64_general_operand")
+		(match_operand 0 "nonimmediate_operand")))
+
 ;; Nearly general operand, but accept any const_double, since we wish
 ;; to be able to drop them into memory rather than have them get pulled
 ;; into registers.
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c b/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c
new file mode 100644
index 00000000000..6f996faeced
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mavx2" } */
+
+int foo(char *a)
+{
+    static const char t[] = "0123456789012345678901234567890";
+    return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
+}
+
+/* { dg-final { scan-assembler-not "movl\[ \\t]*\\\$0," } } */
+/* { dg-final { scan-assembler-not "vptest\[ \\t]*%ymm" } } */
+/* { dg-final { scan-assembler-times "vptest\[ \\t]*%xmm" 2 } } */
+

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-06-28 10:12 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-28 10:12 [gcc r14-2159] i386: Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces) Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).