[PATCH] target/98856 - split vpinsrq with new peephole2

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] target/98856 - split vpinsrq with new peephole2
@ 2021-03-08 11:04 Richard Biener
  2021-03-08 11:16 ` Jakub Jelinek
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Biener @ 2021-03-08 11:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: jakub, ubizjak

This reduces the latency of a V2DImode construction from two
GPRs by avoiding the dependence on the GPR->XMM move with the
used vpinsrq instruction and instead allow the two GPR->XMM moves
to be concurrently executed and scheduled, performing the insert
using vpunpcklqdq.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

OK for trunk or do we want to defer this to GCC 12, maybe
unless we can also solve the spilling in PR98856 which would
then fix the performance regression?

2021-03-05  Richard Biener  <rguenther@suse.de>

	PR target/98856
	* config/i386/sse.md (vpinsrq peephole): New peephole2
	splitting vpinsrq to a vmovq and vpunpcklqdq.

	* gcc.target/i386/pr98856.c: New testcase.
	* gcc.target/i386/avx512dq-concatv2di-1.c: Adjust.
	* gcc.target/i386/avx512vl-concatv2di-1.c: Likewise.
---
 gcc/config/i386/sse.md                        | 15 +++++++++++
 .../gcc.target/i386/avx512dq-concatv2di-1.c   |  4 +--
 .../gcc.target/i386/avx512vl-concatv2di-1.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr98856.c       | 25 +++++++++++++++++++
 4 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98856.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ca4372d4164..7c9be80540b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1427,6 +1427,21 @@
   DONE;
 })
 
+;; Further split pinsrq variants of vec_concatv2di to hide the latency
+;; the GPR->XMM transition(s).
+(define_peephole2
+  [(match_scratch:DI 3 "Yv")
+   (set (match_operand:V2DI 0 "sse_reg_operand")
+	(vec_concat:V2DI (match_operand:DI 1 "sse_reg_operand")
+			 (match_operand:DI 2 "nonimmediate_gr_operand")))]
+  "TARGET_64BIT && TARGET_SSE4_1
+   && !optimize_insn_for_size_p ()"
+  [(set (match_dup 3)
+        (match_dup 2))
+   (set (match_dup 0)
+	(vec_concat:V2DI (match_dup 1)
+			 (match_dup 3)))])
+
 ;; Merge movsd/movhpd to movupd for TARGET_SSE_UNALIGNED_LOAD_OPTIMAL targets.
 (define_peephole2
   [(set (match_operand:V2DF 0 "sse_reg_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c
index 82cb402575b..ac652bb1382 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c
@@ -14,7 +14,7 @@ f1 (long long x, long long y)
   asm volatile ("" : "+v" (c));
 }
 
-/* { dg-final { scan-assembler "vpinsrq\[^\n\r]*\\\$1\[^\n\r]*%rsi\[^\n\r]*%xmm16\[^\n\r]*%xmm17" } } */
+/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm16\[^\n\r]*%xmm17" } } */
 
 void
 f2 (long long x, long long *y)
@@ -27,7 +27,7 @@ f2 (long long x, long long *y)
   asm volatile ("" : "+v" (c));
 }
 
-/* { dg-final { scan-assembler "vpinsrq\[^\n\r]*\\\$1\[^\n\r]*%\[re]si\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */
+/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */
 
 void
 f3 (long long x)
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c
index 8e637071aa2..b8300371a21 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c
@@ -28,7 +28,7 @@ f2 (long long x, long long *y)
   asm volatile ("" : "+v" (c));
 }
 
-/* { dg-final { scan-assembler "vmovhps\[^\n\r]*%\[re]si\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */
+/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */
 
 void
 f3 (long long x)
diff --git a/gcc/testsuite/gcc.target/i386/pr98856.c b/gcc/testsuite/gcc.target/i386/pr98856.c
new file mode 100644
index 00000000000..1ea24d0f1fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98856.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O3 -march=znver2" } */
+
+typedef __UINT64_TYPE__ uint64_t;
+void poly_double_le2 (unsigned char *out, const unsigned char *in)
+{
+  uint64_t W[2];
+
+  __builtin_memcpy (&W, in, 16);
+  uint64_t carry = (W[1] >> 63) * 135;
+  W[1] = (W[1] << 1) ^ (W[0] >> 63);
+  W[0] = (W[0] << 1) ^ carry;
+  __builtin_memcpy (out, &W[0], 8);
+  __builtin_memcpy (out + 8, &W[1], 8);
+}
+
+/* We should split 
+     vpinsrq $1, %rax, %xmm0, %xmm0
+   to
+     vmovq %rax, %xmm1
+     vpunpcklqdq %xmm0, %xmm1, %xmm0
+   to better hide the latency of the GPR->XMM transitions.  */
+
+/* { dg-final { scan-assembler-not "pinsrq" } } */
+/* { dg-final { scan-assembler-times "punpcklqdq" 1 } } */
-- 
2.26.2

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] target/98856 - split vpinsrq with new peephole2
  2021-03-08 11:04 [PATCH] target/98856 - split vpinsrq with new peephole2 Richard Biener
@ 2021-03-08 11:16 ` Jakub Jelinek
  2021-03-08 12:01   ` Richard Biener
  0 siblings, 1 reply; 3+ messages in thread
From: Jakub Jelinek @ 2021-03-08 11:16 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, ubizjak

On Mon, Mar 08, 2021 at 12:04:22PM +0100, Richard Biener wrote:
> +;; Further split pinsrq variants of vec_concatv2di to hide the latency
> +;; the GPR->XMM transition(s).
> +(define_peephole2
> +  [(match_scratch:DI 3 "Yv")
> +   (set (match_operand:V2DI 0 "sse_reg_operand")
> +	(vec_concat:V2DI (match_operand:DI 1 "sse_reg_operand")
> +			 (match_operand:DI 2 "nonimmediate_gr_operand")))]
> +  "TARGET_64BIT && TARGET_SSE4_1
> +   && !optimize_insn_for_size_p ()"
> +  [(set (match_dup 3)
> +        (match_dup 2))
> +   (set (match_dup 0)
> +	(vec_concat:V2DI (match_dup 1)
> +			 (match_dup 3)))])

Do we really want to do it for all vpinsrqs and not just those where
operands[1] is set from a nonimmediate_gr_operand a few instructions
earlier (or perhaps e.g. other insertions from GPRs)?
I mean, whether this is a win should depend on the latency of the
operands[1] setter if it is not too far from the vec_concat, if it has low
latency, this will only grow code without benefit, if it has high latency
it indeed could perform the GRP -> XMM move concurrently with the previous
operation.
Hardcoding the operands[1] setter in the peephole2 would mean we couldn't
match some small number of unrelated insns in between, but perhaps the
peephole2 condition could just call a function that walks the IL backward a
little and checks where the setter is and what latency it has?

	Jakub

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] target/98856 - split vpinsrq with new peephole2
  2021-03-08 11:16 ` Jakub Jelinek
@ 2021-03-08 12:01   ` Richard Biener
  0 siblings, 0 replies; 3+ messages in thread
From: Richard Biener @ 2021-03-08 12:01 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches, ubizjak

On Mon, 8 Mar 2021, Jakub Jelinek wrote:

> On Mon, Mar 08, 2021 at 12:04:22PM +0100, Richard Biener wrote:
> > +;; Further split pinsrq variants of vec_concatv2di to hide the latency
> > +;; the GPR->XMM transition(s).
> > +(define_peephole2
> > +  [(match_scratch:DI 3 "Yv")
> > +   (set (match_operand:V2DI 0 "sse_reg_operand")
> > +	(vec_concat:V2DI (match_operand:DI 1 "sse_reg_operand")
> > +			 (match_operand:DI 2 "nonimmediate_gr_operand")))]
> > +  "TARGET_64BIT && TARGET_SSE4_1
> > +   && !optimize_insn_for_size_p ()"
> > +  [(set (match_dup 3)
> > +        (match_dup 2))
> > +   (set (match_dup 0)
> > +	(vec_concat:V2DI (match_dup 1)
> > +			 (match_dup 3)))])
> 
> Do we really want to do it for all vpinsrqs and not just those where
> operands[1] is set from a nonimmediate_gr_operand a few instructions
> earlier (or perhaps e.g. other insertions from GPRs)?
> I mean, whether this is a win should depend on the latency of the
> operands[1] setter if it is not too far from the vec_concat, if it has low
> latency, this will only grow code without benefit, if it has high latency
> it indeed could perform the GRP -> XMM move concurrently with the previous
> operation.

In principle even a single high-latency op (GPR or MEM) is worth splitting
if scheduling can move it enough.  What "enough" is of course depends
on the CPU and its OOO issue capabilities.  It's not exactly clear
what the uops are but I suspect it's MEM load or GPR->xmm move, which
would mean the dependency on the first slow op should be mitigated by
OOO issueing as well.

OK, so I threw the following at IACA:

.L3:
        movl $111, %ebx
        .byte 0x64, 0x67, 0x90
#        movq    %rdi, %xmm0
#       vpinsrq $1, %rax, %xmm0, %xmm0
#        movq    %rdi, %xmm1
#       vpinsrq $1, %rax, %xmm1, %xmm1
        movq    %rdi, %xmm0
        movq    %rax, %xmm3
        vpunpcklqdq %xmm3, %xmm0, %xmm0
        movq    %rdi, %xmm1
        movq    %rax, %xmm4
        vpunpcklqdq %xmm4, %xmm1, %xmm1
        vpaddq  %xmm0, %xmm2, %xmm2
        vpaddq  %xmm1, %xmm2, %xmm2
        addl    $1, %edx
        cmpl    %edx, %eax
        movl $222, %ebx
        .byte 0x64, 0x67, 0x90
        jne     .L3

and the commented vs. the uncommented sequences make no difference in its
analysis of the loop throughput (but there's no work to be issued
between the slow moves and the unpack) - IACA also doesn't seem to
OOO execute here (the first vpaddq is only issued after the 2nd unpck).

Going back to the Botan case there's no speed difference between the
following which has the spilling removed and the vpinsr removed:

        vmovdqu (%rsi), %xmm4
        movq    8(%rsi), %rdx
        shrq    $63, %rdx
        imulq   $135, %rdx, %rdi
        movq    (%rsi), %rdx
        vmovq   %rdi, %xmm0
        vpsllq  $1, %xmm4, %xmm1
        shrq    $63, %rdx
        vmovq   %rdx, %xmm5
        vpunpcklqdq %xmm0, %xmm5, %xmm0
        vpxor   %xmm1, %xmm0, %xmm0
        vmovdqu %xmm0, (%rax)

to the variant with just the spilling removed:

        vmovdqu (%rsi), %xmm4
        movq    8(%rsi), %rdx
        shrq    $63, %rdx
        imulq   $135, %rdx, %rdi
        movq    (%rsi), %rdx
        vmovq   %rdi, %xmm0
        vpsllq  $1, %xmm4, %xmm1
        shrq    $63, %rdx
        vpinsrq $1, %rdx, %xmm0, %xmm0
        vpxor   %xmm1, %xmm0, %xmm0
        vmovdqu %xmm0, (%rax)

so it looks I was barking at the wrong tree here and OOO issueing
works fine here (the patch might still make a difference on in-order
micro architectures or in situations not tested).

Thus I'll drop it for now.

That said, the spilling issue is way harder to address I fear.  For
reference, this was the original:

        vmovdqu (%rsi), %xmm4
        vmovdqa %xmm4, 16(%rsp)
        movq    24(%rsp), %rdx
        vmovdqa 16(%rsp), %xmm5
        shrq    $63, %rdx
        imulq   $135, %rdx, %rdi
        movq    16(%rsp), %rdx
        vmovq   %rdi, %xmm0
        vpsllq  $1, %xmm5, %xmm1
        shrq    $63, %rdx
        vpinsrq $1, %rdx, %xmm0, %xmm0
        vpxor   %xmm1, %xmm0, %xmm0
        vmovdqu %xmm0, (%rax)
        jmp     .L53

and the important part is to do the loads to %rdx/%rdi via
(%rsi) and not the spill slot.  Performing more "clever"
reloads from %xmm4 like via

        vmovdqu (%rsi), %xmm4
        vmovhlps %xmm4, %xmm4, %xmm5
        movq     %xmm5, %rdx
        vmovdqa 16(%rsp), %xmm5
        shrq    $63, %rdx
        imulq   $135, %rdx, %rdi
        movq    %xmm4, %rdx
        vmovq   %rdi, %xmm0
        vpsllq  $1, %xmm4, %xmm1
        shrq    $63, %rdx
        vpinsrq $1, %rdx, %xmm0, %xmm0
        vpxor   %xmm1, %xmm0, %xmm0
        vmovdqu %xmm0, (%rax)
        jmp     .L53

only improves things marginally.

Richard.

> Hardcoding the operands[1] setter in the peephole2 would mean we couldn't
> match some small number of unrelated insns in between, but perhaps the
> peephole2 condition could just call a function that walks the IL backward a
> little and checks where the setter is and what latency it has?
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-03-08 12:01 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-08 11:04 [PATCH] target/98856 - split vpinsrq with new peephole2 Richard Biener
2021-03-08 11:16 ` Jakub Jelinek
2021-03-08 12:01   ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).