* [x86 PATCH] More use of m{32,64}bcst addressing modes with ternlog.
@ 2024-06-12 20:20 Roger Sayle
2024-06-13 0:35 ` Hongtao Liu
0 siblings, 1 reply; 2+ messages in thread
From: Roger Sayle @ 2024-06-12 20:20 UTC (permalink / raw)
To: gcc-patches; +Cc: 'Hongtao Liu', 'Uros Bizjak'
[-- Attachment #1: Type: text/plain, Size: 3889 bytes --]
This patch makes more use of m32bcst and m64bcst addressing modes in
ix86_expand_ternlog. Previously, the i386 backend would only consider
using a m32bcst if the inner mode of the vector was 32-bits, or using
m64bcst if the inner mode was 64-bits. For ternlog (and other logic
operations) this is a strange restriction, as how the same constant
is materialized is dependent upon the mode it is used/operated on.
Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't
use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202}
which has the same bit pattern would. This can optimized by (re)checking
whether a CONST_VECTOR can be broadcast from memory after casting it
to VxSI (or for m64bst to VxDI) where x has the appropriate vector size.
Taking the test case from pr115407:
__attribute__((__vector_size__(64))) char v;
void foo() {
v = v | v << 7;
}
Compiled with -O2 -mcmodel=large -mavx512bw
GCC 14 generates a 64-byte (512-bit) load from the constant pool:
foo: movabsq $v, %rax // 10
movabsq $.LC0, %rdx // 10
vpsllw $7, (%rax), %zmm1 // 7
vmovdqa64 (%rax), %zmm0 // 6
vpternlogd $248, (%rdx), %zmm1, %zmm0 // 7
vmovdqa64 %zmm0, (%rax) // 6
vzeroupper // 3
ret // 1
.LC0: .byte -12 // 64 = 114 bytes
.byte -128
;; repeated another 62 times
mainline currently generates two instructions, using interunit broadcast:
foo: movabsq $v, %rdx // 10
movl $-2139062144, %eax // 5
vmovdqa64 (%rdx), %zmm2 // 6
vpbroadcastd %eax, %zmm0 // 6
vpsllw $7, %zmm2, %zmm1 // 7
vpternlogd $236, %zmm0, %zmm2, %zmm1 // 7
vmovdqa64 %zmm1, (%rdx) // 6
vzeroupper // 3
ret // 1 = 51 bytes
With this patch, we now generate a broadcast addressing mode:
foo: movabsq $v, %rax // 10
movabsq $.LC1, %rdx // 10
vmovdqa64 (%rax), %zmm1 // 6
vpsllw $7, %zmm1, %zmm0 // 7
vpternlogd $236, (%rdx){1to16}, %zmm1, %zmm0 // 7
vmovdqa64 %zmm0, (%rax) // 6
vzeroupper // 3
ret // 1 = 50 total
Without -mcmodel=large, the benefit is two instructions:
foo: vmovdqa64 v(%rip), %zmm1 // 10
vpsllw $7, %zmm1, %zmm0 // 7
vpternlogd $236, .LC2(%rip){1to16}, %zmm1, %zmm0 // 11
vmovdqa64 %zmm0, v(%rip) // 10
vzeroupper // 3
ret // 1 = 42
total
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures. Ok for mainline?
2024-06-12 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing
logic operation in a different vector mode if that enables use of
a 32-bit or 64-bit broadcast addressing mode.
gcc/testsuite/ChangeLog
* gcc.target/i386/pr115407.c: New test case.
Thanks in advance,
Roger
--
[-- Attachment #2: patchtl9.txt --]
[-- Type: text/plain, Size: 2540 bytes --]
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 312329e..a4379b8 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26041,6 +26041,69 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
tmp2 = ix86_gen_bcst_mem (mode, op2);
if (!tmp2)
{
+ machine_mode bcst32_mode = mode;
+ machine_mode bcst64_mode = mode;
+ switch (mode)
+ {
+ case V1TImode:
+ case V4SImode:
+ case V4SFmode:
+ case V8HImode:
+ case V16QImode:
+ bcst32_mode = V4SImode;
+ bcst64_mode = V2DImode;
+ break;
+
+ case V2TImode:
+ case V8SImode:
+ case V8SFmode:
+ case V16HImode:
+ case V32QImode:
+ bcst32_mode = V8SImode;
+ bcst64_mode = V4DImode;
+ break;
+
+ case V4TImode:
+ case V16SImode:
+ case V16SFmode:
+ case V32HImode:
+ case V64QImode:
+ bcst32_mode = V16SImode;
+ bcst64_mode = V8DImode;
+ break;
+
+ default:
+ break;
+ }
+
+ if (bcst32_mode != mode)
+ {
+ tmp2 = gen_lowpart (bcst32_mode, op2);
+ if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
+ {
+ tmp2 = ix86_expand_ternlog (bcst32_mode,
+ gen_lowpart (bcst32_mode, tmp0),
+ gen_lowpart (bcst32_mode, tmp1),
+ tmp2, idx, NULL_RTX);
+ emit_move_insn (target, gen_lowpart (mode, tmp2));
+ return target;
+ }
+ }
+
+ if (bcst64_mode != mode)
+ {
+ tmp2 = gen_lowpart (bcst64_mode, op2);
+ if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
+ {
+ tmp2 = ix86_expand_ternlog (bcst64_mode,
+ gen_lowpart (bcst64_mode, tmp0),
+ gen_lowpart (bcst64_mode, tmp1),
+ tmp2, idx, NULL_RTX);
+ emit_move_insn (target, gen_lowpart (mode, tmp2));
+ return target;
+ }
+ }
+
tmp2 = force_const_mem (mode, op2);
rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
tmp2 = validize_mem (tmp2);
diff --git a/gcc/testsuite/gcc.target/i386/pr115407.c b/gcc/testsuite/gcc.target/i386/pr115407.c
new file mode 100644
index 0000000..b6cb7a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115407.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mcmodel=large -mavx512bw" } */
+__attribute__((__vector_size__(64))) char v;
+
+void foo() {
+ v = v | v << 7;
+}
+
+/* { dg-final { scan-assembler "vpternlog.*1to16" } } */
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [x86 PATCH] More use of m{32,64}bcst addressing modes with ternlog.
2024-06-12 20:20 [x86 PATCH] More use of m{32,64}bcst addressing modes with ternlog Roger Sayle
@ 2024-06-13 0:35 ` Hongtao Liu
0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2024-06-13 0:35 UTC (permalink / raw)
To: Roger Sayle; +Cc: gcc-patches, Uros Bizjak
On Thu, Jun 13, 2024 at 4:20 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch makes more use of m32bcst and m64bcst addressing modes in
> ix86_expand_ternlog. Previously, the i386 backend would only consider
> using a m32bcst if the inner mode of the vector was 32-bits, or using
> m64bcst if the inner mode was 64-bits. For ternlog (and other logic
> operations) this is a strange restriction, as how the same constant
> is materialized is dependent upon the mode it is used/operated on.
> Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't
> use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202}
> which has the same bit pattern would. This can optimized by (re)checking
> whether a CONST_VECTOR can be broadcast from memory after casting it
> to VxSI (or for m64bst to VxDI) where x has the appropriate vector size.
>
>
> Taking the test case from pr115407:
>
> __attribute__((__vector_size__(64))) char v;
> void foo() {
> v = v | v << 7;
> }
>
> Compiled with -O2 -mcmodel=large -mavx512bw
> GCC 14 generates a 64-byte (512-bit) load from the constant pool:
>
> foo: movabsq $v, %rax // 10
> movabsq $.LC0, %rdx // 10
> vpsllw $7, (%rax), %zmm1 // 7
> vmovdqa64 (%rax), %zmm0 // 6
> vpternlogd $248, (%rdx), %zmm1, %zmm0 // 7
> vmovdqa64 %zmm0, (%rax) // 6
> vzeroupper // 3
> ret // 1
> .LC0: .byte -12 // 64 = 114 bytes
> .byte -128
> ;; repeated another 62 times
>
> mainline currently generates two instructions, using interunit broadcast:
>
> foo: movabsq $v, %rdx // 10
> movl $-2139062144, %eax // 5
> vmovdqa64 (%rdx), %zmm2 // 6
> vpbroadcastd %eax, %zmm0 // 6
> vpsllw $7, %zmm2, %zmm1 // 7
> vpternlogd $236, %zmm0, %zmm2, %zmm1 // 7
> vmovdqa64 %zmm1, (%rdx) // 6
> vzeroupper // 3
> ret // 1 = 51 bytes
>
> With this patch, we now generate a broadcast addressing mode:
>
> foo: movabsq $v, %rax // 10
> movabsq $.LC1, %rdx // 10
> vmovdqa64 (%rax), %zmm1 // 6
> vpsllw $7, %zmm1, %zmm0 // 7
> vpternlogd $236, (%rdx){1to16}, %zmm1, %zmm0 // 7
> vmovdqa64 %zmm0, (%rax) // 6
> vzeroupper // 3
> ret // 1 = 50 total
>
> Without -mcmodel=large, the benefit is two instructions:
>
> foo: vmovdqa64 v(%rip), %zmm1 // 10
> vpsllw $7, %zmm1, %zmm0 // 7
> vpternlogd $236, .LC2(%rip){1to16}, %zmm1, %zmm0 // 11
> vmovdqa64 %zmm0, v(%rip) // 10
> vzeroupper // 3
> ret // 1 = 42
> total
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures. Ok for mainline?
Ok.
>
>
> 2024-06-12 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing
> logic operation in a different vector mode if that enables use of
> a 32-bit or 64-bit broadcast addressing mode.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/pr115407.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2024-06-13 0:35 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-12 20:20 [x86 PATCH] More use of m{32,64}bcst addressing modes with ternlog Roger Sayle
2024-06-13 0:35 ` Hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).