public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> @ 2021-09-05 4:32 ` pinskia at gcc dot gnu.org 2021-09-08 10:13 ` crazylht at gmail dot com ` (6 subsequent siblings) 7 siblings, 0 replies; 8+ messages in thread From: pinskia at gcc dot gnu.org @ 2021-09-05 4:32 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 Andrew Pinski <pinskia at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Severity|normal |enhancement ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> 2021-09-05 4:32 ` [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element pinskia at gcc dot gnu.org @ 2021-09-08 10:13 ` crazylht at gmail dot com 2021-09-09 1:33 ` cvs-commit at gcc dot gnu.org ` (5 subsequent siblings) 7 siblings, 0 replies; 8+ messages in thread From: crazylht at gmail dot com @ 2021-09-08 10:13 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 --- Comment #6 from Hongtao.liu <crazylht at gmail dot com> --- For elements located above 128bits, it seems always better(?) to use valign{d,q} diff --git a/origin.s b/after.s index 9a7dfee..9a23f7e 100644 --- a/origin.s +++ b/after.s @@ -6,7 +6,7 @@ foo_v8sf_4: .LFB0: .cfi_startproc - vextractf128 $0x1, %ymm0, %xmm0 + valignd $4, %ymm0, %ymm0, %ymm0 ret .cfi_endproc .LFE0: @@ -17,8 +17,7 @@ foo_v8sf_4: foo_v8sf_7: .LFB1: .cfi_startproc - vextractf128 $0x1, %ymm0, %xmm0 - vshufps $255, %xmm0, %xmm0, %xmm0 + valignd $7, %ymm0, %ymm0, %ymm0 ret .cfi_endproc .LFE1: @@ -29,8 +28,8 @@ foo_v8sf_7: foo_v8si_4: .LFB2: .cfi_startproc - vextracti128 $0x1, %ymm0, %xmm0 - vmovd %xmm0, %eax + valignd $4, %ymm0, %ymm0, %ymm1 + vmovd %xmm1, %eax ret .cfi_endproc .LFE2: @@ -41,8 +40,8 @@ foo_v8si_4: foo_v8si_7: .LFB3: .cfi_startproc - vextracti128 $0x1, %ymm0, %xmm0 - vpextrd $3, %xmm0, %eax + valignd $7, %ymm0, %ymm0, %ymm1 + vmovd %xmm1, %eax ret .cfi_endproc .LFE3: @@ -53,7 +52,7 @@ foo_v8si_7: foo_v16sf_8: .LFB4: .cfi_startproc - vextractf32x8 $0x1, %zmm0, %ymm0 + valignd $8, %zmm0, %zmm0, %zmm0 ret .cfi_endproc .LFE4: @@ -64,9 +63,7 @@ foo_v16sf_8: foo_v16sf_15: .LFB5: .cfi_startproc - vextractf32x8 $0x1, %zmm0, %ymm0 - vextractf128 $0x1, %ymm0, %xmm0 - vshufps $255, %xmm0, %xmm0, %xmm0 + valignd $15, %zmm0, %zmm0, %zmm0 ret .cfi_endproc .LFE5: @@ -77,8 +74,8 @@ foo_v16sf_15: foo_v16si_8: .LFB6: .cfi_startproc - vextracti32x8 $0x1, %zmm0, %ymm0 - vmovd %xmm0, %eax + valignd $8, %zmm0, %zmm0, %zmm1 + vmovd %xmm1, %eax ret .cfi_endproc .LFE6: @@ -89,9 +86,8 @@ foo_v16si_8: foo_v16si_15: .LFB7: .cfi_startproc - vextracti32x8 $0x1, %zmm0, %ymm0 - vextracti128 $0x1, %ymm0, %xmm0 - vpextrd $3, %xmm0, %eax + valignd $15, %zmm0, %zmm0, %zmm1 + vmovd %xmm1, %eax ret .cfi_endproc .LFE7: @@ -102,7 +98,7 @@ foo_v16si_15: foo_v4df_2: .LFB8: .cfi_startproc - vextractf64x2 $0x1, %ymm0, %xmm0 + valignq $2, %ymm0, %ymm0, %ymm0 ret .cfi_endproc .LFE8: @@ -113,8 +109,7 @@ foo_v4df_2: foo_v4df_3: .LFB9: .cfi_startproc - vextractf64x2 $0x1, %ymm0, %xmm0 - vunpckhpd %xmm0, %xmm0, %xmm0 + valignq $3, %ymm0, %ymm0, %ymm0 ret .cfi_endproc .LFE9: @@ -125,8 +120,8 @@ foo_v4df_3: foo_v4di_2: .LFB10: .cfi_startproc - vextracti64x2 $0x1, %ymm0, %xmm0 - vmovq %xmm0, %rax + valignq $2, %ymm0, %ymm0, %ymm1 + vmovq %xmm1, %rax ret .cfi_endproc .LFE10: @@ -137,8 +132,8 @@ foo_v4di_2: foo_v4di_3: .LFB11: .cfi_startproc - vextracti64x2 $0x1, %ymm0, %xmm0 - vpextrq $1, %xmm0, %rax + valignq $3, %ymm0, %ymm0, %ymm1 + vmovq %xmm1, %rax ret .cfi_endproc .LFE11: @@ -149,7 +144,7 @@ foo_v4di_3: foo_v8df_4: .LFB12: .cfi_startproc - vextractf64x4 $0x1, %zmm0, %ymm0 + valignq $4, %zmm0, %zmm0, %zmm0 ret .cfi_endproc .LFE12: @@ -160,9 +155,7 @@ foo_v8df_4: foo_v8df_7: .LFB13: .cfi_startproc - vextractf64x4 $0x1, %zmm0, %ymm0 - vextractf64x2 $0x1, %ymm0, %xmm0 - vunpckhpd %xmm0, %xmm0, %xmm0 + valignq $7, %zmm0, %zmm0, %zmm0 ret .cfi_endproc .LFE13: @@ -173,8 +166,8 @@ foo_v8df_7: foo_v8di_4: .LFB14: .cfi_startproc - vextracti64x4 $0x1, %zmm0, %ymm0 - vmovq %xmm0, %rax + valignq $4, %zmm0, %zmm0, %zmm1 + vmovq %xmm1, %rax ret .cfi_endproc .LFE14: @@ -185,12 +178,11 @@ foo_v8di_4: foo_v8di_7: .LFB15: .cfi_startproc - vextracti64x4 $0x1, %zmm0, %ymm0 - vextracti64x2 $0x1, %ymm0, %xmm0 - vpextrq $1, %xmm0, %rax + valignq $7, %zmm0, %zmm0, %zmm1 + vmovq %xmm1, %rax ret .cfi_endproc .LFE15: .size foo_v8di_7, .-foo_v8di_7 - .ident "GCC: (GNU) 12.0.0 20210907 (experimental)" + .ident "GCC: (GNU) 12.0.0 20210908 (experimental)" .section .note.GNU-stack,"",@progbits ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> 2021-09-05 4:32 ` [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element pinskia at gcc dot gnu.org 2021-09-08 10:13 ` crazylht at gmail dot com @ 2021-09-09 1:33 ` cvs-commit at gcc dot gnu.org 2021-09-09 1:35 ` crazylht at gmail dot com ` (4 subsequent siblings) 7 siblings, 0 replies; 8+ messages in thread From: cvs-commit at gcc dot gnu.org @ 2021-09-09 1:33 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 --- Comment #7 from CVS Commits <cvs-commit at gcc dot gnu.org> --- The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:60eec23b5eda0f350e572586eee738eab0804a74 commit r12-3425-g60eec23b5eda0f350e572586eee738eab0804a74 Author: liuhongt <hongtao.liu@intel.com> Date: Wed Sep 8 16:19:37 2021 +0800 Optimize vec_extract for 256/512-bit vector when index exceeds the lower 128 bits. - vextracti32x8 $0x1, %zmm0, %ymm0 - vmovd %xmm0, %eax + valignd $8, %zmm0, %zmm0, %zmm1 + vmovd %xmm1, %eax - vextracti32x8 $0x1, %zmm0, %ymm0 - vextracti128 $0x1, %ymm0, %xmm0 - vpextrd $3, %xmm0, %eax + valignd $15, %zmm0, %zmm0, %zmm1 + vmovd %xmm1, %eax - vextractf64x2 $0x1, %ymm0, %xmm0 + valignq $2, %ymm0, %ymm0, %ymm0 - vextractf64x4 $0x1, %zmm0, %ymm0 - vextractf64x2 $0x1, %ymm0, %xmm0 - vunpckhpd %xmm0, %xmm0, %xmm0 + valignq $7, %zmm0, %zmm0, %zmm0 gcc/ChangeLog: PR target/91103 * config/i386/sse.md (*vec_extract<mode><ssescalarmodelower>_valign): New define_insn. gcc/testsuite/ChangeLog: PR target/91103 * gcc.target/i386/pr91103-1.c: New test. * gcc.target/i386/pr91103-2.c: New test. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> ` (2 preceding siblings ...) 2021-09-09 1:33 ` cvs-commit at gcc dot gnu.org @ 2021-09-09 1:35 ` crazylht at gmail dot com 2021-09-11 7:54 ` peter at cordes dot ca ` (3 subsequent siblings) 7 siblings, 0 replies; 8+ messages in thread From: crazylht at gmail dot com @ 2021-09-09 1:35 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 --- Comment #8 from Hongtao.liu <crazylht at gmail dot com> --- Fixed in GCC12. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> ` (3 preceding siblings ...) 2021-09-09 1:35 ` crazylht at gmail dot com @ 2021-09-11 7:54 ` peter at cordes dot ca 2021-09-13 1:16 ` crazylht at gmail dot com ` (2 subsequent siblings) 7 siblings, 0 replies; 8+ messages in thread From: peter at cordes dot ca @ 2021-09-11 7:54 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 --- Comment #9 from Peter Cordes <peter at cordes dot ca> --- Thanks for implementing my idea :) (In reply to Hongtao.liu from comment #6) > For elements located above 128bits, it seems always better(?) to use > valign{d,q} TL:DR: I think we should still use vextracti* / vextractf* when that can get the job done in a single instruction, especially when the VEX-encoded vextracti/f128 can save a byte of code size for v[4]. Extracts are simpler shuffles that might have better throughput on some future CPUs, especially the upcoming Zen4, so even without code-size savings we should use them when possible. Tiger Lake has a 256-bit shuffle unit on port 1 that supports some common shuffles (like vpshufb); a future Intel might add 256->128-bit extracts to that. It might also save a tiny bit of power, allowing on-average higher turbo clocks. --- On current CPUs with AVX-512, valignd is about equal to a single vextract, and better than multiple instruction. It doesn't really have downsides on current Intel, since I think Intel has continued to not have int/FP bypass delays for shuffles. We don't know yet what AMD's Zen4 implementation of AVX-512 will look like. If it's like Zen1 was AVX2 (i.e. if it decodes 512-bit instructions other than insert/extract into at least 2x 256-bit uops) a lane-crossing shuffle like valignd probably costs more than 2 uops. (vpermq is more than 2 uops on Piledriver/Zen1). But a 128-bit extract will probably cost just one uop. (And especially an extract of the high 256 might be very cheap and low latency, like vextracti128 on Zen1, so we might prefer vextracti64x4 for v[8].) So this change is good, but using a vextracti64x2 or vextracti64x4 could be a useful peephole optimization when byte_offset % 16 == 0. Or of course vextracti128 when possible (x/ymm0..15, not 16..31 which are only accessible with an EVEX-encoded instruction). vextractf-whatever allows an FP shuffle on FP data in case some future CPU cares about that for shuffles. An extract is a simpler shuffle that might have better throughput on some future CPU even with full-width execution units. Some future Intel CPU might add support for vextract uops to the extra shuffle unit on port 1. (Which is available when no 512-bit uops are in flight.) Currently (Ice Lake / Tiger Lake) it can only run some common shuffles like vpshufb ymm, but not including any vextract or valign. Of course port 1 vector ALUs are shut down when 512-bit uops are in flight, but could be relevant for __m256 vectors on these hypothetical future CPUs. When we can get the job done with a single vextract-something, we should use that instead of valignd. Otherwise use valignd. We already check the index for low-128 special cases to use vunpckhqdq vs. vpshufd (or vpsrldq) or similar FP shuffles. ----- On current Intel, with clean YMM/ZMM uppers (known by the CPU hardware to be zero), an extract that only writes a 128-bit register will keep them clean (even if it reads a ZMM), not needing a VZEROUPPER. Since VZEROUPPER is only needed for dirty y/zmm0..15, not with dirty zmm16..31, so a function like float foo(float *p) { some vector stuff that can use high zmm regs; return scalar that happens to be from the middle of a vector; } could vextract into XMM0, but would need vzeroupper if it used valignd into ZMM0. (Also related https://stackoverflow.com/questions/58568514/does-skylake-need-vzeroupper-for-turbo-clocks-to-recover-after-a-512-bit-instruc re reading a ZMM at all and turbo clock). --- Having known zeros outside the low 128 bits (from writing an xmm instead of rotating a zmm) is unlikely to matter, although for FP stuff copying fewer elements that might be subnormal could happen to be an advantage, maybe saving an FP assist for denormal. We're unlikely to be able to take advantage of it to save instructions/uops (like OR instead of blend). But it's not worse to use a single extract instruction instead of a single valignd. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> ` (4 preceding siblings ...) 2021-09-11 7:54 ` peter at cordes dot ca @ 2021-09-13 1:16 ` crazylht at gmail dot com 2021-09-15 8:38 ` cvs-commit at gcc dot gnu.org 2023-07-12 11:20 ` rguenth at gcc dot gnu.org 7 siblings, 0 replies; 8+ messages in thread From: crazylht at gmail dot com @ 2021-09-13 1:16 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 --- Comment #10 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to Peter Cordes from comment #9) > Thanks for implementing my idea :) > > (In reply to Hongtao.liu from comment #6) > > For elements located above 128bits, it seems always better(?) to use > > valign{d,q} > > TL:DR: > I think we should still use vextracti* / vextractf* when that can get the > job done in a single instruction, especially when the VEX-encoded > vextracti/f128 can save a byte of code size for v[4]. > > Extracts are simpler shuffles that might have better throughput on some > future CPUs, especially the upcoming Zen4, so even without code-size savings > we should use them when possible. Tiger Lake has a 256-bit shuffle unit on > port 1 that supports some common shuffles (like vpshufb); a future Intel > might add 256->128-bit extracts to that. > > It might also save a tiny bit of power, allowing on-average higher turbo > clocks. > > --- > > On current CPUs with AVX-512, valignd is about equal to a single vextract, Yes, they're equal but consider the below comments, i thinks it's reasonable to use vextract instead of valign for byte_offset % 16 == 0. > and better than multiple instruction. It doesn't really have downsides on > current Intel, since I think Intel has continued to not have int/FP bypass > delays for shuffles. > > We don't know yet what AMD's Zen4 implementation of AVX-512 will look like. > If it's like Zen1 was AVX2 (i.e. if it decodes 512-bit instructions other > than insert/extract into at least 2x 256-bit uops) a lane-crossing shuffle > like valignd probably costs more than 2 uops. (vpermq is more than 2 uops > on Piledriver/Zen1). But a 128-bit extract will probably cost just one uop. > (And especially an extract of the high 256 might be very cheap and low > latency, like vextracti128 on Zen1, so we might prefer vextracti64x4 for > v[8].) > > So this change is good, but using a vextracti64x2 or vextracti64x4 could be > a useful peephole optimization when byte_offset % 16 == 0. Or of course > vextracti128 when possible (x/ymm0..15, not 16..31 which are only accessible > with an EVEX-encoded instruction). > > vextractf-whatever allows an FP shuffle on FP data in case some future CPU > cares about that for shuffles. > > An extract is a simpler shuffle that might have better throughput on some > future CPU even with full-width execution units. Some future Intel CPU > might add support for vextract uops to the extra shuffle unit on port 1. > (Which is available when no 512-bit uops are in flight.) Currently (Ice > Lake / Tiger Lake) it can only run some common shuffles like vpshufb ymm, > but not including any vextract or valign. Of course port 1 vector ALUs are > shut down when 512-bit uops are in flight, but could be relevant for __m256 > vectors on these hypothetical future CPUs. > > When we can get the job done with a single vextract-something, we should use > that instead of valignd. Otherwise use valignd. > > We already check the index for low-128 special cases to use vunpckhqdq vs. > vpshufd (or vpsrldq) or similar FP shuffles. > > ----- > > On current Intel, with clean YMM/ZMM uppers (known by the CPU hardware to be > zero), an extract that only writes a 128-bit register will keep them clean > (even if it reads a ZMM), not needing a VZEROUPPER. Since VZEROUPPER is > only needed for dirty y/zmm0..15, not with dirty zmm16..31, so a function > like > > float foo(float *p) { > some vector stuff that can use high zmm regs; > return scalar that happens to be from the middle of a vector; > } > > could vextract into XMM0, but would need vzeroupper if it used valignd into > ZMM0. > > (Also related > https://stackoverflow.com/questions/58568514/does-skylake-need-vzeroupper- > for-turbo-clocks-to-recover-after-a-512-bit-instruc re reading a ZMM at all > and turbo clock). > > --- > > Having known zeros outside the low 128 bits (from writing an xmm instead of > rotating a zmm) is unlikely to matter, although for FP stuff copying fewer > elements that might be subnormal could happen to be an advantage, maybe > saving an FP assist for denormal. We're unlikely to be able to take > advantage of it to save instructions/uops (like OR instead of blend). But > it's not worse to use a single extract instruction instead of a single > valignd. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> ` (5 preceding siblings ...) 2021-09-13 1:16 ` crazylht at gmail dot com @ 2021-09-15 8:38 ` cvs-commit at gcc dot gnu.org 2023-07-12 11:20 ` rguenth at gcc dot gnu.org 7 siblings, 0 replies; 8+ messages in thread From: cvs-commit at gcc dot gnu.org @ 2021-09-15 8:38 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 --- Comment #11 from CVS Commits <cvs-commit at gcc dot gnu.org> --- The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:243e0a5b1942879bc005bf150a744e69a4fcdc87 commit r12-3542-g243e0a5b1942879bc005bf150a744e69a4fcdc87 Author: liuhongt <hongtao.liu@intel.com> Date: Mon Sep 13 10:27:51 2021 +0800 Output vextract{i,f}{32x4,64x2} for (vec_select:(reg:Vmode) idx) when byte_offset of idx % 16 == 0. 2020-09-13 Hongtao Liu <hongtao.liu@intel.com> Peter Cordes <peter@cordes.ca> gcc/ChangeLog: PR target/91103 * config/i386/sse.md (extract_suf): Add V8SF/V8SI/V4DF/V4DI. (*vec_extract<mode><ssescalarmodelower>_valign): Output vextract{i,f}{32x4,64x2} instruction when byte_offset % 16 == 0. gcc/testsuite/ChangeLog: PR target/91103 * gcc.target/i386/pr91103-1.c: Add extract tests. * gcc.target/i386/pr91103-2.c: Ditto. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> ` (6 preceding siblings ...) 2021-09-15 8:38 ` cvs-commit at gcc dot gnu.org @ 2023-07-12 11:20 ` rguenth at gcc dot gnu.org 7 siblings, 0 replies; 8+ messages in thread From: rguenth at gcc dot gnu.org @ 2023-07-12 11:20 UTC (permalink / raw) To: gcc-bugs https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91103 Richard Biener <rguenth at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Status|NEW |RESOLVED Target Milestone|--- |12.0 Resolution|--- |FIXED --- Comment #12 from Richard Biener <rguenth at gcc dot gnu.org> --- Fixed. ^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2023-07-12 11:20 UTC | newest] Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <bug-91103-4@http.gcc.gnu.org/bugzilla/> 2021-09-05 4:32 ` [Bug target/91103] AVX512 vector element extract uses more than 1 shuffle instruction; VALIGND can grab any element pinskia at gcc dot gnu.org 2021-09-08 10:13 ` crazylht at gmail dot com 2021-09-09 1:33 ` cvs-commit at gcc dot gnu.org 2021-09-09 1:35 ` crazylht at gmail dot com 2021-09-11 7:54 ` peter at cordes dot ca 2021-09-13 1:16 ` crazylht at gmail dot com 2021-09-15 8:38 ` cvs-commit at gcc dot gnu.org 2023-07-12 11:20 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).