public inbox for gcc-cvs@sourceware.org help / color / mirror / Atom feed
From: Haochen Jiang <jianghc@gcc.gnu.org> To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-96] Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm Date: Thu, 20 Apr 2023 01:30:52 +0000 (GMT) [thread overview] Message-ID: <20230420013052.EB43D3858D33@sourceware.org> (raw) https://gcc.gnu.org/g:c2dac2e5fbbcdda013aa7b0609d579abec8120ec commit r14-96-gc2dac2e5fbbcdda013aa7b0609d579abec8120ec Author: Hu, Lin1 <lin1.hu@intel.com> Date: Mon Jan 16 11:23:09 2023 +0800 Optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk. We can optimze them to vblend, vmovaps when there's no cross-lane. gcc/ChangeLog: * config/i386/sse.md: Modify insn vperm{i,f} and vshuf{i,f}. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test. * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-1.c: New test. * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto. Diff: --- gcc/config/i386/sse.md | 36 ++++++++++-- .../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +- gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++++++++++++++++ gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c | 68 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c | 63 ++++++++++++++++++++ 8 files changed, 218 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5dca8dd1e27..b0d9c025fbe 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18438,6 +18438,8 @@ mask = INTVAL (operands[3]) / 2; mask |= (INTVAL (operands[5]) - 4) / 2 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !<mask_applied>) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -18596,6 +18598,9 @@ mask |= (INTVAL (operands[7]) - 8) / 4 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !<mask_applied>) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -25664,7 +25669,28 @@ (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_VPERMTI))] "TARGET_AVX2" - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" + { + int mask = INTVAL (operands[3]); + if ((mask & 0xbb) == 16) + { + if (rtx_equal_p (operands[0], operands[1])) + return ""; + else + return "vmovaps\t{%1, %0|%0, %1}"; + } + if ((mask & 0xbb) == 50) + { + if (rtx_equal_p (operands[0], operands[2])) + return ""; + else + return "vmovaps\t{%2, %0|%0, %2}"; + } + if ((mask & 0xbb) == 18) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 48) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + } [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -26227,9 +26253,11 @@ && avx_vperm2f128_parallel (operands[3], <MODE>mode)" { int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1; - if (mask == 0x12) - return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; - if (mask == 0x20) + if ((mask & 0xbb) == 0x12) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 0x30) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if ((mask & 0xbb) == 0x20) return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c index 6c2fb2f184a..02aecf4edce 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f32x4 (x, x, 2); + x = _mm256_shuffle_f32x4 (x, x, 3); x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c index 1191b400134..563ded5d9df 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f64x2 (x, x, 2); + x = _mm256_shuffle_f64x2 (x, x, 3); x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c index ef9a441e7a5..e89c4140d37 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_i32x4 (x, x, 2); + x = _mm256_shuffle_i32x4 (x, x, 3); x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2); x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c index 0bd117e85d4..8e8e47eda38 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_i64x2 (x, x, 2); + x = _mm256_shuffle_i64x2 (x, x, 3); x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2); x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c new file mode 100644 index 00000000000..1ee00b6b4a1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-times "vmovaps" 1 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */ + +#include<x86intrin.h> + +/* Vpermi128/Vpermf128 */ +__m256i +perm0 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 50); +} + +__m256i +perm1 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 18); +} + +__m256i +perm2 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 48); +} + +/* vshuf{i,f}{32x4,64x2} ymm .*/ +__m256i +shuff0 (__m256i a, __m256i b) +{ + return _mm256_shuffle_i32x4(a, b, 2); +} + +__m256 +shuff1 (__m256 a, __m256 b) +{ + return _mm256_shuffle_f32x4(a, b, 2); +} + +__m256i +shuff2 (__m256i a, __m256i b) +{ + return _mm256_shuffle_i64x2(a, b, 2); +} + +__m256d +shuff3 (__m256d a, __m256d b) +{ + return _mm256_shuffle_f64x2(a, b, 2); +} diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c new file mode 100644 index 00000000000..9775072b97a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c @@ -0,0 +1,68 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vblendps" } } */ +/* { dg-final { scan-assembler-not "vperm2i128" } } */ +/* { dg-final { scan-assembler-not "vperm2f128" } } */ + +#include<x86intrin.h> + +__m256i +perm0 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 16); +} + +__m256d +perm1 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 16); +} + +__m256 +perm2 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 16); +} + +__m256i +perm3 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 16); +} + +__m256i +perm4 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 20); +} + +__m256d +perm5 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 20); +} + +__m256i +perm6 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 80); +} + +__m256d +perm7 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 80); +} + +__m256i +perm8 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 84); +} + +__m256d +perm9 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 84); +} diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c new file mode 100644 index 00000000000..a330b14caca --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-times "vmov..." 3 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */ +/* { dg-final { scan-assembler-not "vperm2f128" } } */ + +#include<x86intrin.h> + +/* Vpermf128 */ +__m256 +perm0 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 50); +} + +__m256 +perm1 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 18); +} + +__m256 +perm2 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 48); +} + +__m256i +perm3 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 50); +} + +__m256i +perm4 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 18); +} + +__m256i +perm5 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 48); +} + +__m256d +perm6 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 50); +} + +__m256d +perm7 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 18); +} + +__m256d +perm8 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 48); +}
reply other threads:[~2023-04-20 1:30 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20230420013052.EB43D3858D33@sourceware.org \ --to=jianghc@gcc.gnu.org \ --cc=gcc-cvs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).