From: "Kewen.Lin" <linkw@linux.ibm.com>
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Segher Boessenkool <segher@kernel.crashing.org>,
Bill Schmidt <wschmidt@linux.ibm.com>,
David Edelsohn <dje.gcc@gmail.com>
Subject: [PATCH] rs6000: Use direct move for char/short vector CTOR [PR96933]
Date: Tue, 8 Sep 2020 15:55:07 +0800 [thread overview]
Message-ID: <6ed4ed60-1bdb-4e9d-adfb-c08f5941576d@linux.ibm.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 928 bytes --]
Hi,
This patch is to make vector CTOR with char/short leverage direct
move instructions when they are available. With one constructed
test case, it can speed up 145% for char and 190% for short on P9.
Tested SPEC2017 x264_r at -Ofast on P9, it gets 1.61% speedup
(but based on unexpected SLP see PR96789).
Bootstrapped/regtested on powerpc64{,le}-linux-gnu P8 and
powerpc64le-linux-gnu P9.
Is it ok for trunk?
BR,
Kewen
------------
gcc/ChangeLog:
PR target/96933
* config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
instructions for vector construction with char/short types.
* config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
(p8_mtvsrd_v16qidi2): Likewise.
gcc/testsuite/ChangeLog:
PR target/96933
* gcc.target/powerpc/pr96933-1.c: New test.
* gcc.target/powerpc/pr96933-2.c: New test.
* gcc.target/powerpc/pr96933-3.c: New test.
* gcc.target/powerpc/pr96933.h: New test.
[-- Attachment #2: vec_CTOR.diff --]
[-- Type: text/plain, Size: 13514 bytes --]
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ca5b71ecdd3..39d7e2e9451 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6411,11 +6411,11 @@ rs6000_expand_vector_init (rtx target, rtx vals)
{
machine_mode mode = GET_MODE (target);
machine_mode inner_mode = GET_MODE_INNER (mode);
- int n_elts = GET_MODE_NUNITS (mode);
+ unsigned int n_elts = GET_MODE_NUNITS (mode);
int n_var = 0, one_var = -1;
bool all_same = true, all_const_zero = true;
rtx x, mem;
- int i;
+ unsigned int i;
for (i = 0; i < n_elts; ++i)
{
@@ -6681,6 +6681,207 @@ rs6000_expand_vector_init (rtx target, rtx vals)
return;
}
+ if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
+ {
+ rtx op[16];
+ /* Force the values into word_mode registers. */
+ for (i = 0; i < n_elts; i++)
+ {
+ rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
+ if (TARGET_POWERPC64)
+ {
+ op[i] = gen_reg_rtx (DImode);
+ emit_insn (gen_zero_extendqidi2 (op[i], tmp));
+ }
+ else
+ {
+ op[i] = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendqisi2 (op[i], tmp));
+ }
+ }
+
+ rtx vr_qi[16];
+ rtx vr_hi[8];
+ rtx vr_si[4];
+ rtx vr_di[2];
+
+ rtx (*merge_v16qi) (rtx, rtx, rtx) = NULL;
+ rtx (*merge_v8hi) (rtx, rtx, rtx) = NULL;
+ rtx (*merge_v4si) (rtx, rtx, rtx) = NULL;
+ rtx perm_idx;
+
+ /* Set up some common gen routines and values according to endianness. */
+ if (BYTES_BIG_ENDIAN)
+ {
+ if (mode == V16QImode)
+ {
+ merge_v16qi = gen_altivec_vmrghb;
+ merge_v8hi = gen_altivec_vmrglh;
+ }
+ else
+ merge_v8hi = gen_altivec_vmrghh;
+
+ merge_v4si = gen_altivec_vmrglw;
+ perm_idx = GEN_INT (3);
+ }
+ else
+ {
+ if (mode == V16QImode)
+ {
+ merge_v16qi = gen_altivec_vmrglb;
+ merge_v8hi = gen_altivec_vmrghh;
+ }
+ else
+ merge_v8hi = gen_altivec_vmrglh;
+
+ merge_v4si = gen_altivec_vmrghw;
+ perm_idx = GEN_INT (0);
+ }
+
+ if (TARGET_DIRECT_MOVE_128)
+ {
+ /* Take unsigned char big endianness as example for below comments,
+ the input values are: c1, c2, c3, c4, ..., c15, c16. */
+
+ rtx vr_mrg1[8];
+ /* Move to VSX register with vec_concat, each has 2 values.
+ eg: vr_mrg1[0] = { ... c1, ... c2 };
+ vr_mrg1[1] = { ... c3, ... c4 };
+ ... */
+ for (i = 0; i < n_elts / 2; i++)
+ {
+ rtx tmp = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vsx_concat_v2di (tmp, op[i * 2], op[i * 2 + 1]));
+ vr_mrg1[i] = gen_reg_rtx (V16QImode);
+ emit_move_insn (vr_mrg1[i], gen_lowpart (V16QImode, tmp));
+ }
+
+ /* Obtain the control vector for further merging. */
+ rtx vr_ctrl = gen_reg_rtx (V16QImode);
+ if (mode == V16QImode)
+ {
+ rtx val;
+ if (BYTES_BIG_ENDIAN)
+ val = gen_int_mode (0x070f171f, SImode);
+ else
+ val = gen_int_mode (0x18100800, SImode);
+ rtvec v = gen_rtvec (4, val, val, val, val);
+ rtx tmp = gen_reg_rtx (V4SImode);
+ emit_insn (gen_vec_initv4sisi (tmp,
+ gen_rtx_PARALLEL (V4SImode, v)));
+ emit_move_insn (vr_ctrl, gen_lowpart (V16QImode, tmp));
+ }
+ else
+ {
+ rtx val;
+ if (BYTES_BIG_ENDIAN)
+ val = gen_int_mode (0x06070e0f16171e1f, DImode);
+ else
+ val = gen_int_mode (0x1918111009080100, DImode);
+ rtvec v = gen_rtvec (2, val, val);
+ rtx tmp = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vec_initv2didi (tmp,
+ gen_rtx_PARALLEL (V2DImode, v)));
+ emit_move_insn (vr_ctrl, gen_lowpart (V16QImode, tmp));
+ }
+
+ rtx vr_mrg2[4];
+ /* Merge vectors with 2 values into vectors with 4 values.
+ eg: vr_mrg1[0] = { ... c1, ... c2 };
+ vr_mrg1[1] = { ... c3, ... c4 };
+ ...
+ =>
+ vr_mrg2[0] = { ... c1, c2, c3, c4 };
+ ... */
+ for (i = 0; i < n_elts / 4; i++)
+ {
+ rtx tmp = gen_reg_rtx (V16QImode);
+ emit_insn (gen_altivec_vperm_v16qi (tmp, vr_mrg1[i * 2],
+ vr_mrg1[i * 2 + 1],
+ gen_lowpart (V16QImode,
+ vr_ctrl)));
+ if (mode == V16QImode)
+ {
+ vr_mrg2[i] = gen_reg_rtx (V4SImode);
+ emit_move_insn (vr_mrg2[i], gen_lowpart (V4SImode, tmp));
+ }
+ else
+ {
+ vr_di[i] = gen_reg_rtx (V2DImode);
+ emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+ }
+ }
+
+ /* Merge vectors with 4 values into vectors with 8 values.
+ eg: vr_mrg2[0] = { ... c1, c2, c3, c4 };
+ vr_mrg2[1] = { ... c5, c6, c7, c8 };
+ ...
+ =>
+ vr_di[0] = { ... c1, c2, c3, c4, c5, c6, c7, c8 };
+ ... */
+ if (mode == V16QImode)
+ {
+ for (i = 0; i < 2; i++)
+ {
+ rtx tmp = gen_reg_rtx (V4SImode);
+ emit_insn (merge_v4si (tmp, vr_mrg2[2 * i],
+ vr_mrg2[2 * i + 1]));
+ vr_di[i] = gen_reg_rtx (V2DImode);
+ emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+ }
+ }
+ }
+ else
+ {
+ /* Move to VSX register. */
+ for (i = 0; i < n_elts; i++)
+ {
+ vr_qi[i] = gen_reg_rtx (V16QImode);
+ if (TARGET_POWERPC64)
+ emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
+ else
+ emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
+ }
+
+ /* Merge/move into vector short. */
+ for (i = 0; i < 8; i++)
+ {
+ rtx tmp = vr_qi[i];
+ if (mode == V16QImode)
+ {
+ tmp = gen_reg_rtx (V16QImode);
+ emit_insn (merge_v16qi (tmp, vr_qi[2 * i], vr_qi[2 * i + 1]));
+ }
+ vr_hi[i] = gen_reg_rtx (V8HImode);
+ emit_move_insn (vr_hi[i], gen_lowpart (V8HImode, tmp));
+ }
+
+ /* Merge vector short into vector int. */
+ for (i = 0; i < 4; i++)
+ {
+ rtx tmp = gen_reg_rtx (V8HImode);
+ emit_insn (merge_v8hi (tmp, vr_hi[2 * i], vr_hi[2 * i + 1]));
+ vr_si[i] = gen_reg_rtx (V4SImode);
+ emit_move_insn (vr_si[i], gen_lowpart (V4SImode, tmp));
+ }
+
+ /* Merge vector int into vector long long. */
+ for (i = 0; i < 2; i++)
+ {
+ rtx tmp = gen_reg_rtx (V4SImode);
+ emit_insn (merge_v4si (tmp, vr_si[2 * i], vr_si[2 * i + 1]));
+ vr_di[i] = gen_reg_rtx (V2DImode);
+ emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+ }
+ }
+
+ rtx res = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vsx_xxpermdi_v2di (res, vr_di[0], vr_di[1], perm_idx));
+ emit_insn (gen_rtx_SET (target, gen_lowpart (mode, res)));
+
+ return;
+ }
+
/* Construct the vector in memory one field at a time
and load the whole vector. */
mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 43b620ae1c0..4d6bd193a10 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8713,6 +8713,22 @@ (define_insn "p8_mtvsrwz"
"mtvsrwz %x0,%1"
[(set_attr "type" "mftgpr")])
+(define_insn "p8_mtvsrwz_v16qisi2"
+ [(set (match_operand:V16QI 0 "register_operand" "=wa")
+ (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
+ UNSPEC_P8V_MTVSRWZ))]
+ "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+ "mtvsrwz %x0,%1"
+ [(set_attr "type" "mftgpr")])
+
+(define_insn "p8_mtvsrd_v16qidi2"
+ [(set (match_operand:V16QI 0 "register_operand" "=wa")
+ (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
+ UNSPEC_P8V_MTVSRD))]
+ "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+ "mtvsrd %x0,%1"
+ [(set_attr "type" "mftgpr")])
+
(define_insn_and_split "reload_fpr_from_gpr<mode>"
[(set (match_operand:FMOVE64X 0 "register_operand" "=d")
(unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-1.c b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
new file mode 100644
index 00000000000..eaa4ca5cd5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short type values whether use 128bit
+ direct move instructions mtvsrdd on Power9 or later, rather than transfering
+ with memory store/load with stb/sth and vector load. */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 24 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-2.c b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
new file mode 100644
index 00000000000..e8d57d67aaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target powerpc_p8vector_ok } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short type values whether use direct
+ move instructions like mtvsrd/mtvsrwz on Power8, rather than transfering
+ with memory store/load with stb/sth and vector load. */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 48 {target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 48 {target {! lp64 } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-3.c b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
new file mode 100644
index 00000000000..fd0c6926c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short run successfully. */
+
+#include <stdlib.h>
+#include "pr96933.h"
+
+int
+main ()
+{
+ unsigned char uc[16];
+ signed char sc[16];
+
+ for (int i = 0; i < 16; i++)
+ {
+ uc[i] = (unsigned char) (i * 2 + 1);
+ sc[i] = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+ }
+
+ vector unsigned char ucv
+ = test_uchar (uc[0], uc[1], uc[2], uc[3], uc[4], uc[5], uc[6], uc[7], uc[8],
+ uc[9], uc[10], uc[11], uc[12], uc[13], uc[14], uc[15]);
+ vector signed char scv
+ = test_schar (sc[0], sc[1], sc[2], sc[3], sc[4], sc[5], sc[6], sc[7], sc[8],
+ sc[9], sc[10], sc[11], sc[12], sc[13], sc[14], sc[15]);
+
+ for (int i = 0; i < 16; i++)
+ {
+ unsigned char uexp = (unsigned char) (i * 2 + 1);
+ signed char sexp = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+ if (ucv[i] != uexp)
+ abort ();
+ if (scv[i] != sexp)
+ abort ();
+ }
+
+ unsigned short us[8];
+ signed short ss[8];
+ for (int i = 0; i < 8; i++)
+ {
+ us[i] = (unsigned short) (i * 2 + 1);
+ ss[i] = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+ }
+
+ vector unsigned short usv
+ = test_ushort (us[0], us[1], us[2], us[3], us[4], us[5], us[6], us[7]);
+ vector signed short ssv
+ = test_sshort (ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7]);
+
+ for (int i = 0; i < 8; i++)
+ {
+ unsigned short uexp = (unsigned short) (i * 2 + 1);
+ signed short sexp = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+ if (usv[i] != uexp)
+ abort ();
+ if (ssv[i] != sexp)
+ abort ();
+ }
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933.h b/gcc/testsuite/gcc.target/powerpc/pr96933.h
new file mode 100644
index 00000000000..4bc2b941e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933.h
@@ -0,0 +1,50 @@
+/* Source file for pr96933-*.c testing, this mainly contains 4
+ functions as below:
+
+ - test_uchar // vector unsigned char
+ - test_schar // vector signed char
+ - test_ushort // vector unsigned short
+ - test_sshort // vector signed short
+*/
+
+__attribute__ ((noipa)) vector unsigned char
+test_uchar (unsigned char c1, unsigned char c2, unsigned char c3,
+ unsigned char c4, unsigned char c5, unsigned char c6,
+ unsigned char c7, unsigned char c8, unsigned char c9,
+ unsigned char c10, unsigned char c11, unsigned char c12,
+ unsigned char c13, unsigned char c14, unsigned char c15,
+ unsigned char c16)
+{
+ vector unsigned char v
+ = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+ return v;
+}
+
+__attribute__ ((noipa)) vector signed char
+test_schar (signed char c1, signed char c2, signed char c3, signed char c4,
+ signed char c5, signed char c6, signed char c7, signed char c8,
+ signed char c9, signed char c10, signed char c11, signed char c12,
+ signed char c13, signed char c14, signed char c15, signed char c16)
+{
+ vector signed char v
+ = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+ return v;
+}
+
+__attribute__ ((noipa)) vector unsigned short
+test_ushort (unsigned short c1, unsigned short c2, unsigned short c3,
+ unsigned short c4, unsigned short c5, unsigned short c6,
+ unsigned short c7, unsigned short c8)
+{
+ vector unsigned short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+ return v;
+}
+
+__attribute__ ((noipa)) vector signed short
+test_sshort (signed short c1, signed short c2, signed short c3,
+ signed short c4, signed short c5, signed short c6,
+ signed short c7, signed short c8)
+{
+ vector signed short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+ return v;
+}
next reply other threads:[~2020-09-08 7:55 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-09-08 7:55 Kewen.Lin [this message]
2020-09-10 3:19 ` [PATCH v2] " Kewen.Lin
2020-10-13 6:59 ` PING^1 " Kewen.Lin
2020-11-02 9:11 ` PING^2 " Kewen.Lin
2020-11-02 13:44 ` David Edelsohn
2020-11-03 7:25 ` [PATCH v3] " Kewen.Lin
2020-11-03 16:45 ` Segher Boessenkool
2020-11-05 8:27 ` Kewen.Lin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6ed4ed60-1bdb-4e9d-adfb-c08f5941576d@linux.ibm.com \
--to=linkw@linux.ibm.com \
--cc=dje.gcc@gmail.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=segher@kernel.crashing.org \
--cc=wschmidt@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).