* [PATCH 1/4] Add support for floating point vector pair built-in functions
2023-11-10 23:02 [PATCH 0/4] Add vector pair builtins to PowerPC Michael Meissner
@ 2023-11-10 23:09 ` Michael Meissner
2023-11-10 23:11 ` [PATCH 2/4] Add support for integer vector pair built-ins Michael Meissner
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Michael Meissner @ 2023-11-10 23:09 UTC (permalink / raw)
To: Michael Meissner, gcc-patches, Segher Boessenkool, Kewen.Lin,
David Edelsohn, Peter Bergner
This patch adds a series of built-in functions to allow users to write code to
do a number of simple operations where the loop is done using the __vector_pair
type. The __vector_pair type is an opaque type. These built-in functions keep
the two 128-bit vectors within the __vector_pair together, and split the
operation after register allocation.
This patch provides vector pair operations for 32-bit floating point and 64-bit
floating point.
I have built and tested these patches on:
* A little endian power10 server using --with-cpu=power10
* A little endian power9 server using --with-cpu=power9
* A big endian power9 server using --with-cpu=power9.
Can I check this patch into the master branch?
2023-11-09 Michael Meissner <meissner@linux.ibm.com>
gcc/
* config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_*): Add vector
pair built-in functions for float.
(__builtin_vpair_f64_*): Add vector pair built-in functions for double.
* config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
declaration.
(split_binary_vector_pair): Likewise.
(split_fma_vector_pair): Likewise.
* config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
for vector pair built-in functions.
(split_binary_vector_pair): Likewise.
(split_fma_vector_pair): Likewise.
* config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
* config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
* config/rs6000/vector-pair.md: New file.
* doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
floating point and general vector pair built-in functions.
gcc/testsuite/
* gcc.target/powerpc/vector-pair-1.c: New test.
* gcc.target/powerpc/vector-pair-2.c: New test.
* gcc.target/powerpc/vector-pair-3.c: New test.
* gcc.target/powerpc/vector-pair-4.c: New test.
---
gcc/config/rs6000/rs6000-builtins.def | 52 +++
gcc/config/rs6000/rs6000-protos.h | 5 +
gcc/config/rs6000/rs6000.cc | 74 ++++
gcc/config/rs6000/rs6000.md | 1 +
gcc/config/rs6000/t-rs6000 | 1 +
gcc/config/rs6000/vector-pair.md | 329 ++++++++++++++++++
gcc/doc/extend.texi | 46 +++
.../gcc.target/powerpc/vector-pair-1.c | 135 +++++++
.../gcc.target/powerpc/vector-pair-2.c | 134 +++++++
.../gcc.target/powerpc/vector-pair-3.c | 60 ++++
.../gcc.target/powerpc/vector-pair-4.c | 60 ++++
11 files changed, 897 insertions(+)
create mode 100644 gcc/config/rs6000/vector-pair.md
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index ce40600e803..89b248b50ef 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4131,3 +4131,55 @@
void __builtin_vsx_stxvp (v256, unsigned long, const v256 *);
STXVP nothing {mma,pair}
+
+;; vector pair built-in functions for 8 32-bit float values
+
+ v256 __builtin_vpair_f32_abs (v256);
+ VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+ v256 __builtin_vpair_f32_add (v256, v256);
+ VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_fma (v256, v256, v256);
+ VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+ v256 __builtin_vpair_f32_max (v256, v256);
+ VPAIR_F32_MAX vpair_smax_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_min (v256, v256);
+ VPAIR_F32_MIN vpair_smin_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_mul (v256, v256);
+ VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_neg (v256);
+ VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+ v256 __builtin_vpair_f32_sub (v256, v256);
+ VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+;; vector pair built-in functions for 4 64-bit double values
+
+ v256 __builtin_vpair_f64_abs (v256);
+ VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+ v256 __builtin_vpair_f64_add (v256, v256);
+ VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_fma (v256, v256, v256);
+ VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+ v256 __builtin_vpair_f64_max (v256, v256);
+ VPAIR_F64_MAX vpair_smax_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_min (v256, v256);
+ VPAIR_F64_MIN vpair_smin_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_mul (v256, v256);
+ VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_neg (v256);
+ VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+ v256 __builtin_vpair_f64_sub (v256, v256);
+ VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..bbd899d7562 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
extern void output_toc (FILE *, rtx, int, machine_mode);
extern void rs6000_fatal_bad_address (rtx);
extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+ rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+ rtx (*)(rtx, rtx, rtx, rtx));
extern void rs6000_split_multireg_move (rtx, rtx);
extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index db60d3ca960..99352400197 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27396,6 +27396,80 @@ rs6000_split_logical (rtx operands[3],
return;
}
+/* Split a unary vector pair insn into two separate vector insns. */
+
+void
+split_unary_vector_pair (machine_mode mode, /* vector mode. */
+ rtx operands[], /* dest, src. */
+ rtx (*func)(rtx, rtx)) /* create insn. */
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ machine_mode orig_mode = GET_MODE (op0);
+
+ rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+ rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+ rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+ rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+ emit_insn (func (reg0_vector0, reg1_vector0));
+ emit_insn (func (reg0_vector1, reg1_vector1));
+ return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns. */
+
+void
+split_binary_vector_pair (machine_mode mode, /* vector mode. */
+ rtx operands[], /* dest, src. */
+ rtx (*func)(rtx, rtx, rtx)) /* create insn. */
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op2 = operands[2];
+ machine_mode orig_mode = GET_MODE (op0);
+
+ rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+ rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+ rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+ rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+ rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+ rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+ emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+ emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+ return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+ insns. */
+
+void
+split_fma_vector_pair (machine_mode mode, /* vector mode. */
+ rtx operands[], /* dest, src. */
+ rtx (*func)(rtx, rtx, rtx, rtx)) /* create insn. */
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op2 = operands[2];
+ rtx op3 = operands[3];
+ machine_mode orig_mode = GET_MODE (op0);
+
+ rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+ rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+ rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+ rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+ rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+ rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+ rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+ rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+ emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+ emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+ return;
+}
+
/* Emit instructions to move SRC to DST. Called by splitters for
multi-register moves. It will emit at most one instruction for
each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index dcf1f3526f5..5a17adc1bc3 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15767,6 +15767,7 @@ (define_insn "hashchk"
(include "vsx.md")
(include "altivec.md")
(include "mma.md")
+(include "vector-pair.md")
(include "dfp.md")
(include "crypto.md")
(include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1d..5fc89499795 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
$(srcdir)/config/rs6000/vsx.md \
$(srcdir)/config/rs6000/altivec.md \
$(srcdir)/config/rs6000/mma.md \
+ $(srcdir)/config/rs6000/vector-pair.md \
$(srcdir)/config/rs6000/crypto.md \
$(srcdir)/config/rs6000/htm.md \
$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 00000000000..2dcac6a31e2
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,329 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;; Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers. Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations. The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+ [UNSPEC_VPAIR_V4DF
+ UNSPEC_VPAIR_V8SF
+ ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs "abs")
+ (minus "sub")
+ (mult "mul")
+ (neg "neg")
+ (plus "add")
+ (smin "smin")
+ (smax "smax")
+ (xor "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+ UNSPEC_VPAIR_V8SF])
+
+;; Map VP_* to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF "V2DF")
+ (UNSPEC_VPAIR_V8SF "V4SF")])
+
+;; Map VP_* to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF "v4df")
+ (UNSPEC_VPAIR_V8SF "v8sf")])
+
+;; Map VP_* to a lower case name to identify the vector after the vector pair
+;; has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF "v2df")
+ (UNSPEC_VPAIR_V8SF "v4sf")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(VP_FP_UNARY:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa"))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_<vp_insn><vp_vmode>2);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_vsx_nabs<vp_vmode>2);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(VP_FP_BINARY:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_<vp_insn><vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_fma<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (unspec:OO
+ [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_fms<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nfma<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (unspec:OO
+ [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nfms<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(plus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (match_dup 3))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(minus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (unspec:OO
+ [(neg:OO
+ (match_dup 3))]
+ VP_FP))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(plus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (match_dup 3))]
+ VP_FP))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(minus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (unspec:OO
+ [(neg:OO
+ (match_dup 3))]
+ VP_FP))]
+ VP_FP))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 7cdfdf8c83b..a830ad06b90 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -15038,6 +15038,7 @@ instructions, but allow the compiler to schedule those calls.
* NDS32 Built-in Functions::
* Nvidia PTX Built-in Functions::
* Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
* PowerPC AltiVec/VSX Built-in Functions::
* PowerPC Hardware Transactional Memory Built-in Functions::
* PowerPC Atomic Memory Operation Functions::
@@ -21368,6 +21369,51 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
@end smallexample
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using the type
+@code{__vector_pair} to hold two 128-bit vectors on processors that
+support ISA 3.1 (power10). The @code{__vector_pair} type and the
+vector pair built-in functions require the MMA instruction set
+(@option{-mmma}) to be enabled, which is on by default for
+@option{-mcpu=power10}.
+
+By default, @code{__vector_pair} types are loaded into vectors with a
+single load vector pair instruction. The processing for the built-in
+function is done as two separate vector instructions on each of the
+two 128-bit vectors stored in the vector pair. The
+@code{__vector_pair} type is usually stored with a single vector pair
+store instruction.
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
@node PowerPC Hardware Transactional Memory Built-in Functions
@subsection PowerPC Hardware Transactional Memory Built-in Functions
GCC provides two interfaces for accessing the Hardware Transactional
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
new file mode 100644
index 00000000000..e74840cebc0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
@@ -0,0 +1,135 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 4 double elements. */
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvadddp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvsubdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmuldp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_mul (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmindp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_min (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmaxdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_max (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvnegdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvabsdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xvnabsdp, 1 stxvp. */
+ __vector_pair ab = __builtin_vpair_f64_abs (*x);
+ *dest = __builtin_vpair_f64_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmadd{a,q}dp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmsub{a,q}dp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f64_neg (*z);
+ *dest = __builtin_vpair_f64_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmadd{a,q}dp, 1 stxvp. */
+ __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmsub{a,q}dp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f64_neg (*z);
+ __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mxvabsdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmindp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
new file mode 100644
index 00000000000..2facb727053
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
@@ -0,0 +1,134 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 8 float elements. */
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvaddsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvsubsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmulsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_mul (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmaxsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_max (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvminsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvnegsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvabssp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xvnabssp, 1 stxvp. */
+ __vector_pair ab = __builtin_vpair_f32_abs (*x);
+ *dest = __builtin_vpair_f32_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f32_neg (*z);
+ *dest = __builtin_vpair_f32_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp. */
+ __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f32_neg (*z);
+ __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvminsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
new file mode 100644
index 00000000000..65bfc44f85d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+ negate operations to the appropriate fused multiply-add instruction for
+ vector pairs with 4 double elements. */
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmadd{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ *dest = __builtin_vpair_f64_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmsub{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ *dest = __builtin_vpair_f64_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f64_add (m, *z);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f64_sub (m, *z);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
new file mode 100644
index 00000000000..b62871be1fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
@@ -0,0 +1,60 @@
+/* { dgv64-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+ negate operations to the appropriate fused multiply-add instruction for
+ vector pairs with 8 float elements. */
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmadd{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ *dest = __builtin_vpair_f32_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmsub{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ *dest = __builtin_vpair_f32_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f32_add (m, *z);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f32_sub (m, *z);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */
--
2.41.0
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meissner@linux.ibm.com
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 2/4] Add support for integer vector pair built-ins
2023-11-10 23:02 [PATCH 0/4] Add vector pair builtins to PowerPC Michael Meissner
2023-11-10 23:09 ` [PATCH 1/4] Add support for floating point vector pair built-in functions Michael Meissner
@ 2023-11-10 23:11 ` Michael Meissner
2023-11-10 23:12 ` [PATCH 3/4] Add support for initializing and extracting from vector pairs Michael Meissner
2023-11-10 23:13 ` [PATCH 4/4] Add support for doing a horizontal add on vector pair elements Michael Meissner
3 siblings, 0 replies; 5+ messages in thread
From: Michael Meissner @ 2023-11-10 23:11 UTC (permalink / raw)
To: Michael Meissner, gcc-patches, Segher Boessenkool, Kewen.Lin,
David Edelsohn, Peter Bergner
This patch adds a series of built-in functions to allow users to write code to
do a number of simple operations where the loop is done using the __vector_pair
type. The __vector_pair type is an opaque type. These built-in functions keep
the two 128-bit vectors within the __vector_pair together, and split the
operation after register allocation.
This patch provides vector pair operations for 8, 16, 32, and 64-bit integers.
I have built and tested these patches on:
* A little endian power10 server using --with-cpu=power10
* A little endian power9 server using --with-cpu=power9
* A big endian power9 server using --with-cpu=power9.
Can I check this patch into the master branch after the preceeding patch is
checked in?
2023-11-09 Michael Meissner <meissner@linux.ibm.com>
gcc/
* config/rs6000/rs6000-builtins.def (__builtin_vpair_i8*): Add built-in
functions for integer vector pairs.
(__builtin_vpair_i16*): Likeise.
(__builtin_vpair_i32*): Likeise.
(__builtin_vpair_i64*): Likeise.
* config/rs6000/vector-pair.md (UNSPEC_VPAIR_V32QI): New unspec.
(UNSPEC_VPAIR_V16HI): Likewise.
(UNSPEC_VPAIR_V8SI): Likewise.
(UNSPEC_VPAIR_V4DI): Likewise.
(VP_INT_BINARY): New iterator for integer vector pair.
(vp_insn): Add supoort for integer vector pairs.
(vp_ireg): New code attribute for integer vector pairs.
(vp_ipredicate): Likewise.
(VP_INT): New int interator for integer vector pairs.
(VP_VEC_MODE): Likewise.
(vp_pmode): Likewise.
(vp_vmode): Likewise.
(vp_neg_reg): New int interator for integer vector pairs.
(vpair_neg_<vp_pmode>): Add integer vector pair support insns.
(vpair_not_<vp_pmode>2): Likewise.
(vpair_<vp_insn>_<vp_pmode>3): Likewise.
(vpair_andc_<vp_pmode): Likewise.
(*vpair_iorc_<vp_pmode>): Likewise.
(vpair_nand_<vp_pmode>_1): Likewise.
(vpair_nand_<vp_pmode>_2): Likewise.
(vpair_nor_<vp_pmode>_1): Likewise.
(vpair_nor_<vp_pmode>_2): Likewise.
* doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
integer vector pair built-in functions.
gcc/testsuite/
* gcc.target/powerpc/vector-pair-5.c: New test.
* gcc.target/powerpc/vector-pair-6.c: New test.
* gcc.target/powerpc/vector-pair-7.c: New test.
* gcc.target/powerpc/vector-pair-8.c: New test.
---
gcc/config/rs6000/rs6000-builtins.def | 144 +++++++++
gcc/config/rs6000/vector-pair.md | 280 +++++++++++++++++-
gcc/doc/extend.texi | 72 +++++
.../gcc.target/powerpc/vector-pair-5.c | 193 ++++++++++++
.../gcc.target/powerpc/vector-pair-6.c | 193 ++++++++++++
.../gcc.target/powerpc/vector-pair-7.c | 193 ++++++++++++
.../gcc.target/powerpc/vector-pair-8.c | 194 ++++++++++++
7 files changed, 1266 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-5.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-6.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-7.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-8.c
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 89b248b50ef..3b2db39c1ab 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4183,3 +4183,147 @@
v256 __builtin_vpair_f64_sub (v256, v256);
VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
+
+;; vector pair built-in functions for 32 8-bit unsigned char or
+;; signed char values
+
+ v256 __builtin_vpair_i8_add (v256, v256);
+ VPAIR_I8_ADD vpair_add_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8_and (v256, v256);
+ VPAIR_I8_AND vpair_and_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8_ior (v256, v256);
+ VPAIR_I8_IOR vpair_ior_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8_max (v256, v256);
+ VPAIR_I8_MAX vpair_smax_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8_min (v256, v256);
+ VPAIR_I8_MIN vpair_smin_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8_neg (v256);
+ VPAIR_I8_NEG vpair_neg_v32qi2 {mma,pair}
+
+ v256 __builtin_vpair_i8_not (v256);
+ VPAIR_I8_NOT vpair_not_v32qi2 {mma,pair}
+
+ v256 __builtin_vpair_i8_sub (v256, v256);
+ VPAIR_I8_SUB vpair_sub_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8_xor (v256, v256);
+ VPAIR_I8_XOR vpair_xor_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8u_max (v256, v256);
+ VPAIR_I8U_MAX vpair_umax_v32qi3 {mma,pair}
+
+ v256 __builtin_vpair_i8u_min (v256, v256);
+ VPAIR_I8U_MIN vpair_umin_v32qi3 {mma,pair}
+
+;; vector pair built-in functions for 16 16-bit unsigned short or
+;; signed short values
+
+ v256 __builtin_vpair_i16_add (v256, v256);
+ VPAIR_I16_ADD vpair_add_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16_and (v256, v256);
+ VPAIR_I16_AND vpair_and_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16_ior (v256, v256);
+ VPAIR_I16_IOR vpair_ior_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16_max (v256, v256);
+ VPAIR_I16_MAX vpair_smax_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16_min (v256, v256);
+ VPAIR_I16_MIN vpair_smin_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16_neg (v256);
+ VPAIR_I16_NEG vpair_neg_v16hi2 {mma,pair}
+
+ v256 __builtin_vpair_i16_not (v256);
+ VPAIR_I16_NOT vpair_not_v16hi2 {mma,pair}
+
+ v256 __builtin_vpair_i16_sub (v256, v256);
+ VPAIR_I16_SUB vpair_sub_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16_xor (v256, v256);
+ VPAIR_I16_XOR vpair_xor_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16u_max (v256, v256);
+ VPAIR_I16U_MAX vpair_umax_v16hi3 {mma,pair}
+
+ v256 __builtin_vpair_i16u_min (v256, v256);
+ VPAIR_I16U_MIN vpair_umin_v16hi3 {mma,pair}
+
+;; vector pair built-in functions for 8 32-bit unsigned int or
+;; signed int values
+
+ v256 __builtin_vpair_i32_add (v256, v256);
+ VPAIR_I32_ADD vpair_add_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32_and (v256, v256);
+ VPAIR_I32_AND vpair_and_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32_ior (v256, v256);
+ VPAIR_I32_IOR vpair_ior_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32_max (v256, v256);
+ VPAIR_I32_MAX vpair_smax_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32_min (v256, v256);
+ VPAIR_I32_MIN vpair_smin_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32_neg (v256);
+ VPAIR_I32_NEG vpair_neg_v8si2 {mma,pair}
+
+ v256 __builtin_vpair_i32_not (v256);
+ VPAIR_I32_NOT vpair_not_v8si2 {mma,pair}
+
+ v256 __builtin_vpair_i32_sub (v256, v256);
+ VPAIR_I32_SUB vpair_sub_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32_xor (v256, v256);
+ VPAIR_I32_XOR vpair_xor_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32u_max (v256, v256);
+ VPAIR_I32U_MAX vpair_umax_v8si3 {mma,pair}
+
+ v256 __builtin_vpair_i32u_min (v256, v256);
+ VPAIR_I32U_MIN vpair_umin_v8si3 {mma,pair}
+
+;; vector pair built-in functions for 4 64-bit unsigned long long or
+;; signed long long values
+
+ v256 __builtin_vpair_i64_add (v256, v256);
+ VPAIR_I64_ADD vpair_add_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64_and (v256, v256);
+ VPAIR_I64_AND vpair_and_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64_ior (v256, v256);
+ VPAIR_I64_IOR vpair_ior_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64_max (v256, v256);
+ VPAIR_I64_MAX vpair_smax_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64_min (v256, v256);
+ VPAIR_I64_MIN vpair_smin_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64_neg (v256);
+ VPAIR_I64_NEG vpair_neg_v4di2 {mma,pair}
+
+ v256 __builtin_vpair_i64_not (v256);
+ VPAIR_I64_NOT vpair_not_v4di2 {mma,pair}
+
+ v256 __builtin_vpair_i64_sub (v256, v256);
+ VPAIR_I64_SUB vpair_sub_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64_xor (v256, v256);
+ VPAIR_I64_XOR vpair_xor_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64u_max (v256, v256);
+ VPAIR_I64U_MAX vpair_umax_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64u_min (v256, v256);
+ VPAIR_I64U_MIN vpair_umin_v4di3 {mma,pair}
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
index 2dcac6a31e2..cd14430f47a 100644
--- a/gcc/config/rs6000/vector-pair.md
+++ b/gcc/config/rs6000/vector-pair.md
@@ -29,38 +29,102 @@
(define_c_enum "unspec"
[UNSPEC_VPAIR_V4DF
UNSPEC_VPAIR_V8SF
+ UNSPEC_VPAIR_V32QI
+ UNSPEC_VPAIR_V16HI
+ UNSPEC_VPAIR_V8SI
+ UNSPEC_VPAIR_V4DI
])
;; Iterator doing unary/binary arithmetic on vector pairs
(define_code_iterator VP_FP_UNARY [abs neg])
(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+(define_code_iterator VP_INT_BINARY [and ior minus plus smax smin umax umin xor])
+
;; Return the insn name from the VP_* code iterator
(define_code_attr vp_insn [(abs "abs")
+ (and "and")
+ (ior "ior")
(minus "sub")
(mult "mul")
+ (not "one_cmpl")
(neg "neg")
(plus "add")
(smin "smin")
(smax "smax")
+ (umin "umin")
+ (umax "umax")
(xor "xor")])
+;; Return the register constraint ("v" or "wa") for the integer code iterator
+;; used. For arithmetic operations, we need to use "v" in order to use the
+;; Altivec instruction. For logical operations, we can use wa.
+(define_code_attr vp_ireg [(and "wa")
+ (ior "wa")
+ (minus "v")
+ (not "wa")
+ (neg "v")
+ (plus "v")
+ (smax "v")
+ (smin "v")
+ (umax "v")
+ (umin "v")
+ (xor "wa")])
+
+;; Return the register previdcate for the integer code iterator used
+(define_code_attr vp_ipredicate [(and "vsx_register_operand")
+ (ior "vsx_register_operand")
+ (minus "altivec_register_operand")
+ (not "vsx_register_operand")
+ (neg "altivec_register_operand")
+ (plus "altivec_register_operand")
+ (smax "altivec_register_operand")
+ (smin "altivec_register_operand")
+ (umax "altivec_register_operand")
+ (umin "altivec_register_operand")
+ (xor "vsx_register_operand")])
+
;; Iterator for creating the unspecs for vector pair built-ins
(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
UNSPEC_VPAIR_V8SF])
+(define_int_iterator VP_INT [UNSPEC_VPAIR_V4DI
+ UNSPEC_VPAIR_V8SI
+ UNSPEC_VPAIR_V16HI
+ UNSPEC_VPAIR_V32QI])
+
;; Map VP_* to vector mode of the arguments after they are split
(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF "V2DF")
- (UNSPEC_VPAIR_V8SF "V4SF")])
+ (UNSPEC_VPAIR_V8SF "V4SF")
+ (UNSPEC_VPAIR_V32QI "V16QI")
+ (UNSPEC_VPAIR_V16HI "V8HI")
+ (UNSPEC_VPAIR_V8SI "V4SI")
+ (UNSPEC_VPAIR_V4DI "V2DI")])
;; Map VP_* to a lower case name to identify the vector pair.
(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF "v4df")
- (UNSPEC_VPAIR_V8SF "v8sf")])
+ (UNSPEC_VPAIR_V8SF "v8sf")
+ (UNSPEC_VPAIR_V32QI "v32qi")
+ (UNSPEC_VPAIR_V16HI "v16hi")
+ (UNSPEC_VPAIR_V8SI "v8si")
+ (UNSPEC_VPAIR_V4DI "v4di")])
;; Map VP_* to a lower case name to identify the vector after the vector pair
;; has been split.
(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF "v2df")
- (UNSPEC_VPAIR_V8SF "v4sf")])
+ (UNSPEC_VPAIR_V8SF "v4sf")
+ (UNSPEC_VPAIR_V32QI "v16qi")
+ (UNSPEC_VPAIR_V16HI "v8hi")
+ (UNSPEC_VPAIR_V8SI "v4si")
+ (UNSPEC_VPAIR_V4DI "v2di")])
+
+;; Map VP_INT to constraints used for the negate scratch register. For vectors
+;; of QI and HI, we need to change -a into 0 - a since we don't have a negate
+;; operation. We do have a vnegw/vnegd operation for SI and DI modes.
+(define_int_attr vp_neg_reg [(UNSPEC_VPAIR_V32QI "&v")
+ (UNSPEC_VPAIR_V16HI "&v")
+ (UNSPEC_VPAIR_V8SI "X")
+ (UNSPEC_VPAIR_V4DI "X")])
\f
;; Vector pair floating point unary operations
@@ -327,3 +391,213 @@ (define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
{
}
[(set_attr "length" "8")])
+
+\f
+;; Vector pair integer negate support.
+(define_insn_and_split "vpair_neg_<vp_pmode>2"
+ [(set (match_operand:OO 0 "altivec_register_operand" "=v")
+ (unspec:OO [(neg:OO
+ (match_operand:OO 1 "altivec_register_operand" "v"))]
+ VP_INT))
+ (clobber (match_scratch:<VP_VEC_MODE> 2 "=<vp_neg_reg>"))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 2) (match_dup 3))
+ (set (match_dup 4) (minus:<VP_VEC_MODE> (match_dup 2)
+ (match_dup 5)))
+ (set (match_dup 6) (minus:<VP_VEC_MODE> (match_dup 2)
+ (match_dup 7)))]
+{
+ unsigned reg0 = reg_or_subregno (operands[0]);
+ unsigned reg1 = reg_or_subregno (operands[1]);
+ machine_mode vmode = <VP_VEC_MODE>mode;
+
+ operands[3] = CONST0_RTX (vmode);
+
+ operands[4] = gen_rtx_REG (vmode, reg0);
+ operands[5] = gen_rtx_REG (vmode, reg1);
+
+ operands[6] = gen_rtx_REG (vmode, reg0 + 1);
+ operands[7] = gen_rtx_REG (vmode, reg1 + 1);
+
+ /* If the vector integer size is 32 or 64 bits, we can use the vneg{w,d}
+ instructions. */
+ if (vmode == V4SImode)
+ {
+ emit_insn (gen_negv4si2 (operands[4], operands[5]));
+ emit_insn (gen_negv4si2 (operands[6], operands[7]));
+ DONE;
+ }
+ else if (vmode == V2DImode)
+ {
+ emit_insn (gen_negv2di2 (operands[4], operands[5]));
+ emit_insn (gen_negv2di2 (operands[6], operands[7]));
+ DONE;
+ }
+}
+ [(set_attr "length" "8")])
+
+;; Vector pair integer not support.
+(define_insn_and_split "vpair_not_<vp_pmode>2"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(not:OO (match_operand:OO 1 "vsx_register_operand" "wa"))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_one_cmpl<vp_vmode>2);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Vector pair integer binary operations.
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+ [(set (match_operand:OO 0 "<vp_ipredicate>" "=<vp_ireg>")
+ (unspec:OO [(VP_INT_BINARY:OO
+ (match_operand:OO 1 "<vp_ipredicate>" "<vp_ireg>")
+ (match_operand:OO 2 "<vp_ipredicate>" "<vp_ireg>"))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_<vp_insn><vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair a & ~b
+(define_insn_and_split "*vpair_andc_<vp_pmode>"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(and:OO
+ (unspec:OO
+ [(not:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa"))]
+ VP_INT)
+ (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_andc<vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair a | ~b
+(define_insn_and_split "*vpair_iorc_<vp_pmode>"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(ior:OO
+ (unspec:OO
+ [(not:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa"))]
+ VP_INT)
+ (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_orc<vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optiomize vector pair ~(a & b) or ((~a) | (~b))
+(define_insn_and_split "*vpair_nand_<vp_pmode>_1"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO
+ [(not:OO
+ (unspec:OO [(and:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_INT))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nand<vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "*vpair_nand_<vp_pmode>_2"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO
+ [(ior:OO
+ (unspec:OO
+ [(not:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa"))]
+ VP_INT)
+ (unspec:OO
+ [(not:OO
+ (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_INT))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nand<vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optiomize vector pair ~(a | b) or ((~a) & (~b))
+(define_insn_and_split "*vpair_nor_<vp_pmode>_1"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO
+ [(not:OO
+ (unspec:OO [(ior:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_INT))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nor<vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "*vpair_nor_<vp_pmode>_2"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO
+ [(ior:OO
+ (unspec:OO
+ [(not:OO (match_operand:OO 1 "vsx_register_operand" "wa"))]
+ VP_INT)
+ (unspec:OO
+ [(not:OO (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_INT))]
+ VP_INT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nor<vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index a830ad06b90..ff7918c7a58 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -21414,6 +21414,78 @@ __vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
@end smallexample
+The following built-in functions operate on pairs of
+@code{vector long long} or @code{vector unsigned long long} values:
+
+@smallexample
+__vector_pair __builtin_vpair_i64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64_ior (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64_neg (__vector_pair);
+__vector_pair __builtin_vpair_i64_not (__vector_pair);
+__vector_pair __builtin_vpair_i64_sub (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64_xor (__vector_pair, __vector_pair);
+
+__vector_pair __builtin_vpair_i64u_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64u_min (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector int} or @code{vector unsigned int} values:
+
+@smallexample
+__vector_pair __builtin_vpair_i32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_ior (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_neg (__vector_pair);
+__vector_pair __builtin_vpair_i32_not (__vector_pair);
+__vector_pair __builtin_vpair_i32_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_sub (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_xor (__vector_pair, __vector_pair);
+
+__vector_pair __builtin_vpair_i32u_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32u_min (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector short} or @code{vector unsigned short} values:
+
+@smallexample
+__vector_pair __builtin_vpair_i16_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16_ior (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16_neg (__vector_pair);
+__vector_pair __builtin_vpair_i16_not (__vector_pair);
+__vector_pair __builtin_vpair_i16_sub (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16_xor (__vector_pair, __vector_pair);
+
+__vector_pair __builtin_vpair_i16u_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16u_min (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector signed char} or @code{vector unsigned char} values:
+
+@smallexample
+__vector_pair __builtin_vpair_i8_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_ior (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_neg (__vector_pair);
+__vector_pair __builtin_vpair_i8_not (__vector_pair);
+__vector_pair __builtin_vpair_i8_sub (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_xor (__vector_pair, __vector_pair);
+
+__vector_pair __builtin_vpair_i8_umax (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_umin (__vector_pair, __vector_pair);
+@end smallexample
+
@node PowerPC Hardware Transactional Memory Built-in Functions
@subsection PowerPC Hardware Transactional Memory Built-in Functions
GCC provides two interfaces for accessing the Hardware Transactional
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-5.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-5.c
new file mode 100644
index 00000000000..924919cae1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-5.c
@@ -0,0 +1,193 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 4 64-bit integer elements. */
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vaddudm, 1 stxvp. */
+ *dest = __builtin_vpair_i64_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vaddudm, 1 stxvp. */
+ *dest = __builtin_vpair_i64_sub (*x, *y);
+}
+
+void
+test_and (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxland, 1 stxvp. */
+ *dest = __builtin_vpair_i64_and (*x, *y);
+}
+
+void
+test_or (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlor, 1 stxvp. */
+ *dest = __builtin_vpair_i64_ior (*x, *y);
+}
+
+void
+test_xor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlxor, 1 stxvp. */
+ *dest = __builtin_vpair_i64_xor (*x, *y);
+}
+
+void
+test_smax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxsd, 1 stxvp. */
+ *dest = __builtin_vpair_i64_max (*x, *y);
+}
+
+void
+test_smin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminsd, 1 stxvp. */
+ *dest = __builtin_vpair_i64_min (*x, *y);
+}
+
+void
+test_umax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxud, 1 stxvp. */
+ *dest = __builtin_vpair_i64u_max (*x, *y);
+}
+
+void
+test_umin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminud, 1 stxvp. */
+ *dest = __builtin_vpair_i64u_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 vnegd, 1 stxvp. */
+ *dest = __builtin_vpair_i64_neg (*x);
+}
+
+void
+test_not (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ *dest = __builtin_vpair_i64_not (*x);
+}
+
+/* Combination of logical operators. */
+
+void
+test_andc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i64_not (*y);
+ *dest = __builtin_vpair_i64_and (*x, n);
+}
+
+void
+test_andc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i64_not (*x);
+ *dest = __builtin_vpair_i64_and (n, *y);
+}
+
+void
+test_orc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i64_not (*y);
+ *dest = __builtin_vpair_i64_ior (*x, n);
+}
+
+void
+test_orc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i64_not (*x);
+ *dest = __builtin_vpair_i64_ior (n, *y);
+}
+
+void
+test_nand_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i64_and (*x, *y);
+ *dest = __builtin_vpair_i64_not (a);
+}
+
+void
+test_nand_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair nx = __builtin_vpair_i64_not (*x);
+ __vector_pair ny = __builtin_vpair_i64_not (*y);
+ *dest = __builtin_vpair_i64_ior (nx, ny);
+}
+
+void
+test_nor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i64_ior (*x, *y);
+ *dest = __builtin_vpair_i64_not (a);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 34 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */
+/* { dg-final { scan-assembler-times {\mvaddudm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxsd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxud\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminsd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminud\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvnegd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsubudm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-6.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-6.c
new file mode 100644
index 00000000000..f22949c1f95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-6.c
@@ -0,0 +1,193 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 8 32-bit integer elements. */
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vadduwm, 1 stxvp. */
+ *dest = __builtin_vpair_i32_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vsubuwm, 1 stxvp. */
+ *dest = __builtin_vpair_i32_sub (*x, *y);
+}
+
+void
+test_and (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxland, 1 stxvp. */
+ *dest = __builtin_vpair_i32_and (*x, *y);
+}
+
+void
+test_or (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlor, 1 stxvp. */
+ *dest = __builtin_vpair_i32_ior (*x, *y);
+}
+
+void
+test_xor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlxor, 1 stxvp. */
+ *dest = __builtin_vpair_i32_xor (*x, *y);
+}
+
+void
+test_smax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxsw, 1 stxvp. */
+ *dest = __builtin_vpair_i32_max (*x, *y);
+}
+
+void
+test_smin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminsw, 1 stxvp. */
+ *dest = __builtin_vpair_i32_min (*x, *y);
+}
+
+void
+test_umax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxuw, 1 stxvp. */
+ *dest = __builtin_vpair_i32u_max (*x, *y);
+}
+
+void
+test_umin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminuw, 1 stxvp. */
+ *dest = __builtin_vpair_i32u_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 vnegw, 1 stxvp. */
+ *dest = __builtin_vpair_i32_neg (*x);
+}
+
+void
+test_not (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ *dest = __builtin_vpair_i32_not (*x);
+}
+
+/* Combination of logical operators. */
+
+void
+test_andc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i32_not (*y);
+ *dest = __builtin_vpair_i32_and (*x, n);
+}
+
+void
+test_andc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i32_not (*x);
+ *dest = __builtin_vpair_i32_and (n, *y);
+}
+
+void
+test_orc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i32_not (*y);
+ *dest = __builtin_vpair_i32_ior (*x, n);
+}
+
+void
+test_orc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i32_not (*x);
+ *dest = __builtin_vpair_i32_ior (n, *y);
+}
+
+void
+test_nand_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i32_and (*x, *y);
+ *dest = __builtin_vpair_i32_not (a);
+}
+
+void
+test_nand_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair nx = __builtin_vpair_i32_not (*x);
+ __vector_pair ny = __builtin_vpair_i32_not (*y);
+ *dest = __builtin_vpair_i32_ior (nx, ny);
+}
+
+void
+test_nor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i32_ior (*x, *y);
+ *dest = __builtin_vpair_i32_not (a);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 34 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */
+/* { dg-final { scan-assembler-times {\mvadduwm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxsw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxuw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminsw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminuw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvnegw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuwm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-7.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-7.c
new file mode 100644
index 00000000000..71452f59284
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-7.c
@@ -0,0 +1,193 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 16 16-bit integer elements. */
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vadduhm, 1 stxvp. */
+ *dest = __builtin_vpair_i16_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vsubuhm, 1 stxvp. */
+ *dest = __builtin_vpair_i16_sub (*x, *y);
+}
+
+void
+test_and (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxland, 1 stxvp. */
+ *dest = __builtin_vpair_i16_and (*x, *y);
+}
+
+void
+test_or (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlor, 1 stxvp. */
+ *dest = __builtin_vpair_i16_ior (*x, *y);
+}
+
+void
+test_xor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlxor, 1 stxvp. */
+ *dest = __builtin_vpair_i16_xor (*x, *y);
+}
+
+void
+test_smax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxsh, 1 stxvp. */
+ *dest = __builtin_vpair_i16_max (*x, *y);
+}
+
+void
+test_smin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminsh, 1 stxvp. */
+ *dest = __builtin_vpair_i16_min (*x, *y);
+}
+
+void
+test_umax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxuh, 1 stxvp. */
+ *dest = __builtin_vpair_i16u_max (*x, *y);
+}
+
+void
+test_umin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminuh, 1 stxvp. */
+ *dest = __builtin_vpair_i16u_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 1 xxspltib, 2 vsubuhm, 1 stxvp. */
+ *dest = __builtin_vpair_i16_neg (*x);
+}
+
+void
+test_not (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ *dest = __builtin_vpair_i16_not (*x);
+}
+
+/* Combination of logical operators. */
+
+void
+test_andc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i16_not (*y);
+ *dest = __builtin_vpair_i16_and (*x, n);
+}
+
+void
+test_andc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i16_not (*x);
+ *dest = __builtin_vpair_i16_and (n, *y);
+}
+
+void
+test_orc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i16_not (*y);
+ *dest = __builtin_vpair_i16_ior (*x, n);
+}
+
+void
+test_orc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i16_not (*x);
+ *dest = __builtin_vpair_i16_ior (n, *y);
+}
+
+void
+test_nand_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i16_and (*x, *y);
+ *dest = __builtin_vpair_i16_not (a);
+}
+
+void
+test_nand_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair nx = __builtin_vpair_i16_not (*x);
+ __vector_pair ny = __builtin_vpair_i16_not (*y);
+ *dest = __builtin_vpair_i16_ior (nx, ny);
+}
+
+void
+test_nor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i16_ior (*x, *y);
+ *dest = __builtin_vpair_i16_not (a);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 34 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */
+/* { dg-final { scan-assembler-times {\mvadduhm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxsh\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxuh\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminsh\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminuh\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuhm\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-8.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-8.c
new file mode 100644
index 00000000000..8db9056d4cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-8.c
@@ -0,0 +1,194 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 32 8-bit integer elements. */
+
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vaddubm, 1 stxvp. */
+ *dest = __builtin_vpair_i8_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vsububm, 1 stxvp. */
+ *dest = __builtin_vpair_i8_sub (*x, *y);
+}
+
+void
+test_and (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxland, 1 stxvp. */
+ *dest = __builtin_vpair_i8_and (*x, *y);
+}
+
+void
+test_or (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlor, 1 stxvp. */
+ *dest = __builtin_vpair_i8_ior (*x, *y);
+}
+
+void
+test_xor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlxor, 1 stxvp. */
+ *dest = __builtin_vpair_i8_xor (*x, *y);
+}
+
+void
+test_smax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxsb, 1 stxvp. */
+ *dest = __builtin_vpair_i8_max (*x, *y);
+}
+
+void
+test_smin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminsb, 1 stxvp. */
+ *dest = __builtin_vpair_i8_min (*x, *y);
+}
+
+void
+test_umax (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vmaxub, 1 stxvp. */
+ *dest = __builtin_vpair_i8u_max (*x, *y);
+}
+
+void
+test_umin (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 vminub, 1 stxvp. */
+ *dest = __builtin_vpair_i8u_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 1 xxspltib, 2 vsububm, 1 stxvp. */
+ *dest = __builtin_vpair_i8_neg (*x);
+}
+
+void
+test_not (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ *dest = __builtin_vpair_i8_not (*x);
+}
+
+/* Combination of logical operators. */
+
+void
+test_andc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i8_not (*y);
+ *dest = __builtin_vpair_i8_and (*x, n);
+}
+
+void
+test_andc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlandc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i8_not (*x);
+ *dest = __builtin_vpair_i8_and (n, *y);
+}
+
+void
+test_orc_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i8_not (*y);
+ *dest = __builtin_vpair_i8_ior (*x, n);
+}
+
+void
+test_orc_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlorc, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_i8_not (*x);
+ *dest = __builtin_vpair_i8_ior (n, *y);
+}
+
+void
+test_nand_1 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i8_and (*x, *y);
+ *dest = __builtin_vpair_i8_not (a);
+}
+
+void
+test_nand_2 (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnand, 1 stxvp. */
+ __vector_pair nx = __builtin_vpair_i8_not (*x);
+ __vector_pair ny = __builtin_vpair_i8_not (*y);
+ *dest = __builtin_vpair_i8_ior (nx, ny);
+}
+
+void
+test_nor (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xxlnor, 1 stxvp. */
+ __vector_pair a = __builtin_vpair_i8_ior (*x, *y);
+ *dest = __builtin_vpair_i8_not (a);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 34 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */
+/* { dg-final { scan-assembler-times {\mvaddubm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxsb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmaxub\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminsb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvminub\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsububm\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 1 } } */
--
2.41.0
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meissner@linux.ibm.com
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 3/4] Add support for initializing and extracting from vector pairs
2023-11-10 23:02 [PATCH 0/4] Add vector pair builtins to PowerPC Michael Meissner
2023-11-10 23:09 ` [PATCH 1/4] Add support for floating point vector pair built-in functions Michael Meissner
2023-11-10 23:11 ` [PATCH 2/4] Add support for integer vector pair built-ins Michael Meissner
@ 2023-11-10 23:12 ` Michael Meissner
2023-11-10 23:13 ` [PATCH 4/4] Add support for doing a horizontal add on vector pair elements Michael Meissner
3 siblings, 0 replies; 5+ messages in thread
From: Michael Meissner @ 2023-11-10 23:12 UTC (permalink / raw)
To: Michael Meissner, gcc-patches, Segher Boessenkool, Kewen.Lin,
David Edelsohn, Peter Bergner
This patch adds a series of built-in functions to allow users to write code to
do a number of simple operations where the loop is done using the __vector_pair
type. The __vector_pair type is an opaque type. These built-in functions keep
the two 128-bit vectors within the __vector_pair together, and split the
operation after register allocation.
This patch provides vector pair operations for loading up a vector pair with all
0's, duplicated (splat) from a scalar type, or combining two vectors in a vector
pair. This patch also provides vector pair builtins to extract one vector
element of a vector pair.
I have built and tested these patches on:
* A little endian power10 server using --with-cpu=power10
* A little endian power9 server using --with-cpu=power9
* A big endian power9 server using --with-cpu=power9.
Can I check this patch into the master branch after the preceeding patches have
been checked in?
2023-11-09 Michael Meissner <meissner@linux.ibm.com>
gcc/
* config/rs6000/predicates.md (mma_assemble_input_operand): Allow any
16-byte vector, not just V16QImode.
* config/rs6000/rs6000-builtins.def (__builtin_vpair_zero): New vector
pair initialization built-in functions.
(__builtin_vpair_*_assemble): Likeise.
(__builtin_vpair_*_splat): Likeise.
(__builtin_vpair_*_extract_vector): New vector pair extraction built-in
functions.
* config/rs6000/vector-pair.md (UNSPEC_VPAIR_V32QI): New unspec.
(UNSPEC_VPAIR_V16HI): Likewise.
(UNSPEC_VPAIR_V8SI): Likewise.
(UNSPEC_VPAIR_V4DI): Likewise.
(VP_INT_BINARY): New iterator for integer vector pair.
(vp_insn): Add supoort for integer vector pairs.
(vp_ireg): New code attribute for integer vector pairs.
(vp_ipredicate): Likewise.
(VP_INT): New int interator for integer vector pairs.
(VP_VEC_MODE): Likewise.
(vp_pmode): Likewise.
(vp_vmode): Likewise.
(vp_neg_reg): New int interator for integer vector pairs.
(vpair_neg_<vp_pmode>): Add integer vector pair support insns.
(vpair_not_<vp_pmode>2): Likewise.
(vpair_<vp_insn>_<vp_pmode>3): Likewise.
(vpair_andc_<vp_pmode): Likewise.
(vpair_iorc_<vp_pmode>): Likewise.
(vpair_nand_<vp_pmode>_1): Likewise.
(vpair_nand_<vp_pmode>_2): Likewise.
(vpair_nor_<vp_pmode>_1): Likewise.
(vpair_nor_<vp_pmode>_2): Likewise.
* doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
integer vector pair built-in functions.
gcc/testsuite/
* gcc.target/powerpc/vector-pair-5.c: New test.
* gcc.target/powerpc/vector-pair-6.c: New test.
* gcc.target/powerpc/vector-pair-7.c: New test.
* gcc.target/powerpc/vector-pair-8.c: New test.
---
gcc/config/rs6000/predicates.md | 2 +-
gcc/config/rs6000/rs6000-builtins.def | 95 +++++++++
gcc/config/rs6000/vector-pair.md | 185 ++++++++++++++++++
gcc/doc/extend.texi | 44 +++++
.../gcc.target/powerpc/vector-pair-10.c | 86 ++++++++
.../gcc.target/powerpc/vector-pair-11.c | 84 ++++++++
.../gcc.target/powerpc/vector-pair-12.c | 156 +++++++++++++++
.../gcc.target/powerpc/vector-pair-13.c | 139 +++++++++++++
.../gcc.target/powerpc/vector-pair-14.c | 141 +++++++++++++
.../gcc.target/powerpc/vector-pair-15.c | 139 +++++++++++++
.../gcc.target/powerpc/vector-pair-9.c | 13 ++
11 files changed, 1083 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-10.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-11.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-12.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-13.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-14.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-15.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-9.c
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c4..922a77716c4 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1301,7 +1301,7 @@ (define_predicate "splat_input_operand"
;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
(define_special_predicate "mma_assemble_input_operand"
- (match_test "(mode == V16QImode
+ (match_test "(VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16
&& (vsx_register_operand (op, mode)
|| (MEM_P (op)
&& (indexed_or_indirect_address (XEXP (op, 0), mode)
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 3b2db39c1ab..fbd416ceb87 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4132,6 +4132,11 @@
void __builtin_vsx_stxvp (v256, unsigned long, const v256 *);
STXVP nothing {mma,pair}
+;; General vector pair built-in functions
+
+ v256 __builtin_vpair_zero ();
+ VPAIR_ZERO vpair_zero {mma}
+
;; vector pair built-in functions for 8 32-bit float values
v256 __builtin_vpair_f32_abs (v256);
@@ -4140,6 +4145,12 @@
v256 __builtin_vpair_f32_add (v256, v256);
VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+ v256 __builtin_vpair_f32_assemble (vf, vf);
+ VPAIR_F32_ASSEMBLE vpair_assemble_v8sf {mma,pair}
+
+ vf __builtin_vpair_f32_extract_vector (v256, const int<1>);
+ VPAIR_F32_EXTRACT_VECTOR vpair_extract_vector_v8sf {mma,pair}
+
v256 __builtin_vpair_f32_fma (v256, v256, v256);
VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
@@ -4155,6 +4166,9 @@
v256 __builtin_vpair_f32_neg (v256);
VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+ v256 __builtin_vpair_f32_splat (float);
+ VPAIR_F32_SPLAT vpair_splat_v8sf {mma,pair}
+
v256 __builtin_vpair_f32_sub (v256, v256);
VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
@@ -4166,6 +4180,12 @@
v256 __builtin_vpair_f64_add (v256, v256);
VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+v256 __builtin_vpair_f64_assemble (vd, vd);
+ VPAIR_F64_ASSEMBLE vpair_assemble_v4df {mma,pair}
+
+ vd __builtin_vpair_f64_extract_vector (v256, const int<1>);
+ VPAIR_F64_EXTRACT_VECTOR vpair_extract_vector_v4df {mma,pair}
+
v256 __builtin_vpair_f64_fma (v256, v256, v256);
VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
@@ -4181,6 +4201,9 @@
v256 __builtin_vpair_f64_neg (v256);
VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+ v256 __builtin_vpair_f64_splat (double);
+ VPAIR_F64_SPLAT vpair_splat_v4df {mma,pair}
+
v256 __builtin_vpair_f64_sub (v256, v256);
VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
@@ -4193,6 +4216,12 @@
v256 __builtin_vpair_i8_and (v256, v256);
VPAIR_I8_AND vpair_and_v32qi3 {mma,pair}
+ v256 __builtin_vpair_i8_assemble (vsc, vsc);
+ VPAIR_I8_ASSEMBLE vpair_assemble_v32qi {mma,pair}
+
+ vsc __builtin_vpair_i8_extract_vector (v256, const int<1>);
+ VPAIR_I8_EXTRACT_VECTOR vpair_extract_vector_v32qi {mma,pair}
+
v256 __builtin_vpair_i8_ior (v256, v256);
VPAIR_I8_IOR vpair_ior_v32qi3 {mma,pair}
@@ -4208,18 +4237,30 @@
v256 __builtin_vpair_i8_not (v256);
VPAIR_I8_NOT vpair_not_v32qi2 {mma,pair}
+ v256 __builtin_vpair_i8_splat (signed char);
+ VPAIR_I8_SPLAT vpair_splat_v32qi {mma,pair}
+
v256 __builtin_vpair_i8_sub (v256, v256);
VPAIR_I8_SUB vpair_sub_v32qi3 {mma,pair}
v256 __builtin_vpair_i8_xor (v256, v256);
VPAIR_I8_XOR vpair_xor_v32qi3 {mma,pair}
+ v256 __builtin_vpair_i8u_assemble (vuc, vuc);
+ VPAIR_I8U_ASSEMBLE vpair_assemble_v32qi {mma,pair}
+
+ vuc __builtin_vpair_i8u_extract_vector (v256, const int<1>);
+ VPAIR_I8U_EXTRACT_VECTOR vpair_extract_vector_v32qi {mma,pair}
+
v256 __builtin_vpair_i8u_max (v256, v256);
VPAIR_I8U_MAX vpair_umax_v32qi3 {mma,pair}
v256 __builtin_vpair_i8u_min (v256, v256);
VPAIR_I8U_MIN vpair_umin_v32qi3 {mma,pair}
+ v256 __builtin_vpair_i8u_splat (unsigned char);
+ VPAIR_I8U_SPLAT vpair_splat_v32qi {mma,pair}
+
;; vector pair built-in functions for 16 16-bit unsigned short or
;; signed short values
@@ -4229,6 +4270,12 @@
v256 __builtin_vpair_i16_and (v256, v256);
VPAIR_I16_AND vpair_and_v16hi3 {mma,pair}
+ v256 __builtin_vpair_i16_assemble (vss, vss);
+ VPAIR_I16_ASSEMBLE vpair_assemble_v16hi {mma,pair}
+
+ vss __builtin_vpair_i16_extract_vector (v256, const int<1>);
+ VPAIR_I16_EXTRACT_VECTOR vpair_extract_vector_v16hi {mma,pair}
+
v256 __builtin_vpair_i16_ior (v256, v256);
VPAIR_I16_IOR vpair_ior_v16hi3 {mma,pair}
@@ -4244,18 +4291,30 @@
v256 __builtin_vpair_i16_not (v256);
VPAIR_I16_NOT vpair_not_v16hi2 {mma,pair}
+ v256 __builtin_vpair_i16_splat (short);
+ VPAIR_I16_SPLAT vpair_splat_v16hi {mma,pair}
+
v256 __builtin_vpair_i16_sub (v256, v256);
VPAIR_I16_SUB vpair_sub_v16hi3 {mma,pair}
v256 __builtin_vpair_i16_xor (v256, v256);
VPAIR_I16_XOR vpair_xor_v16hi3 {mma,pair}
+ v256 __builtin_vpair_i16u_assemble (vus, vus);
+ VPAIR_I16U_ASSEMBLE vpair_assemble_v16hi {mma,pair}
+
+ vus __builtin_vpair_i16u_extract_vector (v256, const int<1>);
+ VPAIR_I16U_EXTRACT_VECTOR vpair_extract_vector_v16hi {mma,pair}
+
v256 __builtin_vpair_i16u_max (v256, v256);
VPAIR_I16U_MAX vpair_umax_v16hi3 {mma,pair}
v256 __builtin_vpair_i16u_min (v256, v256);
VPAIR_I16U_MIN vpair_umin_v16hi3 {mma,pair}
+ v256 __builtin_vpair_i16u_splat (unsigned short);
+ VPAIR_I16U_SPLAT vpair_splat_v16hi {mma,pair}
+
;; vector pair built-in functions for 8 32-bit unsigned int or
;; signed int values
@@ -4265,6 +4324,12 @@
v256 __builtin_vpair_i32_and (v256, v256);
VPAIR_I32_AND vpair_and_v8si3 {mma,pair}
+ v256 __builtin_vpair_i32_assemble (vsi, vsi);
+ VPAIR_I32_ASSEMBLE vpair_assemble_v8si {mma,pair}
+
+ vsi __builtin_vpair_i32_extract_vector (v256, const int<1>);
+ VPAIR_I32_EXTRACT_VECTOR vpair_extract_vector_v8si {mma,pair}
+
v256 __builtin_vpair_i32_ior (v256, v256);
VPAIR_I32_IOR vpair_ior_v8si3 {mma,pair}
@@ -4280,18 +4345,30 @@
v256 __builtin_vpair_i32_not (v256);
VPAIR_I32_NOT vpair_not_v8si2 {mma,pair}
+ v256 __builtin_vpair_i32_splat (int);
+ VPAIR_I32_SPLAT vpair_splat_v8si {mma,pair}
+
v256 __builtin_vpair_i32_sub (v256, v256);
VPAIR_I32_SUB vpair_sub_v8si3 {mma,pair}
v256 __builtin_vpair_i32_xor (v256, v256);
VPAIR_I32_XOR vpair_xor_v8si3 {mma,pair}
+ v256 __builtin_vpair_i32u_assemble (vui, vui);
+ VPAIR_I32U_ASSEMBLE vpair_assemble_v8si {mma,pair}
+
+ vui __builtin_vpair_i32u_extract_vector (v256, const int<1>);
+ VPAIR_I32U_EXTRACT_VECTOR vpair_extract_vector_v8si {mma,pair}
+
v256 __builtin_vpair_i32u_max (v256, v256);
VPAIR_I32U_MAX vpair_umax_v8si3 {mma,pair}
v256 __builtin_vpair_i32u_min (v256, v256);
VPAIR_I32U_MIN vpair_umin_v8si3 {mma,pair}
+ v256 __builtin_vpair_i32u_splat (unsigned int);
+ VPAIR_I32U_SPLAT vpair_splat_v8si {mma,pair}
+
;; vector pair built-in functions for 4 64-bit unsigned long long or
;; signed long long values
@@ -4301,6 +4378,12 @@
v256 __builtin_vpair_i64_and (v256, v256);
VPAIR_I64_AND vpair_and_v4di3 {mma,pair}
+ v256 __builtin_vpair_i64_assemble (vsll, vsll);
+ VPAIR_I64_ASSEMBLE vpair_assemble_v4di {mma,pair}
+
+ vsll __builtin_vpair_i64_extract_vector (v256, const int<1>);
+ VPAIR_I64_EXTRACT_VECTOR vpair_extract_vector_v4di {mma,pair}
+
v256 __builtin_vpair_i64_ior (v256, v256);
VPAIR_I64_IOR vpair_ior_v4di3 {mma,pair}
@@ -4316,14 +4399,26 @@
v256 __builtin_vpair_i64_not (v256);
VPAIR_I64_NOT vpair_not_v4di2 {mma,pair}
+ v256 __builtin_vpair_i64_splat (long long);
+ VPAIR_I64_SPLAT vpair_splat_v4di {mma,pair}
+
v256 __builtin_vpair_i64_sub (v256, v256);
VPAIR_I64_SUB vpair_sub_v4di3 {mma,pair}
v256 __builtin_vpair_i64_xor (v256, v256);
VPAIR_I64_XOR vpair_xor_v4di3 {mma,pair}
+ v256 __builtin_vpair_i64u_assemble (vull, vull);
+ VPAIR_I64U_ASSEMBLE vpair_assemble_v4di {mma,pair}
+
+ vull __builtin_vpair_i64u_extract_vector (v256, const int<1>);
+ VPAIR_I64U_EXTRACT_VECTOR vpair_extract_vector_v4di {mma,pair}
+
v256 __builtin_vpair_i64u_max (v256, v256);
VPAIR_I64U_MAX vpair_umax_v4di3 {mma,pair}
v256 __builtin_vpair_i64u_min (v256, v256);
VPAIR_I64U_MIN vpair_umin_v4di3 {mma,pair}
+
+ v256 __builtin_vpair_i64u_splat (unsigned long long);
+ VPAIR_I64U_SPLAT vpair_splat_v4di {mma,pair}
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
index cd14430f47a..f6d0b2a39fc 100644
--- a/gcc/config/rs6000/vector-pair.md
+++ b/gcc/config/rs6000/vector-pair.md
@@ -33,6 +33,8 @@ (define_c_enum "unspec"
UNSPEC_VPAIR_V16HI
UNSPEC_VPAIR_V8SI
UNSPEC_VPAIR_V4DI
+ UNSPEC_VPAIR_ZERO
+ UNSPEC_VPAIR_SPLAT
])
;; Iterator doing unary/binary arithmetic on vector pairs
@@ -93,6 +95,13 @@ (define_int_iterator VP_INT [UNSPEC_VPAIR_V4DI
UNSPEC_VPAIR_V16HI
UNSPEC_VPAIR_V32QI])
+(define_int_iterator VP_ALL [UNSPEC_VPAIR_V4DF
+ UNSPEC_VPAIR_V8SF
+ UNSPEC_VPAIR_V4DI
+ UNSPEC_VPAIR_V8SI
+ UNSPEC_VPAIR_V16HI
+ UNSPEC_VPAIR_V32QI])
+
;; Map VP_* to vector mode of the arguments after they are split
(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF "V2DF")
(UNSPEC_VPAIR_V8SF "V4SF")
@@ -126,6 +135,182 @@ (define_int_attr vp_neg_reg [(UNSPEC_VPAIR_V32QI "&v")
(UNSPEC_VPAIR_V8SI "X")
(UNSPEC_VPAIR_V4DI "X")])
+;; Moddes of the vector element to splat to vector pair
+(define_mode_iterator VP_SPLAT [DF SF DI SI HI QI])
+
+;; Moddes of the vector to splat to vector pair
+(define_mode_iterator VP_SPLAT_VEC [V2DF V4SF V2DI V4SI V8HI V16QI])
+
+;; MAP VP_SPLAT and VP_SPLAT_VEC to the mode of the vector pair operation
+(define_mode_attr vp_splat_pmode [(DF "v4df")
+ (V2DF "v4df")
+ (SF "v8sf")
+ (V4SF "v8sf")
+ (DI "v4di")
+ (V2DI "v4di")
+ (SI "v8si")
+ (V4SI "v8si")
+ (HI "v16hi")
+ (V8HI "v16hi")
+ (QI "v32qi")
+ (V16QI "v32qi")])
+
+;; MAP VP_SPLAT to the mode of the vector containing the element
+(define_mode_attr VP_SPLAT_VMODE [(DF "V2DF")
+ (SF "V4SF")
+ (DI "V2DI")
+ (SI "V4SI")
+ (HI "V8HI")
+ (QI "V16QI")])
+
+;; Initialize a vector pair to 0
+(define_insn_and_split "vpair_zero"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(const_int 0)] UNSPEC_VPAIR_ZERO))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 1) (match_dup 3))
+ (set (match_dup 2) (match_dup 3))]
+{
+ rtx op0 = operands[0];
+ unsigned offset_hi = (WORDS_BIG_ENDIAN) ? 0 : 16;
+ unsigned offset_lo = (WORDS_BIG_ENDIAN) ? 16 : 0;
+
+ operands[1] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_hi);
+ operands[2] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_lo);
+ operands[3] = CONST0_RTX (V2DImode);
+}
+ [(set_attr "length" "8")])
+
+;; Assemble a vector pair from two vectors. Unlike
+;; __builtin_mma_assemble_pair, this function produces a vector pair output
+;; directly and it takes all of the vector types.
+;;
+;; We cannot update the two output registers atomically, so mark the output as
+;; an early clobber so we don't accidentally clobber the input operands. */
+
+(define_insn_and_split "vpair_assemble_<vp_pmode>"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=&wa")
+ (unspec:OO
+ [(match_operand:<VP_VEC_MODE> 1 "mma_assemble_input_operand" "mwa")
+ (match_operand:<VP_VEC_MODE> 2 "mma_assemble_input_operand" "mwa")]
+ VP_ALL))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ rtx src = gen_rtx_UNSPEC (OOmode,
+ gen_rtvec (2, operands[1], operands[2]),
+ UNSPEC_VSX_ASSEMBLE);
+ rs6000_split_multireg_move (operands[0], src);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Extract one of the two 128-bit vectors from a vector pair.
+(define_insn_and_split "vpair_extract_vector_<vp_pmode>"
+ [(set (match_operand:<VP_VEC_MODE> 0 "vsx_register_operand" "=wa")
+ (unspec:<VP_VEC_MODE>
+ [(match_operand:OO 1 "vsx_register_operand" "wa")
+ (match_operand 2 "const_0_to_1_operand" "n")]
+ VP_ALL))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (match_dup 3))]
+{
+ machine_mode vmode = <VP_VEC_MODE>mode;
+ unsigned reg_num = UINTVAL (operands[2]);
+ if (!WORDS_BIG_ENDIAN)
+ reg_num = 1 - reg_num;
+
+ operands[3] = simplify_gen_subreg (vmode, operands[1], OOmode, reg_num * 16);
+})
+
+;; Optimize extracting an 128-bit vector from a vector pair in memory.
+(define_insn_and_split "*vpair_extract_vector_<vp_pmode>_mem"
+ [(set (match_operand:<VP_VEC_MODE> 0 "vsx_register_operand" "=wa")
+ (unspec:<VP_VEC_MODE>
+ [(match_operand:OO 1 "memory_operand" "o")
+ (match_operand 2 "const_0_to_1_operand" "n")]
+ VP_ALL))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (match_dup 3))]
+{
+ operands[3] = adjust_address (operands[1], <VP_VEC_MODE>mode,
+ 16 * INTVAL (operands[2]));
+}
+ [(set_attr "type" "vecload")])
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<vp_splat_pmode>"
+ [(use (match_operand:OO 0 "vsx_register_operand"))
+ (use (match_operand:VP_SPLAT 1 "input_operand"))]
+ "TARGET_MMA"
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ machine_mode element_mode = <MODE>mode;
+ machine_mode vector_mode = <VP_SPLAT_VMODE>mode;
+
+ if (op1 == CONST0_RTX (element_mode))
+ {
+ emit_insn (gen_vpair_zero (op0));
+ DONE;
+ }
+
+ rtx vec = gen_reg_rtx (vector_mode);
+ unsigned num_elements = GET_MODE_NUNITS (vector_mode);
+ rtvec elements = rtvec_alloc (num_elements);
+ for (size_t i = 0; i < num_elements; i++)
+ RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+ rs6000_expand_vector_init (vec, gen_rtx_PARALLEL (vector_mode, elements));
+ emit_insn (gen_vpair_splat_<vp_splat_pmode>_internal (op0, vec));
+ DONE;
+})
+
+;; Inner splat support. Operand1 is the vector splat created above. Allow
+;; operand 1 to overlap with the output registers to eliminate one move
+;; instruction.
+(define_insn_and_split "vpair_splat_<vp_splat_pmode>_internal"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(match_operand:VP_SPLAT_VEC 1 "vsx_register_operand" "0,wa")]
+ UNSPEC_VPAIR_SPLAT))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op0_vector0 = simplify_gen_subreg (<MODE>mode, op0, OOmode, 0);
+ rtx op0_vector1 = simplify_gen_subreg (<MODE>mode, op0, OOmode, 16);
+
+ /* Check if the input is one of the output registers. */
+ if (rtx_equal_p (op0_vector0, op1))
+ emit_move_insn (op0_vector1, op1);
+
+ else if (rtx_equal_p (op0_vector1, op1))
+ emit_move_insn (op0_vector0, op1);
+
+ else
+ {
+ emit_move_insn (op0_vector0, op1);
+ emit_move_insn (op0_vector1, op1);
+ }
+
+ DONE;
+}
+ [(set_attr "length" "*,8")
+ (set_attr "type" "vecmove")])
+
\f
;; Vector pair floating point unary operations
(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index ff7918c7a58..600e2c393db 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -21386,17 +21386,27 @@ two 128-bit vectors stored in the vector pair. The
@code{__vector_pair} type is usually stored with a single vector pair
store instruction.
+The following built-in functions are independent on the type of the
+underlying vector:
+
+@smallexample
+__vector_pair __builtin_vpair_zero ();
+@end smallexample
+
The following built-in functions operate on pairs of
@code{vector float} values:
@smallexample
__vector_pair __builtin_vpair_f32_abs (__vector_pair);
__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_assemble (vector float, vector float);
+vector float __builtin_vpair_f32_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_splat (float);
__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
@end smallexample
@@ -21406,11 +21416,14 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_f64_abs (__vector_pair);
__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_assemble (vector double, vector double);
+vector double __builtin_vpair_f64_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f64_neg (__vector_pair);
__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_splat (double);
__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
@end smallexample
@@ -21420,16 +21433,24 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_i64_add (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64_assemble (vector long long,
+ vector long long);
+vector long long __builtin_vpair_i64_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_i64_ior (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64_min (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64_neg (__vector_pair);
__vector_pair __builtin_vpair_i64_not (__vector_pair);
+__vector_pair __builtin_vpair_i64_splat (long long);
__vector_pair __builtin_vpair_i64_sub (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64_xor (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64u_assemble (vector unsigned long long,
+ vector unsigned long long);
+vector unsigned long long __builtin_vpair_i64u_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_i64u_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64u_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i64u_splat (unsigned long long);
@end smallexample
The following built-in functions operate on pairs of
@@ -21438,16 +21459,23 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_i32_add (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i32_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_assemble (vector int, vector int);
+vector int __builtin_vpair_i32_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_i32_ior (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i32_neg (__vector_pair);
__vector_pair __builtin_vpair_i32_not (__vector_pair);
__vector_pair __builtin_vpair_i32_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32_splat (int);
__vector_pair __builtin_vpair_i32_sub (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i32_xor (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32u_assemble (vector unsigned int,
+ vector unsigned int);
+vector unsigned int __builtin_vpair_i32u_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_i32u_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i32u_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i32u_splat (unsigned int);
@end smallexample
The following built-in functions operate on pairs of
@@ -21456,6 +21484,10 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_i16_add (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i16_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16_assemble (vector short,
+ vector short);
+__vector_pair __builtin_vpair_i16_splat (short);
+vector short __builtin_vpair_i16_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_i16_ior (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i16_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i16_min (__vector_pair, __vector_pair);
@@ -21464,6 +21496,10 @@ __vector_pair __builtin_vpair_i16_not (__vector_pair);
__vector_pair __builtin_vpair_i16_sub (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i16_xor (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i16u_assemble (vector unsigned short,
+ vector unsigned short);
+vector unsigned short __builtin_vpair_i16u_extract_vector (__vector_pair, int);
+__vector_pair __builtin_vpair_i16u_splat (unsigned short);
__vector_pair __builtin_vpair_i16u_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i16u_min (__vector_pair, __vector_pair);
@end smallexample
@@ -21474,6 +21510,10 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_i8_add (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i8_and (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8_assemble (vector signed char,
+ vector signed char);
+vector signed char __builtin_vpair_i8_extract_vector (__vector_pair, int);
+__vector_pair __builtin_vpair_i8_splat (signed char);
__vector_pair __builtin_vpair_i8_ior (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i8_max (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i8_min (__vector_pair, __vector_pair);
@@ -21482,8 +21522,12 @@ __vector_pair __builtin_vpair_i8_not (__vector_pair);
__vector_pair __builtin_vpair_i8_sub (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i8_xor (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8u_assemble (vector unsigned char,
+ vector unsigned char4);
+vector unsigned char __builtin_vpair_i8u_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_i8_umax (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i8_umin (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_i8u_splat (unsigned char);
@end smallexample
@node PowerPC Hardware Transactional Memory Built-in Functions
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-10.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-10.c
new file mode 100644
index 00000000000..df1c4019245
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-10.c
@@ -0,0 +1,86 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test the vector pair built-in functions for creation and extraction of
+ vector pair operations using 32-bit floats. */
+
+void
+test_f32_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_f32_splat (0.0f);
+}
+
+void
+test_f32_splat_1 (__vector_pair *p)
+{
+ /* 1 xxspltiw, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_f32_splat (1.0f);
+}
+
+void
+test_f32_splat_var (__vector_pair *p,
+ float f)
+{
+ /* 1 xscvdpspn, 1 xxspltw, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_f32_splat (f);
+}
+
+void
+test_f32_splat_mem (__vector_pair *p,
+ float *q)
+{
+ /* 1 lxvwsx, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_f32_splat (*q);
+}
+
+void
+test_f32_assemble (__vector_pair *p,
+ vector float v1,
+ vector float v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_f32_assemble (v1, v2);
+}
+
+vector float
+test_f32_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_f32_extract_vector (vp, 0);
+}
+
+vector float
+test_f32_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_f32_extract_vector (vp, 0);
+}
+
+vector float
+test_f32_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_f32_extract_vector (p[1], 0);
+}
+
+vector float
+test_f32_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_f32_extract_vector (p[2], 1);
+}
+
+/* { dg-final { scan-assembler-times {\mlxv\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxvwsx\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 5 } } */
+/* { dg-final { scan-assembler-times {\mxscvdpspn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltiw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxspltw\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-11.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-11.c
new file mode 100644
index 00000000000..397d7f60f45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-11.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test the vector pair built-in functions for creation and extraction of
+ vector pair operations using 64-bit doubles. */
+
+void
+test_f64_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib. */
+ *p = __builtin_vpair_f64_splat (0.0);
+}
+
+void
+test_f64_splat_1 (__vector_pair *p)
+{
+ /* 1 xxspltidp, 1 xxlor. */
+ *p = __builtin_vpair_f64_splat (1.0);
+}
+
+void
+test_f64_splat_var (__vector_pair *p,
+ double d)
+{
+ /* 1 xxpermdi, 1 xxlor. */
+ *p = __builtin_vpair_f64_splat (d);
+}
+
+void
+test_f64_splat_mem (__vector_pair *p,
+ double *q)
+{
+ /* 1 lxvdsx, 1 xxlor. */
+ *p = __builtin_vpair_f64_splat (*q);
+}
+
+void
+test_f64_assemble (__vector_pair *p,
+ vector double v1,
+ vector double v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_f64_assemble (v1, v2);
+}
+
+vector double
+test_f64_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_f64_extract_vector (vp, 0);
+}
+
+vector double
+test_f64_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_f64_extract_vector (vp, 0);
+}
+
+vector double
+test_f64_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_f64_extract_vector (p[1], 0);
+}
+
+vector double
+test_f64_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_f64_extract_vector (p[2], 1);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvdsx\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 5 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltidp\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-12.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-12.c
new file mode 100644
index 00000000000..0990dfe28d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-12.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test the vector pair built-in functions for creation and extraction of
+ vector pair operations using 64-bit integers. */
+
+void
+test_i64_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i64_splat (0);
+}
+
+void
+test_i64_splat_1 (__vector_pair *p)
+{
+ /* 1 xxspltib, 1 vextsb2d, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64_splat (1);
+}
+
+void
+test_i64_splat_var (__vector_pair *p,
+ long long ll)
+{
+ /* 1 xscvdpspn, 1 xxspltw, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64_splat (ll);
+}
+
+void
+test_i64_splat_mem (__vector_pair *p,
+ long long *q)
+{
+ /* 1 lxvwsx, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64_splat (*q);
+}
+
+void
+test_i64_assemble (__vector_pair *p,
+ vector long long v1,
+ vector long long v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64_assemble (v1, v2);
+}
+
+vector long long
+test_i64_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i64_extract_vector (vp, 0);
+}
+
+vector long long
+test_i64_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i64_extract_vector (vp, 0);
+}
+
+vector long long
+test_i64_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i64_extract_vector (p[1], 0);
+}
+
+vector long long
+test_i64_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i64_extract_vector (p[2], 1);
+}
+
+void
+test_i64u_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i64u_splat (0);
+}
+
+void
+test_i64u_splat_1 (__vector_pair *p)
+{
+ /* 1 xxspltib, 1 vextsb2d, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64u_splat (1);
+}
+
+void
+test_i64u_splat_var (__vector_pair *p,
+ unsigned long long ull)
+{
+ /* 1 xscvdpspn, 1 xxspltw, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64u_splat (ull);
+}
+
+void
+test_i64u_splat_mem (__vector_pair *p,
+ unsigned long long *q)
+{
+ /* 1 lxvwsx, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64u_splat (*q);
+}
+
+void
+test_i64u_assemble (__vector_pair *p,
+ vector unsigned long long v1,
+ vector unsigned long long v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i64u_assemble (v1, v2);
+}
+
+vector unsigned long long
+test_i64u_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i64u_extract_vector (vp, 0);
+}
+
+vector unsigned long long
+test_i64u_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i64u_extract_vector (vp, 0);
+}
+
+vector unsigned long long
+test_i64u_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i64u_extract_vector (p[1], 0);
+}
+
+vector unsigned long long
+test_i64u_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i64u_extract_vector (p[2], 1);
+}
+
+/* { dg-final { scan-assembler-times {\mlxv\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlxvdsx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 10 } } */
+/* { dg-final { scan-assembler-times {\mvextsb2d\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 6 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-13.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-13.c
new file mode 100644
index 00000000000..8174f6b1cc3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-13.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test the vector pair built-in functions for creation and extraction of
+ vector pair operations using 32-bit integers. */
+
+void
+test_i32_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i32_splat (0);
+}
+
+void
+test_i32_splat_1 (__vector_pair *p)
+{
+ /* 1 vspltisw, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i32_splat (1);
+}
+
+void
+test_i32_splat_mem (__vector_pair *p,
+ int *q)
+{
+ /* 1 lxvwsx, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i32_splat (*q);
+}
+
+void
+test_i32_assemble (__vector_pair *p,
+ vector int v1,
+ vector int v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i32_assemble (v1, v2);
+}
+
+vector int
+test_i32_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i32_extract_vector (vp, 0);
+}
+
+vector int
+test_i32_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i32_extract_vector (vp, 0);
+}
+
+vector int
+test_i32_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i32_extract_vector (p[1], 0);
+}
+
+vector int
+test_i32_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i32_extract_vector (p[2], 1);
+}
+
+void
+test_i32u_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i32u_splat (0);
+}
+
+void
+test_i32u_splat_1 (__vector_pair *p)
+{
+ /* 1 vspltisw, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i32u_splat (1);
+}
+
+void
+test_i32u_splat_mem (__vector_pair *p,
+ unsigned int *q)
+{
+ /* 1 lxvwsx, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i32u_splat (*q);
+}
+
+void
+test_i32u_assemble (__vector_pair *p,
+ vector unsigned int v1,
+ vector unsigned int v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i32u_assemble (v1, v2);
+}
+
+vector unsigned int
+test_i32u_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i32u_extract_vector (vp, 0);
+}
+
+vector unsigned int
+test_i32u_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i32u_extract_vector (vp, 0);
+}
+
+vector unsigned int
+test_i32u_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i32u_extract_vector (p[1], 0);
+}
+
+vector unsigned int
+test_i32u_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i32u_extract_vector (p[2], 1);
+}
+
+/* { dg-final { scan-assembler-times {\mlxv\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlxvwsx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mvspltisw\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 4 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-14.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-14.c
new file mode 100644
index 00000000000..fe63df795d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-14.c
@@ -0,0 +1,141 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test the vector pair built-in functions for creation and extraction of
+ vector pair operations using 16-bit integers. */
+
+void
+test_i16_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i16_splat (0);
+}
+
+void
+test_i16_splat_1 (__vector_pair *p)
+{
+ /* 1 vspltish, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i16_splat (1);
+}
+
+void
+test_i16_splat_mem (__vector_pair *p,
+ short *q)
+{
+ /* 1 lxsihzx, 1 vsplth, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i16_splat (*q);
+}
+
+void
+test_i16_assemble (__vector_pair *p,
+ vector short v1,
+ vector short v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i16_assemble (v1, v2);
+}
+
+vector short
+test_i16_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i16_extract_vector (vp, 0);
+}
+
+vector short
+test_i16_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i16_extract_vector (vp, 0);
+}
+
+vector short
+test_i16_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i16_extract_vector (p[1], 0);
+}
+
+vector short
+test_i16_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i16_extract_vector (p[2], 1);
+}
+
+void
+test_i16u_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i16u_splat (0);
+}
+
+void
+test_i16u_splat_1 (__vector_pair *p)
+{
+ /* 1 vspltish, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i16u_splat (1);
+}
+
+void
+test_i16u_splat_mem (__vector_pair *p,
+ unsigned short *q)
+{
+ /* 1 lxsihzx, 1 vsplth, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i16u_splat (*q);
+}
+
+void
+test_i16u_assemble (__vector_pair *p,
+ vector unsigned short v1,
+ vector unsigned short v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i16u_assemble (v1, v2);
+}
+
+vector unsigned short
+test_i16u_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i16u_extract_vector (vp, 0);
+}
+
+vector unsigned short
+test_i16u_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i16u_extract_vector (vp, 0);
+}
+
+vector unsigned short
+test_i16u_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i16u_extract_vector (p[1], 0);
+}
+
+vector unsigned short
+test_i16u_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i16u_extract_vector (p[2], 1);
+}
+
+/* { dg-final { scan-assembler-times {\mlxsihzx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mvsplth\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvspltish\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 4 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-15.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-15.c
new file mode 100644
index 00000000000..bd494327af6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-15.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test the vector pair built-in functions for creation and extraction of
+ vector pair operations using 8-bit integers. */
+
+void
+test_i8_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i8_splat (0);
+}
+
+void
+test_i8_splat_1 (__vector_pair *p)
+{
+ /* 1 vspltisb, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i8_splat (1);
+}
+
+void
+test_i8_splat_mem (__vector_pair *p,
+ signed char *q)
+{
+ /* 1 lxsibzx, 1 vspltb, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i8_splat (*q);
+}
+
+void
+test_i8_assemble (__vector_pair *p,
+ vector signed char v1,
+ vector signed char v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i8_assemble (v1, v2);
+}
+
+vector signed char
+test_i8_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i8_extract_vector (vp, 0);
+}
+
+vector signed char
+test_i8_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i8_extract_vector (vp, 0);
+}
+
+vector signed char
+test_i8_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i8_extract_vector (p[1], 0);
+}
+
+vector signed char
+test_i8_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i8_extract_vector (p[2], 1);
+}
+
+void
+test_i8u_splat_0 (__vector_pair *p)
+{
+ /* 2 xxspltib, 1 stxvp. */
+ *p = __builtin_vpair_i8u_splat (0);
+}
+
+void
+test_i8u_splat_1 (__vector_pair *p)
+{
+ /* 1 vspltisb, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i8u_splat (1);
+}
+
+void
+test_i8u_splat_mem (__vector_pair *p,
+ unsigned char *q)
+{
+ /* 1 lxsibzx, 1 vspltb, 1 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i8u_splat (*q);
+}
+
+void
+test_i8u_assemble (__vector_pair *p,
+ vector unsigned char v1,
+ vector unsigned char v2)
+{
+ /* 2 xxlor, 1 stxvp. */
+ *p = __builtin_vpair_i8u_assemble (v1, v2);
+}
+
+vector unsigned char
+test_i8u_extract_0_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i8u_extract_vector (vp, 0);
+}
+
+vector unsigned char
+test_i8u_extract_1_reg (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xxlor. */
+ __vector_pair vp = *p;
+ __asm__ (" # extract in register %x0" : "+wa" (vp));
+ return __builtin_vpair_i8u_extract_vector (vp, 0);
+}
+
+vector unsigned char
+test_i8u_extract_0_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i8u_extract_vector (p[1], 0);
+}
+
+vector unsigned char
+test_i8u_extract_1_mem (__vector_pair *p)
+{
+ /* 1 lxv. */
+ return __builtin_vpair_i8u_extract_vector (p[2], 1);
+}
+
+/* { dg-final { scan-assembler-times {\mlxsibzx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mvspltb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 6 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-9.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-9.c
new file mode 100644
index 00000000000..95504a5afd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-9.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+void
+test_zero (__vector_pair *p)
+{
+ /* 2 xxspltib. */
+ *p = __builtin_vpair_zero ();
+}
+
+/* { dg-final { scan-assembler-times {\mstxvp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M} 2 } } */
--
2.41.0
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meissner@linux.ibm.com
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 4/4] Add support for doing a horizontal add on vector pair elements.
2023-11-10 23:02 [PATCH 0/4] Add vector pair builtins to PowerPC Michael Meissner
` (2 preceding siblings ...)
2023-11-10 23:12 ` [PATCH 3/4] Add support for initializing and extracting from vector pairs Michael Meissner
@ 2023-11-10 23:13 ` Michael Meissner
3 siblings, 0 replies; 5+ messages in thread
From: Michael Meissner @ 2023-11-10 23:13 UTC (permalink / raw)
To: Michael Meissner, gcc-patches, Segher Boessenkool, Kewen.Lin,
David Edelsohn, Peter Bergner
This patch adds a series of built-in functions to allow users to write code to
do a number of simple operations where the loop is done using the __vector_pair
type. The __vector_pair type is an opaque type. These built-in functions keep
the two 128-bit vectors within the __vector_pair together, and split the
operation after register allocation.
This patch provides vector pair built-in functions to do a horizontal add on
vector pair elements. Only floating point and 64-bit horizontal adds are
provided in this patch.
I have built and tested these patches on:
* A little endian power10 server using --with-cpu=power10
* A little endian power9 server using --with-cpu=power9
* A big endian power9 server using --with-cpu=power9.
Can I check this patch into the master branch after the preceeding patches have
been checked in?
2023-11-08 Michael Meissner <meissner@linux.ibm.com>
gcc/
* config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_add_elements):
New built-in function.
(__builtin_vpair_f64_add_elements): Likewise.
(__builtin_vpair_i64_add_elements): Likewise.
(__builtin_vpair_i64u_add_elements): Likewise.
* config/rs6000/vector-pair.md (UNSPEC_VPAIR_REDUCE_PLUS_F32): New
unspec.
(UNSPEC_VPAIR_REDUCE_PLUS_F64): Likewise.
(UNSPEC_VPAIR_REDUCE_PLUS_I64): Likewise.
(vpair_reduc_plus_scale_v8sf): New insn.
(vpair_reduc_plus_scale_v4df): Likewise.
(vpair_reduc_plus_scale_v4di): Likewise.
* doc/extend.texi (__builtin_vpair_f32_add_elements): Document.
(__builtin_vpair_f64_add_elements): Likewise.
(__builtin_vpair_i64_add_elements): Likewise.
gcc/testsuite/
* gcc.target/powerpc/vector-pair-16.c: New test.
---
gcc/config/rs6000/rs6000-builtins.def | 12 +++
gcc/config/rs6000/vector-pair.md | 93 +++++++++++++++++++
gcc/doc/extend.texi | 3 +
.../gcc.target/powerpc/vector-pair-16.c | 45 +++++++++
4 files changed, 153 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-16.c
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index fbd416ceb87..b9a16c01420 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4145,6 +4145,9 @@
v256 __builtin_vpair_f32_add (v256, v256);
VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+ float __builtin_vpair_f32_add_elements (v256);
+ VPAIR_F32_ADD_ELEMENTS vpair_reduc_plus_scale_v8sf {mma,pair}
+
v256 __builtin_vpair_f32_assemble (vf, vf);
VPAIR_F32_ASSEMBLE vpair_assemble_v8sf {mma,pair}
@@ -4180,6 +4183,9 @@
v256 __builtin_vpair_f64_add (v256, v256);
VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+ double __builtin_vpair_f64_add_elements (v256);
+ VPAIR_F64_ADD_ELEMENTS vpair_reduc_plus_scale_v4df {mma,pair}
+
v256 __builtin_vpair_f64_assemble (vd, vd);
VPAIR_F64_ASSEMBLE vpair_assemble_v4df {mma,pair}
@@ -4375,6 +4381,9 @@ v256 __builtin_vpair_f64_assemble (vd, vd);
v256 __builtin_vpair_i64_add (v256, v256);
VPAIR_I64_ADD vpair_add_v4di3 {mma,pair}
+ long long __builtin_vpair_i64_add_elements (v256);
+ VPAIR_I64_ADD_ELEMENTS vpair_reduc_plus_scale_v4di {mma,pair,no32bit}
+
v256 __builtin_vpair_i64_and (v256, v256);
VPAIR_I64_AND vpair_and_v4di3 {mma,pair}
@@ -4408,6 +4417,9 @@ v256 __builtin_vpair_f64_assemble (vd, vd);
v256 __builtin_vpair_i64_xor (v256, v256);
VPAIR_I64_XOR vpair_xor_v4di3 {mma,pair}
+ unsigned long long __builtin_vpair_i64u_add_elements (v256);
+ VPAIR_I64U_ADD_ELEMENTS vpair_reduc_plus_scale_v4di {mma,pair,no32bit}
+
v256 __builtin_vpair_i64u_assemble (vull, vull);
VPAIR_I64U_ASSEMBLE vpair_assemble_v4di {mma,pair}
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
index f6d0b2a39fc..b5e9330e71f 100644
--- a/gcc/config/rs6000/vector-pair.md
+++ b/gcc/config/rs6000/vector-pair.md
@@ -35,6 +35,9 @@ (define_c_enum "unspec"
UNSPEC_VPAIR_V4DI
UNSPEC_VPAIR_ZERO
UNSPEC_VPAIR_SPLAT
+ UNSPEC_VPAIR_REDUCE_PLUS_F32
+ UNSPEC_VPAIR_REDUCE_PLUS_F64
+ UNSPEC_VPAIR_REDUCE_PLUS_I64
])
;; Iterator doing unary/binary arithmetic on vector pairs
@@ -577,6 +580,66 @@ (define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
}
[(set_attr "length" "8")])
+\f
+;; Add all elements in a pair of V4SF vectors.
+(define_insn_and_split "vpair_reduc_plus_scale_v8sf"
+ [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+ (unspec:SF [(match_operand:OO 1 "vsx_register_operand" "v")]
+ UNSPEC_VPAIR_REDUCE_PLUS_F32))
+ (clobber (match_scratch:V4SF 2 "=&v"))
+ (clobber (match_scratch:V4SF 3 "=&v"))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(pc)]
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx tmp1 = operands[2];
+ rtx tmp2 = operands[3];
+ unsigned r = reg_or_subregno (op1);
+ rtx op1_hi = gen_rtx_REG (V4SFmode, r);
+ rtx op1_lo = gen_rtx_REG (V4SFmode, r + 1);
+
+ emit_insn (gen_addv4sf3 (tmp1, op1_hi, op1_lo));
+ emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (8)));
+ emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+ emit_insn (gen_altivec_vsldoi_v4sf (tmp1, tmp2, tmp2, GEN_INT (4)));
+ emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+ emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+ DONE;
+}
+ [(set_attr "length" "24")])
+
+;; Add all elements in a pair of V2DF vectors
+(define_insn_and_split "vpair_reduc_plus_scale_v4df"
+ [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+ (unspec:DF [(match_operand:OO 1 "vsx_register_operand" "wa")]
+ UNSPEC_VPAIR_REDUCE_PLUS_F64))
+ (clobber (match_scratch:DF 2 "=&wa"))
+ (clobber (match_scratch:V2DF 3 "=&wa"))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 3)
+ (plus:V2DF (match_dup 4)
+ (match_dup 5)))
+ (set (match_dup 2)
+ (vec_select:DF (match_dup 3)
+ (parallel [(match_dup 6)])))
+ (set (match_dup 0)
+ (plus:DF (match_dup 7)
+ (match_dup 2)))]
+{
+ unsigned reg1 = reg_or_subregno (operands[1]);
+ unsigned reg3 = reg_or_subregno (operands[3]);
+
+ operands[4] = gen_rtx_REG (V2DFmode, reg1);
+ operands[5] = gen_rtx_REG (V2DFmode, reg1 + 1);
+ operands[6] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+ operands[7] = gen_rtx_REG (DFmode, reg3);
+})
+
\f
;; Vector pair integer negate support.
(define_insn_and_split "vpair_neg_<vp_pmode>2"
@@ -786,3 +849,33 @@ (define_insn_and_split "*vpair_nor_<vp_pmode>_2"
DONE;
}
[(set_attr "length" "8")])
+
+;; Add all elements in a pair of V2DI vectors
+(define_insn_and_split "vpair_reduc_plus_scale_v4di"
+ [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
+ (unspec:DI [(match_operand:OO 1 "altivec_register_operand" "v")]
+ UNSPEC_VPAIR_REDUCE_PLUS_I64))
+ (clobber (match_scratch:V2DI 2 "=&v"))
+ (clobber (match_scratch:DI 3 "=&r"))]
+ "TARGET_MMA && TARGET_POWERPC64"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 2)
+ (plus:V2DI (match_dup 4)
+ (match_dup 5)))
+ (set (match_dup 3)
+ (vec_select:DI (match_dup 2)
+ (parallel [(const_int 0)])))
+ (set (match_dup 0)
+ (vec_select:DI (match_dup 2)
+ (parallel [(const_int 1)])))
+ (set (match_dup 0)
+ (plus:DI (match_dup 0)
+ (match_dup 3)))]
+{
+ unsigned reg1 = reg_or_subregno (operands[1]);
+
+ operands[4] = gen_rtx_REG (V2DImode, reg1);
+ operands[5] = gen_rtx_REG (V2DImode, reg1 + 1);
+}
+ [(set_attr "length" "16")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 600e2c393db..0e6e74b8087 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -21399,6 +21399,7 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_f32_abs (__vector_pair);
__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+float __builtin_vpair_f32_add_elements (__vector_pair);
__vector_pair __builtin_vpair_f32_assemble (vector float, vector float);
vector float __builtin_vpair_f32_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
@@ -21416,6 +21417,7 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_f64_abs (__vector_pair);
__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+double __builtin_vpair_f64_add_elements (__vector_pair);
__vector_pair __builtin_vpair_f64_assemble (vector double, vector double);
vector double __builtin_vpair_f64_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
@@ -21432,6 +21434,7 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_i64_add (__vector_pair, __vector_pair);
+long long __builtin_vpair_i64_add_elements (__vector_pair);
__vector_pair __builtin_vpair_i64_and (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64_assemble (vector long long,
vector long long);
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-16.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-16.c
new file mode 100644
index 00000000000..a8c206c4093
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-16.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test vector pair built-in functions to do a horizontal add of the
+ elements. */
+
+float
+f32_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xvaddsp, 2 vsldoi, 2 xvaddsp, 1 xcvspdp. */
+ return __builtin_vpair_f32_add_elements (*p);
+}
+
+double
+f64_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xvadddp, 1 xxperdi, 1 fadd/xxadddp. */
+ return __builtin_vpair_f64_add_elements (*p);
+}
+
+long long
+i64_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1vaddudm, 1 mfvsrld, 1 mfvsrd, 1 add. */
+ return __builtin_vpair_i64_add_elements (*p);
+}
+
+unsigned long long
+i64u_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1vaddudm, 1 mfvsrld, 1 mfvsrd, 1 add. */
+ return __builtin_vpair_i64u_add_elements (*p);
+}
+
+/* { dg-final { scan-assembler-times {\mfadd\M|\mxsadddp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mmfvsrd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mmfvsrld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvaddudm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsldoi\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */
--
2.41.0
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meissner@linux.ibm.com
^ permalink raw reply [flat|nested] 5+ messages in thread