public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/meissner/heads/work141-vpair)] Add support for floating point vector pair built-in functions.
@ 2023-10-26  6:45 Michael Meissner
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-10-26  6:45 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:7e60fc9594caea1380cfdd378059e236ea0febe8

commit 7e60fc9594caea1380cfdd378059e236ea0febe8
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Thu Oct 26 02:44:40 2023 -0400

    Add support for floating point vector pair built-in functions.
    
    2023-10-26  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (mma_assemble_input_operand): Allow other
            16-byte vectors and not just V16QImode.
            * config/rs6000/rs6000-builtins.def (__builtin_vpair_zero): New
            built-in to clear vector pair.
            (__builtin_vpair_f32_*): Add vector pair built-in functions for float.
            (__builtin_vpair_f64_*): Add vector pair built-in functions for double.
            * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
            declaration.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
            for vector pair built-in functions.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
            * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
            * config/rs6000/vector-pair.md: New file.
            * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
            floating point and general vector pair built-in functions.

Diff:
---
 gcc/config/rs6000/predicates.md       |   2 +-
 gcc/config/rs6000/rs6000-builtins.def |  59 +++++
 gcc/config/rs6000/rs6000-protos.h     |   5 +
 gcc/config/rs6000/rs6000.cc           |  74 ++++++
 gcc/config/rs6000/rs6000.md           |   1 +
 gcc/config/rs6000/t-rs6000            |   1 +
 gcc/config/rs6000/vector-pair.md      | 433 ++++++++++++++++++++++++++++++++++
 gcc/doc/extend.texi                   |  49 ++++
 8 files changed, 623 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c42..922a77716c41 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1301,7 +1301,7 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16
 		&& (vsx_register_operand (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index b661a2268432..79793d09565e 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4137,3 +4137,62 @@
 
   void __builtin_vsx_stxvp_internal (v256 *, v256);
     STXVP_INTERNAL stxvp_internal {mma}
+
+;; Vector pair built-in functions
+
+  v256 __builtin_vpair_zero ();
+    VPAIR_ZERO vpair_zero {mma}
+
+  v256 __builtin_vpair_f32_splat (float);
+    VPAIR_F32_SPLAT vpair_splat_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_abs (v256);
+    VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_add (v256, v256);
+    VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_fma (v256, v256, v256);
+    VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+  v256 __builtin_vpair_f32_mul (v256, v256);
+    VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_neg (v256);
+    VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_max (v256, v256);
+    VPAIR_F32_SMAX vpair_smax_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_min (v256, v256);
+    VPAIR_F32_SMIN vpair_smin_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_sub (v256, v256);
+    VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f64_splat (double);
+    VPAIR_F64_SPLAT vpair_splat_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_abs (v256);
+    VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_add (v256, v256);
+    VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_fma (v256, v256, v256);
+    VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+  v256 __builtin_vpair_f64_mul (v256, v256);
+    VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_neg (v256);
+    VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_max (v256, v256);
+    VPAIR_F64_SMAX vpair_smax_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_min (v256, v256);
+    VPAIR_F64_SMIN vpair_smin_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_sub (v256, v256);
+    VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f5..bbd899d75620 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 8f06b37171a3..0f466f1f7c29 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27393,6 +27393,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 2a1b5ecfaee2..da51029aa1ba 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15759,6 +15759,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1de..5fc89499795d 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/vsx.md \
 	$(srcdir)/config/rs6000/altivec.md \
 	$(srcdir)/config/rs6000/mma.md \
+	$(srcdir)/config/rs6000/vector-pair.md \
 	$(srcdir)/config/rs6000/crypto.md \
 	$(srcdir)/config/rs6000/htm.md \
 	$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 000000000000..5eff41685fc6
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,433 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+  [UNSPEC_VPAIR_ZERO
+   UNSPEC_VPAIR_SPLAT
+   UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY  [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs      "abs")
+			   (minus    "sub")
+			   (mult     "mul")
+			   (neg      "neg")
+			   (plus     "add")
+			   (smin     "smin")
+			   (smax     "smax")
+			   (xor      "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+			    UNSPEC_VPAIR_V8SF])
+
+;; Map VP_* to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF  "V2DF")
+			      (UNSPEC_VPAIR_V8SF  "V4SF")])
+
+;; Map VP_* to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF  "v4df")
+			   (UNSPEC_VPAIR_V8SF  "v8sf")])
+
+;; Map VP_* to a lower case name to identify the vector after the vector pair
+;; has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF  "v2df")
+			   (UNSPEC_VPAIR_V8SF  "v4sf")])
+
+;; Moddes of the vector element to splat to vector pair
+(define_mode_iterator VP_SPLAT [DF SF])
+
+;; Moddes of the vector to splat to vector pair
+(define_mode_iterator VP_SPLAT_VEC [V2DF V4SF])
+
+;; MAP VP_SPLAT and VP_SPLAT_VEC to the mode of the vector pair in the assemble
+;; operation
+(define_mode_attr vp_splat_pmode [(DF   "v4df")
+				  (V2DF "v4df")
+				  (SF   "v8sf")
+				  (V4SF "v8sf")])
+
+;; MAP VP_SPLAT to the mode of the vector containing the element
+(define_mode_attr VP_SPLAT_VMODE [(DF "V2DF")
+				  (SF "V4SF")])
+
+;; Initialize a vector pair to 0
+(define_insn_and_split "vpair_zero"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(const_int 0)] UNSPEC_VPAIR_ZERO))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 1) (match_dup 3))
+   (set (match_dup 2) (match_dup 3))]
+{
+  rtx op0 = operands[0];
+  unsigned offset_hi = (WORDS_BIG_ENDIAN) ? 0 : 16;
+  unsigned offset_lo = (WORDS_BIG_ENDIAN) ? 16 : 0;
+
+  operands[1] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_hi);
+  operands[2] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_lo);
+  operands[3] = CONST0_RTX (V2DImode);
+}
+  [(set_attr "length" "8")])
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<vp_splat_pmode>"
+  [(use (match_operand:OO 0 "vsx_register_operand"))
+   (use (match_operand:VP_SPLAT 1 "input_operand"))]
+  "TARGET_MMA"
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode element_mode = <MODE>mode;
+  machine_mode vector_mode = <VP_SPLAT_VMODE>mode;
+
+  if (op1 == CONST0_RTX (element_mode))
+    {
+      emit_insn (gen_vpair_zero (op0));
+      DONE;
+    }
+
+  rtx vec = gen_reg_rtx (vector_mode);
+  unsigned num_elements = GET_MODE_NUNITS (vector_mode);
+  rtvec elements = rtvec_alloc (num_elements);
+  for (size_t i = 0; i < num_elements; i++)
+    RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+  rs6000_expand_vector_init (vec, gen_rtx_PARALLEL (vector_mode, elements));
+  emit_insn (gen_vpair_splat_<vp_splat_pmode>_internal (op0, vec));
+  DONE;
+})
+
+;; Inner splat support.  Operand1 is the vector splat created above.  Allow
+;; operand 1 to overlap with the output registers to eliminate one move
+;; instruction.
+(define_insn_and_split "vpair_splat_<vp_splat_pmode>_internal"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(match_operand:VP_SPLAT_VEC 1 "vsx_register_operand" "0,wa")]
+	 UNSPEC_VPAIR_SPLAT))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op0_vector0 = simplify_gen_subreg (<MODE>mode, op0, OOmode, 0);
+  rtx op0_vector1 = simplify_gen_subreg (<MODE>mode, op0, OOmode, 16);
+
+  /* Check if the input is one of the output registers.  */
+  if (rtx_equal_p (op0_vector0, op1))
+    emit_move_insn (op0_vector1, op1);
+
+  else if (rtx_equal_p (op0_vector1, op1))
+    emit_move_insn (op0_vector0, op1);
+
+  else
+    {
+      emit_move_insn (op0_vector0, op1);
+      emit_move_insn (op0_vector1, op1);
+    }
+
+  DONE;
+}
+  [(set_attr "length" "*,8")
+   (set_attr "type" "vecmove")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_<vp_insn><vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_vsx_nabs<vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+			    gen_<vp_insn><vp_vmode>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VP_FP))]
+	   VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(plus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(minus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (unspec:OO
+	    [(neg:OO
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(plus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(minus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (unspec:OO
+	       [(neg:OO
+		 (match_dup 3))]
+	       VP_FP))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index bf941e6b93a1..33a2926f1bac 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14979,6 +14979,7 @@ instructions, but allow the compiler to schedule those calls.
 * NDS32 Built-in Functions::
 * Nvidia PTX Built-in Functions::
 * Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
 * PowerPC AltiVec/VSX Built-in Functions::
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
@@ -21300,6 +21301,54 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
 
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using
+@code{__vector_pair} to hold two vectors.  The load vector pair and
+store vector pair instructions are used to load the values into
+registers and store the values.  The operation itself is split into
+two separate vector instructions.  To use the vector pair built-in
+functions, you need to have MMA support enabled (@option{-mmma}, which
+is enabled by default with @option{-mcpu=power10}).
+
+The following built-in functions are independent on the type of the
+underlying vector:
+
+@smallexample
+__vector_pair __builtin_vpair_zero ();
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f32_splat (float);
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f64_splat (double);
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
 GCC provides two interfaces for accessing the Hardware Transactional

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [gcc(refs/users/meissner/heads/work141-vpair)] Add support for floating point vector pair built-in functions.
@ 2023-11-02  4:18 Michael Meissner
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-11-02  4:18 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:3975093b755112ea652583f0ad4c9e36bd986a1a

commit 3975093b755112ea652583f0ad4c9e36bd986a1a
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Thu Nov 2 00:15:04 2023 -0400

    Add support for floating point vector pair built-in functions.
    
    This patch adds a series of built-in functions to allow users to write code to
    do a number of simple operations where the loop is done using the __vector_pair
    type.  The __vector_pair type is an opaque type.  These built-in functions keep
    the two 128-bit vectors within the __vector_pair together, and split the
    operation after register allocation.
    
    This patch provides vector pair operations for 32-bit floating point and 64-bit
    floating point.
    
    2023-11-02  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_*): Add vector
            pair built-in functions for float.
            (__builtin_vpair_f64_*): Add vector pair built-in functions for double.
            * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
            declaration.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
            for vector pair built-in functions.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
            * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
            * config/rs6000/vector-pair.md: New file.
            * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
            floating point and general vector pair built-in functions.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/vector-pair-1.c: New test.
            * gcc.target/powerpc/vector-pair-2.c: New test.
            * gcc.target/powerpc/vector-pair-3.c: New test.
            * gcc.target/powerpc/vector-pair-4.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000-builtins.def            |  52 ++++
 gcc/config/rs6000/rs6000-protos.h                |   5 +
 gcc/config/rs6000/rs6000.cc                      |  74 +++++
 gcc/config/rs6000/rs6000.md                      |   1 +
 gcc/config/rs6000/t-rs6000                       |   1 +
 gcc/config/rs6000/vector-pair.md                 | 329 +++++++++++++++++++++++
 gcc/doc/extend.texi                              |  40 +++
 gcc/testsuite/gcc.target/powerpc/vector-pair-1.c | 135 ++++++++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-2.c | 134 +++++++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-3.c |  60 +++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-4.c |  60 +++++
 11 files changed, 891 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index b661a2268432..24b9547c602e 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4137,3 +4137,55 @@
 
   void __builtin_vsx_stxvp_internal (v256 *, v256);
     STXVP_INTERNAL stxvp_internal {mma}
+
+;; vector pair built-in functions for 8 32-bit float values
+
+  v256 __builtin_vpair_f32_abs (v256);
+    VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_add (v256, v256);
+    VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_fma (v256, v256, v256);
+    VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+  v256 __builtin_vpair_f32_max (v256, v256);
+    VPAIR_F32_MAX vpair_smax_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_min (v256, v256);
+    VPAIR_F32_MIN vpair_smin_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_mul (v256, v256);
+    VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_neg (v256);
+    VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_sub (v256, v256);
+    VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+;; vector pair built-in functions for 4 64-bit double values
+
+  v256 __builtin_vpair_f64_abs (v256);
+    VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_add (v256, v256);
+    VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_fma (v256, v256, v256);
+    VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+  v256 __builtin_vpair_f64_max (v256, v256);
+    VPAIR_F64_MAX vpair_smax_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_min (v256, v256);
+    VPAIR_F64_MIN vpair_smin_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_mul (v256, v256);
+    VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_neg (v256);
+    VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_sub (v256, v256);
+    VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f5..bbd899d75620 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 8f06b37171a3..0f466f1f7c29 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27393,6 +27393,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 2a1b5ecfaee2..da51029aa1ba 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15759,6 +15759,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1de..5fc89499795d 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/vsx.md \
 	$(srcdir)/config/rs6000/altivec.md \
 	$(srcdir)/config/rs6000/mma.md \
+	$(srcdir)/config/rs6000/vector-pair.md \
 	$(srcdir)/config/rs6000/crypto.md \
 	$(srcdir)/config/rs6000/htm.md \
 	$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 000000000000..2dcac6a31e22
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,329 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+  [UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY  [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs      "abs")
+			   (minus    "sub")
+			   (mult     "mul")
+			   (neg      "neg")
+			   (plus     "add")
+			   (smin     "smin")
+			   (smax     "smax")
+			   (xor      "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+			    UNSPEC_VPAIR_V8SF])
+
+;; Map VP_* to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF  "V2DF")
+			      (UNSPEC_VPAIR_V8SF  "V4SF")])
+
+;; Map VP_* to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF  "v4df")
+			   (UNSPEC_VPAIR_V8SF  "v8sf")])
+
+;; Map VP_* to a lower case name to identify the vector after the vector pair
+;; has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF  "v2df")
+			   (UNSPEC_VPAIR_V8SF  "v4sf")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_<vp_insn><vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_vsx_nabs<vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+			    gen_<vp_insn><vp_vmode>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VP_FP))]
+	   VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(plus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(minus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (unspec:OO
+	    [(neg:OO
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(plus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(minus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (unspec:OO
+	       [(neg:OO
+		 (match_dup 3))]
+	       VP_FP))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index bf941e6b93a1..3ec4b7da6939 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14979,6 +14979,7 @@ instructions, but allow the compiler to schedule those calls.
 * NDS32 Built-in Functions::
 * Nvidia PTX Built-in Functions::
 * Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
 * PowerPC AltiVec/VSX Built-in Functions::
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
@@ -21300,6 +21301,45 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
 
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using
+@code{__vector_pair} to hold two vectors.  The load vector pair and
+store vector pair instructions are used to load the values into
+registers and store the values.  The operation itself is split into
+two separate vector instructions.  To use the vector pair built-in
+functions, you need to have MMA support enabled (@option{-mmma}, which
+is enabled by default with @option{-mcpu=power10}).
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
 GCC provides two interfaces for accessing the Hardware Transactional
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
new file mode 100644
index 000000000000..e74840cebc0f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
@@ -0,0 +1,135 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+   vector pairs with 4 double elements.  */
+
+void
+test_add (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvadddp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvsubdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+	       __vector_pair *x,
+	       __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmuldp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_mul (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmindp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_min (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmaxdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_max (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+	     __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvnegdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+	  __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvabsdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+		   __vector_pair *x)
+{
+  /* 2 lxvp, 2 xvnabsdp, 1 stxvp.  */
+  __vector_pair ab = __builtin_vpair_f64_abs (*x);
+  *dest = __builtin_vpair_f64_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmadd{a,q}dp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmsub{a,q}dp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f64_neg (*z);
+  *dest = __builtin_vpair_f64_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmadd{a,q}dp, 1 stxvp.  */
+  __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmsub{a,q}dp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f64_neg (*z);
+  __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}       12 } } */
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxdp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvmindp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}      2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
new file mode 100644
index 000000000000..2facb7270537
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
@@ -0,0 +1,134 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+   vector pairs with 8 float elements.  */
+
+void
+test_add (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvaddsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvsubsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+	       __vector_pair *x,
+	       __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmulsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_mul (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmaxsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_max (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvminsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+	     __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvnegsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+	  __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvabssp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+		   __vector_pair *x)
+{
+  /* 2 lxvp, 2 xvnabssp, 1 stxvp.  */
+  __vector_pair ab = __builtin_vpair_f32_abs (*x);
+  *dest = __builtin_vpair_f32_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f32_neg (*z);
+  *dest = __builtin_vpair_f32_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp.  */
+  __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f32_neg (*z);
+  __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}       25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}      12 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvminsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}  2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
new file mode 100644
index 000000000000..65bfc44f85d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+   negate operations to the appropriate fused multiply-add instruction for
+   vector pairs with 4 double elements.  */
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  *dest = __builtin_vpair_f64_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmsub{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  *dest = __builtin_vpair_f64_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f64_add (m, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f64_sub (m, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}        4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
new file mode 100644
index 000000000000..b62871be1fdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
@@ -0,0 +1,60 @@
+/* { dgv64-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+   negate operations to the appropriate fused multiply-add instruction for
+   vector pairs with 8 float elements.  */
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  *dest = __builtin_vpair_f32_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmsub{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  *dest = __builtin_vpair_f32_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f32_add (m, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f32_sub (m, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}        4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}   2 } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [gcc(refs/users/meissner/heads/work141-vpair)] Add support for floating point vector pair built-in functions.
@ 2023-11-01 17:45 Michael Meissner
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-11-01 17:45 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:9f2dd1dd4758868bf06a2496baccc6f6a2dd7faa

commit 9f2dd1dd4758868bf06a2496baccc6f6a2dd7faa
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Wed Nov 1 11:15:26 2023 -0400

    Add support for floating point vector pair built-in functions.
    
    This patch adds a series of built-in functions to allow users to write code to
    do a number of simple operations where the loop is done using the __vector_pair
    type.  The __vector_pair type is an opaque type.  These built-in functions keep
    the two 128-bit vectors within the __vector_pair together, and split the
    operation after register allocation.
    
    This patch provides vector pair operations for 32-bit floating point and 64-bit
    floating point.
    
    2023-11-01  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_*): Add vector
            pair built-in functions for float.
            (__builtin_vpair_f64_*): Add vector pair built-in functions for double.
            * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
            declaration.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
            for vector pair built-in functions.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
            * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
            * config/rs6000/vector-pair.md: New file.
            * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
            floating point and general vector pair built-in functions.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/vector-pair-1.c: New test.
            * gcc.target/powerpc/vector-pair-2.c: New test.
            * gcc.target/powerpc/vector-pair-3.c: New test.
            * gcc.target/powerpc/vector-pair-4.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000-builtins.def            |  52 ++++
 gcc/config/rs6000/rs6000-protos.h                |   5 +
 gcc/config/rs6000/rs6000.cc                      |  74 +++++
 gcc/config/rs6000/rs6000.md                      |   1 +
 gcc/config/rs6000/t-rs6000                       |   1 +
 gcc/config/rs6000/vector-pair.md                 | 329 +++++++++++++++++++++++
 gcc/doc/extend.texi                              |  40 +++
 gcc/testsuite/gcc.target/powerpc/vector-pair-1.c | 135 ++++++++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-2.c | 134 +++++++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-3.c |  60 +++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-4.c |  60 +++++
 11 files changed, 891 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index b661a2268432..24b9547c602e 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4137,3 +4137,55 @@
 
   void __builtin_vsx_stxvp_internal (v256 *, v256);
     STXVP_INTERNAL stxvp_internal {mma}
+
+;; vector pair built-in functions for 8 32-bit float values
+
+  v256 __builtin_vpair_f32_abs (v256);
+    VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_add (v256, v256);
+    VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_fma (v256, v256, v256);
+    VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+  v256 __builtin_vpair_f32_max (v256, v256);
+    VPAIR_F32_MAX vpair_smax_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_min (v256, v256);
+    VPAIR_F32_MIN vpair_smin_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_mul (v256, v256);
+    VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_neg (v256);
+    VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_sub (v256, v256);
+    VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+;; vector pair built-in functions for 4 64-bit double values
+
+  v256 __builtin_vpair_f64_abs (v256);
+    VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_add (v256, v256);
+    VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_fma (v256, v256, v256);
+    VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+  v256 __builtin_vpair_f64_max (v256, v256);
+    VPAIR_F64_MAX vpair_smax_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_min (v256, v256);
+    VPAIR_F64_MIN vpair_smin_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_mul (v256, v256);
+    VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_neg (v256);
+    VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_sub (v256, v256);
+    VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f5..bbd899d75620 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 8f06b37171a3..0f466f1f7c29 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27393,6 +27393,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 2a1b5ecfaee2..da51029aa1ba 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15759,6 +15759,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1de..5fc89499795d 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/vsx.md \
 	$(srcdir)/config/rs6000/altivec.md \
 	$(srcdir)/config/rs6000/mma.md \
+	$(srcdir)/config/rs6000/vector-pair.md \
 	$(srcdir)/config/rs6000/crypto.md \
 	$(srcdir)/config/rs6000/htm.md \
 	$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 000000000000..2dcac6a31e22
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,329 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+  [UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY  [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs      "abs")
+			   (minus    "sub")
+			   (mult     "mul")
+			   (neg      "neg")
+			   (plus     "add")
+			   (smin     "smin")
+			   (smax     "smax")
+			   (xor      "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+			    UNSPEC_VPAIR_V8SF])
+
+;; Map VP_* to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF  "V2DF")
+			      (UNSPEC_VPAIR_V8SF  "V4SF")])
+
+;; Map VP_* to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF  "v4df")
+			   (UNSPEC_VPAIR_V8SF  "v8sf")])
+
+;; Map VP_* to a lower case name to identify the vector after the vector pair
+;; has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF  "v2df")
+			   (UNSPEC_VPAIR_V8SF  "v4sf")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_<vp_insn><vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_vsx_nabs<vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+			    gen_<vp_insn><vp_vmode>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VP_FP))]
+	   VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(plus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(minus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (unspec:OO
+	    [(neg:OO
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(plus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(minus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (unspec:OO
+	       [(neg:OO
+		 (match_dup 3))]
+	       VP_FP))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index bf941e6b93a1..3ec4b7da6939 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14979,6 +14979,7 @@ instructions, but allow the compiler to schedule those calls.
 * NDS32 Built-in Functions::
 * Nvidia PTX Built-in Functions::
 * Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
 * PowerPC AltiVec/VSX Built-in Functions::
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
@@ -21300,6 +21301,45 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
 
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using
+@code{__vector_pair} to hold two vectors.  The load vector pair and
+store vector pair instructions are used to load the values into
+registers and store the values.  The operation itself is split into
+two separate vector instructions.  To use the vector pair built-in
+functions, you need to have MMA support enabled (@option{-mmma}, which
+is enabled by default with @option{-mcpu=power10}).
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
 GCC provides two interfaces for accessing the Hardware Transactional
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
new file mode 100644
index 000000000000..7e1c1347c403
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
@@ -0,0 +1,135 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+   vector pairs with 4 double elements.  */
+
+void
+test_add (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvadddp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvsubdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+	       __vector_pair *x,
+	       __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmuldp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_mul (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmindp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_min (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmaxdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_max (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+	     __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvnegdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+	  __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvabsdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+		   __vector_pair *x)
+{
+  /* 2 lxvp, 2 xvnabsdp, 1 stxvp.  */
+  __vector_pair ab = __builtin_vpair_f64_abs (*x);
+  *dest = __builtin_vpair_f64_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmadd{a,q}dp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmsub{a,q}dp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f64_neg (*z);
+  *dest = __builtin_vpair_f64_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmadd{a,q}dp, 1 stxvp.  */
+  __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmsub{a,q}dp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f64_neg (*z);
+  __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}       12 } } */
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxdp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvmindp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}      2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
new file mode 100644
index 000000000000..2facb7270537
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
@@ -0,0 +1,134 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+   vector pairs with 8 float elements.  */
+
+void
+test_add (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvaddsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvsubsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+	       __vector_pair *x,
+	       __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmulsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_mul (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmaxsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_max (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvminsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+	     __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvnegsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+	  __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvabssp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+		   __vector_pair *x)
+{
+  /* 2 lxvp, 2 xvnabssp, 1 stxvp.  */
+  __vector_pair ab = __builtin_vpair_f32_abs (*x);
+  *dest = __builtin_vpair_f32_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f32_neg (*z);
+  *dest = __builtin_vpair_f32_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp.  */
+  __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f32_neg (*z);
+  __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}       25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}      12 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvminsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}  2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
new file mode 100644
index 000000000000..65bfc44f85d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+   negate operations to the appropriate fused multiply-add instruction for
+   vector pairs with 4 double elements.  */
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  *dest = __builtin_vpair_f64_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmsub{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  *dest = __builtin_vpair_f64_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f64_add (m, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f64_sub (m, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}        4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
new file mode 100644
index 000000000000..b62871be1fdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
@@ -0,0 +1,60 @@
+/* { dgv64-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+   negate operations to the appropriate fused multiply-add instruction for
+   vector pairs with 8 float elements.  */
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  *dest = __builtin_vpair_f32_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmsub{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  *dest = __builtin_vpair_f32_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f32_add (m, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f32_sub (m, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}        4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}   2 } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [gcc(refs/users/meissner/heads/work141-vpair)] Add support for floating point vector pair built-in functions.
@ 2023-10-27  4:20 Michael Meissner
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-10-27  4:20 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:1119b8a64a39c275926c2920026374aaf1805974

commit 1119b8a64a39c275926c2920026374aaf1805974
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Oct 27 00:20:02 2023 -0400

    Add support for floating point vector pair built-in functions.
    
    2023-10-26  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (mma_assemble_input_operand): Allow other
            16-byte vectors and not just V16QImode.
            * config/rs6000/rs6000-builtins.def (__builtin_vpair_zero): New
            built-in to clear vector pair.
            (__builtin_vpair_f32_*): Add vector pair built-in functions for float.
            (__builtin_vpair_f64_*): Add vector pair built-in functions for double.
            * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
            declaration.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
            for vector pair built-in functions.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
            * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
            * config/rs6000/vector-pair.md: New file.
            * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
            floating point and general vector pair built-in functions.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/vector-pair-1.c: New test.
            * gcc.target/powerpc/vector-pair-2.c: New test.
            * gcc.target/powerpc/vector-pair-3.c: New test.
            * gcc.target/powerpc/vector-pair-4.c: New test.

Diff:
---
 gcc/config/rs6000/predicates.md                  |   2 +-
 gcc/config/rs6000/rs6000-builtins.def            |  63 ++++
 gcc/config/rs6000/rs6000-protos.h                |   5 +
 gcc/config/rs6000/rs6000.cc                      |  74 ++++
 gcc/config/rs6000/rs6000.md                      |   1 +
 gcc/config/rs6000/t-rs6000                       |   1 +
 gcc/config/rs6000/vector-pair.md                 | 433 +++++++++++++++++++++++
 gcc/doc/extend.texi                              |  49 +++
 gcc/testsuite/gcc.target/powerpc/vector-pair-1.c | 174 +++++++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-2.c | 175 +++++++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-3.c |  60 ++++
 gcc/testsuite/gcc.target/powerpc/vector-pair-4.c |  60 ++++
 12 files changed, 1096 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c42..922a77716c41 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1301,7 +1301,7 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16
 		&& (vsx_register_operand (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index b661a2268432..77cd832d6266 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4137,3 +4137,66 @@
 
   void __builtin_vsx_stxvp_internal (v256 *, v256);
     STXVP_INTERNAL stxvp_internal {mma}
+
+;; General vector pair built-in functions
+
+  v256 __builtin_vpair_zero ();
+    VPAIR_ZERO vpair_zero {mma}
+
+;; vector pair built-in functions for 8 32-bit float values
+
+  v256 __builtin_vpair_f32_abs (v256);
+    VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_add (v256, v256);
+    VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_fma (v256, v256, v256);
+    VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+  v256 __builtin_vpair_f32_max (v256, v256);
+    VPAIR_F32_MAX vpair_smax_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_min (v256, v256);
+    VPAIR_F32_MIN vpair_smin_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_mul (v256, v256);
+    VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_neg (v256);
+    VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_splat (float);
+    VPAIR_F32_SPLAT vpair_splat_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_sub (v256, v256);
+    VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+;; vector pair built-in functions for 4 64-bit double values
+
+  v256 __builtin_vpair_f64_abs (v256);
+    VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_add (v256, v256);
+    VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_fma (v256, v256, v256);
+    VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+  v256 __builtin_vpair_f64_max (v256, v256);
+    VPAIR_F64_MAX vpair_smax_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_min (v256, v256);
+    VPAIR_F64_MIN vpair_smin_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_mul (v256, v256);
+    VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_neg (v256);
+    VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_splat (double);
+    VPAIR_F64_SPLAT vpair_splat_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_sub (v256, v256);
+    VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f5..bbd899d75620 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 8f06b37171a3..0f466f1f7c29 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27393,6 +27393,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 2a1b5ecfaee2..da51029aa1ba 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15759,6 +15759,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1de..5fc89499795d 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/vsx.md \
 	$(srcdir)/config/rs6000/altivec.md \
 	$(srcdir)/config/rs6000/mma.md \
+	$(srcdir)/config/rs6000/vector-pair.md \
 	$(srcdir)/config/rs6000/crypto.md \
 	$(srcdir)/config/rs6000/htm.md \
 	$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 000000000000..5eff41685fc6
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,433 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+  [UNSPEC_VPAIR_ZERO
+   UNSPEC_VPAIR_SPLAT
+   UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY  [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs      "abs")
+			   (minus    "sub")
+			   (mult     "mul")
+			   (neg      "neg")
+			   (plus     "add")
+			   (smin     "smin")
+			   (smax     "smax")
+			   (xor      "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+			    UNSPEC_VPAIR_V8SF])
+
+;; Map VP_* to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF  "V2DF")
+			      (UNSPEC_VPAIR_V8SF  "V4SF")])
+
+;; Map VP_* to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF  "v4df")
+			   (UNSPEC_VPAIR_V8SF  "v8sf")])
+
+;; Map VP_* to a lower case name to identify the vector after the vector pair
+;; has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF  "v2df")
+			   (UNSPEC_VPAIR_V8SF  "v4sf")])
+
+;; Moddes of the vector element to splat to vector pair
+(define_mode_iterator VP_SPLAT [DF SF])
+
+;; Moddes of the vector to splat to vector pair
+(define_mode_iterator VP_SPLAT_VEC [V2DF V4SF])
+
+;; MAP VP_SPLAT and VP_SPLAT_VEC to the mode of the vector pair in the assemble
+;; operation
+(define_mode_attr vp_splat_pmode [(DF   "v4df")
+				  (V2DF "v4df")
+				  (SF   "v8sf")
+				  (V4SF "v8sf")])
+
+;; MAP VP_SPLAT to the mode of the vector containing the element
+(define_mode_attr VP_SPLAT_VMODE [(DF "V2DF")
+				  (SF "V4SF")])
+
+;; Initialize a vector pair to 0
+(define_insn_and_split "vpair_zero"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(const_int 0)] UNSPEC_VPAIR_ZERO))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 1) (match_dup 3))
+   (set (match_dup 2) (match_dup 3))]
+{
+  rtx op0 = operands[0];
+  unsigned offset_hi = (WORDS_BIG_ENDIAN) ? 0 : 16;
+  unsigned offset_lo = (WORDS_BIG_ENDIAN) ? 16 : 0;
+
+  operands[1] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_hi);
+  operands[2] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_lo);
+  operands[3] = CONST0_RTX (V2DImode);
+}
+  [(set_attr "length" "8")])
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<vp_splat_pmode>"
+  [(use (match_operand:OO 0 "vsx_register_operand"))
+   (use (match_operand:VP_SPLAT 1 "input_operand"))]
+  "TARGET_MMA"
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode element_mode = <MODE>mode;
+  machine_mode vector_mode = <VP_SPLAT_VMODE>mode;
+
+  if (op1 == CONST0_RTX (element_mode))
+    {
+      emit_insn (gen_vpair_zero (op0));
+      DONE;
+    }
+
+  rtx vec = gen_reg_rtx (vector_mode);
+  unsigned num_elements = GET_MODE_NUNITS (vector_mode);
+  rtvec elements = rtvec_alloc (num_elements);
+  for (size_t i = 0; i < num_elements; i++)
+    RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+  rs6000_expand_vector_init (vec, gen_rtx_PARALLEL (vector_mode, elements));
+  emit_insn (gen_vpair_splat_<vp_splat_pmode>_internal (op0, vec));
+  DONE;
+})
+
+;; Inner splat support.  Operand1 is the vector splat created above.  Allow
+;; operand 1 to overlap with the output registers to eliminate one move
+;; instruction.
+(define_insn_and_split "vpair_splat_<vp_splat_pmode>_internal"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(match_operand:VP_SPLAT_VEC 1 "vsx_register_operand" "0,wa")]
+	 UNSPEC_VPAIR_SPLAT))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op0_vector0 = simplify_gen_subreg (<MODE>mode, op0, OOmode, 0);
+  rtx op0_vector1 = simplify_gen_subreg (<MODE>mode, op0, OOmode, 16);
+
+  /* Check if the input is one of the output registers.  */
+  if (rtx_equal_p (op0_vector0, op1))
+    emit_move_insn (op0_vector1, op1);
+
+  else if (rtx_equal_p (op0_vector1, op1))
+    emit_move_insn (op0_vector0, op1);
+
+  else
+    {
+      emit_move_insn (op0_vector0, op1);
+      emit_move_insn (op0_vector1, op1);
+    }
+
+  DONE;
+}
+  [(set_attr "length" "*,8")
+   (set_attr "type" "vecmove")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_<vp_insn><vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_vsx_nabs<vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+			    gen_<vp_insn><vp_vmode>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VP_FP))]
+	   VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(plus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(minus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (unspec:OO
+	    [(neg:OO
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(plus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(minus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (unspec:OO
+	       [(neg:OO
+		 (match_dup 3))]
+	       VP_FP))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index bf941e6b93a1..8c3cf6b15d35 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14979,6 +14979,7 @@ instructions, but allow the compiler to schedule those calls.
 * NDS32 Built-in Functions::
 * Nvidia PTX Built-in Functions::
 * Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
 * PowerPC AltiVec/VSX Built-in Functions::
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
@@ -21300,6 +21301,54 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
 
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using
+@code{__vector_pair} to hold two vectors.  The load vector pair and
+store vector pair instructions are used to load the values into
+registers and store the values.  The operation itself is split into
+two separate vector instructions.  To use the vector pair built-in
+functions, you need to have MMA support enabled (@option{-mmma}, which
+is enabled by default with @option{-mcpu=power10}).
+
+The following built-in functions are independent on the type of the
+underlying vector:
+
+@smallexample
+__vector_pair __builtin_vpair_zero ();
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_splat (float);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_splat (double);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
 GCC provides two interfaces for accessing the Hardware Transactional
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
new file mode 100644
index 000000000000..66f986b181b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
@@ -0,0 +1,174 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+   vector pairs with 4 double elements.  */
+
+void
+test_add (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvadddp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvsubdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+	       __vector_pair *x,
+	       __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmuldp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_mul (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmindp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_min (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmaxdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_max (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+	     __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvnegdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+	  __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvabsdp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+		   __vector_pair *x)
+{
+  /* 2 lxvp, 2 xvnabsdp, 1 stxvp.  */
+  __vector_pair ab = __builtin_vpair_f64_abs (*x);
+  *dest = __builtin_vpair_f64_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmadd{a,q}dp, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmsub{a,q}dp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f64_neg (*z);
+  *dest = __builtin_vpair_f64_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmadd{a,q}dp, 1 stxvp.  */
+  __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmsub{a,q}dp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f64_neg (*z);
+  __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_splat_arg (__vector_pair *dest, double x)
+{
+  /* 1 xxpermdi, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_splat (x);
+}
+
+void
+test_splat_mem (__vector_pair *dest, double *p)
+{
+  /* 1 lxvdsx, 1 xxlor, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_splat (*p);
+}
+
+void
+test_splat_const_0 (__vector_pair *dest)
+{
+  /* 2 xxspltib, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_splat (0.0);
+}
+
+void
+test_splat_const_1 (__vector_pair *dest)
+{
+  /* 1 xxspltidp, 1 xxlor, 1 stxvp.  */
+  *dest = __builtin_vpair_f64_splat (1.0);
+}
+
+void
+test_zero (__vector_pair *dest)
+{
+  /* 2 xxspltib, 1 stxvp.  */
+  *dest = __builtin_vpair_zero ();
+}
+
+/* { dg-final { scan-assembler-times {\mlxvdsx\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M}       25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}      17 } } */
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmindp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M}    1 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxspltidp\M}   1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
new file mode 100644
index 000000000000..9b9bfc4a6f76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
@@ -0,0 +1,175 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+   vector pairs with 8 float elements.  */
+
+void
+test_add (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvaddsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvsubsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+	       __vector_pair *x,
+	       __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmulsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_mul (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvmaxsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_max (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y)
+{
+  /* 2 lxvp, 2 xvminsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+	     __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvnegsp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+	  __vector_pair *x)
+{
+  /* 1 lxvp, 2 xvabssp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+		   __vector_pair *x)
+{
+  /* 2 lxvp, 2 xvnabssp, 1 stxvp.  */
+  __vector_pair ab = __builtin_vpair_f32_abs (*x);
+  *dest = __builtin_vpair_f32_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f32_neg (*z);
+  *dest = __builtin_vpair_f32_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp.  */
+  __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp.  */
+  __vector_pair n = __builtin_vpair_f32_neg (*z);
+  __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_splat_arg (__vector_pair *dest, float x)
+{
+  /* 1 xscvdpspn, 1 xxspltw, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_splat (x);
+}
+
+void
+test_splat_mem (__vector_pair *dest, float *p)
+{
+  /* 1 lxvwsx, 1 xxlor, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_splat (*p);
+}
+
+void
+test_splat_const_0 (__vector_pair *dest)
+{
+  /* 2 xxspltib, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_splat (0.0f);
+}
+
+void
+test_splat_const_1 (__vector_pair *dest)
+{
+  /* 1 xxspltiw, 1 xxlor, 1 stxvp.  */
+  *dest = __builtin_vpair_f32_splat (1.0f);
+}
+
+void
+test_zero (__vector_pair *dest)
+{
+  /* 2 xxspltib, 1 stxvp.  */
+  *dest = __builtin_vpair_zero ();
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}      25 } } */
+/* { dg-final { scan-assembler-times {\mlxvwsx\M}     1 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}     17 } } */
+/* { dg-final { scan-assembler-times {\mxscvdpspn\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxsp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvminsp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxspltiw\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mxxspltw\M}    1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
new file mode 100644
index 000000000000..65bfc44f85d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+   negate operations to the appropriate fused multiply-add instruction for
+   vector pairs with 4 double elements.  */
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  *dest = __builtin_vpair_f64_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmsub{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  *dest = __builtin_vpair_f64_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f64_add (m, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f64_sub (m, *z);
+  *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}        4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
new file mode 100644
index 000000000000..b62871be1fdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
@@ -0,0 +1,60 @@
+/* { dgv64-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+   negate operations to the appropriate fused multiply-add instruction for
+   vector pairs with 8 float elements.  */
+
+void
+test_fma (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  *dest = __builtin_vpair_f32_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+	  __vector_pair *x,
+	  __vector_pair *y,
+	  __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvmsub{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  *dest = __builtin_vpair_f32_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f32_add (m, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+	   __vector_pair *x,
+	   __vector_pair *y,
+	   __vector_pair *z)
+{
+  /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp.  */
+  __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+  __vector_pair w = __builtin_vpair_f32_sub (m, *z);
+  *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}        12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}        4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}   2 } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [gcc(refs/users/meissner/heads/work141-vpair)] Add support for floating point vector pair built-in functions.
@ 2023-10-25 19:02 Michael Meissner
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-10-25 19:02 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:62eebf39fb5ff43a3cbada96bdfbbb3569a8e4ff

commit 62eebf39fb5ff43a3cbada96bdfbbb3569a8e4ff
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Wed Oct 25 15:01:50 2023 -0400

    Add support for floating point vector pair built-in functions.
    
    2023-10-25  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (mma_assemble_input_operand): Allow other
            16-byte vectors and not just V16QImode.
            * config/rs6000/rs6000-builtins.def (__builtin_vpair_zero): New
            built-in to clear vector pair.
            (__builtin_vpair_f32_*): Add vector pair built-in functions for float.
            (__builtin_vpair_f64_*): Add vector pair built-in functions for double.
            * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
            declaration.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
            for vector pair built-in functions.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
            * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
            * config/rs6000/vector-pair.md: New file.
            * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
            vector pair built-in functions.

Diff:
---
 gcc/config/rs6000/predicates.md       |   2 +-
 gcc/config/rs6000/rs6000-builtins.def |  71 ++++++
 gcc/config/rs6000/rs6000-protos.h     |   5 +
 gcc/config/rs6000/rs6000.cc           |  74 ++++++
 gcc/config/rs6000/rs6000.md           |   1 +
 gcc/config/rs6000/t-rs6000            |   1 +
 gcc/config/rs6000/vector-pair.md      | 456 ++++++++++++++++++++++++++++++++++
 gcc/doc/extend.texi                   |  53 ++++
 8 files changed, 662 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c42..922a77716c41 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1301,7 +1301,7 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16
 		&& (vsx_register_operand (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index b661a2268432..6e2865a5a628 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4137,3 +4137,74 @@
 
   void __builtin_vsx_stxvp_internal (v256 *, v256);
     STXVP_INTERNAL stxvp_internal {mma}
+
+;; Vector pair built-in functions
+
+  v256 __builtin_vpair_zero ();
+    VPAIR_ZERO vpair_zero {mma}
+
+  vf __builtin_vpair_f32_extract_vector (v256, const int<1>);
+    VPAIR_F32_EXTRACT_VECTOR vpair_extract_vector_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_assemble (vf, vf);
+    VPAIR_F32_ASSEMBLE vpair_assemble_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_splat (float);
+    VPAIR_F32_SPLAT vpair_splat_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_abs (v256);
+    VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_add (v256, v256);
+    VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_fma (v256, v256, v256);
+    VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+  v256 __builtin_vpair_f32_mul (v256, v256);
+    VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_neg (v256);
+    VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_smax (v256, v256);
+    VPAIR_F32_SMAX vpair_smax_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_smin (v256, v256);
+    VPAIR_F32_SMIN vpair_smin_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_sub (v256, v256);
+    VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+  vd __builtin_vpair_f64_extract_vector (v256, const int<1>);
+    VPAIR_F64_EXTRACT_VECTOR vpair_extract_vector_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_assemble (vd, vd);
+    VPAIR_F64_ASSEMBLE vpair_assemble_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_splat (double);
+    VPAIR_F64_SPLAT vpair_splat_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_abs (v256);
+    VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_add (v256, v256);
+    VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_fma (v256, v256, v256);
+    VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+  v256 __builtin_vpair_f64_mul (v256, v256);
+    VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_neg (v256);
+    VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_smax (v256, v256);
+    VPAIR_F64_SMAX vpair_smax_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_smin (v256, v256);
+    VPAIR_F64_SMIN vpair_smin_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_sub (v256, v256);
+    VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f5..bbd899d75620 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 8f06b37171a3..0f466f1f7c29 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27393,6 +27393,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 2a1b5ecfaee2..da51029aa1ba 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15759,6 +15759,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1de..5fc89499795d 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/vsx.md \
 	$(srcdir)/config/rs6000/altivec.md \
 	$(srcdir)/config/rs6000/mma.md \
+	$(srcdir)/config/rs6000/vector-pair.md \
 	$(srcdir)/config/rs6000/crypto.md \
 	$(srcdir)/config/rs6000/htm.md \
 	$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 000000000000..4301a86e4040
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,456 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+  [UNSPEC_VPAIR_ZERO
+   UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY  [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs      "abs")
+			   (minus    "sub")
+			   (mult     "mul")
+			   (neg      "neg")
+			   (plus     "add")
+			   (smin     "smin")
+			   (smax     "smax")
+			   (xor      "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+			    UNSPEC_VPAIR_V8SF])
+
+(define_int_iterator VP_ALL [UNSPEC_VPAIR_V4DF
+			     UNSPEC_VPAIR_V8SF])
+
+;; Map VP_{INT,FP,ALL} to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF  "V2DF")
+			      (UNSPEC_VPAIR_V8SF  "V4SF")])
+
+;; Map VP_{INT,FP,ALL} to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF  "v4df")
+			   (UNSPEC_VPAIR_V8SF  "v8sf")])
+
+;; Map VP_{INT,FP,ALL} to a lower case name to identify the vector after the
+;; vector pair has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF  "v2df")
+			   (UNSPEC_VPAIR_V8SF  "v4sf")])
+
+;; Moddes of the vector element to splat to vector pair
+(define_mode_iterator VP_SPLAT [DF SF])
+
+;; MAP VP_SPLAT to the mode of the vector pair in the assemble operation
+(define_mode_attr vp_splat_pmode [(DF "v4df")
+				  (SF "v8sf")])
+
+;; MAP VP_SPLAT to the mode of the vector containing the element
+(define_mode_attr VP_SPLAT_VMODE [(DF "V2DF")
+				  (SF "V4SF")])
+
+;; Initialize a vector pair to 0
+(define_insn_and_split "vpair_zero"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(const_int 0)] UNSPEC_VPAIR_ZERO))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 1) (match_dup 3))
+   (set (match_dup 2) (match_dup 3))]
+{
+  rtx op0 = operands[0];
+  unsigned offset_hi = (WORDS_BIG_ENDIAN) ? 0 : 16;
+  unsigned offset_lo = (WORDS_BIG_ENDIAN) ? 16 : 0;
+
+  operands[1] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_hi);
+  operands[2] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_lo);
+  operands[3] = CONST0_RTX (V2DImode);
+}
+  [(set_attr "length" "8")])
+
+;; Assemble a vector pair from two vectors.  Unlike
+;; __builtin_mma_assemble_pair, this function produces a vector pair output
+;; directly and it takes all of the vector types.
+;;
+;; We cannot update the two output registers atomically, so mark the output as
+;; an early clobber so we don't accidentally clobber the input operands.  */
+
+(define_insn_and_split "vpair_assemble_<vp_pmode>"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=&wa")
+	(unspec:OO
+	 [(match_operand:<VP_VEC_MODE> 1 "mma_assemble_input_operand" "mwa")
+	  (match_operand:<VP_VEC_MODE> 2 "mma_assemble_input_operand" "mwa")]
+	 VP_ALL))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx src = gen_rtx_UNSPEC (OOmode,
+			    gen_rtvec (2, operands[1], operands[2]),
+			    UNSPEC_VSX_ASSEMBLE);
+  rs6000_split_multireg_move (operands[0], src);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Extract one of the two 128-bit vectors from a vector pair.
+(define_insn_and_split "vpair_extract_vector_<vp_pmode>"
+  [(set (match_operand:<VP_VEC_MODE> 0 "vsx_register_operand" "=wa")
+	(unspec:<VP_VEC_MODE>
+	 [(match_operand:OO 1 "vsx_register_operand" "wa")
+	  (match_operand 2 "const_0_to_1_operand" "n")]
+	 VP_ALL))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 3))]
+{
+  machine_mode vmode = <VP_VEC_MODE>mode;
+  unsigned reg_num = UINTVAL (operands[2]);
+  if (!WORDS_BIG_ENDIAN)
+    reg_num = 1 - reg_num;
+	   
+  operands[3] = simplify_gen_subreg (vmode, operands[0], OOmode, reg_num * 16);
+})
+
+;; Optimize extracting an 128-bit vector from a vector pair in memory.
+(define_insn_and_split "*vpair_extract_vector_<vp_pmode>_mem"
+  [(set (match_operand:<VP_VEC_MODE> 0 "vsx_register_operand" "=wa")
+	(unspec:<VP_VEC_MODE>
+	 [(match_operand:OO 1 "memory_operand" "o")
+	  (match_operand 2 "const_0_to_1_operand" "n")]
+	 VP_ALL))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 3))]
+{
+  operands[3] = adjust_address (operands[1], <VP_VEC_MODE>mode,
+				16 * INTVAL (operands[2]));
+}
+  [(set_attr "type" "vecload")])
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<vp_splat_pmode>"
+  [(use (match_operand:OO 0 "vsx_register_operand"))
+   (use (match_operand:VP_SPLAT 1 "input_operand"))]
+  "TARGET_MMA"
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode element_mode = <MODE>mode;
+  machine_mode vector_mode = <VP_SPLAT_VMODE>mode;
+
+  if (op1 == CONST0_RTX (element_mode))
+    {
+      emit_insn (gen_vpair_zero (op0));
+      DONE;
+    }
+
+  rtx vec = gen_reg_rtx (vector_mode);
+  unsigned num_elements = GET_MODE_NUNITS (vector_mode);
+  rtvec elements = rtvec_alloc (num_elements);
+  for (unsigned i = 0; i < num_elements; i++)
+    RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+  rs6000_expand_vector_init (vec, gen_rtx_PARALLEL (vector_mode, elements));
+  emit_insn (gen_vpair_assemble_<vp_splat_pmode> (op0, vec, vec));
+  DONE;
+})
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_<vp_insn><vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_vsx_nabs<vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+			    gen_<vp_insn><vp_vmode>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VP_FP))]
+	   VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(plus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(minus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (unspec:OO
+	    [(neg:OO
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(plus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(minus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (unspec:OO
+	       [(neg:OO
+		 (match_dup 3))]
+	       VP_FP))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index bf941e6b93a1..c1d6293deb96 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14979,6 +14979,7 @@ instructions, but allow the compiler to schedule those calls.
 * NDS32 Built-in Functions::
 * Nvidia PTX Built-in Functions::
 * Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
 * PowerPC AltiVec/VSX Built-in Functions::
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
@@ -21300,6 +21301,58 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
 
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using
+@code{__vector_pair} to hold two vectors.  The load vector pair and
+store vector pair instructions are used to load the values into
+registers and store the values.  The operation itself is split into
+two separate vector instructions.  To use the vector pair built-in
+functions, you need to have MMA support enabled (@option{-mmma}, which
+is enabled by default with @option{-mcpu=power10}).
+
+The following built-in functions are independent on the type of the
+underlying vector:
+
+@smallexample
+__vector_pair __builtin_vpair_zero ();
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+vector float __builtin_vpair_f32_extract_vector (__vector_pair, int);
+__vector_pair __builtin_vpair_f32_assemble (vector float, vector float);
+__vector_pair __builtin_vpair_f32_splat (float);
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_smax (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_smin (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+vector double __builtin_vpair_f64_extract_vector (__vector_pair, int);
+__vector_pair __builtin_vpair_f64_assemble (vector double, vector double);
+__vector_pair __builtin_vpair_f64_splat (double);
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_smax (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_smin (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
 GCC provides two interfaces for accessing the Hardware Transactional

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [gcc(refs/users/meissner/heads/work141-vpair)] Add support for floating point vector pair built-in functions.
@ 2023-10-25 17:52 Michael Meissner
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-10-25 17:52 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:9d8afc93af52bf423f0ff303c525214cf5f8a4b7

commit 9d8afc93af52bf423f0ff303c525214cf5f8a4b7
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Wed Oct 25 13:51:33 2023 -0400

    Add support for floating point vector pair built-in functions.
    
    2023-10-25  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (mma_assemble_input_operand): Allow other
            16-byte vectors and not just V16QImode.
            * config/rs6000/rs6000-builtins.def (__builtin_vpair_zero): New
            built-in to clear vector pair.
            (__builtin_vpair_f32_*): Add vector pair built-in functions for float.
            (__builtin_vpair_f64_*): Add vector pair built-in functions for double.
            * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
            declaration.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
            for vector pair built-in functions.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
            * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
            * config/rs6000/vector-pair.md: New file.
            * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
            vector pair built-in functions.

Diff:
---
 gcc/config/rs6000/predicates.md       |   2 +-
 gcc/config/rs6000/rs6000-builtins.def |  71 ++++++
 gcc/config/rs6000/rs6000-protos.h     |   5 +
 gcc/config/rs6000/rs6000.cc           |  74 ++++++
 gcc/config/rs6000/rs6000.md           |   1 +
 gcc/config/rs6000/t-rs6000            |   1 +
 gcc/config/rs6000/vector-pair.md      | 456 ++++++++++++++++++++++++++++++++++
 gcc/doc/extend.texi                   |  53 ++++
 8 files changed, 662 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c42..922a77716c41 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1301,7 +1301,7 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16
 		&& (vsx_register_operand (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index b661a2268432..d4674544c07e 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4137,3 +4137,74 @@
 
   void __builtin_vsx_stxvp_internal (v256 *, v256);
     STXVP_INTERNAL stxvp_internal {mma}
+
+;; Vector pair built-in functions
+
+  v256 __builtin_vpair_zero ();
+    VPAIR_ZERO vpair_zero {mma}
+
+  vf __builtin_vpair_f32_extract_vector (v256, const int<1>);
+    VPAIR_F32_GET_VECTOR vpair_extract_vector_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_assemble (vf, vf);
+    VPAIR_F32_ASSEMBLE vpair_assemble_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_splat (float);
+    VPAIR_F32_SPLAT vpair_splat_v8sf {mma,pair}
+
+  v256 __builtin_vpair_f32_abs (v256);
+    VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_add (v256, v256);
+    VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_fma (v256, v256, v256);
+    VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+  v256 __builtin_vpair_f32_mul (v256, v256);
+    VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_neg (v256);
+    VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+  v256 __builtin_vpair_f32_smax (v256, v256);
+    VPAIR_F32_SMAX vpair_smax_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_smin (v256, v256);
+    VPAIR_F32_SMIN vpair_smin_v8sf3 {mma,pair}
+
+  v256 __builtin_vpair_f32_sub (v256, v256);
+    VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+  vd __builtin_vpair_f64_extract_vector (v256, const int<1>);
+    VPAIR_F64_GET_VECTOR vpair_extract_vector_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_assemble (vd, vd);
+    VPAIR_F64_ASSEMBLE vpair_assemble_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_splat (double);
+    VPAIR_F64_SPLAT vpair_splat_v4df {mma,pair}
+
+  v256 __builtin_vpair_f64_abs (v256);
+    VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_add (v256, v256);
+    VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_fma (v256, v256, v256);
+    VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+  v256 __builtin_vpair_f64_mul (v256, v256);
+    VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_neg (v256);
+    VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+  v256 __builtin_vpair_f64_smax (v256, v256);
+    VPAIR_F64_SMAX vpair_smax_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_smin (v256, v256);
+    VPAIR_F64_SMIN vpair_smin_v4df3 {mma,pair}
+
+  v256 __builtin_vpair_f64_sub (v256, v256);
+    VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f5..bbd899d75620 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 8f06b37171a3..0f466f1f7c29 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27393,6 +27393,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 2a1b5ecfaee2..da51029aa1ba 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15759,6 +15759,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1de..5fc89499795d 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/vsx.md \
 	$(srcdir)/config/rs6000/altivec.md \
 	$(srcdir)/config/rs6000/mma.md \
+	$(srcdir)/config/rs6000/vector-pair.md \
 	$(srcdir)/config/rs6000/crypto.md \
 	$(srcdir)/config/rs6000/htm.md \
 	$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 000000000000..4301a86e4040
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,456 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+  [UNSPEC_VPAIR_ZERO
+   UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY  [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs      "abs")
+			   (minus    "sub")
+			   (mult     "mul")
+			   (neg      "neg")
+			   (plus     "add")
+			   (smin     "smin")
+			   (smax     "smax")
+			   (xor      "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+			    UNSPEC_VPAIR_V8SF])
+
+(define_int_iterator VP_ALL [UNSPEC_VPAIR_V4DF
+			     UNSPEC_VPAIR_V8SF])
+
+;; Map VP_{INT,FP,ALL} to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF  "V2DF")
+			      (UNSPEC_VPAIR_V8SF  "V4SF")])
+
+;; Map VP_{INT,FP,ALL} to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF  "v4df")
+			   (UNSPEC_VPAIR_V8SF  "v8sf")])
+
+;; Map VP_{INT,FP,ALL} to a lower case name to identify the vector after the
+;; vector pair has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF  "v2df")
+			   (UNSPEC_VPAIR_V8SF  "v4sf")])
+
+;; Moddes of the vector element to splat to vector pair
+(define_mode_iterator VP_SPLAT [DF SF])
+
+;; MAP VP_SPLAT to the mode of the vector pair in the assemble operation
+(define_mode_attr vp_splat_pmode [(DF "v4df")
+				  (SF "v8sf")])
+
+;; MAP VP_SPLAT to the mode of the vector containing the element
+(define_mode_attr VP_SPLAT_VMODE [(DF "V2DF")
+				  (SF "V4SF")])
+
+;; Initialize a vector pair to 0
+(define_insn_and_split "vpair_zero"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(const_int 0)] UNSPEC_VPAIR_ZERO))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 1) (match_dup 3))
+   (set (match_dup 2) (match_dup 3))]
+{
+  rtx op0 = operands[0];
+  unsigned offset_hi = (WORDS_BIG_ENDIAN) ? 0 : 16;
+  unsigned offset_lo = (WORDS_BIG_ENDIAN) ? 16 : 0;
+
+  operands[1] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_hi);
+  operands[2] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_lo);
+  operands[3] = CONST0_RTX (V2DImode);
+}
+  [(set_attr "length" "8")])
+
+;; Assemble a vector pair from two vectors.  Unlike
+;; __builtin_mma_assemble_pair, this function produces a vector pair output
+;; directly and it takes all of the vector types.
+;;
+;; We cannot update the two output registers atomically, so mark the output as
+;; an early clobber so we don't accidentally clobber the input operands.  */
+
+(define_insn_and_split "vpair_assemble_<vp_pmode>"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=&wa")
+	(unspec:OO
+	 [(match_operand:<VP_VEC_MODE> 1 "mma_assemble_input_operand" "mwa")
+	  (match_operand:<VP_VEC_MODE> 2 "mma_assemble_input_operand" "mwa")]
+	 VP_ALL))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx src = gen_rtx_UNSPEC (OOmode,
+			    gen_rtvec (2, operands[1], operands[2]),
+			    UNSPEC_VSX_ASSEMBLE);
+  rs6000_split_multireg_move (operands[0], src);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Extract one of the two 128-bit vectors from a vector pair.
+(define_insn_and_split "vpair_extract_vector_<vp_pmode>"
+  [(set (match_operand:<VP_VEC_MODE> 0 "vsx_register_operand" "=wa")
+	(unspec:<VP_VEC_MODE>
+	 [(match_operand:OO 1 "vsx_register_operand" "wa")
+	  (match_operand 2 "const_0_to_1_operand" "n")]
+	 VP_ALL))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 3))]
+{
+  machine_mode vmode = <VP_VEC_MODE>mode;
+  unsigned reg_num = UINTVAL (operands[2]);
+  if (!WORDS_BIG_ENDIAN)
+    reg_num = 1 - reg_num;
+	   
+  operands[3] = simplify_gen_subreg (vmode, operands[0], OOmode, reg_num * 16);
+})
+
+;; Optimize extracting an 128-bit vector from a vector pair in memory.
+(define_insn_and_split "*vpair_extract_vector_<vp_pmode>_mem"
+  [(set (match_operand:<VP_VEC_MODE> 0 "vsx_register_operand" "=wa")
+	(unspec:<VP_VEC_MODE>
+	 [(match_operand:OO 1 "memory_operand" "o")
+	  (match_operand 2 "const_0_to_1_operand" "n")]
+	 VP_ALL))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 3))]
+{
+  operands[3] = adjust_address (operands[1], <VP_VEC_MODE>mode,
+				16 * INTVAL (operands[2]));
+}
+  [(set_attr "type" "vecload")])
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<vp_splat_pmode>"
+  [(use (match_operand:OO 0 "vsx_register_operand"))
+   (use (match_operand:VP_SPLAT 1 "input_operand"))]
+  "TARGET_MMA"
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode element_mode = <MODE>mode;
+  machine_mode vector_mode = <VP_SPLAT_VMODE>mode;
+
+  if (op1 == CONST0_RTX (element_mode))
+    {
+      emit_insn (gen_vpair_zero (op0));
+      DONE;
+    }
+
+  rtx vec = gen_reg_rtx (vector_mode);
+  unsigned num_elements = GET_MODE_NUNITS (vector_mode);
+  rtvec elements = rtvec_alloc (num_elements);
+  for (unsigned i = 0; i < num_elements; i++)
+    RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+  rs6000_expand_vector_init (vec, gen_rtx_PARALLEL (vector_mode, elements));
+  emit_insn (gen_vpair_assemble_<vp_splat_pmode> (op0, vec, vec));
+  DONE;
+})
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_<vp_insn><vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+			   gen_vsx_nabs<vp_vmode>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VP_FP_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+			    gen_<vp_insn><vp_vmode>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_fms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfma<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VP_FP))]
+	   VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+			 gen_nfms<vp_vmode>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(plus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(minus:OO
+	   (unspec:OO
+	    [(mult:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	    VP_FP)
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(fma:OO
+	   (match_dup 1)
+	   (match_dup 2)
+	   (unspec:OO
+	    [(neg:OO
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(plus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (match_dup 3))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(minus:OO
+	      (unspec:OO
+	       [(mult:OO
+		 (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+		 (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+	       VP_FP)
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VP_FP))]
+	 VP_FP))]
+  "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_dup 1)
+	      (match_dup 2)
+	      (unspec:OO
+	       [(neg:OO
+		 (match_dup 3))]
+	       VP_FP))]
+	    VP_FP))]
+	 VP_FP))]
+{
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index bf941e6b93a1..c1d6293deb96 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14979,6 +14979,7 @@ instructions, but allow the compiler to schedule those calls.
 * NDS32 Built-in Functions::
 * Nvidia PTX Built-in Functions::
 * Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
 * PowerPC AltiVec/VSX Built-in Functions::
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
@@ -21300,6 +21301,58 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
 
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using
+@code{__vector_pair} to hold two vectors.  The load vector pair and
+store vector pair instructions are used to load the values into
+registers and store the values.  The operation itself is split into
+two separate vector instructions.  To use the vector pair built-in
+functions, you need to have MMA support enabled (@option{-mmma}, which
+is enabled by default with @option{-mcpu=power10}).
+
+The following built-in functions are independent on the type of the
+underlying vector:
+
+@smallexample
+__vector_pair __builtin_vpair_zero ();
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+vector float __builtin_vpair_f32_extract_vector (__vector_pair, int);
+__vector_pair __builtin_vpair_f32_assemble (vector float, vector float);
+__vector_pair __builtin_vpair_f32_splat (float);
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_smax (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_smin (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+vector double __builtin_vpair_f64_extract_vector (__vector_pair, int);
+__vector_pair __builtin_vpair_f64_assemble (vector double, vector double);
+__vector_pair __builtin_vpair_f64_splat (double);
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_smax (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_smin (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
 GCC provides two interfaces for accessing the Hardware Transactional

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-11-02  4:18 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-26  6:45 [gcc(refs/users/meissner/heads/work141-vpair)] Add support for floating point vector pair built-in functions Michael Meissner
  -- strict thread matches above, loose matches on Subject: below --
2023-11-02  4:18 Michael Meissner
2023-11-01 17:45 Michael Meissner
2023-10-27  4:20 Michael Meissner
2023-10-25 19:02 Michael Meissner
2023-10-25 17:52 Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).