public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/meissner/heads/work129-vpair)] Add fp built-in overload support.
@ 2023-08-01  3:36 Michael Meissner
  0 siblings, 0 replies; only message in thread
From: Michael Meissner @ 2023-08-01  3:36 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ff9141eac886e540968dc6a000d5820481bfcf6f

commit ff9141eac886e540968dc6a000d5820481bfcf6f
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Mon Jul 31 23:35:47 2023 -0400

    Add fp built-in overload support.
    
    2023-07-31  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/rs6000-builtin.cc (fold_builtin_overload_fp): New helper
            function for floating point overloaded built-in functions.
            (rs6000_gimple_fold_builtin): Add support for floating point overloaded
            built-in functions that map directly to gimple.
            * config/rs6000/rs6000-builtins.def (__builtin_*_f32_scalar): New
            built-in functions for overloaded floating point support.
            (__builtin_*_f32_vector): Likewise.
            (__builtin_*_f32_vpair): Likewise.
            (__builtin_*_f64_scalar): Likewise.
            (__builtin_*_f64_vector): Likewise.
            (__builtin_*_f64_vpair): Likewise.
            * config/rs6000/rs6000-overload.def (__builtin_*_f32): Likewise.
            (__builtin_*_f64): Likewise.
            * config/rs6000/rs6000.md (vector-pair.md): Include.
            * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
            * config/rs6000/vector-pair.md: New file.
            * doc/extend.texi (PowerPC Built-in functions): Document the floating
            point overloaded built-in function.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/fp-overload-f32-scalar.c: New test.
            * gcc.target/powerpc/fp-overload-f32-vector.c: Likewise.
            * gcc.target/powerpc/fp-overload-f32-vpair.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-scalar.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-vector.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-vpair.c: Likewise.
            * gcc.target/powerpc/fp-overload.h: New include file for fp overloaded
            built-in functions.

Diff:
---
 gcc/config/rs6000/rs6000-builtin.cc                | 105 ++++++
 gcc/config/rs6000/rs6000-builtins.def              | 173 +++++++++
 gcc/config/rs6000/rs6000-overload.def              | 178 +++++++++
 gcc/config/rs6000/rs6000.md                        |   1 +
 gcc/config/rs6000/t-rs6000                         |   1 +
 gcc/config/rs6000/vector-pair.md                   | 417 +++++++++++++++++++++
 gcc/doc/extend.texi                                | 107 ++++++
 .../gcc.target/powerpc/fp-overload-f32-scalar.c    |  21 ++
 .../gcc.target/powerpc/fp-overload-f32-vector.c    |  23 ++
 .../gcc.target/powerpc/fp-overload-f32-vpair.c     |  23 ++
 .../gcc.target/powerpc/fp-overload-f64-scalar.c    |  22 ++
 .../gcc.target/powerpc/fp-overload-f64-vector.c    |  22 ++
 .../gcc.target/powerpc/fp-overload-f64-vpair.c     |  22 ++
 gcc/testsuite/gcc.target/powerpc/fp-overload.h     |  85 +++++
 14 files changed, 1200 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc
index 82cc3a19447..e32d9175a0c 100644
--- a/gcc/config/rs6000/rs6000-builtin.cc
+++ b/gcc/config/rs6000/rs6000-builtin.cc
@@ -1261,6 +1261,49 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi,
   return true;
 }
 
+/* Helper function to fold the overloaded fp functions for the scalar and
+   vector types that support the operation directly.  */
+
+static void
+fold_builtin_overload_fp (gimple_stmt_iterator *gsi,
+			  gimple *stmt,
+			  enum tree_code code,
+			  int nargs)
+{
+  location_t loc = gimple_location (stmt);
+  tree lhs = gimple_call_lhs (stmt);
+  tree t;
+
+  if (nargs == 1)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      t = build1 (code, TREE_TYPE (lhs), arg0);
+    }
+
+  else if (nargs == 2)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      tree arg1 = gimple_call_arg (stmt, 1);
+      t = build2 (code, TREE_TYPE (lhs), arg0, arg1);
+    }
+
+  else if (nargs == 3)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      tree arg1 = gimple_call_arg (stmt, 1);
+      tree arg2 = gimple_call_arg (stmt, 2);
+      t = build3 (code, TREE_TYPE (lhs), arg0, arg1, arg2);
+    }
+
+  else
+    gcc_unreachable ();
+
+  gimple *g = gimple_build_assign (lhs, t);
+  gimple_set_location (g, loc);
+  gsi_replace (gsi, g, true);
+  return;
+}
+
 /* Fold a machine-dependent built-in in GIMPLE.  (For folding into
    a constant, use rs6000_fold_builtin.)  */
 bool
@@ -2233,6 +2276,68 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	return true;
       }
 
+    case RS6000_BIF_ABS_F32_SCALAR:
+    case RS6000_BIF_ABS_F32_VECTOR:
+    case RS6000_BIF_ABS_F64_SCALAR:
+    case RS6000_BIF_ABS_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, ABS_EXPR, 1);
+      return true;
+
+    case RS6000_BIF_ADD_F32_SCALAR:
+    case RS6000_BIF_ADD_F32_VECTOR:
+    case RS6000_BIF_ADD_F64_SCALAR:
+    case RS6000_BIF_ADD_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, PLUS_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_MULT_F32_SCALAR:
+    case RS6000_BIF_MULT_F32_VECTOR:
+    case RS6000_BIF_MULT_F64_SCALAR:
+    case RS6000_BIF_MULT_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MULT_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_NEG_F32_SCALAR:
+    case RS6000_BIF_NEG_F32_VECTOR:
+    case RS6000_BIF_NEG_F64_SCALAR:
+    case RS6000_BIF_NEG_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, NEGATE_EXPR, 1);
+      return true;
+
+    case RS6000_BIF_REDUCE_F32_SCALAR:
+    case RS6000_BIF_REDUCE_F64_SCALAR:
+      {
+	location_t loc = gimple_location (stmt);
+	lhs = gimple_call_lhs (stmt);
+	arg0 = gimple_call_arg (stmt, 0);
+	g = gimple_build_assign (lhs, arg0);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    case RS6000_BIF_SMAX_F32_SCALAR:
+    case RS6000_BIF_SMAX_F32_VECTOR:
+    case RS6000_BIF_SMAX_F64_SCALAR:
+    case RS6000_BIF_SMAX_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MAX_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_SMIN_F32_SCALAR:
+    case RS6000_BIF_SMIN_F32_VECTOR:
+    case RS6000_BIF_SMIN_F64_SCALAR:
+    case RS6000_BIF_SMIN_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MIN_EXPR, 2);
+      return true;
+
+
+    case RS6000_BIF_SUB_F32_SCALAR:
+    case RS6000_BIF_SUB_F32_VECTOR:
+    case RS6000_BIF_SUB_F64_SCALAR:
+    case RS6000_BIF_SUB_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MINUS_EXPR, 2);
+      return true;
+
     default:
       if (TARGET_DEBUG_BUILTIN)
 	fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 35c4cdf74c5..acc76adca12 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4116,3 +4116,176 @@
 
   void __builtin_vsx_stxvp (v256, unsigned long, const v256 *);
     STXVP nothing {mma,pair}
+
+; Builtins for overload floating point operations, including scalar and
+; 128-bit vector codes that are converted into direct operations.
+; The 256 codes that are kept in vector pairs insns that are split
+; into separate operations after register allocation.
+
+  float __builtin_abs_f32_scalar (float);
+    ABS_F32_SCALAR nothing {}
+  vf __builtin_abs_f32_vector (vf);
+    ABS_F32_VECTOR nothing {}
+  v256 __builtin_abs_f32_vpair (v256);
+    ABS_F32_VPAIR vpair_absv8sf2 {mma}
+
+  double __builtin_abs_f64_scalar (double);
+    ABS_F64_SCALAR nothing {}
+  vd __builtin_abs_f64_vector (vd);
+    ABS_F64_VECTOR nothing {}
+  v256 __builtin_abs_f64_vpair (v256);
+    ABS_F64_VPAIR vpair_absv4df2 {mma}
+
+  float __builtin_add_f32_scalar (float, float);
+    ADD_F32_SCALAR nothing {}
+  vf __builtin_add_f32_vector (vf, vf);
+    ADD_F32_VECTOR nothing {}
+  v256 __builtin_add_f32_vpair (v256, v256);
+    ADD_F32_VPAIR vpair_addv8sf3 {mma}
+
+  double __builtin_add_f64_scalar (double, double);
+    ADD_F64_SCALAR nothing {}
+  vd __builtin_add_f64_vector (vd, vd);
+    ADD_F64_VECTOR nothing {}
+  v256 __builtin_add_f64_vpair (v256, v256);
+    ADD_F64_VPAIR vpair_addv4df3 {mma}
+
+  float __builtin_copysign_f32_scalar (float, float);
+    COPYSIGN_F32_SCALAR copysignsf3_fcpsgn {}
+  vf __builtin_copysign_f32_vector (vf, vf);
+    COPYSIGN_F32_VECTOR vsx_copysignv4sf3 {}
+  v256 __builtin_copysign_f32_vpair (v256, v256);
+    COPYSIGN_F32_VPAIR vpair_copysignv8sf3 {mma}
+
+  double __builtin_copysign_f64_scalar (double, double);
+    COPYSIGN_F64_SCALAR copysigndf3_fcpsgn {}
+  vd __builtin_copysign_f64_vector (vd, vd);
+    COPYSIGN_F64_VECTOR vsx_copysignv2df3 {}
+  v256 __builtin_copysign_f64_vpair (v256, v256);
+    COPYSIGN_F64_VPAIR vpair_copysignv4df3 {mma}
+
+  float __builtin_div_f32_scalar (float, float);
+    DIV_F32_SCALAR divsf3 {}
+  vf __builtin_div_f32_vector (vf, vf);
+    DIV_F32_VECTOR divv4sf3 {}
+  v256 __builtin_div_f32_vpair (v256, v256);
+    DIV_F32_VPAIR vpair_divv8sf3 {mma}
+
+  double __builtin_div_f64_scalar (double, double);
+    DIV_F64_SCALAR divdf3 {}
+  vd __builtin_div_f64_vector (vd, vd);
+    DIV_F64_VECTOR divv2df3 {}
+  v256 __builtin_div_f64_vpair (v256, v256);
+    DIV_F64_VPAIR vpair_divv4df3 {mma}
+
+  float __builtin_fma_f32_scalar (float, float, float);
+    FMA_F32_SCALAR fmasf4 {}
+  vf __builtin_fma_f32_vector (vf, vf, vf);
+    FMA_F32_VECTOR fmav4sf4 {}
+  v256 __builtin_fma_v8sf (v256, v256, v256);
+    FMA_F32_VPAIR vpair_fmav8sf4 {mma}
+
+  double __builtin_fma_f64_scalar (double, double, double);
+    FMA_F64_SCALAR fmadf4 {}
+  vd __builtin_fma_f64_vector (vd, vd, vd);
+    FMA_F64_VECTOR fmav2df4 {}
+  v256 __builtin_fma_v4df (v256, v256, v256);
+    FMA_F64_VPAIR vpair_fmav4df4 {mma}
+
+  float __builtin_mult_f32_scalar (float, float);
+    MULT_F32_SCALAR nothing {}
+  vf __builtin_mult_f32_vector (vf, vf);
+    MULT_F32_VECTOR nothing {}
+  v256 __builtin_mult_f32_vpair (v256, v256);
+    MULT_F32_VPAIR vpair_mulv8sf3 {mma}
+
+  double __builtin_mult_f64_scalar (double, double);
+    MULT_F64_SCALAR nothing {}
+  vd __builtin_mult_f64_vector (vd, vd);
+    MULT_F64_VECTOR nothing {}
+  v256 __builtin_mult_f64_vpair (v256, v256);
+    MULT_F64_VPAIR vpair_mulv4df3 {mma}
+
+  float __builtin_neg_f32_scalar (float);
+    NEG_F32_SCALAR nothing {}
+  vf __builtin_neg_f32_vector (vf);
+    NEG_F32_VECTOR nothing {}
+  v256 __builtin_neg_f32_vpair (v256);
+    NEG_F32_VPAIR vpair_negv8sf2 {mma}
+
+  double __builtin_neg_f64_scalar (double);
+    NEG_F64_SCALAR nothing {}
+  vd __builtin_neg_f64_vector (vd);
+    NEG_F64_VECTOR nothing {}
+  v256 __builtin_neg_f64_vpair (v256);
+    NEG_F64_VPAIR vpair_negv4df2 {mma}
+
+  float __builtin_reduce_f32_scalar (float);
+    REDUCE_F32_SCALAR nothing {}
+  float __builtin_reduce_f32_vector (vf);
+    REDUCE_F32_VECTOR reduce_v4sf {}
+  float __builtin_reduce_f32_vpair (v256);
+    REDUCE_F32_VPAIR reduce_v8sf {mma,pair}
+
+  double __builtin_reduce_f64_scalar (double);
+    REDUCE_F64_SCALAR nothing {}
+  double __builtin_reduce_f64_vector (vd);
+    REDUCE_F64_VECTOR reduce_v2df {}
+  double __builtin_reduce_f64_vpair (v256);
+    REDUCE_F64_VPAIR reduce_v4df {mma,pair}
+
+  float __builtin_smax_f32_scalar (float, float);
+    SMAX_F32_SCALAR nothing {}
+  vf __builtin_smax_f32_vector (vf, vf);
+    SMAX_F32_VECTOR nothing {}
+  v256 __builtin_smax_f32_vpair (v256, v256);
+    SMAX_F32_VPAIR vpair_smaxv8sf3 {mma}
+
+  double __builtin_smax_f64_scalar (double, double);
+    SMAX_F64_SCALAR nothing {}
+  vd __builtin_smax_f64_vector (vd, vd);
+    SMAX_F64_VECTOR nothing {}
+  v256 __builtin_smax_f64_vpair (v256, v256);
+    SMAX_F64_VPAIR vpair_smaxv4df3 {mma}
+
+  float __builtin_smin_f32_scalar (float, float);
+    SMIN_F32_SCALAR nothing {}
+  vf __builtin_smin_f32_vector (vf, vf);
+    SMIN_F32_VECTOR nothing {}
+  v256 __builtin_smin_f32_vpair (v256, v256);
+    SMIN_F32_VPAIR vpair_sminv8sf3 {mma}
+
+  double __builtin_smin_f64_scalar (double, double);
+    SMIN_F64_SCALAR nothing {}
+  vd __builtin_smin_f64_vector (vd, vd);
+    SMIN_F64_VECTOR nothing {}
+  v256 __builtin_smin_f64_vpair (v256, v256);
+    SMIN_F64_VPAIR vpair_sminv4df3 {mma}
+
+  float __builtin_sqrt_f32_scalar (float);
+    SQRT_F32_SCALAR nothing {}
+  vf __builtin_sqrt_f32_vector (vf);
+    SQRT_F32_VECTOR nothing {}
+  v256 __builtin_sqrt_f32_vpair (v256);
+    SQRT_F32_VPAIR vpair_sqrtv8sf2 {mma}
+
+  double __builtin_sqrt_f64_scalar (double);
+    SQRT_F64_SCALAR nothing {}
+  vd __builtin_sqrt_f64_vector (vd);
+    SQRT_F64_VECTOR nothing {}
+  v256 __builtin_sqrt_f64_vpair (v256);
+    SQRT_F64_VPAIR vpair_sqrtv4df2 {mma}
+
+  float __builtin_sub_f32_scalar (float, float);
+    SUB_F32_SCALAR nothing {}
+  vf __builtin_sub_f32_vector (vf, vf);
+    SUB_F32_VECTOR nothing {}
+  v256 __builtin_sub_f32_vpair (v256, v256);
+    SUB_F32_VPAIR vpair_subv8sf3 {mma}
+
+  double __builtin_sub_f64_scalar (double, double);
+    SUB_F64_SCALAR nothing {}
+  vd __builtin_sub_f64_vector (vd, vd);
+    SUB_F64_VECTOR nothing {}
+  v256 __builtin_sub_f64_vpair (v256, v256);
+    SUB_F64_VPAIR vpair_subv4df3 {mma}
diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def
index b83946f5ad8..bbc26de4568 100644
--- a/gcc/config/rs6000/rs6000-overload.def
+++ b/gcc/config/rs6000/rs6000-overload.def
@@ -6187,3 +6187,181 @@
     VUPKLSW  VUPKLSW_DEPR1
   vbll __builtin_vec_vupklsw (vbi);
     VUPKLSW  VUPKLSW_DEPR2
+
+;; Overloaded floating point built-in functions
+
+[ABS_F32, SKIP, __builtin_abs_f32]
+  float __builtin_abs_f32 (float);
+    ABS_F32_SCALAR
+  vf __builtin_abs_f32 (vf);
+    ABS_F32_VECTOR
+  v256 __builtin_abs_f32 (v256);
+    ABS_F32_VPAIR
+
+[ABS_F64, SKIP, __builtin_abs_f64]
+  double __builtin_abs_f64 (double);
+    ABS_F64_SCALAR
+  vd __builtin_abs_f64 (vd);
+    ABS_F64_VECTOR
+  v256 __builtin_abs_f64 (v256);
+    ABS_F64_VPAIR
+
+[ADD_F32, SKIP, __builtin_add_f32]
+  float __builtin_add_f32 (float, float);
+    ADD_F32_SCALAR
+  vf __builtin_add_f32 (vf, vf);
+    ADD_F32_VECTOR
+  v256 __builtin_add_f32 (v256, v256);
+    ADD_F32_VPAIR
+
+[ADD_F64, SKIP, __builtin_add_f64]
+  double __builtin_add_f64 (double, double);
+    ADD_F64_SCALAR
+  vd __builtin_add_f64 (vd, vd);
+    ADD_F64_VECTOR
+  v256 __builtin_add_f64 (v256, v256);
+    ADD_F64_VPAIR
+
+[COPYSIGN_F32, SKIP, __builtin_copysign_f32]
+  float __builtin_copysign_f32 (float, float);
+    COPYSIGN_F32_SCALAR
+  vf __builtin_copysign_f32 (vf, vf);
+    COPYSIGN_F32_VECTOR
+  v256 __builtin_copysign_f32 (v256, v256);
+    COPYSIGN_F32_VPAIR
+
+[COPYSIGN_F64, SKIP, __builtin_copysign_f64]
+  double __builtin_copysign_f64 (double, double);
+    COPYSIGN_F64_SCALAR
+  vd __builtin_copysign_f64 (vd, vd);
+    COPYSIGN_F64_VECTOR
+  v256 __builtin_copysign_f64 (v256, v256);
+    COPYSIGN_F64_VPAIR
+
+[DIV_F32, SKIP, __builtin_div_f32]
+  float __builtin_div_f32 (float, float);
+    DIV_F32_SCALAR
+  vf __builtin_div_f32 (vf, vf);
+    DIV_F32_VECTOR
+  v256 __builtin_div_f32 (v256, v256);
+    DIV_F32_VPAIR
+
+[DIV_F64, SKIP, __builtin_div_f64]
+  double __builtin_div_f64 (double, double);
+    DIV_F64_SCALAR
+  vd __builtin_div_f64 (vd, vd);
+    DIV_F64_VECTOR
+  v256 __builtin_div_f64 (v256, v256);
+    DIV_F64_VPAIR
+
+[FMA_F32, SKIP, __builtin_fma_f32]
+  float __builtin_fma_f32 (float, float, float);
+    FMA_F32_SCALAR
+  vf __builtin_fma_f32 (vf, vf, vf);
+    FMA_F32_VECTOR
+  v256 __builtin_fma_f32 (v256, v256, v256);
+    FMA_F32_VPAIR
+
+[FMA_F64, SKIP, __builtin_fma_f64]
+  double __builtin_fma_f64 (double, double, double);
+    FMA_F64_SCALAR
+  vd __builtin_fma_f64 (vd, vd, vd);
+    FMA_F64_VECTOR
+  v256 __builtin_fma_f64 (v256, v256, v256);
+    FMA_F64_VPAIR
+
+[MULT_F32, SKIP, __builtin_mult_f32]
+  float __builtin_mult_f32 (float, float);
+    MULT_F32_SCALAR
+  vf __builtin_mult_f32 (vf, vf);
+    MULT_F32_VECTOR
+  v256 __builtin_mult_f32 (v256, v256);
+    MULT_F32_VPAIR
+
+[MULT_F64, SKIP, __builtin_mult_f64]
+  double __builtin_mult_f64 (double, double);
+    MULT_F64_SCALAR
+  vd __builtin_mult_f64 (vd, vd);
+    MULT_F64_VECTOR
+  v256 __builtin_mult_f64 (v256, v256);
+    MULT_F64_VPAIR
+
+[NEG_F32, SKIP, __builtin_neg_f32]
+  float __builtin_neg_f32 (float);
+    NEG_F32_SCALAR
+  vf __builtin_neg_f32 (vf);
+    NEG_F32_VECTOR
+  v256 __builtin_neg_f32 (v256);
+    NEG_F32_VPAIR
+
+[NEG_F64, SKIP, __builtin_neg_f64]
+  double __builtin_neg_f64 (double);
+    NEG_F64_SCALAR
+  vd __builtin_neg_f64 (vd);
+    NEG_F64_VECTOR
+  v256 __builtin_neg_f64 (v256);
+    NEG_F64_VPAIR
+
+[REDUCE_F32, SKIP, __builtin_reduce_f32]
+  float __builtin_reduce_f32 (float);
+    REDUCE_F32_SCALAR
+  float __builtin_reduce_f32 (vf);
+    REDUCE_F32_VECTOR
+  float __builtin_reduce_f32 (v256);
+    REDUCE_F32_VPAIR
+
+[REDUCE_F64, SKIP, __builtin_reduce_f64]
+  double __builtin_reduce_f64 (double);
+    REDUCE_F64_SCALAR
+  double __builtin_reduce_f64 (vd);
+    REDUCE_F64_VECTOR
+  double __builtin_reduce_f64 (v256);
+    REDUCE_F64_VPAIR
+
+[SMAX_F32, SKIP, __builtin_smax_f32]
+  float __builtin_smax_f32 (float, float);
+    SMAX_F32_SCALAR
+  vf __builtin_smax_f32 (vf, vf);
+    SMAX_F32_VECTOR
+  v256 __builtin_smax_f32 (v256, v256);
+    SMAX_F32_VPAIR
+
+[SMAX_F64, SKIP, __builtin_smax_f64]
+  double __builtin_smax_f64 (double, double);
+    SMAX_F64_SCALAR
+  vd __builtin_smax_f64 (vd, vd);
+    SMAX_F64_VECTOR
+  v256 __builtin_smax_f64 (v256, v256);
+    SMAX_F64_VPAIR
+
+[SMIN_F32, SKIP, __builtin_smin_f32]
+  float __builtin_smin_f32 (float, float);
+    SMIN_F32_SCALAR
+  vf __builtin_smin_f32 (vf, vf);
+    SMIN_F32_VECTOR
+  v256 __builtin_smin_f32 (v256, v256);
+    SMIN_F32_VPAIR
+
+[SMIN_F64, SKIP, __builtin_smin_f64]
+  double __builtin_smin_f64 (double, double);
+    SMIN_F64_SCALAR
+  vd __builtin_smin_f64 (vd, vd);
+    SMIN_F64_VECTOR
+  v256 __builtin_smin_f64 (v256, v256);
+    SMIN_F64_VPAIR
+
+[SUB_F32, SKIP, __builtin_sub_f32]
+  float __builtin_sub_f32 (float, float);
+    SUB_F32_SCALAR
+  vf __builtin_sub_f32 (vf, vf);
+    SUB_F32_VECTOR
+  v256 __builtin_sub_f32 (v256, v256);
+    SUB_F32_VPAIR
+
+[SUB_F64, SKIP, __builtin_sub_f64]
+  double __builtin_sub_f64 (double, double);
+    SUB_F64_SCALAR
+  vd __builtin_sub_f64 (vd, vd);
+    SUB_F64_VECTOR
+  v256 __builtin_sub_f64 (v256, v256);
+    SUB_F64_VPAIR
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index be615c3584e..1cf0ed31fb8 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15778,6 +15778,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1d..5fc89499795 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
 	$(srcdir)/config/rs6000/vsx.md \
 	$(srcdir)/config/rs6000/altivec.md \
 	$(srcdir)/config/rs6000/mma.md \
+	$(srcdir)/config/rs6000/vector-pair.md \
 	$(srcdir)/config/rs6000/crypto.md \
 	$(srcdir)/config/rs6000/htm.md \
 	$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 00000000000..13f6e0464b5
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,417 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+  [UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   UNSPEC_VPAIR_V32QI
+   UNSPEC_VPAIR_V16HI
+   UNSPEC_VPAIR_V8SI
+   UNSPEC_VPAIR_V4DI
+   UNSPEC_REDUCE_F32
+   UNSPEC_REDUCE_F64
+   ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VPAIR_UNARY  [neg abs sqrt])
+(define_code_iterator VPAIR_BINARY [plus minus mult div copysign smin smax])
+
+;; Give the insn name from the opertion
+(define_code_attr vpair_op [(abs      "abs")
+			    (copysign "copysign")
+			    (div      "div")
+			    (minus    "sub")
+			    (mult     "mul")
+			    (neg      "neg")
+			    (plus     "add")
+			    (smin     "smin")
+			    (smax     "smax")
+			    (sqrt     "sqrt")])
+
+;; Iterator for creating the wrapper for vector pair built-ins
+(define_int_iterator VPAIR_WRAPPER [UNSPEC_VPAIR_V4DF UNSPEC_VPAIR_V8SF])
+
+;; Map VPAIR_WRAPPER to vector type (i.e. V2DF or V4SF)
+(define_int_attr VPAIR_VECTOR [(UNSPEC_VPAIR_V4DF "V2DF")
+			       (UNSPEC_VPAIR_V8SF "V4SF")])
+
+(define_int_attr vpair_type [(UNSPEC_VPAIR_V4DF "v4df")
+			     (UNSPEC_VPAIR_V8SF "v8sf")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vpair_op><vpair_type>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VPAIR_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2) (VPAIR_UNARY:<VPAIR_VECTOR> (match_dup 3)))
+   (set (match_dup 4) (VPAIR_UNARY:<VPAIR_VECTOR> (match_dup 5)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[2] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs<vpair_type>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2)
+	(neg:<VPAIR_VECTOR>
+	 (abs:<VPAIR_VECTOR> (match_dup 3))))
+   (set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (abs:<VPAIR_VECTOR> (match_dup 5))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[2] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vpair_op><vpair_type>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VPAIR_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3)
+	(VPAIR_BINARY:<VPAIR_VECTOR> (match_dup 4)
+				      (match_dup 5)))
+   (set (match_dup 6)
+	(VPAIR_BINARY:<VPAIR_VECTOR> (match_dup 7)
+				      (match_dup 8)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(fma:<VPAIR_VECTOR> (match_dup 5)
+			    (match_dup 6)
+			    (match_dup 7)))
+   (set (match_dup 8)
+	(fma:<VPAIR_VECTOR> (match_dup 9)
+			    (match_dup 10)
+			    (match_dup 11)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(fma:<VPAIR_VECTOR> (match_dup 5)
+			    (match_dup 6)
+			    (neg:<VPAIR_VECTOR> (match_dup 7))))
+   (set (match_dup 8)
+	(fma:<VPAIR_VECTOR> (match_dup 9)
+			    (match_dup 10)
+			    (neg:<VPAIR_VECTOR> (match_dup 11))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 5)
+			     (match_dup 6)
+			     (match_dup 7))))
+   (set (match_dup 8)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 9)
+			     (match_dup 10)
+			     (match_dup 11))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VPAIR_WRAPPER))]
+	   VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 5)
+			     (match_dup 6)
+			     (neg:<VPAIR_VECTOR> (match_dup 7)))))
+   (set (match_dup 8)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 9)
+			     (match_dup 10)
+			     (neg:<VPAIR_VECTOR> (match_dup 11)))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Reduction for a V4SF vector
+(define_insn_and_split "reduce_v4sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "v")]
+		   UNSPEC_REDUCE_F32))
+   (clobber (match_scratch:V4SF 2 "=&v"))
+   (clobber (match_scratch:V4SF 3 "=&v"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp1 = operands[2];
+  rtx tmp2 = operands[3];
+
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp1, op1, op1, GEN_INT (8)));
+  emit_insn (gen_addv4sf3 (tmp1, op1, tmp1));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (4)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+  DONE;
+}
+  [(set_attr "length" "24")])
+
+;; Reduction for a pair of V4SF vectors
+(define_insn_and_split "reduce_v8sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(unspec:SF [(match_operand:OO 1 "vsx_register_operand" "v")]
+		   UNSPEC_REDUCE_F32))
+   (clobber (match_scratch:V4SF 2 "=&v"))
+   (clobber (match_scratch:V4SF 3 "=&v"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp1 = operands[2];
+  rtx tmp2 = operands[3];
+  unsigned r = reg_or_subregno (op1);
+  rtx op1_hi = gen_rtx_REG (V4SFmode, r);
+  rtx op1_lo = gen_rtx_REG (V4SFmode, r + 1);
+
+  emit_insn (gen_addv4sf3 (tmp1, op1_hi, op1_lo));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp1, tmp2, tmp2, GEN_INT (4)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+  DONE;
+}
+  [(set_attr "length" "24")])
+
+;; Reduction for a V2DF vector
+(define_insn_and_split "reduce_v2df"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+	(unspec:DF [(match_operand:V2DF 1 "vsx_register_operand" "wa")]
+		   UNSPEC_REDUCE_F64))
+   (clobber (match_scratch:DF 2 "=&wa"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2)
+	(vec_select:DF (match_dup 1)
+		       (parallel [(match_dup 3)])))
+   (set (match_dup 0)
+	(plus:DF (match_dup 4)
+		 (match_dup 2)))]
+{
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[3] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+  operands[4] = gen_rtx_REG (DFmode, reg1);
+})
+
+;; Reduction for a pair of V2DF vectors
+(define_insn_and_split "reduce_v4df"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+	(unspec:DF [(match_operand:OO 1 "vsx_register_operand" "wa")]
+		   UNSPEC_REDUCE_F64))
+   (clobber (match_scratch:DF 2 "=&wa"))
+   (clobber (match_scratch:V2DF 3 "=&wa"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3)
+	(plus:V2DF (match_dup 4)
+		   (match_dup 5)))
+   (set (match_dup 2)
+	(vec_select:DF (match_dup 3)
+		       (parallel [(match_dup 6)])))
+   (set (match_dup 0)
+	(plus:DF (match_dup 7)
+		 (match_dup 2)))]
+{
+  unsigned reg1 = REGNO (operands[1]);
+  unsigned reg3 = REGNO (operands[3]);
+
+  operands[4] = gen_rtx_REG (V2DFmode, reg1);
+  operands[5] = gen_rtx_REG (V2DFmode, reg1 + 1);
+  operands[6] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+  operands[7] = gen_rtx_REG (DFmode, reg3);
+})
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 97eaacf8a7e..1e735cdc16f 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -19150,6 +19150,7 @@ The PVIPR documents the following overloaded functions:
 * PowerPC AltiVec Built-in Functions Available on ISA 2.07::
 * PowerPC AltiVec Built-in Functions Available on ISA 3.0::
 * PowerPC AltiVec Built-in Functions Available on ISA 3.1::
+* PowerPC Floating Point Overloaded Built-in Functions::
 @end menu
 
 @node PowerPC AltiVec Built-in Functions on ISA 2.05
@@ -21102,6 +21103,112 @@ int vec_any_le (vector signed __int128, vector signed __int128);
 int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
+@node PowerPC Floating Point Overloaded Built-in Functions
+@subsubsection PowerPC Floating Point Overloaded Built-in Functions
+
+The following additional built-in functions are also available for the
+PowerPC family of processors, that allow programmers to use the same
+built-in function to handle scalar, 128-bit vectors, and on Power10
+systems vector pairs.
+
+The following built-in functions handle 32-bit floating point
+operations on all processors where the VSX registers are available:
+
+@smallexample
+float __builtin_abs_f32 (float);
+vector float __builtin_abs_f32 (vector float);
+float __builtin_add_f32 (float, float);
+vector float __builtin_add_f32 (vector float, vector float);
+float __builtin_copysign_f32 (float, float);
+vector float __builtin_copysign_f32 (vector float, vector float);
+float __builtin_div_f32 (float, float);
+vector float __builtin_div_f32 (vector float, vector float);
+float __builtin_fma_f32 (float, float, float);
+vector float __builtin_fma_f32 (vector float, vector float, vector float);
+float __builtin_mult_f32 (float, float);
+vector float __builtin_mult_f32 (vector float, vector float);
+float __builtin_neg_f32 (float);
+vector float __builtin_neg_f32 (vector float);
+float __builtin_smax_f32 (float, float);
+vector float __builtin_smax_f32 (vector float, vector float);
+float __builtin_smin_f32 (float, float);
+vector float __builtin_smin_f32 (vector float, vector float);
+float __builtin_sub_f32 (float, float);
+vector float __builtin_sub_f32 (vector float, vector float);
+@end smallexample
+
+The following built-in functions handle 32-bit floating point
+operations on Power10 systems that support paired vector load and
+store instructions.
+
+@smallexample
+__vector_pair __builtin_abs_f32 (__vector_pair);
+__vector_pair __builtin_add_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_copysign_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_div_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_fma_f32 (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_mult_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_neg_f32 (__vector_pair);
+__vector_pair __builtin_smax_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_smin_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_sub_f32 (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions handle 64-bit floating point
+operations on all processors where the VSX registers are available:
+
+@smallexample
+double __builtin_abs_f64 (double);
+vector double __builtin_abs_f64 (vector double);
+double __builtin_add_f64 (double, double);
+vector double __builtin_add_f64 (vector double, vector double);
+double __builtin_copysign_f64 (double, double);
+vector double __builtin_copysign_f64 (vector double, vector double);
+double __builtin_div_f64 (double, double);
+vector double __builtin_div_f64 (vector double, vector double);
+double __builtin_fma_f64 (double, double, double);
+vector double __builtin_fma_f64 (vector double, vector double, vector double);
+double __builtin_mult_f64 (double, double);
+vector double __builtin_mult_f64 (vector double, vector double);
+double __builtin_neg_f64 (double);
+vector double __builtin_neg_f64 (vector double);
+double __builtin_smax_f64 (double, double);
+vector double __builtin_smax_f64 (vector double, vector double);
+double __builtin_smin_f64 (double, double);
+vector double __builtin_smin_f64 (vector double, vector double);
+double __builtin_sub_f64 (double, double);
+vector double __builtin_sub_f64 (vector double, vector double);
+@end smallexample
+
+The following built-in functions handle 64-bit floating point
+operations on Power10 systems that support paired vector load and
+store instructions.
+
+@smallexample
+__vector_pair __builtin_abs_f64 (__vector_pair);
+__vector_pair __builtin_add_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_copysign_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_div_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_fma_f64 (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_mult_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_neg_f64 (__vector_pair);
+__vector_pair __builtin_smax_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_smin_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_sub_f64 (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions return the sum of all of the elements
+in scalar, vector, and vector pair types.
+
+@smallexample
+float __builtin_reduce_f32 (float);
+float __builtin_reduce_f32 (vector float);
+float __builtin_reduce_f32 (__vector_pair);
+double __builtin_reduce_f64 (double);
+double __builtin_reduce_f64 (vector double);
+double __builtin_reduce_f64 (__vector_pair);
+@end smallexample
+
 
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c
new file mode 100644
index 00000000000..400a54f2fd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using scalar float.  */
+
+#include "fp-overload.h"
+
+TEST (float, float, flt, 32)
+
+/* { dg-final { scan-assembler-times {\mfabs\M|\mxsabsdp\M}          1 } } */
+/* { dg-final { scan-assembler-times {\mfadds\M|\mxsaddsp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfmadds\M|\mxsmadd[am]sp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmsubs\M|\mxsmsub[am]sp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmuls\M|\mxsmulsp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnabs\M|\mxsnabsdp\M}        1 } } */
+/* { dg-final { scan-assembler-times {\mfneg\M|\mxsnegdp\M}          1 } } */
+/* { dg-final { scan-assembler-times {\mfnmadds\M|\mxsmadd[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfnmsubs\M|\mxsnmsub[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfsubs\M|\mxssubsp\M}         1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}                          } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c
new file mode 100644
index 00000000000..14f76d8a8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using vector float.  */
+
+#include "fp-overload.h"
+
+TEST (vector float, float, vect, 32)
+
+/* { dg-final { scan-assembler-times {\mvsldoi\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}       3 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]sp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]sp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}       1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c
new file mode 100644
index 00000000000..466f056cf9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using __vector_pair.  */
+
+#include "fp-overload.h"
+
+TEST (__vector_pair, float, vpair, 32)
+
+/* { dg-final { scan-assembler-times {\mvsldoi\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}       5 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}       2 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c
new file mode 100644
index 00000000000..28e7c91c77c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using scalar double.  */
+
+#include "fp-overload.h"
+
+TEST (double, double, dbl, 64)
+
+
+/* { dg-final { scan-assembler-times {\mfabs\M|\mxsabsdp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfadd\M|\mxsadddp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfmadd\M|\mxsmadd[am]dp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmsub\M|\mxsmsub[am]dp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmul\M|\mxsmuldp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnabs\M|\mxsnabsdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mfneg\M|\mxsnegdp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnmadd\M|\mxsmadd[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfnmsub\M|\mxsmsub[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfsub\M|\mxssubdp\M}         1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}                         } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c
new file mode 100644
index 00000000000..4289ba4edb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using vector double.  */
+
+#include "fp-overload.h"
+
+TEST (vector double, double, vect, 64)
+
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]dp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]dp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M}      1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c
new file mode 100644
index 00000000000..7dd0613bf88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using __vector_pair.  */
+
+#include "fp-overload.h"
+
+TEST (__vector_pair, double, vpair, 64)
+
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}        3 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M}       1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}               } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload.h b/gcc/testsuite/gcc.target/powerpc/fp-overload.h
new file mode 100644
index 00000000000..a1ce5f83765
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload.h
@@ -0,0 +1,85 @@
+/* Common code to test the floating point overload functions.  */
+
+#define TEST(TYPE, SCALAR, TYPE_STR, SIZE)				\
+									\
+void									\
+do_add_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_add_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_sub_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_sub_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_mult_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_mult_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_neg_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_neg_f ## SIZE (*q);					\
+}									\
+									\
+void									\
+do_abs_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_abs_f ## SIZE (*q);					\
+}									\
+									\
+void									\
+do_nabs_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_neg_f ## SIZE (__builtin_abs_f ## SIZE (*q));		\
+}									\
+									\
+void									\
+do_fma_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				   TYPE *q,				\
+				   TYPE *r,				\
+				   TYPE *s)				\
+{									\
+  *p = __builtin_fma_f ## SIZE (*q, *r, *s);				\
+}									\
+									\
+void									\
+do_fms_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				   TYPE *q,				\
+				   TYPE *r,				\
+				   TYPE *s)				\
+{									\
+  TYPE neg_s = __builtin_neg_f ## SIZE (*s);				\
+  *p = __builtin_fma_f ## SIZE (*q, *r, neg_s);				\
+}									\
+									\
+void									\
+do_nfma_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				    TYPE *q,				\
+				    TYPE *r,				\
+				    TYPE *s)				\
+{									\
+  TYPE f = __builtin_fma_f ## SIZE (*q, *r, *s);			\
+  *p = __builtin_neg_f ## SIZE (f);					\
+}									\
+									\
+void									\
+do_nfms_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				    TYPE *q,				\
+				    TYPE *r,				\
+				    TYPE *s)				\
+{									\
+  TYPE neg_s = __builtin_neg_f ## SIZE (*s);				\
+  TYPE f = __builtin_fma_f ## SIZE (*q, *r, neg_s);			\
+  *p = __builtin_neg_f ## SIZE (f);					\
+}									\
+									\
+void									\
+do_reduce_ ## TYPE_STR ## _f ## SIZE (SCALAR *p, TYPE *q)		\
+{									\
+  *p = __builtin_reduce_f ## SIZE (*q);					\
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-08-01  3:36 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-01  3:36 [gcc(refs/users/meissner/heads/work129-vpair)] Add fp built-in overload support Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).