public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/meissner/heads/work128-vpair)] Add fp built-in overload support.
@ 2023-07-28 19:25 Michael Meissner
  0 siblings, 0 replies; 2+ messages in thread
From: Michael Meissner @ 2023-07-28 19:25 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:6f66fa41caa9dcb7a3f0c94aa892bc7e584aa478

commit 6f66fa41caa9dcb7a3f0c94aa892bc7e584aa478
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Jul 28 15:25:10 2023 -0400

    Add fp built-in overload support.
    
    2023-07-28  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/mma.md (UNSPEC_VPAIR_V4DF): New unspec.
            (UNSPEC_VPAIR_V8SF): Likewise.
            (UNSPEC_REDUCE_F32): Likewise.
            (UNSPEC_REDUCE_F64): Likewise.
            (VPAIR_UNARY): New code iterator.
            (VPAIR_BINARY): Likewise.
            (vpair_op): New code attribute.
            (VPAIR_WRAPPER): New int iterator.
            (VPAIR_VECTOR): New int attribute.
            (vpair_type): Likewise.
            (vpair_<vpair_op><vpair_type>2): New insns to support arithmetic
            operations on vector pairs.
            (vpair_nabs<vpair_type>2): Likewise.
            (vpair_<vpair_op><vpair_type>3): Likewise.
            (vpair_fma<vpair_type>4): Likewise.
            (vpair_fms<vpair_type>4): Likewise.
            (vpair_nfma<vpair_type>): Likewise.
            (vpair_nfms<vpair_type>): Likewise.
            (reduce_v4sf): Likewise.
            (reduce_v8sf): Likewise.
            (reduce_v2df): Likewise.
            (reduce_v4df): Likewise.
            * config/rs6000/rs6000-builtin.cc (fold_builtin_overload_fp): New helper
            function for floating point overloaded built-in functions.
            (rs6000_gimple_fold_builtin): Add support for floating point overloaded
            built-in functions that map directly to gimple.
            * config/rs6000/rs6000-builtins.def (__builtin_*_f32_scalar): New
            built-in functions for overloaded floating point support.
            (__builtin_*_f32_vector): Likewise.
            (__builtin_*_f32_vpair): Likewise.
            (__builtin_*_f64_scalar): Likewise.
            (__builtin_*_f64_vector): Likewise.
            (__builtin_*_f64_vpair): Likewise.
            * config/rs6000/rs6000-overload.def (__builtin_*_f32): Likewise.
            (__builtin_*_f64): Likewise.
            * doc/extend.texi (PowerPC Built-in functions): Document the floating
            point overloaded built-in function.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/fp-overload-f32-scalar.c: New test.
            * gcc.target/powerpc/fp-overload-f32-vector.c: Likewise.
            * gcc.target/powerpc/fp-overload-f32-vpair.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-scalar.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-vector.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-vpair.c: Likewise.
            * gcc.target/powerpc/fp-overload.h: New include file for fp overloaded
            built-in functions.

Diff:
---
 gcc/config/rs6000/mma.md                           | 384 +++++++++++++++++++++
 gcc/config/rs6000/rs6000-builtin.cc                | 105 ++++++
 gcc/config/rs6000/rs6000-builtins.def              | 173 ++++++++++
 gcc/config/rs6000/rs6000-overload.def              | 178 ++++++++++
 gcc/doc/extend.texi                                |  95 +++++
 .../gcc.target/powerpc/fp-overload-f32-scalar.c    |  21 ++
 .../gcc.target/powerpc/fp-overload-f32-vector.c    |  23 ++
 .../gcc.target/powerpc/fp-overload-f32-vpair.c     |  23 ++
 .../gcc.target/powerpc/fp-overload-f64-scalar.c    |  22 ++
 .../gcc.target/powerpc/fp-overload-f64-vector.c    |  22 ++
 .../gcc.target/powerpc/fp-overload-f64-vpair.c     |  22 ++
 gcc/testsuite/gcc.target/powerpc/fp-overload.h     |  85 +++++
 12 files changed, 1153 insertions(+)

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index d36dc13872b..20bd17ca720 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -91,6 +91,10 @@
    UNSPEC_MMA_XVI8GER4SPP
    UNSPEC_MMA_XXMFACC
    UNSPEC_MMA_XXMTACC
+   UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   UNSPEC_REDUCE_F32
+   UNSPEC_REDUCE_F64
   ])
 
 (define_c_enum "unspecv"
@@ -264,6 +268,33 @@
 				 (UNSPEC_MMA_PMXVI8GER4SPP	"pmxvi8ger4spp")])
 
 
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VPAIR_UNARY  [neg abs sqrt])
+(define_code_iterator VPAIR_BINARY [plus minus mult div copysign smin smax])
+
+;; Give the insn name from the opertion
+(define_code_attr vpair_op [(abs      "abs")
+			    (copysign "copysign")
+			    (div      "div")
+			    (minus    "sub")
+			    (mult     "mul")
+			    (neg      "neg")
+			    (plus     "add")
+			    (smin     "smin")
+			    (smax     "smax")
+			    (sqrt     "sqrt")])
+
+;; Iterator for creating the wrapper for vector pair built-ins
+(define_int_iterator VPAIR_WRAPPER [UNSPEC_VPAIR_V4DF UNSPEC_VPAIR_V8SF])
+
+;; Map VPAIR_WRAPPER to vector type (i.e. V2DF or V4SF)
+(define_int_attr VPAIR_VECTOR [(UNSPEC_VPAIR_V4DF "V2DF")
+			       (UNSPEC_VPAIR_V8SF "V4SF")])
+
+(define_int_attr vpair_type [(UNSPEC_VPAIR_V4DF "v4df")
+			     (UNSPEC_VPAIR_V8SF "v8sf")])
+
+\f
 ;; Vector pair support.  OOmode can only live in VSRs.
 (define_expand "movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand")
@@ -690,3 +721,356 @@
   "<avvi4i4i4> %A0,%x2,%x3,%4,%5,%6"
   [(set_attr "type" "mma")
    (set_attr "prefixed" "yes")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vpair_op><vpair_type>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VPAIR_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2) (VPAIR_UNARY:<VPAIR_VECTOR> (match_dup 3)))
+   (set (match_dup 4) (VPAIR_UNARY:<VPAIR_VECTOR> (match_dup 5)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[2] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs<vpair_type>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2)
+	(neg:<VPAIR_VECTOR>
+	 (abs:<VPAIR_VECTOR> (match_dup 3))))
+   (set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (abs:<VPAIR_VECTOR> (match_dup 5))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[2] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vpair_op><vpair_type>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VPAIR_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3)
+	(VPAIR_BINARY:<VPAIR_VECTOR> (match_dup 4)
+				      (match_dup 5)))
+   (set (match_dup 6)
+	(VPAIR_BINARY:<VPAIR_VECTOR> (match_dup 7)
+				      (match_dup 8)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(fma:<VPAIR_VECTOR> (match_dup 5)
+			    (match_dup 6)
+			    (match_dup 7)))
+   (set (match_dup 8)
+	(fma:<VPAIR_VECTOR> (match_dup 9)
+			    (match_dup 10)
+			    (match_dup 11)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(fma:<VPAIR_VECTOR> (match_dup 5)
+			    (match_dup 6)
+			    (neg:<VPAIR_VECTOR> (match_dup 7))))
+   (set (match_dup 8)
+	(fma:<VPAIR_VECTOR> (match_dup 9)
+			    (match_dup 10)
+			    (neg:<VPAIR_VECTOR> (match_dup 11))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 5)
+			     (match_dup 6)
+			     (match_dup 7))))
+   (set (match_dup 8)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 9)
+			     (match_dup 10)
+			     (match_dup 11))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VPAIR_WRAPPER))]
+	   VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 5)
+			     (match_dup 6)
+			     (neg:<VPAIR_VECTOR> (match_dup 7)))))
+   (set (match_dup 8)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 9)
+			     (match_dup 10)
+			     (neg:<VPAIR_VECTOR> (match_dup 11)))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Reduction for a V4SF vector
+(define_insn_and_split "reduce_v4sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "v")]
+		   UNSPEC_REDUCE_F32))
+   (clobber (match_scratch:V4SF 2 "=&v"))
+   (clobber (match_scratch:V4SF 3 "=&v"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp1 = operands[2];
+  rtx tmp2 = operands[3];
+
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp1, op1, op1, GEN_INT (8)));
+  emit_insn (gen_addv4sf3 (tmp1, op1, tmp1));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (4)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+  DONE;
+}
+  [(set_attr "length" "24")])
+
+;; Reduction for a pair of V4SF vectors
+(define_insn_and_split "reduce_v8sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(unspec:SF [(match_operand:OO 1 "vsx_register_operand" "v")]
+		   UNSPEC_REDUCE_F32))
+   (clobber (match_scratch:V4SF 2 "=&v"))
+   (clobber (match_scratch:V4SF 3 "=&v"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp1 = operands[2];
+  rtx tmp2 = operands[3];
+  unsigned r = reg_or_subregno (op1);
+  rtx op1_hi = gen_rtx_REG (V4SFmode, r);
+  rtx op1_lo = gen_rtx_REG (V4SFmode, r + 1);
+
+  emit_insn (gen_addv4sf3 (tmp1, op1_hi, op1_lo));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp1, tmp2, tmp2, GEN_INT (4)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+  DONE;
+}
+  [(set_attr "length" "24")])
+
+;; Reduction for a V2DF vector
+(define_insn_and_split "reduce_v2df"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+	(unspec:DF [(match_operand:V2DF 1 "vsx_register_operand" "wa")]
+		   UNSPEC_REDUCE_F64))
+   (clobber (match_scratch:DF 2 "=&wa"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2)
+	(vec_select:DF (match_dup 1)
+		       (parallel [(match_dup 3)])))
+   (set (match_dup 0)
+	(plus:DF (match_dup 4)
+		 (match_dup 2)))]
+{
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[3] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+  operands[4] = gen_rtx_REG (DFmode, reg1);
+})
+
+;; Reduction for a pair of V2DF vectors
+(define_insn_and_split "reduce_v4df"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+	(unspec:DF [(match_operand:OO 1 "vsx_register_operand" "wa")]
+		   UNSPEC_REDUCE_F64))
+   (clobber (match_scratch:DF 2 "=&wa"))
+   (clobber (match_scratch:V2DF 3 "=&wa"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3)
+	(plus:V2DF (match_dup 4)
+		   (match_dup 5)))
+   (set (match_dup 2)
+	(vec_select:DF (match_dup 3)
+		       (parallel [(match_dup 6)])))
+   (set (match_dup 0)
+	(plus:DF (match_dup 7)
+		 (match_dup 2)))]
+{
+  unsigned reg1 = REGNO (operands[1]);
+  unsigned reg3 = REGNO (operands[3]);
+
+  operands[4] = gen_rtx_REG (V2DFmode, reg1);
+  operands[5] = gen_rtx_REG (V2DFmode, reg1 + 1);
+  operands[6] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+  operands[7] = gen_rtx_REG (DFmode, reg3);
+})
diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc
index 82cc3a19447..e32d9175a0c 100644
--- a/gcc/config/rs6000/rs6000-builtin.cc
+++ b/gcc/config/rs6000/rs6000-builtin.cc
@@ -1261,6 +1261,49 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi,
   return true;
 }
 
+/* Helper function to fold the overloaded fp functions for the scalar and
+   vector types that support the operation directly.  */
+
+static void
+fold_builtin_overload_fp (gimple_stmt_iterator *gsi,
+			  gimple *stmt,
+			  enum tree_code code,
+			  int nargs)
+{
+  location_t loc = gimple_location (stmt);
+  tree lhs = gimple_call_lhs (stmt);
+  tree t;
+
+  if (nargs == 1)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      t = build1 (code, TREE_TYPE (lhs), arg0);
+    }
+
+  else if (nargs == 2)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      tree arg1 = gimple_call_arg (stmt, 1);
+      t = build2 (code, TREE_TYPE (lhs), arg0, arg1);
+    }
+
+  else if (nargs == 3)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      tree arg1 = gimple_call_arg (stmt, 1);
+      tree arg2 = gimple_call_arg (stmt, 2);
+      t = build3 (code, TREE_TYPE (lhs), arg0, arg1, arg2);
+    }
+
+  else
+    gcc_unreachable ();
+
+  gimple *g = gimple_build_assign (lhs, t);
+  gimple_set_location (g, loc);
+  gsi_replace (gsi, g, true);
+  return;
+}
+
 /* Fold a machine-dependent built-in in GIMPLE.  (For folding into
    a constant, use rs6000_fold_builtin.)  */
 bool
@@ -2233,6 +2276,68 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	return true;
       }
 
+    case RS6000_BIF_ABS_F32_SCALAR:
+    case RS6000_BIF_ABS_F32_VECTOR:
+    case RS6000_BIF_ABS_F64_SCALAR:
+    case RS6000_BIF_ABS_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, ABS_EXPR, 1);
+      return true;
+
+    case RS6000_BIF_ADD_F32_SCALAR:
+    case RS6000_BIF_ADD_F32_VECTOR:
+    case RS6000_BIF_ADD_F64_SCALAR:
+    case RS6000_BIF_ADD_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, PLUS_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_MULT_F32_SCALAR:
+    case RS6000_BIF_MULT_F32_VECTOR:
+    case RS6000_BIF_MULT_F64_SCALAR:
+    case RS6000_BIF_MULT_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MULT_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_NEG_F32_SCALAR:
+    case RS6000_BIF_NEG_F32_VECTOR:
+    case RS6000_BIF_NEG_F64_SCALAR:
+    case RS6000_BIF_NEG_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, NEGATE_EXPR, 1);
+      return true;
+
+    case RS6000_BIF_REDUCE_F32_SCALAR:
+    case RS6000_BIF_REDUCE_F64_SCALAR:
+      {
+	location_t loc = gimple_location (stmt);
+	lhs = gimple_call_lhs (stmt);
+	arg0 = gimple_call_arg (stmt, 0);
+	g = gimple_build_assign (lhs, arg0);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    case RS6000_BIF_SMAX_F32_SCALAR:
+    case RS6000_BIF_SMAX_F32_VECTOR:
+    case RS6000_BIF_SMAX_F64_SCALAR:
+    case RS6000_BIF_SMAX_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MAX_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_SMIN_F32_SCALAR:
+    case RS6000_BIF_SMIN_F32_VECTOR:
+    case RS6000_BIF_SMIN_F64_SCALAR:
+    case RS6000_BIF_SMIN_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MIN_EXPR, 2);
+      return true;
+
+
+    case RS6000_BIF_SUB_F32_SCALAR:
+    case RS6000_BIF_SUB_F32_VECTOR:
+    case RS6000_BIF_SUB_F64_SCALAR:
+    case RS6000_BIF_SUB_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MINUS_EXPR, 2);
+      return true;
+
     default:
       if (TARGET_DEBUG_BUILTIN)
 	fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 35c4cdf74c5..acc76adca12 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4116,3 +4116,176 @@
 
   void __builtin_vsx_stxvp (v256, unsigned long, const v256 *);
     STXVP nothing {mma,pair}
+
+; Builtins for overload floating point operations, including scalar and
+; 128-bit vector codes that are converted into direct operations.
+; The 256 codes that are kept in vector pairs insns that are split
+; into separate operations after register allocation.
+
+  float __builtin_abs_f32_scalar (float);
+    ABS_F32_SCALAR nothing {}
+  vf __builtin_abs_f32_vector (vf);
+    ABS_F32_VECTOR nothing {}
+  v256 __builtin_abs_f32_vpair (v256);
+    ABS_F32_VPAIR vpair_absv8sf2 {mma}
+
+  double __builtin_abs_f64_scalar (double);
+    ABS_F64_SCALAR nothing {}
+  vd __builtin_abs_f64_vector (vd);
+    ABS_F64_VECTOR nothing {}
+  v256 __builtin_abs_f64_vpair (v256);
+    ABS_F64_VPAIR vpair_absv4df2 {mma}
+
+  float __builtin_add_f32_scalar (float, float);
+    ADD_F32_SCALAR nothing {}
+  vf __builtin_add_f32_vector (vf, vf);
+    ADD_F32_VECTOR nothing {}
+  v256 __builtin_add_f32_vpair (v256, v256);
+    ADD_F32_VPAIR vpair_addv8sf3 {mma}
+
+  double __builtin_add_f64_scalar (double, double);
+    ADD_F64_SCALAR nothing {}
+  vd __builtin_add_f64_vector (vd, vd);
+    ADD_F64_VECTOR nothing {}
+  v256 __builtin_add_f64_vpair (v256, v256);
+    ADD_F64_VPAIR vpair_addv4df3 {mma}
+
+  float __builtin_copysign_f32_scalar (float, float);
+    COPYSIGN_F32_SCALAR copysignsf3_fcpsgn {}
+  vf __builtin_copysign_f32_vector (vf, vf);
+    COPYSIGN_F32_VECTOR vsx_copysignv4sf3 {}
+  v256 __builtin_copysign_f32_vpair (v256, v256);
+    COPYSIGN_F32_VPAIR vpair_copysignv8sf3 {mma}
+
+  double __builtin_copysign_f64_scalar (double, double);
+    COPYSIGN_F64_SCALAR copysigndf3_fcpsgn {}
+  vd __builtin_copysign_f64_vector (vd, vd);
+    COPYSIGN_F64_VECTOR vsx_copysignv2df3 {}
+  v256 __builtin_copysign_f64_vpair (v256, v256);
+    COPYSIGN_F64_VPAIR vpair_copysignv4df3 {mma}
+
+  float __builtin_div_f32_scalar (float, float);
+    DIV_F32_SCALAR divsf3 {}
+  vf __builtin_div_f32_vector (vf, vf);
+    DIV_F32_VECTOR divv4sf3 {}
+  v256 __builtin_div_f32_vpair (v256, v256);
+    DIV_F32_VPAIR vpair_divv8sf3 {mma}
+
+  double __builtin_div_f64_scalar (double, double);
+    DIV_F64_SCALAR divdf3 {}
+  vd __builtin_div_f64_vector (vd, vd);
+    DIV_F64_VECTOR divv2df3 {}
+  v256 __builtin_div_f64_vpair (v256, v256);
+    DIV_F64_VPAIR vpair_divv4df3 {mma}
+
+  float __builtin_fma_f32_scalar (float, float, float);
+    FMA_F32_SCALAR fmasf4 {}
+  vf __builtin_fma_f32_vector (vf, vf, vf);
+    FMA_F32_VECTOR fmav4sf4 {}
+  v256 __builtin_fma_v8sf (v256, v256, v256);
+    FMA_F32_VPAIR vpair_fmav8sf4 {mma}
+
+  double __builtin_fma_f64_scalar (double, double, double);
+    FMA_F64_SCALAR fmadf4 {}
+  vd __builtin_fma_f64_vector (vd, vd, vd);
+    FMA_F64_VECTOR fmav2df4 {}
+  v256 __builtin_fma_v4df (v256, v256, v256);
+    FMA_F64_VPAIR vpair_fmav4df4 {mma}
+
+  float __builtin_mult_f32_scalar (float, float);
+    MULT_F32_SCALAR nothing {}
+  vf __builtin_mult_f32_vector (vf, vf);
+    MULT_F32_VECTOR nothing {}
+  v256 __builtin_mult_f32_vpair (v256, v256);
+    MULT_F32_VPAIR vpair_mulv8sf3 {mma}
+
+  double __builtin_mult_f64_scalar (double, double);
+    MULT_F64_SCALAR nothing {}
+  vd __builtin_mult_f64_vector (vd, vd);
+    MULT_F64_VECTOR nothing {}
+  v256 __builtin_mult_f64_vpair (v256, v256);
+    MULT_F64_VPAIR vpair_mulv4df3 {mma}
+
+  float __builtin_neg_f32_scalar (float);
+    NEG_F32_SCALAR nothing {}
+  vf __builtin_neg_f32_vector (vf);
+    NEG_F32_VECTOR nothing {}
+  v256 __builtin_neg_f32_vpair (v256);
+    NEG_F32_VPAIR vpair_negv8sf2 {mma}
+
+  double __builtin_neg_f64_scalar (double);
+    NEG_F64_SCALAR nothing {}
+  vd __builtin_neg_f64_vector (vd);
+    NEG_F64_VECTOR nothing {}
+  v256 __builtin_neg_f64_vpair (v256);
+    NEG_F64_VPAIR vpair_negv4df2 {mma}
+
+  float __builtin_reduce_f32_scalar (float);
+    REDUCE_F32_SCALAR nothing {}
+  float __builtin_reduce_f32_vector (vf);
+    REDUCE_F32_VECTOR reduce_v4sf {}
+  float __builtin_reduce_f32_vpair (v256);
+    REDUCE_F32_VPAIR reduce_v8sf {mma,pair}
+
+  double __builtin_reduce_f64_scalar (double);
+    REDUCE_F64_SCALAR nothing {}
+  double __builtin_reduce_f64_vector (vd);
+    REDUCE_F64_VECTOR reduce_v2df {}
+  double __builtin_reduce_f64_vpair (v256);
+    REDUCE_F64_VPAIR reduce_v4df {mma,pair}
+
+  float __builtin_smax_f32_scalar (float, float);
+    SMAX_F32_SCALAR nothing {}
+  vf __builtin_smax_f32_vector (vf, vf);
+    SMAX_F32_VECTOR nothing {}
+  v256 __builtin_smax_f32_vpair (v256, v256);
+    SMAX_F32_VPAIR vpair_smaxv8sf3 {mma}
+
+  double __builtin_smax_f64_scalar (double, double);
+    SMAX_F64_SCALAR nothing {}
+  vd __builtin_smax_f64_vector (vd, vd);
+    SMAX_F64_VECTOR nothing {}
+  v256 __builtin_smax_f64_vpair (v256, v256);
+    SMAX_F64_VPAIR vpair_smaxv4df3 {mma}
+
+  float __builtin_smin_f32_scalar (float, float);
+    SMIN_F32_SCALAR nothing {}
+  vf __builtin_smin_f32_vector (vf, vf);
+    SMIN_F32_VECTOR nothing {}
+  v256 __builtin_smin_f32_vpair (v256, v256);
+    SMIN_F32_VPAIR vpair_sminv8sf3 {mma}
+
+  double __builtin_smin_f64_scalar (double, double);
+    SMIN_F64_SCALAR nothing {}
+  vd __builtin_smin_f64_vector (vd, vd);
+    SMIN_F64_VECTOR nothing {}
+  v256 __builtin_smin_f64_vpair (v256, v256);
+    SMIN_F64_VPAIR vpair_sminv4df3 {mma}
+
+  float __builtin_sqrt_f32_scalar (float);
+    SQRT_F32_SCALAR nothing {}
+  vf __builtin_sqrt_f32_vector (vf);
+    SQRT_F32_VECTOR nothing {}
+  v256 __builtin_sqrt_f32_vpair (v256);
+    SQRT_F32_VPAIR vpair_sqrtv8sf2 {mma}
+
+  double __builtin_sqrt_f64_scalar (double);
+    SQRT_F64_SCALAR nothing {}
+  vd __builtin_sqrt_f64_vector (vd);
+    SQRT_F64_VECTOR nothing {}
+  v256 __builtin_sqrt_f64_vpair (v256);
+    SQRT_F64_VPAIR vpair_sqrtv4df2 {mma}
+
+  float __builtin_sub_f32_scalar (float, float);
+    SUB_F32_SCALAR nothing {}
+  vf __builtin_sub_f32_vector (vf, vf);
+    SUB_F32_VECTOR nothing {}
+  v256 __builtin_sub_f32_vpair (v256, v256);
+    SUB_F32_VPAIR vpair_subv8sf3 {mma}
+
+  double __builtin_sub_f64_scalar (double, double);
+    SUB_F64_SCALAR nothing {}
+  vd __builtin_sub_f64_vector (vd, vd);
+    SUB_F64_VECTOR nothing {}
+  v256 __builtin_sub_f64_vpair (v256, v256);
+    SUB_F64_VPAIR vpair_subv4df3 {mma}
diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def
index b83946f5ad8..bbc26de4568 100644
--- a/gcc/config/rs6000/rs6000-overload.def
+++ b/gcc/config/rs6000/rs6000-overload.def
@@ -6187,3 +6187,181 @@
     VUPKLSW  VUPKLSW_DEPR1
   vbll __builtin_vec_vupklsw (vbi);
     VUPKLSW  VUPKLSW_DEPR2
+
+;; Overloaded floating point built-in functions
+
+[ABS_F32, SKIP, __builtin_abs_f32]
+  float __builtin_abs_f32 (float);
+    ABS_F32_SCALAR
+  vf __builtin_abs_f32 (vf);
+    ABS_F32_VECTOR
+  v256 __builtin_abs_f32 (v256);
+    ABS_F32_VPAIR
+
+[ABS_F64, SKIP, __builtin_abs_f64]
+  double __builtin_abs_f64 (double);
+    ABS_F64_SCALAR
+  vd __builtin_abs_f64 (vd);
+    ABS_F64_VECTOR
+  v256 __builtin_abs_f64 (v256);
+    ABS_F64_VPAIR
+
+[ADD_F32, SKIP, __builtin_add_f32]
+  float __builtin_add_f32 (float, float);
+    ADD_F32_SCALAR
+  vf __builtin_add_f32 (vf, vf);
+    ADD_F32_VECTOR
+  v256 __builtin_add_f32 (v256, v256);
+    ADD_F32_VPAIR
+
+[ADD_F64, SKIP, __builtin_add_f64]
+  double __builtin_add_f64 (double, double);
+    ADD_F64_SCALAR
+  vd __builtin_add_f64 (vd, vd);
+    ADD_F64_VECTOR
+  v256 __builtin_add_f64 (v256, v256);
+    ADD_F64_VPAIR
+
+[COPYSIGN_F32, SKIP, __builtin_copysign_f32]
+  float __builtin_copysign_f32 (float, float);
+    COPYSIGN_F32_SCALAR
+  vf __builtin_copysign_f32 (vf, vf);
+    COPYSIGN_F32_VECTOR
+  v256 __builtin_copysign_f32 (v256, v256);
+    COPYSIGN_F32_VPAIR
+
+[COPYSIGN_F64, SKIP, __builtin_copysign_f64]
+  double __builtin_copysign_f64 (double, double);
+    COPYSIGN_F64_SCALAR
+  vd __builtin_copysign_f64 (vd, vd);
+    COPYSIGN_F64_VECTOR
+  v256 __builtin_copysign_f64 (v256, v256);
+    COPYSIGN_F64_VPAIR
+
+[DIV_F32, SKIP, __builtin_div_f32]
+  float __builtin_div_f32 (float, float);
+    DIV_F32_SCALAR
+  vf __builtin_div_f32 (vf, vf);
+    DIV_F32_VECTOR
+  v256 __builtin_div_f32 (v256, v256);
+    DIV_F32_VPAIR
+
+[DIV_F64, SKIP, __builtin_div_f64]
+  double __builtin_div_f64 (double, double);
+    DIV_F64_SCALAR
+  vd __builtin_div_f64 (vd, vd);
+    DIV_F64_VECTOR
+  v256 __builtin_div_f64 (v256, v256);
+    DIV_F64_VPAIR
+
+[FMA_F32, SKIP, __builtin_fma_f32]
+  float __builtin_fma_f32 (float, float, float);
+    FMA_F32_SCALAR
+  vf __builtin_fma_f32 (vf, vf, vf);
+    FMA_F32_VECTOR
+  v256 __builtin_fma_f32 (v256, v256, v256);
+    FMA_F32_VPAIR
+
+[FMA_F64, SKIP, __builtin_fma_f64]
+  double __builtin_fma_f64 (double, double, double);
+    FMA_F64_SCALAR
+  vd __builtin_fma_f64 (vd, vd, vd);
+    FMA_F64_VECTOR
+  v256 __builtin_fma_f64 (v256, v256, v256);
+    FMA_F64_VPAIR
+
+[MULT_F32, SKIP, __builtin_mult_f32]
+  float __builtin_mult_f32 (float, float);
+    MULT_F32_SCALAR
+  vf __builtin_mult_f32 (vf, vf);
+    MULT_F32_VECTOR
+  v256 __builtin_mult_f32 (v256, v256);
+    MULT_F32_VPAIR
+
+[MULT_F64, SKIP, __builtin_mult_f64]
+  double __builtin_mult_f64 (double, double);
+    MULT_F64_SCALAR
+  vd __builtin_mult_f64 (vd, vd);
+    MULT_F64_VECTOR
+  v256 __builtin_mult_f64 (v256, v256);
+    MULT_F64_VPAIR
+
+[NEG_F32, SKIP, __builtin_neg_f32]
+  float __builtin_neg_f32 (float);
+    NEG_F32_SCALAR
+  vf __builtin_neg_f32 (vf);
+    NEG_F32_VECTOR
+  v256 __builtin_neg_f32 (v256);
+    NEG_F32_VPAIR
+
+[NEG_F64, SKIP, __builtin_neg_f64]
+  double __builtin_neg_f64 (double);
+    NEG_F64_SCALAR
+  vd __builtin_neg_f64 (vd);
+    NEG_F64_VECTOR
+  v256 __builtin_neg_f64 (v256);
+    NEG_F64_VPAIR
+
+[REDUCE_F32, SKIP, __builtin_reduce_f32]
+  float __builtin_reduce_f32 (float);
+    REDUCE_F32_SCALAR
+  float __builtin_reduce_f32 (vf);
+    REDUCE_F32_VECTOR
+  float __builtin_reduce_f32 (v256);
+    REDUCE_F32_VPAIR
+
+[REDUCE_F64, SKIP, __builtin_reduce_f64]
+  double __builtin_reduce_f64 (double);
+    REDUCE_F64_SCALAR
+  double __builtin_reduce_f64 (vd);
+    REDUCE_F64_VECTOR
+  double __builtin_reduce_f64 (v256);
+    REDUCE_F64_VPAIR
+
+[SMAX_F32, SKIP, __builtin_smax_f32]
+  float __builtin_smax_f32 (float, float);
+    SMAX_F32_SCALAR
+  vf __builtin_smax_f32 (vf, vf);
+    SMAX_F32_VECTOR
+  v256 __builtin_smax_f32 (v256, v256);
+    SMAX_F32_VPAIR
+
+[SMAX_F64, SKIP, __builtin_smax_f64]
+  double __builtin_smax_f64 (double, double);
+    SMAX_F64_SCALAR
+  vd __builtin_smax_f64 (vd, vd);
+    SMAX_F64_VECTOR
+  v256 __builtin_smax_f64 (v256, v256);
+    SMAX_F64_VPAIR
+
+[SMIN_F32, SKIP, __builtin_smin_f32]
+  float __builtin_smin_f32 (float, float);
+    SMIN_F32_SCALAR
+  vf __builtin_smin_f32 (vf, vf);
+    SMIN_F32_VECTOR
+  v256 __builtin_smin_f32 (v256, v256);
+    SMIN_F32_VPAIR
+
+[SMIN_F64, SKIP, __builtin_smin_f64]
+  double __builtin_smin_f64 (double, double);
+    SMIN_F64_SCALAR
+  vd __builtin_smin_f64 (vd, vd);
+    SMIN_F64_VECTOR
+  v256 __builtin_smin_f64 (v256, v256);
+    SMIN_F64_VPAIR
+
+[SUB_F32, SKIP, __builtin_sub_f32]
+  float __builtin_sub_f32 (float, float);
+    SUB_F32_SCALAR
+  vf __builtin_sub_f32 (vf, vf);
+    SUB_F32_VECTOR
+  v256 __builtin_sub_f32 (v256, v256);
+    SUB_F32_VPAIR
+
+[SUB_F64, SKIP, __builtin_sub_f64]
+  double __builtin_sub_f64 (double, double);
+    SUB_F64_SCALAR
+  vd __builtin_sub_f64 (vd, vd);
+    SUB_F64_VECTOR
+  v256 __builtin_sub_f64 (v256, v256);
+    SUB_F64_VPAIR
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 97eaacf8a7e..b2212cdc84d 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -19150,6 +19150,7 @@ The PVIPR documents the following overloaded functions:
 * PowerPC AltiVec Built-in Functions Available on ISA 2.07::
 * PowerPC AltiVec Built-in Functions Available on ISA 3.0::
 * PowerPC AltiVec Built-in Functions Available on ISA 3.1::
+* PowerPC Floating Point Overloaded Built-in Functions::
 @end menu
 
 @node PowerPC AltiVec Built-in Functions on ISA 2.05
@@ -21102,6 +21103,100 @@ int vec_any_le (vector signed __int128, vector signed __int128);
 int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
+@node PowerPC Floating Point Overloaded Built-in Functions
+@subsubsection PowerPC Floating Point Overloaded Built-in Functions
+
+The following additional built-in functions are also available for the
+PowerPC family of processors, that allow programmers to use the same
+built-in function to handle scalar, 128-bit vectors, and on Power10
+systems vector pairs.
+
+The following built-in functions handle 32-bit floating point
+operations on all processors where the VSX registers are available:
+
+@smallexample
+float __builtin_abs_f32 (float);
+vector float __builtin_abs_f32 (vector float);
+float __builtin_add_f32 (float, float);
+vector float __builtin_add_f32 (vector float, vector float);
+float __builtin_copysign_f32 (float, float);
+vector float __builtin_copysign_f32 (vector float, vector float);
+float __builtin_div_f32 (float, float);
+vector float __builtin_div_f32 (vector float, vector float);
+float __builtin_fma_f32 (float, float, float);
+vector float __builtin_fma_f32 (vector float, vector float, vector float);
+float __builtin_mult_f32 (float, float);
+vector float __builtin_mult_f32 (vector float, vector float);
+float __builtin_neg_f32 (float);
+vector float __builtin_neg_f32 (vector float);
+float __builtin_smax_f32 (float, float);
+vector float __builtin_smax_f32 (vector float, vector float);
+float __builtin_smin_f32 (float, float);
+vector float __builtin_smin_f32 (vector float, vector float);
+float __builtin_sub_f32 (float, float);
+vector float __builtin_sub_f32 (vector float, vector float);
+@end smallexample
+
+The following built-in functions handle 32-bit floating point
+operations on Power10 systems that support paired vector load and
+store instructions.
+
+@smallexample
+__vector_pair __builtin_abs_f32 (__vector_pair);
+__vector_pair __builtin_add_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_copysign_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_div_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_fma_f32 (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_mult_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_neg_f32 (__vector_pair);
+__vector_pair __builtin_smax_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_smin_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_sub_f32 (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions handle 64-bit floating point
+operations on all processors where the VSX registers are available:
+
+@smallexample
+double __builtin_abs_f64 (double);
+vector double __builtin_abs_f64 (vector double);
+double __builtin_add_f64 (double, double);
+vector double __builtin_add_f64 (vector double, vector double);
+double __builtin_copysign_f64 (double, double);
+vector double __builtin_copysign_f64 (vector double, vector double);
+double __builtin_div_f64 (double, double);
+vector double __builtin_div_f64 (vector double, vector double);
+double __builtin_fma_f64 (double, double, double);
+vector double __builtin_fma_f64 (vector double, vector double, vector double);
+double __builtin_mult_f64 (double, double);
+vector double __builtin_mult_f64 (vector double, vector double);
+double __builtin_neg_f64 (double);
+vector double __builtin_neg_f64 (vector double);
+double __builtin_smax_f64 (double, double);
+vector double __builtin_smax_f64 (vector double, vector double);
+double __builtin_smin_f64 (double, double);
+vector double __builtin_smin_f64 (vector double, vector double);
+double __builtin_sub_f64 (double, double);
+vector double __builtin_sub_f64 (vector double, vector double);
+@end smallexample
+
+The following built-in functions handle 64-bit floating point
+operations on Power10 systems that support paired vector load and
+store instructions.
+
+@smallexample
+__vector_pair __builtin_abs_f64 (__vector_pair);
+__vector_pair __builtin_add_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_copysign_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_div_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_fma_f64 (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_mult_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_neg_f64 (__vector_pair);
+__vector_pair __builtin_smax_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_smin_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_sub_f64 (__vector_pair, __vector_pair);
+@end smallexample
+
 
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c
new file mode 100644
index 00000000000..400a54f2fd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using scalar float.  */
+
+#include "fp-overload.h"
+
+TEST (float, float, flt, 32)
+
+/* { dg-final { scan-assembler-times {\mfabs\M|\mxsabsdp\M}          1 } } */
+/* { dg-final { scan-assembler-times {\mfadds\M|\mxsaddsp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfmadds\M|\mxsmadd[am]sp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmsubs\M|\mxsmsub[am]sp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmuls\M|\mxsmulsp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnabs\M|\mxsnabsdp\M}        1 } } */
+/* { dg-final { scan-assembler-times {\mfneg\M|\mxsnegdp\M}          1 } } */
+/* { dg-final { scan-assembler-times {\mfnmadds\M|\mxsmadd[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfnmsubs\M|\mxsnmsub[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfsubs\M|\mxssubsp\M}         1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}                          } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c
new file mode 100644
index 00000000000..14f76d8a8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using vector float.  */
+
+#include "fp-overload.h"
+
+TEST (vector float, float, vect, 32)
+
+/* { dg-final { scan-assembler-times {\mvsldoi\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}       3 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]sp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]sp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}       1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c
new file mode 100644
index 00000000000..466f056cf9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using __vector_pair.  */
+
+#include "fp-overload.h"
+
+TEST (__vector_pair, float, vpair, 32)
+
+/* { dg-final { scan-assembler-times {\mvsldoi\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}       5 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}       2 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c
new file mode 100644
index 00000000000..28e7c91c77c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using scalar double.  */
+
+#include "fp-overload.h"
+
+TEST (double, double, dbl, 64)
+
+
+/* { dg-final { scan-assembler-times {\mfabs\M|\mxsabsdp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfadd\M|\mxsadddp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfmadd\M|\mxsmadd[am]dp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmsub\M|\mxsmsub[am]dp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmul\M|\mxsmuldp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnabs\M|\mxsnabsdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mfneg\M|\mxsnegdp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnmadd\M|\mxsmadd[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfnmsub\M|\mxsmsub[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfsub\M|\mxssubdp\M}         1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}                         } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c
new file mode 100644
index 00000000000..4289ba4edb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using vector double.  */
+
+#include "fp-overload.h"
+
+TEST (vector double, double, vect, 64)
+
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]dp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]dp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M}      1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c
new file mode 100644
index 00000000000..7dd0613bf88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using __vector_pair.  */
+
+#include "fp-overload.h"
+
+TEST (__vector_pair, double, vpair, 64)
+
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}        3 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M}       1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}               } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload.h b/gcc/testsuite/gcc.target/powerpc/fp-overload.h
new file mode 100644
index 00000000000..a1ce5f83765
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload.h
@@ -0,0 +1,85 @@
+/* Common code to test the floating point overload functions.  */
+
+#define TEST(TYPE, SCALAR, TYPE_STR, SIZE)				\
+									\
+void									\
+do_add_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_add_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_sub_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_sub_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_mult_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_mult_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_neg_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_neg_f ## SIZE (*q);					\
+}									\
+									\
+void									\
+do_abs_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_abs_f ## SIZE (*q);					\
+}									\
+									\
+void									\
+do_nabs_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_neg_f ## SIZE (__builtin_abs_f ## SIZE (*q));		\
+}									\
+									\
+void									\
+do_fma_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				   TYPE *q,				\
+				   TYPE *r,				\
+				   TYPE *s)				\
+{									\
+  *p = __builtin_fma_f ## SIZE (*q, *r, *s);				\
+}									\
+									\
+void									\
+do_fms_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				   TYPE *q,				\
+				   TYPE *r,				\
+				   TYPE *s)				\
+{									\
+  TYPE neg_s = __builtin_neg_f ## SIZE (*s);				\
+  *p = __builtin_fma_f ## SIZE (*q, *r, neg_s);				\
+}									\
+									\
+void									\
+do_nfma_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				    TYPE *q,				\
+				    TYPE *r,				\
+				    TYPE *s)				\
+{									\
+  TYPE f = __builtin_fma_f ## SIZE (*q, *r, *s);			\
+  *p = __builtin_neg_f ## SIZE (f);					\
+}									\
+									\
+void									\
+do_nfms_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				    TYPE *q,				\
+				    TYPE *r,				\
+				    TYPE *s)				\
+{									\
+  TYPE neg_s = __builtin_neg_f ## SIZE (*s);				\
+  TYPE f = __builtin_fma_f ## SIZE (*q, *r, neg_s);			\
+  *p = __builtin_neg_f ## SIZE (f);					\
+}									\
+									\
+void									\
+do_reduce_ ## TYPE_STR ## _f ## SIZE (SCALAR *p, TYPE *q)		\
+{									\
+  *p = __builtin_reduce_f ## SIZE (*q);					\
+}

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [gcc(refs/users/meissner/heads/work128-vpair)] Add fp built-in overload support.
@ 2023-07-28 19:27 Michael Meissner
  0 siblings, 0 replies; 2+ messages in thread
From: Michael Meissner @ 2023-07-28 19:27 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:c01d4c2606d9833e8c8d4b3d09055898b5469c8b

commit c01d4c2606d9833e8c8d4b3d09055898b5469c8b
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Jul 28 15:25:10 2023 -0400

    Add fp built-in overload support.
    
    2023-07-28  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/mma.md (UNSPEC_VPAIR_V4DF): New unspec.
            (UNSPEC_VPAIR_V8SF): Likewise.
            (UNSPEC_REDUCE_F32): Likewise.
            (UNSPEC_REDUCE_F64): Likewise.
            (VPAIR_UNARY): New code iterator.
            (VPAIR_BINARY): Likewise.
            (vpair_op): New code attribute.
            (VPAIR_WRAPPER): New int iterator.
            (VPAIR_VECTOR): New int attribute.
            (vpair_type): Likewise.
            (vpair_<vpair_op><vpair_type>2): New insns to support arithmetic
            operations on vector pairs.
            (vpair_nabs<vpair_type>2): Likewise.
            (vpair_<vpair_op><vpair_type>3): Likewise.
            (vpair_fma<vpair_type>4): Likewise.
            (vpair_fms<vpair_type>4): Likewise.
            (vpair_nfma<vpair_type>): Likewise.
            (vpair_nfms<vpair_type>): Likewise.
            (reduce_v4sf): Likewise.
            (reduce_v8sf): Likewise.
            (reduce_v2df): Likewise.
            (reduce_v4df): Likewise.
            * config/rs6000/rs6000-builtin.cc (fold_builtin_overload_fp): New helper
            function for floating point overloaded built-in functions.
            (rs6000_gimple_fold_builtin): Add support for floating point overloaded
            built-in functions that map directly to gimple.
            * config/rs6000/rs6000-builtins.def (__builtin_*_f32_scalar): New
            built-in functions for overloaded floating point support.
            (__builtin_*_f32_vector): Likewise.
            (__builtin_*_f32_vpair): Likewise.
            (__builtin_*_f64_scalar): Likewise.
            (__builtin_*_f64_vector): Likewise.
            (__builtin_*_f64_vpair): Likewise.
            * config/rs6000/rs6000-overload.def (__builtin_*_f32): Likewise.
            (__builtin_*_f64): Likewise.
            * doc/extend.texi (PowerPC Built-in functions): Document the floating
            point overloaded built-in function.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/fp-overload-f32-scalar.c: New test.
            * gcc.target/powerpc/fp-overload-f32-vector.c: Likewise.
            * gcc.target/powerpc/fp-overload-f32-vpair.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-scalar.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-vector.c: Likewise.
            * gcc.target/powerpc/fp-overload-f64-vpair.c: Likewise.
            * gcc.target/powerpc/fp-overload.h: New include file for fp overloaded
            built-in functions.

Diff:
---
 gcc/config/rs6000/mma.md                           | 384 +++++++++++++++++++++
 gcc/config/rs6000/rs6000-builtin.cc                | 105 ++++++
 gcc/config/rs6000/rs6000-builtins.def              | 173 ++++++++++
 gcc/config/rs6000/rs6000-overload.def              | 178 ++++++++++
 gcc/doc/extend.texi                                |  95 +++++
 .../gcc.target/powerpc/fp-overload-f32-scalar.c    |  21 ++
 .../gcc.target/powerpc/fp-overload-f32-vector.c    |  23 ++
 .../gcc.target/powerpc/fp-overload-f32-vpair.c     |  23 ++
 .../gcc.target/powerpc/fp-overload-f64-scalar.c    |  22 ++
 .../gcc.target/powerpc/fp-overload-f64-vector.c    |  22 ++
 .../gcc.target/powerpc/fp-overload-f64-vpair.c     |  22 ++
 gcc/testsuite/gcc.target/powerpc/fp-overload.h     |  85 +++++
 12 files changed, 1153 insertions(+)

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index d36dc13872b..20bd17ca720 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -91,6 +91,10 @@
    UNSPEC_MMA_XVI8GER4SPP
    UNSPEC_MMA_XXMFACC
    UNSPEC_MMA_XXMTACC
+   UNSPEC_VPAIR_V4DF
+   UNSPEC_VPAIR_V8SF
+   UNSPEC_REDUCE_F32
+   UNSPEC_REDUCE_F64
   ])
 
 (define_c_enum "unspecv"
@@ -264,6 +268,33 @@
 				 (UNSPEC_MMA_PMXVI8GER4SPP	"pmxvi8ger4spp")])
 
 
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VPAIR_UNARY  [neg abs sqrt])
+(define_code_iterator VPAIR_BINARY [plus minus mult div copysign smin smax])
+
+;; Give the insn name from the opertion
+(define_code_attr vpair_op [(abs      "abs")
+			    (copysign "copysign")
+			    (div      "div")
+			    (minus    "sub")
+			    (mult     "mul")
+			    (neg      "neg")
+			    (plus     "add")
+			    (smin     "smin")
+			    (smax     "smax")
+			    (sqrt     "sqrt")])
+
+;; Iterator for creating the wrapper for vector pair built-ins
+(define_int_iterator VPAIR_WRAPPER [UNSPEC_VPAIR_V4DF UNSPEC_VPAIR_V8SF])
+
+;; Map VPAIR_WRAPPER to vector type (i.e. V2DF or V4SF)
+(define_int_attr VPAIR_VECTOR [(UNSPEC_VPAIR_V4DF "V2DF")
+			       (UNSPEC_VPAIR_V8SF "V4SF")])
+
+(define_int_attr vpair_type [(UNSPEC_VPAIR_V4DF "v4df")
+			     (UNSPEC_VPAIR_V8SF "v8sf")])
+
+\f
 ;; Vector pair support.  OOmode can only live in VSRs.
 (define_expand "movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand")
@@ -690,3 +721,356 @@
   "<avvi4i4i4> %A0,%x2,%x3,%4,%5,%6"
   [(set_attr "type" "mma")
    (set_attr "prefixed" "yes")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vpair_op><vpair_type>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VPAIR_UNARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa"))]
+		   VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2) (VPAIR_UNARY:<VPAIR_VECTOR> (match_dup 3)))
+   (set (match_dup 4) (VPAIR_UNARY:<VPAIR_VECTOR> (match_dup 5)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[2] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs<vpair_type>2"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+	    VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2)
+	(neg:<VPAIR_VECTOR>
+	 (abs:<VPAIR_VECTOR> (match_dup 3))))
+   (set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (abs:<VPAIR_VECTOR> (match_dup 5))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[2] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vpair_op><vpair_type>3"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+	(unspec:OO [(VPAIR_BINARY:OO
+		     (match_operand:OO 1 "vsx_register_operand" "wa")
+		     (match_operand:OO 2 "vsx_register_operand" "wa"))]
+		   VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3)
+	(VPAIR_BINARY:<VPAIR_VECTOR> (match_dup 4)
+				      (match_dup 5)))
+   (set (match_dup 6)
+	(VPAIR_BINARY:<VPAIR_VECTOR> (match_dup 7)
+				      (match_dup 8)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+
+  operands[3] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(fma:<VPAIR_VECTOR> (match_dup 5)
+			    (match_dup 6)
+			    (match_dup 7)))
+   (set (match_dup 8)
+	(fma:<VPAIR_VECTOR> (match_dup 9)
+			    (match_dup 10)
+			    (match_dup 11)))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(fma:OO
+	   (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	   (unspec:OO
+	    [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	     VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(fma:<VPAIR_VECTOR> (match_dup 5)
+			    (match_dup 6)
+			    (neg:<VPAIR_VECTOR> (match_dup 7))))
+   (set (match_dup 8)
+	(fma:<VPAIR_VECTOR> (match_dup 9)
+			    (match_dup 10)
+			    (neg:<VPAIR_VECTOR> (match_dup 11))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	    VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 5)
+			     (match_dup 6)
+			     (match_dup 7))))
+   (set (match_dup 8)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 9)
+			     (match_dup 10)
+			     (match_dup 11))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms<vpair_type>4"
+  [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+	(unspec:OO
+	 [(neg:OO
+	   (unspec:OO
+	    [(fma:OO
+	      (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+	      (match_operand:OO 2 "vsx_register_operand" "wa,0")
+	      (unspec:OO
+	       [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+	       VPAIR_WRAPPER))]
+	   VPAIR_WRAPPER))]
+	 VPAIR_WRAPPER))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 5)
+			     (match_dup 6)
+			     (neg:<VPAIR_VECTOR> (match_dup 7)))))
+   (set (match_dup 8)
+	(neg:<VPAIR_VECTOR>
+	 (fma:<VPAIR_VECTOR> (match_dup 9)
+			     (match_dup 10)
+			     (neg:<VPAIR_VECTOR> (match_dup 11)))))]
+{
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+  unsigned reg2 = reg_or_subregno (operands[2]);
+  unsigned reg3 = reg_or_subregno (operands[3]);
+
+  operands[4] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0);
+  operands[5] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1);
+  operands[6] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2);
+  operands[7] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3);
+
+  operands[8] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg0 + 1);
+  operands[9] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg1 + 1);
+  operands[10] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg2 + 1);
+  operands[11] = gen_rtx_REG (<VPAIR_VECTOR>mode, reg3 + 1);
+}
+  [(set_attr "length" "8")])
+
+;; Reduction for a V4SF vector
+(define_insn_and_split "reduce_v4sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "v")]
+		   UNSPEC_REDUCE_F32))
+   (clobber (match_scratch:V4SF 2 "=&v"))
+   (clobber (match_scratch:V4SF 3 "=&v"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp1 = operands[2];
+  rtx tmp2 = operands[3];
+
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp1, op1, op1, GEN_INT (8)));
+  emit_insn (gen_addv4sf3 (tmp1, op1, tmp1));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (4)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+  DONE;
+}
+  [(set_attr "length" "24")])
+
+;; Reduction for a pair of V4SF vectors
+(define_insn_and_split "reduce_v8sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(unspec:SF [(match_operand:OO 1 "vsx_register_operand" "v")]
+		   UNSPEC_REDUCE_F32))
+   (clobber (match_scratch:V4SF 2 "=&v"))
+   (clobber (match_scratch:V4SF 3 "=&v"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp1 = operands[2];
+  rtx tmp2 = operands[3];
+  unsigned r = reg_or_subregno (op1);
+  rtx op1_hi = gen_rtx_REG (V4SFmode, r);
+  rtx op1_lo = gen_rtx_REG (V4SFmode, r + 1);
+
+  emit_insn (gen_addv4sf3 (tmp1, op1_hi, op1_lo));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_altivec_vsldoi_v4sf (tmp1, tmp2, tmp2, GEN_INT (4)));
+  emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+  DONE;
+}
+  [(set_attr "length" "24")])
+
+;; Reduction for a V2DF vector
+(define_insn_and_split "reduce_v2df"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+	(unspec:DF [(match_operand:V2DF 1 "vsx_register_operand" "wa")]
+		   UNSPEC_REDUCE_F64))
+   (clobber (match_scratch:DF 2 "=&wa"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2)
+	(vec_select:DF (match_dup 1)
+		       (parallel [(match_dup 3)])))
+   (set (match_dup 0)
+	(plus:DF (match_dup 4)
+		 (match_dup 2)))]
+{
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  operands[3] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+  operands[4] = gen_rtx_REG (DFmode, reg1);
+})
+
+;; Reduction for a pair of V2DF vectors
+(define_insn_and_split "reduce_v4df"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+	(unspec:DF [(match_operand:OO 1 "vsx_register_operand" "wa")]
+		   UNSPEC_REDUCE_F64))
+   (clobber (match_scratch:DF 2 "=&wa"))
+   (clobber (match_scratch:V2DF 3 "=&wa"))]
+  "TARGET_MMA"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3)
+	(plus:V2DF (match_dup 4)
+		   (match_dup 5)))
+   (set (match_dup 2)
+	(vec_select:DF (match_dup 3)
+		       (parallel [(match_dup 6)])))
+   (set (match_dup 0)
+	(plus:DF (match_dup 7)
+		 (match_dup 2)))]
+{
+  unsigned reg1 = REGNO (operands[1]);
+  unsigned reg3 = REGNO (operands[3]);
+
+  operands[4] = gen_rtx_REG (V2DFmode, reg1);
+  operands[5] = gen_rtx_REG (V2DFmode, reg1 + 1);
+  operands[6] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+  operands[7] = gen_rtx_REG (DFmode, reg3);
+})
diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc
index 82cc3a19447..e32d9175a0c 100644
--- a/gcc/config/rs6000/rs6000-builtin.cc
+++ b/gcc/config/rs6000/rs6000-builtin.cc
@@ -1261,6 +1261,49 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi,
   return true;
 }
 
+/* Helper function to fold the overloaded fp functions for the scalar and
+   vector types that support the operation directly.  */
+
+static void
+fold_builtin_overload_fp (gimple_stmt_iterator *gsi,
+			  gimple *stmt,
+			  enum tree_code code,
+			  int nargs)
+{
+  location_t loc = gimple_location (stmt);
+  tree lhs = gimple_call_lhs (stmt);
+  tree t;
+
+  if (nargs == 1)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      t = build1 (code, TREE_TYPE (lhs), arg0);
+    }
+
+  else if (nargs == 2)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      tree arg1 = gimple_call_arg (stmt, 1);
+      t = build2 (code, TREE_TYPE (lhs), arg0, arg1);
+    }
+
+  else if (nargs == 3)
+    {
+      tree arg0 = gimple_call_arg (stmt, 0);
+      tree arg1 = gimple_call_arg (stmt, 1);
+      tree arg2 = gimple_call_arg (stmt, 2);
+      t = build3 (code, TREE_TYPE (lhs), arg0, arg1, arg2);
+    }
+
+  else
+    gcc_unreachable ();
+
+  gimple *g = gimple_build_assign (lhs, t);
+  gimple_set_location (g, loc);
+  gsi_replace (gsi, g, true);
+  return;
+}
+
 /* Fold a machine-dependent built-in in GIMPLE.  (For folding into
    a constant, use rs6000_fold_builtin.)  */
 bool
@@ -2233,6 +2276,68 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	return true;
       }
 
+    case RS6000_BIF_ABS_F32_SCALAR:
+    case RS6000_BIF_ABS_F32_VECTOR:
+    case RS6000_BIF_ABS_F64_SCALAR:
+    case RS6000_BIF_ABS_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, ABS_EXPR, 1);
+      return true;
+
+    case RS6000_BIF_ADD_F32_SCALAR:
+    case RS6000_BIF_ADD_F32_VECTOR:
+    case RS6000_BIF_ADD_F64_SCALAR:
+    case RS6000_BIF_ADD_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, PLUS_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_MULT_F32_SCALAR:
+    case RS6000_BIF_MULT_F32_VECTOR:
+    case RS6000_BIF_MULT_F64_SCALAR:
+    case RS6000_BIF_MULT_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MULT_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_NEG_F32_SCALAR:
+    case RS6000_BIF_NEG_F32_VECTOR:
+    case RS6000_BIF_NEG_F64_SCALAR:
+    case RS6000_BIF_NEG_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, NEGATE_EXPR, 1);
+      return true;
+
+    case RS6000_BIF_REDUCE_F32_SCALAR:
+    case RS6000_BIF_REDUCE_F64_SCALAR:
+      {
+	location_t loc = gimple_location (stmt);
+	lhs = gimple_call_lhs (stmt);
+	arg0 = gimple_call_arg (stmt, 0);
+	g = gimple_build_assign (lhs, arg0);
+	gimple_set_location (g, loc);
+	gsi_replace (gsi, g, true);
+	return true;
+      }
+
+    case RS6000_BIF_SMAX_F32_SCALAR:
+    case RS6000_BIF_SMAX_F32_VECTOR:
+    case RS6000_BIF_SMAX_F64_SCALAR:
+    case RS6000_BIF_SMAX_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MAX_EXPR, 2);
+      return true;
+
+    case RS6000_BIF_SMIN_F32_SCALAR:
+    case RS6000_BIF_SMIN_F32_VECTOR:
+    case RS6000_BIF_SMIN_F64_SCALAR:
+    case RS6000_BIF_SMIN_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MIN_EXPR, 2);
+      return true;
+
+
+    case RS6000_BIF_SUB_F32_SCALAR:
+    case RS6000_BIF_SUB_F32_VECTOR:
+    case RS6000_BIF_SUB_F64_SCALAR:
+    case RS6000_BIF_SUB_F64_VECTOR:
+      fold_builtin_overload_fp (gsi, stmt, MINUS_EXPR, 2);
+      return true;
+
     default:
       if (TARGET_DEBUG_BUILTIN)
 	fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 35c4cdf74c5..acc76adca12 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4116,3 +4116,176 @@
 
   void __builtin_vsx_stxvp (v256, unsigned long, const v256 *);
     STXVP nothing {mma,pair}
+
+; Builtins for overload floating point operations, including scalar and
+; 128-bit vector codes that are converted into direct operations.
+; The 256 codes that are kept in vector pairs insns that are split
+; into separate operations after register allocation.
+
+  float __builtin_abs_f32_scalar (float);
+    ABS_F32_SCALAR nothing {}
+  vf __builtin_abs_f32_vector (vf);
+    ABS_F32_VECTOR nothing {}
+  v256 __builtin_abs_f32_vpair (v256);
+    ABS_F32_VPAIR vpair_absv8sf2 {mma}
+
+  double __builtin_abs_f64_scalar (double);
+    ABS_F64_SCALAR nothing {}
+  vd __builtin_abs_f64_vector (vd);
+    ABS_F64_VECTOR nothing {}
+  v256 __builtin_abs_f64_vpair (v256);
+    ABS_F64_VPAIR vpair_absv4df2 {mma}
+
+  float __builtin_add_f32_scalar (float, float);
+    ADD_F32_SCALAR nothing {}
+  vf __builtin_add_f32_vector (vf, vf);
+    ADD_F32_VECTOR nothing {}
+  v256 __builtin_add_f32_vpair (v256, v256);
+    ADD_F32_VPAIR vpair_addv8sf3 {mma}
+
+  double __builtin_add_f64_scalar (double, double);
+    ADD_F64_SCALAR nothing {}
+  vd __builtin_add_f64_vector (vd, vd);
+    ADD_F64_VECTOR nothing {}
+  v256 __builtin_add_f64_vpair (v256, v256);
+    ADD_F64_VPAIR vpair_addv4df3 {mma}
+
+  float __builtin_copysign_f32_scalar (float, float);
+    COPYSIGN_F32_SCALAR copysignsf3_fcpsgn {}
+  vf __builtin_copysign_f32_vector (vf, vf);
+    COPYSIGN_F32_VECTOR vsx_copysignv4sf3 {}
+  v256 __builtin_copysign_f32_vpair (v256, v256);
+    COPYSIGN_F32_VPAIR vpair_copysignv8sf3 {mma}
+
+  double __builtin_copysign_f64_scalar (double, double);
+    COPYSIGN_F64_SCALAR copysigndf3_fcpsgn {}
+  vd __builtin_copysign_f64_vector (vd, vd);
+    COPYSIGN_F64_VECTOR vsx_copysignv2df3 {}
+  v256 __builtin_copysign_f64_vpair (v256, v256);
+    COPYSIGN_F64_VPAIR vpair_copysignv4df3 {mma}
+
+  float __builtin_div_f32_scalar (float, float);
+    DIV_F32_SCALAR divsf3 {}
+  vf __builtin_div_f32_vector (vf, vf);
+    DIV_F32_VECTOR divv4sf3 {}
+  v256 __builtin_div_f32_vpair (v256, v256);
+    DIV_F32_VPAIR vpair_divv8sf3 {mma}
+
+  double __builtin_div_f64_scalar (double, double);
+    DIV_F64_SCALAR divdf3 {}
+  vd __builtin_div_f64_vector (vd, vd);
+    DIV_F64_VECTOR divv2df3 {}
+  v256 __builtin_div_f64_vpair (v256, v256);
+    DIV_F64_VPAIR vpair_divv4df3 {mma}
+
+  float __builtin_fma_f32_scalar (float, float, float);
+    FMA_F32_SCALAR fmasf4 {}
+  vf __builtin_fma_f32_vector (vf, vf, vf);
+    FMA_F32_VECTOR fmav4sf4 {}
+  v256 __builtin_fma_v8sf (v256, v256, v256);
+    FMA_F32_VPAIR vpair_fmav8sf4 {mma}
+
+  double __builtin_fma_f64_scalar (double, double, double);
+    FMA_F64_SCALAR fmadf4 {}
+  vd __builtin_fma_f64_vector (vd, vd, vd);
+    FMA_F64_VECTOR fmav2df4 {}
+  v256 __builtin_fma_v4df (v256, v256, v256);
+    FMA_F64_VPAIR vpair_fmav4df4 {mma}
+
+  float __builtin_mult_f32_scalar (float, float);
+    MULT_F32_SCALAR nothing {}
+  vf __builtin_mult_f32_vector (vf, vf);
+    MULT_F32_VECTOR nothing {}
+  v256 __builtin_mult_f32_vpair (v256, v256);
+    MULT_F32_VPAIR vpair_mulv8sf3 {mma}
+
+  double __builtin_mult_f64_scalar (double, double);
+    MULT_F64_SCALAR nothing {}
+  vd __builtin_mult_f64_vector (vd, vd);
+    MULT_F64_VECTOR nothing {}
+  v256 __builtin_mult_f64_vpair (v256, v256);
+    MULT_F64_VPAIR vpair_mulv4df3 {mma}
+
+  float __builtin_neg_f32_scalar (float);
+    NEG_F32_SCALAR nothing {}
+  vf __builtin_neg_f32_vector (vf);
+    NEG_F32_VECTOR nothing {}
+  v256 __builtin_neg_f32_vpair (v256);
+    NEG_F32_VPAIR vpair_negv8sf2 {mma}
+
+  double __builtin_neg_f64_scalar (double);
+    NEG_F64_SCALAR nothing {}
+  vd __builtin_neg_f64_vector (vd);
+    NEG_F64_VECTOR nothing {}
+  v256 __builtin_neg_f64_vpair (v256);
+    NEG_F64_VPAIR vpair_negv4df2 {mma}
+
+  float __builtin_reduce_f32_scalar (float);
+    REDUCE_F32_SCALAR nothing {}
+  float __builtin_reduce_f32_vector (vf);
+    REDUCE_F32_VECTOR reduce_v4sf {}
+  float __builtin_reduce_f32_vpair (v256);
+    REDUCE_F32_VPAIR reduce_v8sf {mma,pair}
+
+  double __builtin_reduce_f64_scalar (double);
+    REDUCE_F64_SCALAR nothing {}
+  double __builtin_reduce_f64_vector (vd);
+    REDUCE_F64_VECTOR reduce_v2df {}
+  double __builtin_reduce_f64_vpair (v256);
+    REDUCE_F64_VPAIR reduce_v4df {mma,pair}
+
+  float __builtin_smax_f32_scalar (float, float);
+    SMAX_F32_SCALAR nothing {}
+  vf __builtin_smax_f32_vector (vf, vf);
+    SMAX_F32_VECTOR nothing {}
+  v256 __builtin_smax_f32_vpair (v256, v256);
+    SMAX_F32_VPAIR vpair_smaxv8sf3 {mma}
+
+  double __builtin_smax_f64_scalar (double, double);
+    SMAX_F64_SCALAR nothing {}
+  vd __builtin_smax_f64_vector (vd, vd);
+    SMAX_F64_VECTOR nothing {}
+  v256 __builtin_smax_f64_vpair (v256, v256);
+    SMAX_F64_VPAIR vpair_smaxv4df3 {mma}
+
+  float __builtin_smin_f32_scalar (float, float);
+    SMIN_F32_SCALAR nothing {}
+  vf __builtin_smin_f32_vector (vf, vf);
+    SMIN_F32_VECTOR nothing {}
+  v256 __builtin_smin_f32_vpair (v256, v256);
+    SMIN_F32_VPAIR vpair_sminv8sf3 {mma}
+
+  double __builtin_smin_f64_scalar (double, double);
+    SMIN_F64_SCALAR nothing {}
+  vd __builtin_smin_f64_vector (vd, vd);
+    SMIN_F64_VECTOR nothing {}
+  v256 __builtin_smin_f64_vpair (v256, v256);
+    SMIN_F64_VPAIR vpair_sminv4df3 {mma}
+
+  float __builtin_sqrt_f32_scalar (float);
+    SQRT_F32_SCALAR nothing {}
+  vf __builtin_sqrt_f32_vector (vf);
+    SQRT_F32_VECTOR nothing {}
+  v256 __builtin_sqrt_f32_vpair (v256);
+    SQRT_F32_VPAIR vpair_sqrtv8sf2 {mma}
+
+  double __builtin_sqrt_f64_scalar (double);
+    SQRT_F64_SCALAR nothing {}
+  vd __builtin_sqrt_f64_vector (vd);
+    SQRT_F64_VECTOR nothing {}
+  v256 __builtin_sqrt_f64_vpair (v256);
+    SQRT_F64_VPAIR vpair_sqrtv4df2 {mma}
+
+  float __builtin_sub_f32_scalar (float, float);
+    SUB_F32_SCALAR nothing {}
+  vf __builtin_sub_f32_vector (vf, vf);
+    SUB_F32_VECTOR nothing {}
+  v256 __builtin_sub_f32_vpair (v256, v256);
+    SUB_F32_VPAIR vpair_subv8sf3 {mma}
+
+  double __builtin_sub_f64_scalar (double, double);
+    SUB_F64_SCALAR nothing {}
+  vd __builtin_sub_f64_vector (vd, vd);
+    SUB_F64_VECTOR nothing {}
+  v256 __builtin_sub_f64_vpair (v256, v256);
+    SUB_F64_VPAIR vpair_subv4df3 {mma}
diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def
index b83946f5ad8..bbc26de4568 100644
--- a/gcc/config/rs6000/rs6000-overload.def
+++ b/gcc/config/rs6000/rs6000-overload.def
@@ -6187,3 +6187,181 @@
     VUPKLSW  VUPKLSW_DEPR1
   vbll __builtin_vec_vupklsw (vbi);
     VUPKLSW  VUPKLSW_DEPR2
+
+;; Overloaded floating point built-in functions
+
+[ABS_F32, SKIP, __builtin_abs_f32]
+  float __builtin_abs_f32 (float);
+    ABS_F32_SCALAR
+  vf __builtin_abs_f32 (vf);
+    ABS_F32_VECTOR
+  v256 __builtin_abs_f32 (v256);
+    ABS_F32_VPAIR
+
+[ABS_F64, SKIP, __builtin_abs_f64]
+  double __builtin_abs_f64 (double);
+    ABS_F64_SCALAR
+  vd __builtin_abs_f64 (vd);
+    ABS_F64_VECTOR
+  v256 __builtin_abs_f64 (v256);
+    ABS_F64_VPAIR
+
+[ADD_F32, SKIP, __builtin_add_f32]
+  float __builtin_add_f32 (float, float);
+    ADD_F32_SCALAR
+  vf __builtin_add_f32 (vf, vf);
+    ADD_F32_VECTOR
+  v256 __builtin_add_f32 (v256, v256);
+    ADD_F32_VPAIR
+
+[ADD_F64, SKIP, __builtin_add_f64]
+  double __builtin_add_f64 (double, double);
+    ADD_F64_SCALAR
+  vd __builtin_add_f64 (vd, vd);
+    ADD_F64_VECTOR
+  v256 __builtin_add_f64 (v256, v256);
+    ADD_F64_VPAIR
+
+[COPYSIGN_F32, SKIP, __builtin_copysign_f32]
+  float __builtin_copysign_f32 (float, float);
+    COPYSIGN_F32_SCALAR
+  vf __builtin_copysign_f32 (vf, vf);
+    COPYSIGN_F32_VECTOR
+  v256 __builtin_copysign_f32 (v256, v256);
+    COPYSIGN_F32_VPAIR
+
+[COPYSIGN_F64, SKIP, __builtin_copysign_f64]
+  double __builtin_copysign_f64 (double, double);
+    COPYSIGN_F64_SCALAR
+  vd __builtin_copysign_f64 (vd, vd);
+    COPYSIGN_F64_VECTOR
+  v256 __builtin_copysign_f64 (v256, v256);
+    COPYSIGN_F64_VPAIR
+
+[DIV_F32, SKIP, __builtin_div_f32]
+  float __builtin_div_f32 (float, float);
+    DIV_F32_SCALAR
+  vf __builtin_div_f32 (vf, vf);
+    DIV_F32_VECTOR
+  v256 __builtin_div_f32 (v256, v256);
+    DIV_F32_VPAIR
+
+[DIV_F64, SKIP, __builtin_div_f64]
+  double __builtin_div_f64 (double, double);
+    DIV_F64_SCALAR
+  vd __builtin_div_f64 (vd, vd);
+    DIV_F64_VECTOR
+  v256 __builtin_div_f64 (v256, v256);
+    DIV_F64_VPAIR
+
+[FMA_F32, SKIP, __builtin_fma_f32]
+  float __builtin_fma_f32 (float, float, float);
+    FMA_F32_SCALAR
+  vf __builtin_fma_f32 (vf, vf, vf);
+    FMA_F32_VECTOR
+  v256 __builtin_fma_f32 (v256, v256, v256);
+    FMA_F32_VPAIR
+
+[FMA_F64, SKIP, __builtin_fma_f64]
+  double __builtin_fma_f64 (double, double, double);
+    FMA_F64_SCALAR
+  vd __builtin_fma_f64 (vd, vd, vd);
+    FMA_F64_VECTOR
+  v256 __builtin_fma_f64 (v256, v256, v256);
+    FMA_F64_VPAIR
+
+[MULT_F32, SKIP, __builtin_mult_f32]
+  float __builtin_mult_f32 (float, float);
+    MULT_F32_SCALAR
+  vf __builtin_mult_f32 (vf, vf);
+    MULT_F32_VECTOR
+  v256 __builtin_mult_f32 (v256, v256);
+    MULT_F32_VPAIR
+
+[MULT_F64, SKIP, __builtin_mult_f64]
+  double __builtin_mult_f64 (double, double);
+    MULT_F64_SCALAR
+  vd __builtin_mult_f64 (vd, vd);
+    MULT_F64_VECTOR
+  v256 __builtin_mult_f64 (v256, v256);
+    MULT_F64_VPAIR
+
+[NEG_F32, SKIP, __builtin_neg_f32]
+  float __builtin_neg_f32 (float);
+    NEG_F32_SCALAR
+  vf __builtin_neg_f32 (vf);
+    NEG_F32_VECTOR
+  v256 __builtin_neg_f32 (v256);
+    NEG_F32_VPAIR
+
+[NEG_F64, SKIP, __builtin_neg_f64]
+  double __builtin_neg_f64 (double);
+    NEG_F64_SCALAR
+  vd __builtin_neg_f64 (vd);
+    NEG_F64_VECTOR
+  v256 __builtin_neg_f64 (v256);
+    NEG_F64_VPAIR
+
+[REDUCE_F32, SKIP, __builtin_reduce_f32]
+  float __builtin_reduce_f32 (float);
+    REDUCE_F32_SCALAR
+  float __builtin_reduce_f32 (vf);
+    REDUCE_F32_VECTOR
+  float __builtin_reduce_f32 (v256);
+    REDUCE_F32_VPAIR
+
+[REDUCE_F64, SKIP, __builtin_reduce_f64]
+  double __builtin_reduce_f64 (double);
+    REDUCE_F64_SCALAR
+  double __builtin_reduce_f64 (vd);
+    REDUCE_F64_VECTOR
+  double __builtin_reduce_f64 (v256);
+    REDUCE_F64_VPAIR
+
+[SMAX_F32, SKIP, __builtin_smax_f32]
+  float __builtin_smax_f32 (float, float);
+    SMAX_F32_SCALAR
+  vf __builtin_smax_f32 (vf, vf);
+    SMAX_F32_VECTOR
+  v256 __builtin_smax_f32 (v256, v256);
+    SMAX_F32_VPAIR
+
+[SMAX_F64, SKIP, __builtin_smax_f64]
+  double __builtin_smax_f64 (double, double);
+    SMAX_F64_SCALAR
+  vd __builtin_smax_f64 (vd, vd);
+    SMAX_F64_VECTOR
+  v256 __builtin_smax_f64 (v256, v256);
+    SMAX_F64_VPAIR
+
+[SMIN_F32, SKIP, __builtin_smin_f32]
+  float __builtin_smin_f32 (float, float);
+    SMIN_F32_SCALAR
+  vf __builtin_smin_f32 (vf, vf);
+    SMIN_F32_VECTOR
+  v256 __builtin_smin_f32 (v256, v256);
+    SMIN_F32_VPAIR
+
+[SMIN_F64, SKIP, __builtin_smin_f64]
+  double __builtin_smin_f64 (double, double);
+    SMIN_F64_SCALAR
+  vd __builtin_smin_f64 (vd, vd);
+    SMIN_F64_VECTOR
+  v256 __builtin_smin_f64 (v256, v256);
+    SMIN_F64_VPAIR
+
+[SUB_F32, SKIP, __builtin_sub_f32]
+  float __builtin_sub_f32 (float, float);
+    SUB_F32_SCALAR
+  vf __builtin_sub_f32 (vf, vf);
+    SUB_F32_VECTOR
+  v256 __builtin_sub_f32 (v256, v256);
+    SUB_F32_VPAIR
+
+[SUB_F64, SKIP, __builtin_sub_f64]
+  double __builtin_sub_f64 (double, double);
+    SUB_F64_SCALAR
+  vd __builtin_sub_f64 (vd, vd);
+    SUB_F64_VECTOR
+  v256 __builtin_sub_f64 (v256, v256);
+    SUB_F64_VPAIR
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 97eaacf8a7e..b2212cdc84d 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -19150,6 +19150,7 @@ The PVIPR documents the following overloaded functions:
 * PowerPC AltiVec Built-in Functions Available on ISA 2.07::
 * PowerPC AltiVec Built-in Functions Available on ISA 3.0::
 * PowerPC AltiVec Built-in Functions Available on ISA 3.1::
+* PowerPC Floating Point Overloaded Built-in Functions::
 @end menu
 
 @node PowerPC AltiVec Built-in Functions on ISA 2.05
@@ -21102,6 +21103,100 @@ int vec_any_le (vector signed __int128, vector signed __int128);
 int vec_any_le (vector unsigned __int128, vector unsigned __int128);
 @end smallexample
 
+@node PowerPC Floating Point Overloaded Built-in Functions
+@subsubsection PowerPC Floating Point Overloaded Built-in Functions
+
+The following additional built-in functions are also available for the
+PowerPC family of processors, that allow programmers to use the same
+built-in function to handle scalar, 128-bit vectors, and on Power10
+systems vector pairs.
+
+The following built-in functions handle 32-bit floating point
+operations on all processors where the VSX registers are available:
+
+@smallexample
+float __builtin_abs_f32 (float);
+vector float __builtin_abs_f32 (vector float);
+float __builtin_add_f32 (float, float);
+vector float __builtin_add_f32 (vector float, vector float);
+float __builtin_copysign_f32 (float, float);
+vector float __builtin_copysign_f32 (vector float, vector float);
+float __builtin_div_f32 (float, float);
+vector float __builtin_div_f32 (vector float, vector float);
+float __builtin_fma_f32 (float, float, float);
+vector float __builtin_fma_f32 (vector float, vector float, vector float);
+float __builtin_mult_f32 (float, float);
+vector float __builtin_mult_f32 (vector float, vector float);
+float __builtin_neg_f32 (float);
+vector float __builtin_neg_f32 (vector float);
+float __builtin_smax_f32 (float, float);
+vector float __builtin_smax_f32 (vector float, vector float);
+float __builtin_smin_f32 (float, float);
+vector float __builtin_smin_f32 (vector float, vector float);
+float __builtin_sub_f32 (float, float);
+vector float __builtin_sub_f32 (vector float, vector float);
+@end smallexample
+
+The following built-in functions handle 32-bit floating point
+operations on Power10 systems that support paired vector load and
+store instructions.
+
+@smallexample
+__vector_pair __builtin_abs_f32 (__vector_pair);
+__vector_pair __builtin_add_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_copysign_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_div_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_fma_f32 (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_mult_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_neg_f32 (__vector_pair);
+__vector_pair __builtin_smax_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_smin_f32 (__vector_pair, __vector_pair);
+__vector_pair __builtin_sub_f32 (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions handle 64-bit floating point
+operations on all processors where the VSX registers are available:
+
+@smallexample
+double __builtin_abs_f64 (double);
+vector double __builtin_abs_f64 (vector double);
+double __builtin_add_f64 (double, double);
+vector double __builtin_add_f64 (vector double, vector double);
+double __builtin_copysign_f64 (double, double);
+vector double __builtin_copysign_f64 (vector double, vector double);
+double __builtin_div_f64 (double, double);
+vector double __builtin_div_f64 (vector double, vector double);
+double __builtin_fma_f64 (double, double, double);
+vector double __builtin_fma_f64 (vector double, vector double, vector double);
+double __builtin_mult_f64 (double, double);
+vector double __builtin_mult_f64 (vector double, vector double);
+double __builtin_neg_f64 (double);
+vector double __builtin_neg_f64 (vector double);
+double __builtin_smax_f64 (double, double);
+vector double __builtin_smax_f64 (vector double, vector double);
+double __builtin_smin_f64 (double, double);
+vector double __builtin_smin_f64 (vector double, vector double);
+double __builtin_sub_f64 (double, double);
+vector double __builtin_sub_f64 (vector double, vector double);
+@end smallexample
+
+The following built-in functions handle 64-bit floating point
+operations on Power10 systems that support paired vector load and
+store instructions.
+
+@smallexample
+__vector_pair __builtin_abs_f64 (__vector_pair);
+__vector_pair __builtin_add_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_copysign_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_div_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_fma_f64 (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_mult_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_neg_f64 (__vector_pair);
+__vector_pair __builtin_smax_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_smin_f64 (__vector_pair, __vector_pair);
+__vector_pair __builtin_sub_f64 (__vector_pair, __vector_pair);
+@end smallexample
+
 
 @node PowerPC Hardware Transactional Memory Built-in Functions
 @subsection PowerPC Hardware Transactional Memory Built-in Functions
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c
new file mode 100644
index 00000000000..400a54f2fd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-scalar.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using scalar float.  */
+
+#include "fp-overload.h"
+
+TEST (float, float, flt, 32)
+
+/* { dg-final { scan-assembler-times {\mfabs\M|\mxsabsdp\M}          1 } } */
+/* { dg-final { scan-assembler-times {\mfadds\M|\mxsaddsp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfmadds\M|\mxsmadd[am]sp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmsubs\M|\mxsmsub[am]sp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmuls\M|\mxsmulsp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnabs\M|\mxsnabsdp\M}        1 } } */
+/* { dg-final { scan-assembler-times {\mfneg\M|\mxsnegdp\M}          1 } } */
+/* { dg-final { scan-assembler-times {\mfnmadds\M|\mxsmadd[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfnmsubs\M|\mxsnmsub[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfsubs\M|\mxssubsp\M}         1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}                          } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c
new file mode 100644
index 00000000000..14f76d8a8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vector.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using vector float.  */
+
+#include "fp-overload.h"
+
+TEST (vector float, float, vect, 32)
+
+/* { dg-final { scan-assembler-times {\mvsldoi\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}       3 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]sp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]sp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]sp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}       1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c
new file mode 100644
index 00000000000..466f056cf9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f32-vpair.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f32 using __vector_pair.  */
+
+#include "fp-overload.h"
+
+TEST (__vector_pair, float, vpair, 32)
+
+/* { dg-final { scan-assembler-times {\mvsldoi\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}       5 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}       2 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c
new file mode 100644
index 00000000000..28e7c91c77c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-scalar.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using scalar double.  */
+
+#include "fp-overload.h"
+
+TEST (double, double, dbl, 64)
+
+
+/* { dg-final { scan-assembler-times {\mfabs\M|\mxsabsdp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfadd\M|\mxsadddp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfmadd\M|\mxsmadd[am]dp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmsub\M|\mxsmsub[am]dp\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mfmul\M|\mxsmuldp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnabs\M|\mxsnabsdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mfneg\M|\mxsnegdp\M}         1 } } */
+/* { dg-final { scan-assembler-times {\mfnmadd\M|\mxsmadd[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfnmsub\M|\mxsmsub[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mfsub\M|\mxssubdp\M}         1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}                         } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c
new file mode 100644
index 00000000000..4289ba4edb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vector.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using vector double.  */
+
+#include "fp-overload.h"
+
+TEST (vector double, double, vect, 64)
+
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]dp\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}      1 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]dp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]dp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M}      1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}              } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c
new file mode 100644
index 00000000000..7dd0613bf88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload-f64-vpair.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+/* Test code generation for __builtin_<op>_f64 using __vector_pair.  */
+
+#include "fp-overload.h"
+
+TEST (__vector_pair, double, vpair, 64)
+
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}        3 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd[am]dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub[am]dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}       2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd[am]dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub[am]dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}        2 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M}       1 } } */
+/* { dg-final { scan-assembler-not   {\mbl\M}               } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fp-overload.h b/gcc/testsuite/gcc.target/powerpc/fp-overload.h
new file mode 100644
index 00000000000..a1ce5f83765
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/fp-overload.h
@@ -0,0 +1,85 @@
+/* Common code to test the floating point overload functions.  */
+
+#define TEST(TYPE, SCALAR, TYPE_STR, SIZE)				\
+									\
+void									\
+do_add_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_add_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_sub_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_sub_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_mult_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q, TYPE *r)		\
+{									\
+  *p = __builtin_mult_f ## SIZE (*q, *r);				\
+}									\
+									\
+void									\
+do_neg_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_neg_f ## SIZE (*q);					\
+}									\
+									\
+void									\
+do_abs_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_abs_f ## SIZE (*q);					\
+}									\
+									\
+void									\
+do_nabs_ ## TYPE_STR ## _f ## SIZE (TYPE *p, TYPE *q)			\
+{									\
+  *p = __builtin_neg_f ## SIZE (__builtin_abs_f ## SIZE (*q));		\
+}									\
+									\
+void									\
+do_fma_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				   TYPE *q,				\
+				   TYPE *r,				\
+				   TYPE *s)				\
+{									\
+  *p = __builtin_fma_f ## SIZE (*q, *r, *s);				\
+}									\
+									\
+void									\
+do_fms_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				   TYPE *q,				\
+				   TYPE *r,				\
+				   TYPE *s)				\
+{									\
+  TYPE neg_s = __builtin_neg_f ## SIZE (*s);				\
+  *p = __builtin_fma_f ## SIZE (*q, *r, neg_s);				\
+}									\
+									\
+void									\
+do_nfma_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				    TYPE *q,				\
+				    TYPE *r,				\
+				    TYPE *s)				\
+{									\
+  TYPE f = __builtin_fma_f ## SIZE (*q, *r, *s);			\
+  *p = __builtin_neg_f ## SIZE (f);					\
+}									\
+									\
+void									\
+do_nfms_ ## TYPE_STR ## _f ## SIZE (TYPE *p,				\
+				    TYPE *q,				\
+				    TYPE *r,				\
+				    TYPE *s)				\
+{									\
+  TYPE neg_s = __builtin_neg_f ## SIZE (*s);				\
+  TYPE f = __builtin_fma_f ## SIZE (*q, *r, neg_s);			\
+  *p = __builtin_neg_f ## SIZE (f);					\
+}									\
+									\
+void									\
+do_reduce_ ## TYPE_STR ## _f ## SIZE (SCALAR *p, TYPE *q)		\
+{									\
+  *p = __builtin_reduce_f ## SIZE (*q);					\
+}

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-07-28 19:27 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-28 19:25 [gcc(refs/users/meissner/heads/work128-vpair)] Add fp built-in overload support Michael Meissner
2023-07-28 19:27 Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).