public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/meissner/heads/work146-vpair)] Add support for floating point vector pair built-in functions.
@ 2023-11-17 20:45 Michael Meissner
0 siblings, 0 replies; only message in thread
From: Michael Meissner @ 2023-11-17 20:45 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:4f7e0f9e0f504badfb4a87caaecec4606b8b5297
commit 4f7e0f9e0f504badfb4a87caaecec4606b8b5297
Author: Michael Meissner <meissner@linux.ibm.com>
Date: Fri Nov 17 15:40:20 2023 -0500
Add support for floating point vector pair built-in functions.
This patch adds a series of built-in functions to allow users to write code to
do a number of simple operations where the loop is done using the __vector_pair
type. The __vector_pair type is an opaque type. These built-in functions keep
the two 128-bit vectors within the __vector_pair together, and split the
operation after register allocation.
This patch provides vector pair operations for 32-bit floating point and 64-bit
floating point.
2023-11-17 Michael Meissner <meissner@linux.ibm.com>
gcc/
* config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_*): Add vector
pair built-in functions for float.
(__builtin_vpair_f64_*): Add vector pair built-in functions for double.
* config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add
declaration.
(split_binary_vector_pair): Likewise.
(split_fma_vector_pair): Likewise.
* config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function
for vector pair built-in functions.
(split_binary_vector_pair): Likewise.
(split_fma_vector_pair): Likewise.
* config/rs6000/rs6000.md (toplevel): Include vector-pair.md.
* config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md.
* config/rs6000/vector-pair.md: New file.
* doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the
floating point and general vector pair built-in functions.
gcc/testsuite/
* gcc.target/powerpc/vector-pair-1.c: New test.
* gcc.target/powerpc/vector-pair-2.c: New test.
* gcc.target/powerpc/vector-pair-3.c: New test.
* gcc.target/powerpc/vector-pair-4.c: New test.
Diff:
---
gcc/config/rs6000/rs6000-builtins.def | 52 ++++
gcc/config/rs6000/rs6000-protos.h | 5 +
gcc/config/rs6000/rs6000.cc | 74 +++++
gcc/config/rs6000/rs6000.md | 1 +
gcc/config/rs6000/t-rs6000 | 1 +
gcc/config/rs6000/vector-pair.md | 329 +++++++++++++++++++++++
gcc/doc/extend.texi | 46 ++++
gcc/testsuite/gcc.target/powerpc/vector-pair-1.c | 135 ++++++++++
gcc/testsuite/gcc.target/powerpc/vector-pair-2.c | 134 +++++++++
gcc/testsuite/gcc.target/powerpc/vector-pair-3.c | 60 +++++
gcc/testsuite/gcc.target/powerpc/vector-pair-4.c | 60 +++++
11 files changed, 897 insertions(+)
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index ce40600e803..89b248b50ef 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -4131,3 +4131,55 @@
void __builtin_vsx_stxvp (v256, unsigned long, const v256 *);
STXVP nothing {mma,pair}
+
+;; vector pair built-in functions for 8 32-bit float values
+
+ v256 __builtin_vpair_f32_abs (v256);
+ VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair}
+
+ v256 __builtin_vpair_f32_add (v256, v256);
+ VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_fma (v256, v256, v256);
+ VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair}
+
+ v256 __builtin_vpair_f32_max (v256, v256);
+ VPAIR_F32_MAX vpair_smax_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_min (v256, v256);
+ VPAIR_F32_MIN vpair_smin_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_mul (v256, v256);
+ VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair}
+
+ v256 __builtin_vpair_f32_neg (v256);
+ VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair}
+
+ v256 __builtin_vpair_f32_sub (v256, v256);
+ VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair}
+
+;; vector pair built-in functions for 4 64-bit double values
+
+ v256 __builtin_vpair_f64_abs (v256);
+ VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair}
+
+ v256 __builtin_vpair_f64_add (v256, v256);
+ VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_fma (v256, v256, v256);
+ VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair}
+
+ v256 __builtin_vpair_f64_max (v256, v256);
+ VPAIR_F64_MAX vpair_smax_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_min (v256, v256);
+ VPAIR_F64_MIN vpair_smin_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_mul (v256, v256);
+ VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair}
+
+ v256 __builtin_vpair_f64_neg (v256);
+ VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair}
+
+ v256 __builtin_vpair_f64_sub (v256, v256);
+ VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair}
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..bbd899d7562 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
extern void output_toc (FILE *, rtx, int, machine_mode);
extern void rs6000_fatal_bad_address (rtx);
extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+ rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+ rtx (*)(rtx, rtx, rtx, rtx));
extern void rs6000_split_multireg_move (rtx, rtx);
extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 0dd21e67dde..2c30bfb0e70 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27408,6 +27408,80 @@ rs6000_split_logical (rtx operands[3],
return;
}
+/* Split a unary vector pair insn into two separate vector insns. */
+
+void
+split_unary_vector_pair (machine_mode mode, /* vector mode. */
+ rtx operands[], /* dest, src. */
+ rtx (*func)(rtx, rtx)) /* create insn. */
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ machine_mode orig_mode = GET_MODE (op0);
+
+ rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+ rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+ rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+ rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+ emit_insn (func (reg0_vector0, reg1_vector0));
+ emit_insn (func (reg0_vector1, reg1_vector1));
+ return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns. */
+
+void
+split_binary_vector_pair (machine_mode mode, /* vector mode. */
+ rtx operands[], /* dest, src. */
+ rtx (*func)(rtx, rtx, rtx)) /* create insn. */
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op2 = operands[2];
+ machine_mode orig_mode = GET_MODE (op0);
+
+ rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+ rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+ rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+ rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+ rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+ rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+ emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+ emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+ return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+ insns. */
+
+void
+split_fma_vector_pair (machine_mode mode, /* vector mode. */
+ rtx operands[], /* dest, src. */
+ rtx (*func)(rtx, rtx, rtx, rtx)) /* create insn. */
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op2 = operands[2];
+ rtx op3 = operands[3];
+ machine_mode orig_mode = GET_MODE (op0);
+
+ rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+ rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+ rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+ rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+ rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+ rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+ rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+ rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+ emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+ emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+ return;
+}
+
/* Emit instructions to move SRC to DST. Called by splitters for
multi-register moves. It will emit at most one instruction for
each register that is accessed; that is, it won't emit li/lis pairs
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index dcf1f3526f5..5a17adc1bc3 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15767,6 +15767,7 @@
(include "vsx.md")
(include "altivec.md")
(include "mma.md")
+(include "vector-pair.md")
(include "dfp.md")
(include "crypto.md")
(include "htm.md")
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1d..5fc89499795 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \
$(srcdir)/config/rs6000/vsx.md \
$(srcdir)/config/rs6000/altivec.md \
$(srcdir)/config/rs6000/mma.md \
+ $(srcdir)/config/rs6000/vector-pair.md \
$(srcdir)/config/rs6000/crypto.md \
$(srcdir)/config/rs6000/htm.md \
$(srcdir)/config/rs6000/dfp.md \
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 00000000000..2dcac6a31e2
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,329 @@
+;; Vector pair arithmetic support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;; Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; This file adds support for doing vector operations on pairs of vector
+;; registers. Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations. The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+(define_c_enum "unspec"
+ [UNSPEC_VPAIR_V4DF
+ UNSPEC_VPAIR_V8SF
+ ])
+
+;; Iterator doing unary/binary arithmetic on vector pairs
+(define_code_iterator VP_FP_UNARY [abs neg])
+(define_code_iterator VP_FP_BINARY [minus mult plus smin smax])
+
+;; Return the insn name from the VP_* code iterator
+(define_code_attr vp_insn [(abs "abs")
+ (minus "sub")
+ (mult "mul")
+ (neg "neg")
+ (plus "add")
+ (smin "smin")
+ (smax "smax")
+ (xor "xor")])
+
+;; Iterator for creating the unspecs for vector pair built-ins
+(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF
+ UNSPEC_VPAIR_V8SF])
+
+;; Map VP_* to vector mode of the arguments after they are split
+(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF "V2DF")
+ (UNSPEC_VPAIR_V8SF "V4SF")])
+
+;; Map VP_* to a lower case name to identify the vector pair.
+(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF "v4df")
+ (UNSPEC_VPAIR_V8SF "v8sf")])
+
+;; Map VP_* to a lower case name to identify the vector after the vector pair
+;; has been split.
+(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF "v2df")
+ (UNSPEC_VPAIR_V8SF "v4sf")])
+
+\f
+;; Vector pair floating point unary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(VP_FP_UNARY:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa"))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_<vp_insn><vp_vmode>2);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair negate of absolute value
+(define_insn_and_split "vpair_nabs_<vp_pmode>2"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_unary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_vsx_nabs<vp_vmode>2);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Vector pair floating binary operations
+(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa")
+ (unspec:OO [(VP_FP_BINARY:OO
+ (match_operand:OO 1 "vsx_register_operand" "wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa"))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_binary_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_<vp_insn><vp_vmode>3);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Vector pair fused multiply-add floating point operations
+(define_insn_and_split "vpair_fma_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_fma<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_fms_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (unspec:OO
+ [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_fms<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfma_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nfma<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+(define_insn_and_split "vpair_nfms_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0")
+ (unspec:OO
+ [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ split_fma_vector_pair (<VP_VEC_MODE>mode, operands,
+ gen_nfms<vp_vmode>4);
+ DONE;
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c).
+(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(plus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (match_dup 3))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c)
+(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(minus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (unspec:OO
+ [(neg:OO
+ (match_dup 3))]
+ VP_FP))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
+
+
+;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c).
+(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(plus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (match_dup 3))]
+ VP_FP))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c)
+(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
+ [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa")
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(minus:OO
+ (unspec:OO
+ [(mult:OO
+ (match_operand:OO 1 "vsx_register_operand" "%wa,wa")
+ (match_operand:OO 2 "vsx_register_operand" "wa,0"))]
+ VP_FP)
+ (match_operand:OO 3 "vsx_register_operand" "0,wa"))]
+ VP_FP))]
+ VP_FP))]
+ "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:OO
+ [(neg:OO
+ (unspec:OO
+ [(fma:OO
+ (match_dup 1)
+ (match_dup 2)
+ (unspec:OO
+ [(neg:OO
+ (match_dup 3))]
+ VP_FP))]
+ VP_FP))]
+ VP_FP))]
+{
+}
+ [(set_attr "length" "8")])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 8293a7b88a9..0194b6591c5 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -15138,6 +15138,7 @@ instructions, but allow the compiler to schedule those calls.
* NDS32 Built-in Functions::
* Nvidia PTX Built-in Functions::
* Basic PowerPC Built-in Functions::
+* PowerPC Vector Pair Built-in Functions Available on ISA 3.1::
* PowerPC AltiVec/VSX Built-in Functions::
* PowerPC Hardware Transactional Memory Built-in Functions::
* PowerPC Atomic Memory Operation Functions::
@@ -21467,6 +21468,51 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128);
@end smallexample
+@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1
+
+GCC provides functions to speed up processing by using the type
+@code{__vector_pair} to hold two 128-bit vectors on processors that
+support ISA 3.1 (power10). The @code{__vector_pair} type and the
+vector pair built-in functions require the MMA instruction set
+(@option{-mmma}) to be enabled, which is on by default for
+@option{-mcpu=power10}.
+
+By default, @code{__vector_pair} types are loaded into vectors with a
+single load vector pair instruction. The processing for the built-in
+function is done as two separate vector instructions on each of the
+two 128-bit vectors stored in the vector pair. The
+@code{__vector_pair} type is usually stored with a single vector pair
+store instruction.
+
+The following built-in functions operate on pairs of
+@code{vector float} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f32_abs (__vector_pair);
+__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f32_neg (__vector_pair);
+__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair);
+@end smallexample
+
+The following built-in functions operate on pairs of
+@code{vector double} values:
+
+@smallexample
+__vector_pair __builtin_vpair_f64_abs (__vector_pair);
+__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_neg (__vector_pair);
+__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair);
+__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair);
+@end smallexample
+
@node PowerPC Hardware Transactional Memory Built-in Functions
@subsection PowerPC Hardware Transactional Memory Built-in Functions
GCC provides two interfaces for accessing the Hardware Transactional
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
new file mode 100644
index 00000000000..e74840cebc0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c
@@ -0,0 +1,135 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 4 double elements. */
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvadddp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvsubdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmuldp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_mul (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmindp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_min (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmaxdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_max (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvnegdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvabsdp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xvnabsdp, 1 stxvp. */
+ __vector_pair ab = __builtin_vpair_f64_abs (*x);
+ *dest = __builtin_vpair_f64_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmadd{a,q}dp, 1 stxvp. */
+ *dest = __builtin_vpair_f64_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmsub{a,q}dp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f64_neg (*z);
+ *dest = __builtin_vpair_f64_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmadd{a,q}dp, 1 stxvp. */
+ __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmsub{a,q}dp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f64_neg (*z);
+ __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mxvabsdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmindp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
new file mode 100644
index 00000000000..2facb727053
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c
@@ -0,0 +1,134 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector buitin code generates the expected instructions for
+ vector pairs with 8 float elements. */
+
+void
+test_add (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvaddsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_add (*x, *y);
+}
+
+void
+test_sub (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvsubsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_sub (*x, *y);
+}
+
+void
+test_multiply (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmulsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_mul (*x, *y);
+}
+
+void
+test_max (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvmaxsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_max (*x, *y);
+}
+
+void
+test_min (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y)
+{
+ /* 2 lxvp, 2 xvminsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_min (*x, *y);
+}
+
+void
+test_negate (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvnegsp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_neg (*x);
+}
+
+void
+test_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 1 lxvp, 2 xvabssp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_abs (*x);
+}
+
+void
+test_negative_abs (__vector_pair *dest,
+ __vector_pair *x)
+{
+ /* 2 lxvp, 2 xvnabssp, 1 stxvp. */
+ __vector_pair ab = __builtin_vpair_f32_abs (*x);
+ *dest = __builtin_vpair_f32_neg (ab);
+}
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp. */
+ *dest = __builtin_vpair_f32_fma (*x, *y, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f32_neg (*z);
+ *dest = __builtin_vpair_f32_fma (*x, *y, n);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp. */
+ __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp. */
+ __vector_pair n = __builtin_vpair_f32_neg (*z);
+ __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 25 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvminsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
new file mode 100644
index 00000000000..65bfc44f85d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+ negate operations to the appropriate fused multiply-add instruction for
+ vector pairs with 4 double elements. */
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmadd{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ *dest = __builtin_vpair_f64_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmsub{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ *dest = __builtin_vpair_f64_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f64_add (m, *z);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f64_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f64_sub (m, *z);
+ *dest = __builtin_vpair_f64_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
new file mode 100644
index 00000000000..b62871be1fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c
@@ -0,0 +1,60 @@
+/* { dgv64-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */
+
+/* Test whether the vector buitin code combines multiply, add/subtract, and
+ negate operations to the appropriate fused multiply-add instruction for
+ vector pairs with 8 float elements. */
+
+void
+test_fma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmadd{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ *dest = __builtin_vpair_f32_add (m, *z);
+}
+
+void
+test_fms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvmsub{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ *dest = __builtin_vpair_f32_sub (m, *z);
+}
+
+void
+test_nfma (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f32_add (m, *z);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+void
+test_nfms (__vector_pair *dest,
+ __vector_pair *x,
+ __vector_pair *y,
+ __vector_pair *z)
+{
+ /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */
+ __vector_pair m = __builtin_vpair_f32_mul (*x, *y);
+ __vector_pair w = __builtin_vpair_f32_sub (m, *z);
+ *dest = __builtin_vpair_f32_neg (w);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-11-17 20:45 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-17 20:45 [gcc(refs/users/meissner/heads/work146-vpair)] Add support for floating point vector pair built-in functions Michael Meissner
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).