From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1005) id 7672B3858423; Tue, 17 Oct 2023 04:47:51 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7672B3858423 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1697518071; bh=0hrTBGsj4MQocqNWlsh2X7V2IsPoxkLRt7QEuDQzr3o=; h=From:To:Subject:Date:From; b=YzMpxZXHd8XcNt9FiQc1axGctWGSi3QxrmE2suvwEWO/UTvqb1R3cukk16OoMbanZ SP0bgFAcgJQLZGCAX6X5A5kXzmf46kWp2eoHqaW0xVuPVaI5gpp7gMH584cn9QQrsW oXFyxw81oTx6BgAxi7IqsufJKQMLi/2SxUHKOvB0= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Michael Meissner To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/meissner/heads/work139-vpair)] Add support for vector pair built-in functions. X-Act-Checkin: gcc X-Git-Author: Michael Meissner X-Git-Refname: refs/users/meissner/heads/work139-vpair X-Git-Oldrev: 1acb44e16a1943d015d5546d8434ca9c6065dc80 X-Git-Newrev: 3184542be55ff6b0612c570ddae91c541418c2b5 Message-Id: <20231017044751.7672B3858423@sourceware.org> Date: Tue, 17 Oct 2023 04:47:51 +0000 (GMT) List-Id: https://gcc.gnu.org/g:3184542be55ff6b0612c570ddae91c541418c2b5 commit 3184542be55ff6b0612c570ddae91c541418c2b5 Author: Michael Meissner Date: Tue Oct 17 00:47:29 2023 -0400 Add support for vector pair built-in functions. 2023-10-16 Michael Meissner gcc/ * config/rs6000/predicates.md (mma_assemble_input_operand): Allow other 16-byte vectors and not just V16QImode. * config/rs6000/rs6000-builtins.def (__builtin_vpair_*): Add vector pair built-in functions. * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add declaration. (split_binary_vector_pair): Likewise. (split_fma_vector_pair): Likewise. * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function for vector pair built-in functions. (split_binary_vector_pair): Likewise. (split_fma_vector_pair): Likewise. * config/rs6000/rs6000.md (toplevel): Include vector-pair.md. * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md. * config/rs6000/vector-pair.md: New file. * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the vector pair built-in functions. gcc/testsuite/ * gcc.target/powerpc/vector-pair-01.c: New test. * gcc.target/powerpc/vector-pair-02.c: New test. * gcc.target/powerpc/vector-pair-03.c: New test. * gcc.target/powerpc/vector-pair-04.c: New test. * gcc.target/powerpc/vector-pair-05.c: New test. * gcc.target/powerpc/vector-pair-06.c: New test. * gcc.target/powerpc/vector-pair-07.c: New test. * gcc.target/powerpc/vector-pair-08.c: New test. Diff: --- gcc/config/rs6000/predicates.md | 2 +- gcc/config/rs6000/rs6000-builtins.def | 284 ++++++++ gcc/config/rs6000/rs6000-protos.h | 5 + gcc/config/rs6000/rs6000.cc | 74 ++ gcc/config/rs6000/rs6000.md | 1 + gcc/config/rs6000/t-rs6000 | 1 + gcc/config/rs6000/vector-pair.md | 835 ++++++++++++++++++++++ gcc/doc/extend.texi | 155 ++++ gcc/testsuite/gcc.target/powerpc/vector-pair-01.c | 172 +++++ gcc/testsuite/gcc.target/powerpc/vector-pair-02.c | 176 +++++ gcc/testsuite/gcc.target/powerpc/vector-pair-03.c | 60 ++ gcc/testsuite/gcc.target/powerpc/vector-pair-04.c | 60 ++ gcc/testsuite/gcc.target/powerpc/vector-pair-05.c | 192 +++++ gcc/testsuite/gcc.target/powerpc/vector-pair-06.c | 193 +++++ gcc/testsuite/gcc.target/powerpc/vector-pair-07.c | 193 +++++ gcc/testsuite/gcc.target/powerpc/vector-pair-08.c | 194 +++++ 16 files changed, 2596 insertions(+), 1 deletion(-) diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index ef7d3f214c42..922a77716c41 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1301,7 +1301,7 @@ ;; Return 1 if this operand is valid for a MMA assemble accumulator insn. (define_special_predicate "mma_assemble_input_operand" - (match_test "(mode == V16QImode + (match_test "(VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16 && (vsx_register_operand (op, mode) || (MEM_P (op) && (indexed_or_indirect_address (XEXP (op, 0), mode) diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index b661a2268432..6f91cf4c335d 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -4137,3 +4137,287 @@ void __builtin_vsx_stxvp_internal (v256 *, v256); STXVP_INTERNAL stxvp_internal {mma} + +;; Vector pair built-in functions + + v256 __builtin_vpair_zero (); + VPAIR_ZERO vpair_zero {mma} + + vf __builtin_vpair_f32_get_vector (v256, const int); + VPAIR_F32_GET_VECTOR vpair_get_vector_v8sf {mma,pair} + + v256 __builtin_vpair_f32_assemble (vf, vf); + VPAIR_F32_ASSEMBLE vpair_assemble_v8sf {mma,pair} + + v256 __builtin_vpair_f32_splat (float); + VPAIR_F32_SPLAT vpair_splat_v8sf {mma,pair} + + v256 __builtin_vpair_f32_abs (v256); + VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair} + + v256 __builtin_vpair_f32_add (v256, v256); + VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_fma (v256, v256, v256); + VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair} + + v256 __builtin_vpair_f32_mul (v256, v256); + VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_neg (v256); + VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair} + + v256 __builtin_vpair_f32_smax (v256, v256); + VPAIR_F32_SMAX vpair_smax_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_smin (v256, v256); + VPAIR_F32_SMIN vpair_smin_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_sub (v256, v256); + VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair} + + float __builtin_vpair_f32_add_elements (v256); + VPAIR_F32_ADD_ELEMENTS vpair_reduc_plus_scale_v8sf {mma,pair} + + vd __builtin_vpair_f64_get_vector (v256, const int); + VPAIR_F64_GET_VECTOR vpair_get_vector_v4df {mma,pair} + + v256 __builtin_vpair_f64_assemble (vd, vd); + VPAIR_F64_ASSEMBLE vpair_assemble_v4df {mma,pair} + + v256 __builtin_vpair_f64_splat (double); + VPAIR_F64_SPLAT vpair_splat_v4df {mma,pair} + + v256 __builtin_vpair_f64_abs (v256); + VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair} + + v256 __builtin_vpair_f64_add (v256, v256); + VPAIR_F64_ADD vpair_add_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_fma (v256, v256, v256); + VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair} + + v256 __builtin_vpair_f64_mul (v256, v256); + VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_neg (v256); + VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair} + + v256 __builtin_vpair_f64_smax (v256, v256); + VPAIR_F64_SMAX vpair_smax_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_smin (v256, v256); + VPAIR_F64_SMIN vpair_smin_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_sub (v256, v256); + VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair} + + double __builtin_vpair_f64_add_elements (v256); + VPAIR_F64_ADD_ELEMENTS vpair_reduc_plus_scale_v4df {mma,pair} + + vuc __builtin_vpair_i8u_get_vector (v256, const int); + VPAIR_I8U_GET_VECTOR vpair_get_vector_v32qi {mma,pair} + + v256 __builtin_vpair_i8u_assemble (vuc, vuc); + VPAIR_I8U_ASSEMBLE vpair_assemble_v32qi {mma,pair} + + v256 __builtin_vpair_i8u_splat (unsigned char); + VPAIR_I8U_SPLAT vpair_splat_v32qi {mma,pair} + + vsc __builtin_vpair_i8_get_vector (v256, const int); + VPAIR_I8_GET_VECTOR vpair_get_vector_v32qi {mma,pair} + + v256 __builtin_vpair_i8_assemble (vsc, vsc); + VPAIR_I8_ASSEMBLE vpair_assemble_v32qi {mma,pair} + + v256 __builtin_vpair_i8_splat (signed char); + VPAIR_I8_SPLAT vpair_splat_v32qi {mma,pair} + + v256 __builtin_vpair_i8_add (v256, v256); + VPAIR_I8_ADD vpair_add_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_and (v256, v256); + VPAIR_I8_AND vpair_and_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_ior (v256, v256); + VPAIR_I8_IOR vpair_ior_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_neg (v256); + VPAIR_I8_NEG vpair_neg_v32qi2 {mma,pair} + + v256 __builtin_vpair_i8_not (v256); + VPAIR_I8_NOT vpair_not_v32qi2 {mma,pair} + + v256 __builtin_vpair_i8_smax (v256, v256); + VPAIR_I8_SMAX vpair_smax_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_smin (v256, v256); + VPAIR_I8_SMIN vpair_smin_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_sub (v256, v256); + VPAIR_I8_SUB vpair_sub_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_umax (v256, v256); + VPAIR_I8_UMAX vpair_umax_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_umin (v256, v256); + VPAIR_I8_UMIN vpair_umin_v32qi3 {mma,pair} + + v256 __builtin_vpair_i8_xor (v256, v256); + VPAIR_I8_XOR vpair_xor_v32qi3 {mma,pair} + + vus __builtin_vpair_i16u_get_vector (v256, const int); + VPAIR_I16U_GET_VECTOR vpair_get_vector_v16hi {mma,pair} + + v256 __builtin_vpair_i16u_assemble (vus, vus); + VPAIR_I16U_ASSEMBLE vpair_assemble_v16hi {mma,pair} + + v256 __builtin_vpair_i16u_splat (unsigned short); + VPAIR_I16U_SPLAT vpair_splat_v16hi {mma,pair} + + vss __builtin_vpair_i16_get_vector (v256, const int); + VPAIR_I16_GET_VECTOR vpair_get_vector_v16hi {mma,pair} + + v256 __builtin_vpair_i16_assemble (vss, vss); + VPAIR_I16_ASSEMBLE vpair_assemble_v16hi {mma,pair} + + v256 __builtin_vpair_i16_splat (short); + VPAIR_I16_SPLAT vpair_splat_v16hi {mma,pair} + + v256 __builtin_vpair_i16_add (v256, v256); + VPAIR_I16_ADD vpair_add_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_and (v256, v256); + VPAIR_I16_AND vpair_and_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_ior (v256, v256); + VPAIR_I16_IOR vpair_ior_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_neg (v256); + VPAIR_I16_NEG vpair_neg_v16hi2 {mma,pair} + + v256 __builtin_vpair_i16_not (v256); + VPAIR_I16_NOT vpair_not_v16hi2 {mma,pair} + + v256 __builtin_vpair_i16_smax (v256, v256); + VPAIR_I16_SMAX vpair_smax_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_smin (v256, v256); + VPAIR_I16_SMIN vpair_smin_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_sub (v256, v256); + VPAIR_I16_SUB vpair_sub_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_umax (v256, v256); + VPAIR_I16_UMAX vpair_umax_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_umin (v256, v256); + VPAIR_I16_UMIN vpair_umin_v16hi3 {mma,pair} + + v256 __builtin_vpair_i16_xor (v256, v256); + VPAIR_I16_XOR vpair_xor_v16hi3 {mma,pair} + + vui __builtin_vpair_i32u_get_vector (v256, const int); + VPAIR_I32U_GET_VECTOR vpair_get_vector_v8si {mma,pair} + + v256 __builtin_vpair_i32u_assemble (vui, vui); + VPAIR_I32U_ASSEMBLE vpair_assemble_v8si {mma,pair} + + v256 __builtin_vpair_i32u_splat (unsigned int); + VPAIR_I32U_SPLAT vpair_splat_v8si {mma,pair} + + vsi __builtin_vpair_i32_get_vector (v256, const int); + VPAIR_I32_GET_VECTOR vpair_get_vector_v8si {mma,pair} + + v256 __builtin_vpair_i32_assemble (vsi, vsi); + VPAIR_I32_ASSEMBLE vpair_assemble_v8si {mma,pair} + + v256 __builtin_vpair_i32_splat (int); + VPAIR_I32_SPLAT vpair_splat_v8si {mma,pair} + + v256 __builtin_vpair_i32_add (v256, v256); + VPAIR_I32_ADD vpair_add_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_and (v256, v256); + VPAIR_I32_AND vpair_and_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_ior (v256, v256); + VPAIR_I32_IOR vpair_ior_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_neg (v256); + VPAIR_I32_NEG vpair_neg_v8si2 {mma,pair} + + v256 __builtin_vpair_i32_not (v256); + VPAIR_I32_NOT vpair_not_v8si2 {mma,pair} + + v256 __builtin_vpair_i32_smax (v256, v256); + VPAIR_I32_SMAX vpair_smax_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_smin (v256, v256); + VPAIR_I32_SMIN vpair_smin_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_sub (v256, v256); + VPAIR_I32_SUB vpair_sub_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_umax (v256, v256); + VPAIR_I32_UMAX vpair_umax_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_umin (v256, v256); + VPAIR_I32_UMIN vpair_umin_v8si3 {mma,pair} + + v256 __builtin_vpair_i32_xor (v256, v256); + VPAIR_I32_XOR vpair_xor_v8si3 {mma,pair} + + vull __builtin_vpair_i64u_get_vector (v256, const int); + VPAIR_I64U_GET_VECTOR vpair_get_vector_v4di {mma,pair} + + v256 __builtin_vpair_i64u_assemble (vull, vull); + VPAIR_I64U_ASSEMBLE vpair_assemble_v4di {mma,pair} + + v256 __builtin_vpair_i64u_splat (unsigned long long); + VPAIR_I64U_SPLAT vpair_splat_v4di {mma,pair} + + vsll __builtin_vpair_i64_get_vector (v256, const int); + VPAIR_I64_GET_VECTOR vpair_get_vector_v4di {mma,pair} + + v256 __builtin_vpair_i64_assemble (vsll, vsll); + VPAIR_I64_ASSEMBLE vpair_assemble_v4di {mma,pair} + + v256 __builtin_vpair_i64_splat (long long); + VPAIR_I64_SPLAT vpair_splat_v4di {mma,pair} + + v256 __builtin_vpair_i64_add (v256, v256); + VPAIR_I64_ADD vpair_add_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_and (v256, v256); + VPAIR_I64_AND vpair_and_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_ior (v256, v256); + VPAIR_I64_IOR vpair_ior_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_neg (v256); + VPAIR_I64_NEG vpair_neg_v4di2 {mma,pair} + + v256 __builtin_vpair_i64_not (v256); + VPAIR_I64_NOT vpair_not_v4di2 {mma,pair} + + v256 __builtin_vpair_i64_smax (v256, v256); + VPAIR_I64_SMAX vpair_smax_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_smin (v256, v256); + VPAIR_I64_SMIN vpair_smin_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_sub (v256, v256); + VPAIR_I64_SUB vpair_sub_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_umax (v256, v256); + VPAIR_I64_UMAX vpair_umax_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_umin (v256, v256); + VPAIR_I64_UMIN vpair_umin_v4di3 {mma,pair} + + v256 __builtin_vpair_i64_xor (v256, v256); + VPAIR_I64_XOR vpair_xor_v4di3 {mma,pair} + + long long __builtin_vpair_i64_add_elements (v256); + VPAIR_I64_ADD_ELEMENTS vpair_reduc_plus_scale_v4di {mma,pair,no32bit} diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index f70118ea40f5..bbd899d75620 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool); extern void output_toc (FILE *, rtx, int, machine_mode); extern void rs6000_fatal_bad_address (rtx); extern rtx create_TOC_reference (rtx, rtx); +extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx)); +extern void split_binary_vector_pair (machine_mode, rtx [], + rtx (*)(rtx, rtx, rtx)); +extern void split_fma_vector_pair (machine_mode, rtx [], + rtx (*)(rtx, rtx, rtx, rtx)); extern void rs6000_split_multireg_move (rtx, rtx); extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode); extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 8f06b37171a3..0f466f1f7c29 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -27393,6 +27393,80 @@ rs6000_split_logical (rtx operands[3], return; } +/* Split a unary vector pair insn into two separate vector insns. */ + +void +split_unary_vector_pair (machine_mode mode, /* vector mode. */ + rtx operands[], /* dest, src. */ + rtx (*func)(rtx, rtx)) /* create insn. */ +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + machine_mode orig_mode = GET_MODE (op0); + + rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0); + rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0); + rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16); + rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16); + + emit_insn (func (reg0_vector0, reg1_vector0)); + emit_insn (func (reg0_vector1, reg1_vector1)); + return; +} + +/* Split a binary vector pair insn into two separate vector insns. */ + +void +split_binary_vector_pair (machine_mode mode, /* vector mode. */ + rtx operands[], /* dest, src. */ + rtx (*func)(rtx, rtx, rtx)) /* create insn. */ +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + machine_mode orig_mode = GET_MODE (op0); + + rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0); + rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0); + rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0); + rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16); + rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16); + rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16); + + emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0)); + emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1)); + return; +} + +/* Split a fused multiply-add vector pair insn into two separate vector + insns. */ + +void +split_fma_vector_pair (machine_mode mode, /* vector mode. */ + rtx operands[], /* dest, src. */ + rtx (*func)(rtx, rtx, rtx, rtx)) /* create insn. */ +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + rtx op3 = operands[3]; + machine_mode orig_mode = GET_MODE (op0); + + rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0); + rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0); + rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0); + rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0); + + rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16); + rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16); + rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16); + rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16); + + emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0)); + emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1)); + return; +} + /* Emit instructions to move SRC to DST. Called by splitters for multi-register moves. It will emit at most one instruction for each register that is accessed; that is, it won't emit li/lis pairs diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 2a1b5ecfaee2..da51029aa1ba 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -15759,6 +15759,7 @@ (include "vsx.md") (include "altivec.md") (include "mma.md") +(include "vector-pair.md") (include "dfp.md") (include "crypto.md") (include "htm.md") diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index f183b42ce1de..5fc89499795d 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \ $(srcdir)/config/rs6000/vsx.md \ $(srcdir)/config/rs6000/altivec.md \ $(srcdir)/config/rs6000/mma.md \ + $(srcdir)/config/rs6000/vector-pair.md \ $(srcdir)/config/rs6000/crypto.md \ $(srcdir)/config/rs6000/htm.md \ $(srcdir)/config/rs6000/dfp.md \ diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md new file mode 100644 index 000000000000..4d422f5a6976 --- /dev/null +++ b/gcc/config/rs6000/vector-pair.md @@ -0,0 +1,835 @@ +;; Vector pair arithmetic support. +;; Copyright (C) 2020-2023 Free Software Foundation, Inc. +;; Contributed by Peter Bergner and +;; Michael Meissner +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . +;; +;; This file adds support for doing vector operations on pairs of vector +;; registers. Most of the instructions use vector pair instructions to load +;; and possibly store registers, but splitting the operation after register +;; allocation to do 2 separate operations. The second scheduler pass can +;; interleave other instructions between these pairs of instructions if +;; possible. + +(define_c_enum "unspec" + [UNSPEC_VPAIR_ZERO + UNSPEC_VPAIR_V4DF + UNSPEC_VPAIR_V8SF + UNSPEC_VPAIR_V32QI + UNSPEC_VPAIR_V16HI + UNSPEC_VPAIR_V8SI + UNSPEC_VPAIR_V4DI + UNSPEC_VPAIR_REDUCE_PLUS_F32 + UNSPEC_VPAIR_REDUCE_PLUS_F64 + UNSPEC_VPAIR_REDUCE_PLUS_I64 + ]) + +;; Iterator doing unary/binary arithmetic on vector pairs +(define_code_iterator VP_FP_UNARY [abs neg]) +(define_code_iterator VP_FP_BINARY [minus mult plus smin smax]) + +(define_code_iterator VP_INT_BINARY [and ior minus plus smax smin umax umin xor]) + +;; Return the insn name from the VP_* code iterator +(define_code_attr vp_insn [(abs "abs") + (and "and") + (ior "ior") + (minus "sub") + (mult "mul") + (not "one_cmpl") + (neg "neg") + (plus "add") + (smin "smin") + (smax "smax") + (umin "umin") + (umax "umax") + (xor "xor")]) + +;; Return the register constraint ("v" or "wa") for the integer code iterator +;; used. For arithmetic operations, we need to use "v" in order to use the +;; Altivec instruction. For logical operations, we can use wa. +(define_code_attr vp_ireg [(and "wa") + (ior "wa") + (minus "v") + (not "wa") + (neg "v") + (plus "v") + (smax "v") + (smin "v") + (umax "v") + (umin "v") + (xor "wa")]) + +;; Return the register previdcate for the integer code iterator used +(define_code_attr vp_ipredicate [(and "vsx_register_operand") + (ior "vsx_register_operand") + (minus "altivec_register_operand") + (not "vsx_register_operand") + (neg "altivec_register_operand") + (plus "altivec_register_operand") + (smax "altivec_register_operand") + (smin "altivec_register_operand") + (umax "altivec_register_operand") + (umin "altivec_register_operand") + (xor "vsx_register_operand")]) + +;; Iterator for creating the unspecs for vector pair built-ins +(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF + UNSPEC_VPAIR_V8SF]) + +(define_int_iterator VP_INT [UNSPEC_VPAIR_V4DI + UNSPEC_VPAIR_V8SI + UNSPEC_VPAIR_V16HI + UNSPEC_VPAIR_V32QI]) + +(define_int_iterator VP_ALL [UNSPEC_VPAIR_V4DF + UNSPEC_VPAIR_V8SF + UNSPEC_VPAIR_V4DI + UNSPEC_VPAIR_V8SI + UNSPEC_VPAIR_V16HI + UNSPEC_VPAIR_V32QI]) + +;; Map VP_{INT,FP,ALL} to vector mode of the arguments after they are split +(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF "V2DF") + (UNSPEC_VPAIR_V8SF "V4SF") + (UNSPEC_VPAIR_V32QI "V16QI") + (UNSPEC_VPAIR_V16HI "V8HI") + (UNSPEC_VPAIR_V8SI "V4SI") + (UNSPEC_VPAIR_V4DI "V2DI")]) + +;; Map VP_{INT,FP,ALL} to a lower case name to identify the vector pair. +(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF "v4df") + (UNSPEC_VPAIR_V8SF "v8sf") + (UNSPEC_VPAIR_V32QI "v32qi") + (UNSPEC_VPAIR_V16HI "v16hi") + (UNSPEC_VPAIR_V8SI "v8si") + (UNSPEC_VPAIR_V4DI "v4di")]) + +;; Map VP_{INT,FP,ALL} to a lower case name to identify the vector after the +;; vector pair has been split. +(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF "v2df") + (UNSPEC_VPAIR_V8SF "v4sf") + (UNSPEC_VPAIR_V32QI "v16qi") + (UNSPEC_VPAIR_V16HI "v8hi") + (UNSPEC_VPAIR_V8SI "v4si") + (UNSPEC_VPAIR_V4DI "v2di")]) + +;; Map VP_INT to constraints used for the negate scratch register. For vectors +;; of QI and HI, we need to change -a into 0 - a since we don't have a negate +;; operation. We do have a vnegw/vnegd operation for SI and DI modes. +(define_int_attr vp_neg_reg [(UNSPEC_VPAIR_V32QI "&v") + (UNSPEC_VPAIR_V16HI "&v") + (UNSPEC_VPAIR_V8SI "X") + (UNSPEC_VPAIR_V4DI "X")]) + +;; Moddes of the vector element to splat to vector pair +(define_mode_iterator VP_SPLAT [DF SF DI SI HI QI]) + +;; MAP VP_SPLAT to the mode of the vector pair in the assemble operation +(define_mode_attr vp_splat_pmode [(DF "v4df") + (SF "v8sf") + (DI "v4di") + (SI "v8si") + (HI "v16hi") + (QI "v32qi")]) + +;; MAP VP_SPLAT to the mode of the vector containing the element +(define_mode_attr VP_SPLAT_VMODE [(DF "V2DF") + (SF "V4SF") + (DI "V2DI") + (SI "V4SI") + (HI "V8HI") + (QI "V16QI")]) + +;; Initialize a vector pair to 0 +(define_insn_and_split "vpair_zero" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(const_int 0)] UNSPEC_VPAIR_ZERO))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(set (match_dup 1) (match_dup 3)) + (set (match_dup 2) (match_dup 3))] +{ + rtx op0 = operands[0]; + unsigned offset_hi = (WORDS_BIG_ENDIAN) ? 0 : 16; + unsigned offset_lo = (WORDS_BIG_ENDIAN) ? 16 : 0; + + operands[1] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_hi); + operands[2] = simplify_gen_subreg (V2DImode, op0, OOmode, offset_lo); + operands[3] = CONST0_RTX (V2DImode); +} + [(set_attr "length" "8")]) + +;; Assemble a vector pair from two vectors. Unlike +;; __builtin_mma_assemble_pair, this function produces a vector pair output +;; directly and it takes all of the vector types. +;; +;; We cannot update the two output registers atomically, so mark the output as +;; an early clobber so we don't accidentally clobber the input operands. */ + +(define_insn_and_split "vpair_assemble_" + [(set (match_operand:OO 0 "vsx_register_operand" "=&wa") + (unspec:OO + [(match_operand: 1 "mma_assemble_input_operand" "mwa") + (match_operand: 2 "mma_assemble_input_operand" "mwa")] + VP_ALL))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx src = gen_rtx_UNSPEC (OOmode, + gen_rtvec (2, operands[1], operands[2]), + UNSPEC_VSX_ASSEMBLE); + rs6000_split_multireg_move (operands[0], src); + DONE; +} + [(set_attr "length" "8")]) + +;; Extract one of the two 128-bit vectors from a vector pair. +(define_insn_and_split "vpair_get_vector_" + [(set (match_operand: 0 "vsx_register_operand" "=wa") + (unspec: + [(match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand 2 "const_0_to_1_operand" "n")] + VP_ALL))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(set (match_dup 0) (match_dup 3))] +{ + machine_mode vmode = mode; + unsigned reg_num = UINTVAL (operands[2]); + if (!WORDS_BIG_ENDIAN) + reg_num = 1 - reg_num; + + operands[3] = simplify_gen_subreg (vmode, operands[0], OOmode, reg_num * 16); +}) + +;; Optimize extracting an 128-bit vector from a vector pair in memory. +(define_insn_and_split "*vpair_get_vector__mem" + [(set (match_operand: 0 "vsx_register_operand" "=wa") + (unspec: + [(match_operand:OO 1 "memory_operand" "o") + (match_operand 2 "const_0_to_1_operand" "n")] + VP_ALL))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(set (match_dup 0) (match_dup 3))] +{ + operands[3] = adjust_address (operands[1], mode, + 16 * INTVAL (operands[2])); +} + [(set_attr "type" "vecload")]) + +;; Create a vector pair with a value splat'ed (duplicated) to all of the +;; elements. +(define_expand "vpair_splat_" + [(use (match_operand:OO 0 "vsx_register_operand")) + (use (match_operand:VP_SPLAT 1 "input_operand"))] + "TARGET_MMA" +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + machine_mode element_mode = mode; + machine_mode vector_mode = mode; + + if (op1 == CONST0_RTX (element_mode)) + { + emit_insn (gen_vpair_zero (op0)); + DONE; + } + + rtx vec = gen_reg_rtx (vector_mode); + unsigned num_elements = GET_MODE_NUNITS (vector_mode); + rtvec elements = rtvec_alloc (num_elements); + for (unsigned i = 0; i < num_elements; i++) + RTVEC_ELT (elements, i) = copy_rtx (op1); + + rs6000_expand_vector_init (vec, gen_rtx_PARALLEL (vector_mode, elements)); + emit_insn (gen_vpair_assemble_ (op0, vec, vec)); + DONE; +}) + + +;; Vector pair floating point unary operations +(define_insn_and_split "vpair__2" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(VP_FP_UNARY:OO + (match_operand:OO 1 "vsx_register_operand" "wa"))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_unary_vector_pair (mode, operands, + gen_2); + DONE; +} + [(set_attr "length" "8")]) + +;; Optimize vector pair negate of absolute value +(define_insn_and_split "vpair_nabs_2" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_unary_vector_pair (mode, operands, + gen_vsx_nabs2); + DONE; +} + [(set_attr "length" "8")]) + +;; Vector pair floating binary operations +(define_insn_and_split "vpair__3" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(VP_FP_BINARY:OO + (match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_3); + DONE; +} + [(set_attr "length" "8")]) + +;; Vector pair fused multiply-add floating point operations +(define_insn_and_split "vpair_fma_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (mode, operands, + gen_fma4); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "vpair_fms_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (unspec:OO + [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (mode, operands, + gen_fms4); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "vpair_nfma_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (mode, operands, + gen_nfma4); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "vpair_nfms_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (unspec:OO + [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (mode, operands, + gen_nfms4); + DONE; +} + [(set_attr "length" "8")]) + +;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c). +(define_insn_and_split "*vpair_fma_fpcontract_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(plus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (match_dup 3))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) + +;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c) +(define_insn_and_split "*vpair_fms_fpcontract_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(minus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (unspec:OO + [(neg:OO + (match_dup 3))] + VP_FP))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) + + +;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c). +(define_insn_and_split "*vpair_nfma_fpcontract_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(plus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (match_dup 3))] + VP_FP))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) + +;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c) +(define_insn_and_split "*vpair_nfms_fpcontract_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(minus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (unspec:OO + [(neg:OO + (match_dup 3))] + VP_FP))] + VP_FP))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) + + +;; Add all elements in a pair of V4SF vectors. +(define_insn_and_split "vpair_reduc_plus_scale_v8sf" + [(set (match_operand:SF 0 "vsx_register_operand" "=wa") + (unspec:SF [(match_operand:OO 1 "vsx_register_operand" "v")] + UNSPEC_VPAIR_REDUCE_PLUS_F32)) + (clobber (match_scratch:V4SF 2 "=&v")) + (clobber (match_scratch:V4SF 3 "=&v"))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx tmp1 = operands[2]; + rtx tmp2 = operands[3]; + unsigned r = reg_or_subregno (op1); + rtx op1_hi = gen_rtx_REG (V4SFmode, r); + rtx op1_lo = gen_rtx_REG (V4SFmode, r + 1); + + emit_insn (gen_addv4sf3 (tmp1, op1_hi, op1_lo)); + emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (8))); + emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2)); + emit_insn (gen_altivec_vsldoi_v4sf (tmp1, tmp2, tmp2, GEN_INT (4))); + emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2)); + emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2)); + DONE; +} + [(set_attr "length" "24")]) + +;; Add all elements in a pair of V2DF vectors +(define_insn_and_split "vpair_reduc_plus_scale_v4df" + [(set (match_operand:DF 0 "vsx_register_operand" "=&wa") + (unspec:DF [(match_operand:OO 1 "vsx_register_operand" "wa")] + UNSPEC_VPAIR_REDUCE_PLUS_F64)) + (clobber (match_scratch:DF 2 "=&wa")) + (clobber (match_scratch:V2DF 3 "=&wa"))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(set (match_dup 3) + (plus:V2DF (match_dup 4) + (match_dup 5))) + (set (match_dup 2) + (vec_select:DF (match_dup 3) + (parallel [(match_dup 6)]))) + (set (match_dup 0) + (plus:DF (match_dup 7) + (match_dup 2)))] +{ + unsigned reg1 = reg_or_subregno (operands[1]); + unsigned reg3 = reg_or_subregno (operands[3]); + + operands[4] = gen_rtx_REG (V2DFmode, reg1); + operands[5] = gen_rtx_REG (V2DFmode, reg1 + 1); + operands[6] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0); + operands[7] = gen_rtx_REG (DFmode, reg3); +}) + + +;; Vector pair integer negate support. +(define_insn_and_split "vpair_neg_2" + [(set (match_operand:OO 0 "altivec_register_operand" "=v") + (unspec:OO [(neg:OO + (match_operand:OO 1 "altivec_register_operand" "v"))] + VP_INT)) + (clobber (match_scratch: 2 "="))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (minus: (match_dup 2) + (match_dup 5))) + (set (match_dup 6) (minus: (match_dup 2) + (match_dup 7)))] +{ + unsigned reg0 = reg_or_subregno (operands[0]); + unsigned reg1 = reg_or_subregno (operands[1]); + machine_mode vmode = mode; + + operands[3] = CONST0_RTX (vmode); + + operands[4] = gen_rtx_REG (vmode, reg0); + operands[5] = gen_rtx_REG (vmode, reg1); + + operands[6] = gen_rtx_REG (vmode, reg0 + 1); + operands[7] = gen_rtx_REG (vmode, reg1 + 1); + + /* If the vector integer size is 32 or 64 bits, we can use the vneg{w,d} + instructions. */ + if (vmode == V4SImode) + { + emit_insn (gen_negv4si2 (operands[4], operands[5])); + emit_insn (gen_negv4si2 (operands[6], operands[7])); + DONE; + } + else if (vmode == V2DImode) + { + emit_insn (gen_negv2di2 (operands[4], operands[5])); + emit_insn (gen_negv2di2 (operands[6], operands[7])); + DONE; + } +} + [(set_attr "length" "8")]) + +;; Vector pair integer not support. +(define_insn_and_split "vpair_not_2" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(not:OO (match_operand:OO 1 "vsx_register_operand" "wa"))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_unary_vector_pair (mode, operands, + gen_one_cmpl2); + DONE; +} + [(set_attr "length" "8")]) + +;; Vector pair integer binary operations. +(define_insn_and_split "vpair__3" + [(set (match_operand:OO 0 "" "=") + (unspec:OO [(VP_INT_BINARY:OO + (match_operand:OO 1 "" "") + (match_operand:OO 2 "" ""))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_3); + DONE; +} + [(set_attr "length" "8")]) + +;; Optimize vector pair a & ~b +(define_insn_and_split "*vpair_andc_" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(and:OO + (unspec:OO + [(not:OO + (match_operand:OO 1 "vsx_register_operand" "wa"))] + VP_INT) + (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_andc3); + DONE; +} + [(set_attr "length" "8")]) + +;; Optimize vector pair a | ~b +(define_insn_and_split "*vpair_iorc_" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(ior:OO + (unspec:OO + [(not:OO + (match_operand:OO 1 "vsx_register_operand" "wa"))] + VP_INT) + (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_orc3); + DONE; +} + [(set_attr "length" "8")]) + +;; Optiomize vector pair ~(a & b) or ((~a) | (~b)) +(define_insn_and_split "*vpair_nand__1" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO + [(not:OO + (unspec:OO [(and:OO + (match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_INT))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_nand3); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "*vpair_nand__2" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO + [(ior:OO + (unspec:OO + [(not:OO + (match_operand:OO 1 "vsx_register_operand" "wa"))] + VP_INT) + (unspec:OO + [(not:OO + (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_INT))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_nand3); + DONE; +} + [(set_attr "length" "8")]) + +;; Optiomize vector pair ~(a | b) or ((~a) & (~b)) +(define_insn_and_split "*vpair_nor__1" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO + [(not:OO + (unspec:OO [(ior:OO + (match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_INT))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_nor3); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "*vpair_nor__2" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO + [(ior:OO + (unspec:OO + [(not:OO (match_operand:OO 1 "vsx_register_operand" "wa"))] + VP_INT) + (unspec:OO + [(not:OO (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_INT))] + VP_INT))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (mode, operands, + gen_nor3); + DONE; +} + [(set_attr "length" "8")]) + +;; Add all elements in a pair of V2DI vectors +(define_insn_and_split "vpair_reduc_plus_scale_v4di" + [(set (match_operand:DI 0 "gpc_reg_operand" "=&r") + (unspec:DI [(match_operand:OO 1 "altivec_register_operand" "v")] + UNSPEC_VPAIR_REDUCE_PLUS_I64)) + (clobber (match_scratch:V2DI 2 "=&v")) + (clobber (match_scratch:DI 3 "=&r"))] + "TARGET_MMA && TARGET_POWERPC64" + "#" + "&& reload_completed" + [(set (match_dup 2) + (plus:V2DI (match_dup 4) + (match_dup 5))) + (set (match_dup 3) + (vec_select:DI (match_dup 2) + (parallel [(const_int 0)]))) + (set (match_dup 0) + (vec_select:DI (match_dup 2) + (parallel [(const_int 1)]))) + (set (match_dup 0) + (plus:DI (match_dup 0) + (match_dup 3)))] +{ + unsigned reg1 = reg_or_subregno (operands[1]); + + operands[4] = gen_rtx_REG (V2DImode, reg1); + operands[5] = gen_rtx_REG (V2DImode, reg1 + 1); +} + [(set_attr "length" "16")]) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index e8180945ab4b..faa41380a464 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -14968,6 +14968,7 @@ instructions, but allow the compiler to schedule those calls. * NDS32 Built-in Functions:: * Nvidia PTX Built-in Functions:: * Basic PowerPC Built-in Functions:: +* PowerPC Vector Pair Built-in Functions Available on ISA 3.1:: * PowerPC AltiVec/VSX Built-in Functions:: * PowerPC Hardware Transactional Memory Built-in Functions:: * PowerPC Atomic Memory Operation Functions:: @@ -21289,6 +21290,160 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128); @end smallexample +@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1 +@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1 + +GCC provides functions to speed up processing by using +@code{__vector_pair} to hold two vectors. The load vector pair and +store vector pair instructions are used to load the values into +registers and store the values. The operation itself is split into +two separate vector instructions. To use the vector pair built-in +functions, you need to have MMA support enabled (@option{-mmma}, which +is enabled by default with @option{-mcpu=power10}). + +The following built-in functions are independent on the type of the +underlying vector: + +@smallexample +__vector_pair __builtin_vpair_zero (); +@end smallexample + +The following built-in functions operate on pairs of +@code{vector float} values: + +@smallexample +vector float __builtin_vpair_f32_get_vector (__vector_pair, int); +__vector_pair __builtin_vpair_f32_assemble (vector float, vector float); +__vector_pair __builtin_vpair_f32_splat (float); +__vector_pair __builtin_vpair_f32_abs (__vector_pair); +__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_neg (__vector_pair); +__vector_pair __builtin_vpair_f32_smax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_smin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair); +float __builtin_vpair_f32_add_elements (__vector_pair); +@end smallexample + +The following built-in functions operate on pairs of +@code{vector double} values: + +@smallexample +vector double __builtin_vpair_f64_get_vector (__vector_pair, int); +__vector_pair __builtin_vpair_f64_assemble (vector double, vector double); +__vector_pair __builtin_vpair_f64_splat (double); +__vector_pair __builtin_vpair_f64_abs (__vector_pair); +__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_neg (__vector_pair); +__vector_pair __builtin_vpair_f64_smax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_smin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair); +double __builtin_vpair_f64_add_elements (__vector_pair); +@end smallexample + +The following built-in functions operate on pairs of +@code{vector long long} or @code{vector unsigned long long} values: + +@smallexample +vector long long __builtin_vpair_i64_get_vector (__vector_pair, int); +vector unsigned long long __builtin_vpair_i64u_get_vector (__vector_pair, int); +__vector_pair __builtin_vpair_i64_assemble (vector long long, + vector long long); +__vector_pair __builtin_vpair_i64u_assemble (vector unsigned long long, + vector unsigned long long); +__vector_pair __builtin_vpair_i64_splat (long long); +__vector_pair __builtin_vpair_i64u_splat (unsigned long long); +__vector_pair __builtin_vpair_i64_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_and (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_ior (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_neg (__vector_pair); +__vector_pair __builtin_vpair_i64_not (__vector_pair); +__vector_pair __builtin_vpair_i64_smax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_smin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_sub (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_umax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_umin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i64_xor (__vector_pair, __vector_pair); +long long __builtin_vpair_i64_add_elements (__vector_pair); +@end smallexample + +The following built-in functions operate on pairs of +@code{vector int} or @code{vector unsigned int} values: + +@smallexample +vector int __builtin_vpair_i32_get_vector (__vector_pair, int); +vector unsigned int __builtin_vpair_i32u_get_vector (__vector_pair, int); +__vector_pair __builtin_vpair_i32_assemble (vector int, vector int); +__vector_pair __builtin_vpair_i32u_assemble (vector unsigned int, + vector unsigned int); +__vector_pair __builtin_vpair_i32_splat (int); +__vector_pair __builtin_vpair_i32u_splat (unsigned int); +__vector_pair __builtin_vpair_i32_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_and (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_ior (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_neg (__vector_pair); +__vector_pair __builtin_vpair_i32_not (__vector_pair); +__vector_pair __builtin_vpair_i32_smax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_smin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_sub (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_umax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_umin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i32_xor (__vector_pair, __vector_pair); +@end smallexample + +The following built-in functions operate on pairs of +@code{vector short} or @code{vector unsigned short} values: + +@smallexample +vector short __builtin_vpair_i16_get_vector (__vector_pair, int); +vector unsigned short __builtin_vpair_i16u_get_vector (__vector_pair, int); +__vector_pair __builtin_vpair_i16_assemble (vector short, + vector short); +__vector_pair __builtin_vpair_i16u_assemble (vector unsigned short, + vector unsigned short); +__vector_pair __builtin_vpair_i16_splat (short); +__vector_pair __builtin_vpair_i16u_splat (unsigned short); +__vector_pair __builtin_vpair_i16_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_and (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_ior (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_neg (__vector_pair); +__vector_pair __builtin_vpair_i16_not (__vector_pair); +__vector_pair __builtin_vpair_i16_smax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_smin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_sub (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_umax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_umin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i16_xor (__vector_pair, __vector_pair); +@end smallexample + +The following built-in functions operate on pairs of +@code{vector signed char} or @code{vector unsigned char} values: + +@smallexample +vector signed char __builtin_vpair_i8_get_vector (__vector_pair, int); +vector unsigned char __builtin_vpair_i8u_get_vector (__vector_pair, int); +__vector_pair __builtin_vpair_i8_assemble (vector signed char, + vector signed char); +__vector_pair __builtin_vpair_i8u_assemble (vector unsigned char, + vector unsigned char4); +__vector_pair __builtin_vpair_i8_splat (signed char); +__vector_pair __builtin_vpair_i8u_splat (unsigned char); +__vector_pair __builtin_vpair_i8_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_and (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_ior (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_neg (__vector_pair); +__vector_pair __builtin_vpair_i8_not (__vector_pair); +__vector_pair __builtin_vpair_i8_smax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_smin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_sub (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_umax (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_umin (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_i8_xor (__vector_pair, __vector_pair); +@end smallexample + @node PowerPC Hardware Transactional Memory Built-in Functions @subsection PowerPC Hardware Transactional Memory Built-in Functions GCC provides two interfaces for accessing the Hardware Transactional diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-01.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-01.c new file mode 100644 index 000000000000..0cfdd111afd9 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-01.c @@ -0,0 +1,172 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 4 double elements. */ + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvadddp, 1 stxvp. */ + *dest = __builtin_vpair_f64_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvsubdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_sub (*x, *y); +} + +void +test_multiply (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmuldp, 1 stxvp. */ + *dest = __builtin_vpair_f64_mul (*x, *y); +} + +void +test_min (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmindp, 1 stxvp. */ + *dest = __builtin_vpair_f64_smin (*x, *y); +} + +void +test_max (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmaxdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_smax (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvnegdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_neg (*x); +} + +void +test_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvabsdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_abs (*x); +} + +void +test_negative_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xvnabsdp, 1 stxvp. */ + __vector_pair ab = __builtin_vpair_f64_abs (*x); + *dest = __builtin_vpair_f64_neg (ab); +} + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmadd{a,q}dp, 1 stxvp. */ + *dest = __builtin_vpair_f64_fma (*x, *y, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmsub{a,q}dp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f64_neg (*z); + *dest = __builtin_vpair_f64_fma (*x, *y, n); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmadd{a,q}dp, 1 stxvp. */ + __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z); + *dest = __builtin_vpair_f64_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmsub{a,q}dp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f64_neg (*z); + __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n); + *dest = __builtin_vpair_f64_neg (w); +} + +void +test_splat (__vector_pair *dest, double x) +{ + /* 1 xxpermdi, 1 stxvp. */ + *dest = __builtin_vpair_f64_splat (x); +} + +void +test_zero (__vector_pair *dest) +{ + /* 2 xxspltib, 1 stxvp. */ + *dest = __builtin_vpair_zero (); +} + +vector double +test_get_vector_0 (__vector_pair *x) +{ + /* 1 lxp. */ + return __builtin_vpair_f64_get_vector (*x, 0); +} + +vector double +test_get_vector_1 (__vector_pair *x) +{ + /* 1 lxp. */ + return __builtin_vpair_f64_get_vector (*x, 1); +} + +double +test_add_elements (__vector_pair *x) +{ + /* 1 lxvp, 1 xvadddp, 1 xxpermdi, 1 fadd/xsadddp. */ + return __builtin_vpair_f64_add_elements (*x); +} + +/* { dg-final { scan-assembler-times {\mlxv\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mlxvp\M} 26 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 14 } } */ +/* { dg-final { scan-assembler-times {\mxvabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvadddp\M} 3 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmindp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmuldp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsubdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-02.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-02.c new file mode 100644 index 000000000000..1e54d4f82a9a --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-02.c @@ -0,0 +1,176 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 8 float elements. */ + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvaddsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvsubsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_sub (*x, *y); +} + +void +test_multiply (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmulsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_mul (*x, *y); +} + +void +test_max (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmaxsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_smax (*x, *y); +} + +void +test_min (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvminsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_smin (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvnegsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_neg (*x); +} + +void +test_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvabssp, 1 stxvp. */ + *dest = __builtin_vpair_f32_abs (*x); +} + +void +test_negative_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xvnabssp, 1 stxvp. */ + __vector_pair ab = __builtin_vpair_f32_abs (*x); + *dest = __builtin_vpair_f32_neg (ab); +} + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp. */ + *dest = __builtin_vpair_f32_fma (*x, *y, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f32_neg (*z); + *dest = __builtin_vpair_f32_fma (*x, *y, n); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp. */ + __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z); + *dest = __builtin_vpair_f32_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f32_neg (*z); + __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n); + *dest = __builtin_vpair_f32_neg (w); +} + +void +test_splat (__vector_pair *dest, float x) +{ + /* 1 xxpermdi, 1 stxvp. */ + *dest = __builtin_vpair_f32_splat (x); +} + +void +test_zero (__vector_pair *dest) +{ + /* 2 xxspltib, 1 stxvp. */ + *dest = __builtin_vpair_zero (); +} + +vector float +test_get_vector_0 (__vector_pair *x) +{ + /* 1 lxp. */ + return __builtin_vpair_f32_get_vector (*x, 0); +} + +vector float +test_get_vector_1 (__vector_pair *x) +{ + /* 1 lxp. */ + return __builtin_vpair_f32_get_vector (*x, 1); +} + +float +test_add_elements (__vector_pair *x) +{ + /* 1 lxp, 3 xvaddsp, 2 vsldoi, 1 xscvspdp. */ + return __builtin_vpair_f32_add_elements (*x); +} + +/* { dg-final { scan-assembler-times {\mlxv\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mlxvp\M} 26 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 14 } } */ +/* { dg-final { scan-assembler-times {\mvsldoi\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxscvdpspn\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mxscvspdp\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mxvabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvaddsp\M} 5 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvminsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmulsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsubsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxspltib\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxspltw\M} 1 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-03.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-03.c new file mode 100644 index 000000000000..65bfc44f85d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-03.c @@ -0,0 +1,60 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */ + +/* Test whether the vector buitin code combines multiply, add/subtract, and + negate operations to the appropriate fused multiply-add instruction for + vector pairs with 4 double elements. */ + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmadd{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + *dest = __builtin_vpair_f64_add (m, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmsub{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + *dest = __builtin_vpair_f64_sub (m, *z); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + __vector_pair w = __builtin_vpair_f64_add (m, *z); + *dest = __builtin_vpair_f64_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + __vector_pair w = __builtin_vpair_f64_sub (m, *z); + *dest = __builtin_vpair_f64_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-04.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-04.c new file mode 100644 index 000000000000..b62871be1fdf --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-04.c @@ -0,0 +1,60 @@ +/* { dgv64-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */ + +/* Test whether the vector buitin code combines multiply, add/subtract, and + negate operations to the appropriate fused multiply-add instruction for + vector pairs with 8 float elements. */ + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmadd{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + *dest = __builtin_vpair_f32_add (m, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmsub{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + *dest = __builtin_vpair_f32_sub (m, *z); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + __vector_pair w = __builtin_vpair_f32_add (m, *z); + *dest = __builtin_vpair_f32_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + __vector_pair w = __builtin_vpair_f32_sub (m, *z); + *dest = __builtin_vpair_f32_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-05.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-05.c new file mode 100644 index 000000000000..820d6b21d057 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-05.c @@ -0,0 +1,192 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 4 64-bit integer elements. */ + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vaddudm, 1 stxvp. */ + *dest = __builtin_vpair_i64_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vaddudm, 1 stxvp. */ + *dest = __builtin_vpair_i64_sub (*x, *y); +} + +void +test_and (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxland, 1 stxvp. */ + *dest = __builtin_vpair_i64_and (*x, *y); +} + +void +test_or (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlor, 1 stxvp. */ + *dest = __builtin_vpair_i64_ior (*x, *y); +} + +void +test_xor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlxor, 1 stxvp. */ + *dest = __builtin_vpair_i64_xor (*x, *y); +} + +void +test_smax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxsd, 1 stxvp. */ + *dest = __builtin_vpair_i64_smax (*x, *y); +} + +void +test_smin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminsd, 1 stxvp. */ + *dest = __builtin_vpair_i64_smin (*x, *y); +} + +void +test_umax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxud, 1 stxvp. */ + *dest = __builtin_vpair_i64_umax (*x, *y); +} + +void +test_umin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminud, 1 stxvp. */ + *dest = __builtin_vpair_i64_umin (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 vnegd, 1 stxvp. */ + *dest = __builtin_vpair_i64_neg (*x); +} + +void +test_not (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + *dest = __builtin_vpair_i64_not (*x); +} + +/* Combination of logical operators. */ + +void +test_andc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i64_not (*y); + *dest = __builtin_vpair_i64_and (*x, n); +} + +void +test_andc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i64_not (*x); + *dest = __builtin_vpair_i64_and (n, *y); +} + +void +test_orc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i64_not (*y); + *dest = __builtin_vpair_i64_ior (*x, n); +} + +void +test_orc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i64_not (*x); + *dest = __builtin_vpair_i64_ior (n, *y); +} + +void +test_nand_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i64_and (*x, *y); + *dest = __builtin_vpair_i64_not (a); +} + +void +test_nand_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair nx = __builtin_vpair_i64_not (*x); + __vector_pair ny = __builtin_vpair_i64_not (*y); + *dest = __builtin_vpair_i64_ior (nx, ny); +} + +void +test_nor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i64_ior (*x, *y); + *dest = __builtin_vpair_i64_not (a); +} + +/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */ +/* { dg-final { scan-assembler-times {\mvaddudm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxsd\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxud\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminsd\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminud\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvnegd\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsubudm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-06.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-06.c new file mode 100644 index 000000000000..e61d916adc92 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-06.c @@ -0,0 +1,193 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 8 32-bit integer elements. */ + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vadduwm, 1 stxvp. */ + *dest = __builtin_vpair_i32_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vsubuwm, 1 stxvp. */ + *dest = __builtin_vpair_i32_sub (*x, *y); +} + +void +test_and (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxland, 1 stxvp. */ + *dest = __builtin_vpair_i32_and (*x, *y); +} + +void +test_or (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlor, 1 stxvp. */ + *dest = __builtin_vpair_i32_ior (*x, *y); +} + +void +test_xor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlxor, 1 stxvp. */ + *dest = __builtin_vpair_i32_xor (*x, *y); +} + +void +test_smax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxsw, 1 stxvp. */ + *dest = __builtin_vpair_i32_smax (*x, *y); +} + +void +test_smin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminsw, 1 stxvp. */ + *dest = __builtin_vpair_i32_smin (*x, *y); +} + +void +test_umax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxuw, 1 stxvp. */ + *dest = __builtin_vpair_i32_umax (*x, *y); +} + +void +test_umin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminuw, 1 stxvp. */ + *dest = __builtin_vpair_i32_umin (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 vnegw, 1 stxvp. */ + *dest = __builtin_vpair_i32_neg (*x); +} + +void +test_not (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + *dest = __builtin_vpair_i32_not (*x); +} + +/* Combination of logical operators. */ + +void +test_andc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i32_not (*y); + *dest = __builtin_vpair_i32_and (*x, n); +} + +void +test_andc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i32_not (*x); + *dest = __builtin_vpair_i32_and (n, *y); +} + +void +test_orc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i32_not (*y); + *dest = __builtin_vpair_i32_ior (*x, n); +} + +void +test_orc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i32_not (*x); + *dest = __builtin_vpair_i32_ior (n, *y); +} + +void +test_nand_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i32_and (*x, *y); + *dest = __builtin_vpair_i32_not (a); +} + +void +test_nand_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair nx = __builtin_vpair_i32_not (*x); + __vector_pair ny = __builtin_vpair_i32_not (*y); + *dest = __builtin_vpair_i32_ior (nx, ny); +} + +void +test_nor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i32_ior (*x, *y); + *dest = __builtin_vpair_i32_not (a); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 34 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */ +/* { dg-final { scan-assembler-times {\mvadduwm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxsw\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxuw\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminsw\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminuw\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvnegw\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsubuwm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-07.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-07.c new file mode 100644 index 000000000000..cc205c66be62 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-07.c @@ -0,0 +1,193 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 16 16-bit integer elements. */ + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vadduhm, 1 stxvp. */ + *dest = __builtin_vpair_i16_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vsubuhm, 1 stxvp. */ + *dest = __builtin_vpair_i16_sub (*x, *y); +} + +void +test_and (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxland, 1 stxvp. */ + *dest = __builtin_vpair_i16_and (*x, *y); +} + +void +test_or (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlor, 1 stxvp. */ + *dest = __builtin_vpair_i16_ior (*x, *y); +} + +void +test_xor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlxor, 1 stxvp. */ + *dest = __builtin_vpair_i16_xor (*x, *y); +} + +void +test_smax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxsh, 1 stxvp. */ + *dest = __builtin_vpair_i16_smax (*x, *y); +} + +void +test_smin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminsh, 1 stxvp. */ + *dest = __builtin_vpair_i16_smin (*x, *y); +} + +void +test_umax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxuh, 1 stxvp. */ + *dest = __builtin_vpair_i16_umax (*x, *y); +} + +void +test_umin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminuh, 1 stxvp. */ + *dest = __builtin_vpair_i16_umin (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 1 xxspltib, 2 vsubuhm, 1 stxvp. */ + *dest = __builtin_vpair_i16_neg (*x); +} + +void +test_not (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + *dest = __builtin_vpair_i16_not (*x); +} + +/* Combination of logical operators. */ + +void +test_andc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i16_not (*y); + *dest = __builtin_vpair_i16_and (*x, n); +} + +void +test_andc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i16_not (*x); + *dest = __builtin_vpair_i16_and (n, *y); +} + +void +test_orc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i16_not (*y); + *dest = __builtin_vpair_i16_ior (*x, n); +} + +void +test_orc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i16_not (*x); + *dest = __builtin_vpair_i16_ior (n, *y); +} + +void +test_nand_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i16_and (*x, *y); + *dest = __builtin_vpair_i16_not (a); +} + +void +test_nand_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair nx = __builtin_vpair_i16_not (*x); + __vector_pair ny = __builtin_vpair_i16_not (*y); + *dest = __builtin_vpair_i16_ior (nx, ny); +} + +void +test_nor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i16_ior (*x, *y); + *dest = __builtin_vpair_i16_not (a); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 34 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */ +/* { dg-final { scan-assembler-times {\mvadduhm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxsh\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxuh\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminsh\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminuh\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsubuhm\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxspltib\M} 1 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-08.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-08.c new file mode 100644 index 000000000000..9e4e4cdfffa7 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-08.c @@ -0,0 +1,194 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 32 8-bit integer elements. */ + + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vaddubm, 1 stxvp. */ + *dest = __builtin_vpair_i8_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vsububm, 1 stxvp. */ + *dest = __builtin_vpair_i8_sub (*x, *y); +} + +void +test_and (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxland, 1 stxvp. */ + *dest = __builtin_vpair_i8_and (*x, *y); +} + +void +test_or (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlor, 1 stxvp. */ + *dest = __builtin_vpair_i8_ior (*x, *y); +} + +void +test_xor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlxor, 1 stxvp. */ + *dest = __builtin_vpair_i8_xor (*x, *y); +} + +void +test_smax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxsb, 1 stxvp. */ + *dest = __builtin_vpair_i8_smax (*x, *y); +} + +void +test_smin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminsb, 1 stxvp. */ + *dest = __builtin_vpair_i8_smin (*x, *y); +} + +void +test_umax (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vmaxub, 1 stxvp. */ + *dest = __builtin_vpair_i8_umax (*x, *y); +} + +void +test_umin (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 vminub, 1 stxvp. */ + *dest = __builtin_vpair_i8_umin (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 1 xxspltib, 2 vsububm, 1 stxvp. */ + *dest = __builtin_vpair_i8_neg (*x); +} + +void +test_not (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + *dest = __builtin_vpair_i8_not (*x); +} + +/* Combination of logical operators. */ + +void +test_andc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i8_not (*y); + *dest = __builtin_vpair_i8_and (*x, n); +} + +void +test_andc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlandc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i8_not (*x); + *dest = __builtin_vpair_i8_and (n, *y); +} + +void +test_orc_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i8_not (*y); + *dest = __builtin_vpair_i8_ior (*x, n); +} + +void +test_orc_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlorc, 1 stxvp. */ + __vector_pair n = __builtin_vpair_i8_not (*x); + *dest = __builtin_vpair_i8_ior (n, *y); +} + +void +test_nand_1 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i8_and (*x, *y); + *dest = __builtin_vpair_i8_not (a); +} + +void +test_nand_2 (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnand, 1 stxvp. */ + __vector_pair nx = __builtin_vpair_i8_not (*x); + __vector_pair ny = __builtin_vpair_i8_not (*y); + *dest = __builtin_vpair_i8_ior (nx, ny); +} + +void +test_nor (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xxlnor, 1 stxvp. */ + __vector_pair a = __builtin_vpair_i8_ior (*x, *y); + *dest = __builtin_vpair_i8_not (a); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 34 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 18 } } */ +/* { dg-final { scan-assembler-times {\mvaddubm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxsb\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvmaxub\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminsb\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvminub\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsububm\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxland\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlandc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnand\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlnor\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlorc\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxxlxor\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxspltib\M} 1 } } */