From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1005) id 23A463858281; Tue, 23 Jan 2024 07:10:49 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 23A463858281 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1705993849; bh=3IZ3G6cJCZq+rFRf/yvcDeAzu66I68qEVWRjpgqB+C0=; h=From:To:Subject:Date:From; b=CatU9RFCzp2ZNDC/V4tZ/smTly9DUzosrzQxKt1yw/atrYPlxDXb9QUfNckzra4U/ aZlCu4l7/UkDaNtrC6jVRAaJKsLlSuQAh+CohXdDskeY2+M8ZaViWKR/+pkNG9S3tL wmWUX3Pv8m2TJj/vMjcabUe7pyPlprqpeF5ceMi0= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Michael Meissner To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/meissner/heads/work154-vpair)] Add support for vector pair fma operations. X-Act-Checkin: gcc X-Git-Author: Michael Meissner X-Git-Refname: refs/users/meissner/heads/work154-vpair X-Git-Oldrev: 656c1538a142c9ddbf2e836dde7609569602e15a X-Git-Newrev: d6c48ffd5b8e43023e3efbdd1aa20b7830b82fb8 Message-Id: <20240123071049.23A463858281@sourceware.org> Date: Tue, 23 Jan 2024 07:10:49 +0000 (GMT) List-Id: https://gcc.gnu.org/g:d6c48ffd5b8e43023e3efbdd1aa20b7830b82fb8 commit d6c48ffd5b8e43023e3efbdd1aa20b7830b82fb8 Author: Michael Meissner Date: Tue Jan 23 02:09:18 2024 -0500 Add support for vector pair fma operations. 2024-01-23 Michael Meissner gcc/ * config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_fma): New built-in. (__builtin_vpair_f32_fms): Likewise. (__builtin_vpair_f32_nfma): Likewise. (__builtin_vpair_f32_nfms): Likewise. (__builtin_vpair_f64_fma): Likewise. (__builtin_vpair_f64_fms): Likewise. (__builtin_vpair_f64_nfma): Likewise. * config/rs6000/rs6000/rs6000-proto.h (enum vpair_split_fma): New enumeration. (vpair_split_fma): New declaration. * config/rs6000/rs6000.cc (vpair_split_fma): New function to split vector pair FMA operations. * config/rs6000/vector-pair.md (UNSPEC_VPAIR_FMA): New unspec. (vpair_stdname): Add UNSPEC_VPAIR_FMA. (VPAIR_OP): Likewise. (vpair_fma_4): New insns. (vpair_fms_4): Likewise. (vpair_nfma_4): Likewise. (vpair_nfms_4): Likewise. * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document new vector pair fma built-in functions. gcc/testsuite/ * gcc.target/powerpc/vector-pair-3.c: New test. * gcc.target/powerpc/vector-pair-4.c: Likewise. Diff: --- gcc/config/rs6000/rs6000-builtins.def | 24 ++++++ gcc/config/rs6000/rs6000-protos.h | 13 ++++ gcc/config/rs6000/rs6000.cc | 71 ++++++++++++++++++ gcc/config/rs6000/vector-pair.md | 96 ++++++++++++++++++++++++ gcc/doc/extend.texi | 25 ++++++ gcc/testsuite/gcc.target/powerpc/vector-pair-3.c | 57 ++++++++++++++ gcc/testsuite/gcc.target/powerpc/vector-pair-4.c | 57 ++++++++++++++ 7 files changed, 343 insertions(+) diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index 83e7206e989..4362cbb8fc7 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -4142,6 +4142,12 @@ v256 __builtin_vpair_f32_div (v256, v256); VPAIR_F32_DIV vpair_div_v8sf3 {mma} + v256 __builtin_vpair_f32_fma (v256, v256, v256); + VPAIR_F32_FMA vpair_fma_v8sf4 {mma} + + v256 __builtin_vpair_f32_fms (v256, v256, v256); + VPAIR_F32_FMS vpair_fms_v8sf4 {mma} + v256 __builtin_vpair_f32_max (v256, v256); VPAIR_F32_MAX vpair_smax_v8sf3 {mma} @@ -4157,6 +4163,12 @@ v256 __builtin_vpair_f32_neg (v256); VPAIR_F32_NEG vpair_neg_v8sf2 {mma} + v256 __builtin_vpair_f32_nfma (v256, v256, v256); + VPAIR_F32_NFMA vpair_nfma_v8sf4 {mma} + + v256 __builtin_vpair_f32_nfms (v256, v256, v256); + VPAIR_F32_NFMS vpair_nfms_v8sf4 {mma} + v256 __builtin_vpair_f32_sub (v256, v256); VPAIR_F32_SUB vpair_sub_v8sf3 {mma} @@ -4170,6 +4182,12 @@ v256 __builtin_vpair_f64_div (v256, v256); VPAIR_F64_DIV vpair_div_v4df3 {mma} + v256 __builtin_vpair_f64_fma (v256, v256, v256); + VPAIR_F64_FMA vpair_fma_v4df4 {mma} + + v256 __builtin_vpair_f64_fms (v256, v256, v256); + VPAIR_F64_FMS vpair_fms_v4df4 {mma} + v256 __builtin_vpair_f64_max (v256, v256); VPAIR_F64_MAX vpair_smax_v4df3 {mma} @@ -4185,5 +4203,11 @@ v256 __builtin_vpair_f64_neg (v256); VPAIR_F64_NEG vpair_neg_v4df2 {mma} + v256 __builtin_vpair_f64_nfma (v256, v256, v256); + VPAIR_F64_NFMA vpair_nfma_v4df4 {mma} + + v256 __builtin_vpair_f64_nfms (v256, v256, v256); + VPAIR_F64_NFMS vpair_nfms_v4df4 {mma} + v256 __builtin_vpair_f64_sub (v256, v256); VPAIR_F64_SUB vpair_sub_v4df3 {mma} diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 4d6ecc83436..aed4081c87b 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -174,6 +174,19 @@ extern void vpair_split_unary (rtx [], machine_mode, enum rtx_code, enum vpair_split_unary); extern void vpair_split_binary (rtx [], machine_mode, enum rtx_code); +/* When we are splitting a vector pair FMA operation into two vector operations, we + may need to modify the code generated. This enumeration encodes the + different choices. */ + +enum vpair_split_fma { + VPAIR_SPLIT_FMA, /* Fused multiply-add. */ + VPAIR_SPLIT_FMS, /* Fused multiply-subtract. */ + VPAIR_SPLIT_NFMA, /* Fused negate multiply-add. */ + VPAIR_SPLIT_NFMS /* Fused negate multiply-subtract. */ +}; + +extern void vpair_split_fma (rtx [], machine_mode, enum vpair_split_fma); + /* Different PowerPC instruction formats that are used by GCC. There are various other instruction formats used by the PowerPC hardware, but these formats are not currently used by GCC. */ diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index e15669b72fb..39ed73436a7 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -29455,6 +29455,77 @@ vpair_split_binary (rtx operands[], /* Dest, 2 inputs. */ emit_insn (gen_rtx_SET (op0_b, operation_b)); return; } + +/* Split vector pair fma operations. */ + +void +vpair_split_fma (rtx operands[], /* Dest, 3 inputs. */ + machine_mode vmode, /* Vector mode. */ + enum vpair_split_fma action) /* Action to take. */ +{ + rtx op0 = operands[0]; + machine_mode mode0 = GET_MODE (op0); + gcc_assert (GET_MODE_SIZE (mode0) == 32); + rtx op0_a = simplify_gen_subreg (vmode, op0, mode0, 0); + rtx op0_b = simplify_gen_subreg (vmode, op0, mode0, 16); + + rtx op1 = operands[1]; + machine_mode mode1 = GET_MODE (op1); + gcc_assert (GET_MODE_SIZE (mode1) == 32); + rtx op1_a = simplify_gen_subreg (vmode, op1, mode1, 0); + rtx op1_b = simplify_gen_subreg (vmode, op1, mode1, 16); + + rtx op2 = operands[2]; + machine_mode mode2 = GET_MODE (op2); + gcc_assert (GET_MODE_SIZE (mode2) == 32); + rtx op2_a = simplify_gen_subreg (vmode, op2, mode2, 0); + rtx op2_b = simplify_gen_subreg (vmode, op2, mode2, 16); + + rtx op3 = operands[3]; + machine_mode mode3 = GET_MODE (op3); + gcc_assert (GET_MODE_SIZE (mode3) == 32); + rtx op3_a = simplify_gen_subreg (vmode, op3, mode3, 0); + rtx op3_b = simplify_gen_subreg (vmode, op3, mode3, 16); + + switch (action) + { + case VPAIR_SPLIT_FMA: + case VPAIR_SPLIT_NFMA: + break; + + case VPAIR_SPLIT_FMS: + case VPAIR_SPLIT_NFMS: + op3_a = gen_rtx_NEG (vmode, op3_a); + op3_b = gen_rtx_NEG (vmode, op3_b); + break; + + default: + gcc_unreachable (); + } + + rtx operation_a = gen_rtx_fmt_eee (FMA, vmode, op1_a, op2_a, op3_a); + rtx operation_b = gen_rtx_fmt_eee (FMA, vmode, op1_b, op2_b, op3_b); + + switch (action) + { + case VPAIR_SPLIT_FMA: + case VPAIR_SPLIT_FMS: + break; + + case VPAIR_SPLIT_NFMA: + case VPAIR_SPLIT_NFMS: + operation_a = gen_rtx_NEG (vmode, operation_a); + operation_b = gen_rtx_NEG (vmode, operation_b); + break; + + default: + gcc_unreachable (); + } + + emit_insn (gen_rtx_SET (op0_a, operation_a)); + emit_insn (gen_rtx_SET (op0_b, operation_b)); + return; +} struct gcc_target targetm = TARGET_INITIALIZER; diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md index 4b7a8db0d48..73ae46e6d40 100644 --- a/gcc/config/rs6000/vector-pair.md +++ b/gcc/config/rs6000/vector-pair.md @@ -32,6 +32,7 @@ (define_c_enum "unspec" [UNSPEC_VPAIR_ABS UNSPEC_VPAIR_DIV + UNSPEC_VPAIR_FMA UNSPEC_VPAIR_MINUS UNSPEC_VPAIR_MULT UNSPEC_VPAIR_NEG @@ -71,6 +72,7 @@ ;; Map the vpair operator unspec number to the standard name. (define_int_attr vpair_stdname [(UNSPEC_VPAIR_ABS "abs") (UNSPEC_VPAIR_DIV "div") + (UNSPEC_VPAIR_FMA "fma") (UNSPEC_VPAIR_MINUS "sub") (UNSPEC_VPAIR_MULT "mul") (UNSPEC_VPAIR_NEG "neg") @@ -81,6 +83,7 @@ ;; Map the vpair operator unspec number to the RTL operator. (define_int_attr VPAIR_OP [(UNSPEC_VPAIR_ABS "ABS") (UNSPEC_VPAIR_DIV "DIV") + (UNSPEC_VPAIR_FMA "FMA") (UNSPEC_VPAIR_MINUS "MINUS") (UNSPEC_VPAIR_MULT "MULT") (UNSPEC_VPAIR_NEG "NEG") @@ -158,3 +161,96 @@ (set (attr "type") (if_then_else (match_test " == DIV") (const_string "") (const_string "")))]) + +;; Vector pair fused-multiply (FMA) operations. The last argument in the +;; UNSPEC is a CONST_INT which identifies what the scalar element is. +(define_insn_and_split "vpair_fma_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (match_operand:OO 3 "vsx_register_operand" "0,wa") + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_FMA))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + vpair_split_fma (operands, mode, VPAIR_SPLIT_FMA); + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "")]) + +;; Vector pair fused multiply-subtract +(define_insn_and_split "vpair_fms_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (unspec:OO + [(match_operand:OO 3 "vsx_register_operand" "0,wa") + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_NEG) + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_FMA))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + vpair_split_fma (operands, mode, VPAIR_SPLIT_FMS); + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "")]) + +;; Vector pair negate fused multiply-add +(define_insn_and_split "vpair_nfma_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(unspec:OO + [(match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (match_operand:OO 3 "vsx_register_operand" "0,wa") + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_FMA) + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_NEG))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + vpair_split_fma (operands, mode, VPAIR_SPLIT_NFMA); + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "")]) + +;; Vector pair fused multiply-subtract +(define_insn_and_split "vpair_nfms_4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(unspec:OO + [(match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (unspec:OO + [(match_operand:OO 3 "vsx_register_operand" "0,wa") + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_NEG) + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_FMA) + (const_int VPAIR_FP_ELEMENT)] + UNSPEC_VPAIR_NEG))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + vpair_split_fma (operands, mode, VPAIR_SPLIT_NFMS); + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "")]) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index e519b71877a..08d977515dc 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -23878,6 +23878,15 @@ store instruction. The @code{nabs} built-in is a combination of @code{neg} and @code{abs}. +The @code{fms} built-in is a combination of @code{fma} and @code{neg} +of the third element. + +The @code{nfma} built-in is a combination of @code{neg} of the +@code{fma} built-in. + +The @code{nfms} built-in is a combination of @code{neg} of the +@code{fms} built-in. + The following built-in functions operate on pairs of @code{vector float} values: @@ -23885,11 +23894,19 @@ The following built-in functions operate on pairs of __vector_pair __builtin_vpair_f32_abs (__vector_pair); __vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f32_div (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, + __vector_pair); +__vector_pair __builtin_vpair_f32_fms (__vector_pair, __vector_pair, + __vector_pair); __vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f32_nabs (__vector_pair); __vector_pair __builtin_vpair_f32_neg (__vector_pair); +__vector_pair __builtin_vpair_f32_nfma (__vector_pair, __vector_pair, + __vector_pair); +__vector_pair __builtin_vpair_f32_nfms (__vector_pair, __vector_pair, + __vector_pair); __vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair); @end smallexample @@ -23900,11 +23917,19 @@ The following built-in functions operate on pairs of __vector_pair __builtin_vpair_f64_abs (__vector_pair); __vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f64_div (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, + __vector_pair); +__vector_pair __builtin_vpair_f64_fms (__vector_pair, __vector_pair, + __vector_pair); __vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_f64_nabs (__vector_pair); __vector_pair __builtin_vpair_f64_neg (__vector_pair); +__vector_pair __builtin_vpair_f64_nfma (__vector_pair, __vector_pair, + __vector_pair); +__vector_pair __builtin_vpair_f64_nfms (__vector_pair, __vector_pair, + __vector_pair); __vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair); @end smallexample diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c new file mode 100644 index 00000000000..43b91461759 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c @@ -0,0 +1,57 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector builtin code generates the expected FMA instructions + for vector pairs with 4 double elements. */ + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp. */ + *dest = __builtin_vpair_f64_fma (*x, *y, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f64_neg (*z); + *dest = __builtin_vpair_f64_fma (*x, *y, n); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp. */ + __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z); + *dest = __builtin_vpair_f64_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f64_neg (*z); + __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n); + *dest = __builtin_vpair_f64_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c new file mode 100644 index 00000000000..d5c55d3883c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c @@ -0,0 +1,57 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector builtin code generates the expected FMA instructions + for vector pairs with 8 float elements. */ + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp. */ + *dest = __builtin_vpair_f32_fma (*x, *y, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f32_neg (*z); + *dest = __builtin_vpair_f32_fma (*x, *y, n); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp. */ + __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z); + *dest = __builtin_vpair_f32_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f32_neg (*z); + __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n); + *dest = __builtin_vpair_f32_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */