From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2078) id 654523858D33; Wed, 18 Oct 2023 01:15:08 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 654523858D33 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1697591708; bh=ruEUHtY4S2yhm6WUgq0bbr3nPyd3BehpQHqh9dAHzXk=; h=From:To:Subject:Date:From; b=b+g56OYGJ4XoVtINy4ivLK3TdXJtUoJirpKfDj5Glqvl7l2/DC8kv49QGmP0fDc0H flyCwOI5DWQQW4HkyXHzoo4tknRvJeI6Ac6XyMn0DuHU+nJR8CQBy6edm/+g9AKjLb VXFx/mDNmJJbv7VIYRMqbxciqN7WZ25lSTtfHX84= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: hongtao Liu To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-4701] Support 32/64-bit vectorization for _Float16 fma related operations. X-Act-Checkin: gcc X-Git-Author: liuhongt X-Git-Refname: refs/heads/master X-Git-Oldrev: cf7739d4a6ba0b88068877d14439436c22b57630 X-Git-Newrev: cead92b7fc4d7a545dcf2f02397120e3c9afe1a3 Message-Id: <20231018011508.654523858D33@sourceware.org> Date: Wed, 18 Oct 2023 01:15:08 +0000 (GMT) List-Id: https://gcc.gnu.org/g:cead92b7fc4d7a545dcf2f02397120e3c9afe1a3 commit r14-4701-gcead92b7fc4d7a545dcf2f02397120e3c9afe1a3 Author: liuhongt Date: Mon Oct 16 16:22:04 2023 +0800 Support 32/64-bit vectorization for _Float16 fma related operations. gcc/ChangeLog: * config/i386/mmx.md (fma4): New expander. (fms4): Ditto. (fnma4): Ditto. (fnms4): Ditto. (vec_fmaddsubv4hf4): Ditto. (vec_fmsubaddv4hf4): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/part-vect-fmaddsubhf-1.c: New test. * gcc.target/i386/part-vect-fmahf-1.c: New test. Diff: --- gcc/config/i386/mmx.md | 152 ++++++++++++++++++++- .../gcc.target/i386/part-vect-fmaddsubhf-1.c | 22 +++ gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c | 58 ++++++++ 3 files changed, 231 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 82ca49c207b7..491a0a512723 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2365,7 +2365,157 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; -;; Parallel single-precision floating point conversion operations +;; Parallel half-precision FMA multiply/accumulate instructions. +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "fma4" + [(set (match_operand:VHF_32_64 0 "register_operand") + (fma:VHF_32_64 + (match_operand:VHF_32_64 1 "nonimmediate_operand") + (match_operand:VHF_32_64 2 "nonimmediate_operand") + (match_operand:VHF_32_64 3 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math" +{ + rtx op3 = gen_reg_rtx (V8HFmode); + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_mov__to_sse (op3, operands[3])); + emit_insn (gen_mov__to_sse (op2, operands[2])); + emit_insn (gen_mov__to_sse (op1, operands[1])); + + emit_insn (gen_fmav8hf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode)); + DONE; +}) + +(define_expand "fms4" + [(set (match_operand:VHF_32_64 0 "register_operand") + (fma:VHF_32_64 + (match_operand:VHF_32_64 1 "nonimmediate_operand") + (match_operand:VHF_32_64 2 "nonimmediate_operand") + (neg:VHF_32_64 + (match_operand:VHF_32_64 3 "nonimmediate_operand"))))] + "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math" +{ + rtx op3 = gen_reg_rtx (V8HFmode); + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_mov__to_sse (op3, operands[3])); + emit_insn (gen_mov__to_sse (op2, operands[2])); + emit_insn (gen_mov__to_sse (op1, operands[1])); + + emit_insn (gen_fmsv8hf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode)); + DONE; +}) + +(define_expand "fnma4" + [(set (match_operand:VHF_32_64 0 "register_operand") + (fma:VHF_32_64 + (neg:VHF_32_64 + (match_operand:VHF_32_64 1 "nonimmediate_operand")) + (match_operand:VHF_32_64 2 "nonimmediate_operand") + (match_operand:VHF_32_64 3 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math" +{ + rtx op3 = gen_reg_rtx (V8HFmode); + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_mov__to_sse (op3, operands[3])); + emit_insn (gen_mov__to_sse (op2, operands[2])); + emit_insn (gen_mov__to_sse (op1, operands[1])); + + emit_insn (gen_fnmav8hf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode)); + DONE; +}) + +(define_expand "fnms4" + [(set (match_operand:VHF_32_64 0 "register_operand" "=v,v,x") + (fma:VHF_32_64 + (neg:VHF_32_64 + (match_operand:VHF_32_64 1 "nonimmediate_operand")) + (match_operand:VHF_32_64 2 "nonimmediate_operand") + (neg:VHF_32_64 + (match_operand:VHF_32_64 3 "nonimmediate_operand"))))] + "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math" +{ + rtx op3 = gen_reg_rtx (V8HFmode); + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_mov__to_sse (op3, operands[3])); + emit_insn (gen_mov__to_sse (op2, operands[2])); + emit_insn (gen_mov__to_sse (op1, operands[1])); + + emit_insn (gen_fnmsv8hf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode)); + DONE; +}) + +(define_expand "vec_fmaddsubv4hf4" + [(match_operand:V4HF 0 "register_operand") + (match_operand:V4HF 1 "nonimmediate_operand") + (match_operand:V4HF 2 "nonimmediate_operand") + (match_operand:V4HF 3 "nonimmediate_operand")] + "TARGET_AVX512FP16 && TARGET_AVX512VL + && TARGET_MMX_WITH_SSE + && ix86_partial_vec_fp_math" +{ + rtx op3 = gen_reg_rtx (V8HFmode); + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_movq_v4hf_to_sse (op3, operands[3])); + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2])); + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); + + emit_insn (gen_vec_fmaddsubv8hf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); + DONE; +}) + +(define_expand "vec_fmsubaddv4hf4" + [(match_operand:V4HF 0 "register_operand") + (match_operand:V4HF 1 "nonimmediate_operand") + (match_operand:V4HF 2 "nonimmediate_operand") + (match_operand:V4HF 3 "nonimmediate_operand")] + "TARGET_AVX512FP16 && TARGET_AVX512VL + && ix86_partial_vec_fp_math + && TARGET_MMX_WITH_SSE" +{ + rtx op3 = gen_reg_rtx (V8HFmode); + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_movq_v4hf_to_sse (op3, operands[3])); + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2])); + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); + + emit_insn (gen_vec_fmsubaddv8hf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel half-precision floating point conversion operations ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/gcc/testsuite/gcc.target/i386/part-vect-fmaddsubhf-1.c b/gcc/testsuite/gcc.target/i386/part-vect-fmaddsubhf-1.c new file mode 100644 index 000000000000..051f992f66ed --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/part-vect-fmaddsubhf-1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vfmaddsub...ph\[ \t\]+\[^\n\]*%xmm\[0-9\]" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vfmsubadd...ph\[ \t\]+\[^\n\]*%xmm\[0-9\]" 1 { target { ! ia32 } } } } */ + +void vec_fmaddsub_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, _Float16* __restrict z) +{ + for (int i = 0; i < 4; i += 2) + { + z[i] = da_r * x[i] - y[i]; + z[i+1] = da_r * x[i+1] + y[i+1]; + } +} + +void vec_fmasubadd_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, _Float16* __restrict z) +{ + for (int i = 0; i < 4; i += 2) + { + z[i] = da_r * x[i] + y[i]; + z[i+1] = da_r * x[i+1] - y[i+1]; + } +} diff --git a/gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c b/gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c new file mode 100644 index 000000000000..46e3cd34103c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ +/* { dg-final { scan-assembler-times "vfmadd132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vfnmadd132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vfmsub132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vfnmsub132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */ + +typedef _Float16 v4hf __attribute__ ((__vector_size__ (8))); +typedef _Float16 v2hf __attribute__ ((__vector_size__ (4))); + +v4hf +fma_v4hf (v4hf a, v4hf b, v4hf c) +{ + return a * b + c; +} + +v4hf +fnma_v4hf (v4hf a, v4hf b, v4hf c) +{ + return -a * b + c; +} + +v4hf +fms_v4hf (v4hf a, v4hf b, v4hf c) +{ + return a * b - c; +} + +v4hf +fnms_v4hf (v4hf a, v4hf b, v4hf c) +{ + return -a * b - c; +} + +v2hf +fma_v2hf (v2hf a, v2hf b, v2hf c) +{ + return a * b + c; +} + +v2hf +fnma_v2hf (v2hf a, v2hf b, v2hf c) +{ + return -a * b + c; +} + +v2hf +fms_v2hf (v2hf a, v2hf b, v2hf c) +{ + return a * b - c; +} + +v2hf +fnms_v2hf (v2hf a, v2hf b, v2hf c) +{ + return -a * b - c; +} +