From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <liuhongt@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2078)
	id 654523858D33; Wed, 18 Oct 2023 01:15:08 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 654523858D33
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1697591708;
	bh=ruEUHtY4S2yhm6WUgq0bbr3nPyd3BehpQHqh9dAHzXk=;
	h=From:To:Subject:Date:From;
	b=b+g56OYGJ4XoVtINy4ivLK3TdXJtUoJirpKfDj5Glqvl7l2/DC8kv49QGmP0fDc0H
	 flyCwOI5DWQQW4HkyXHzoo4tknRvJeI6Ac6XyMn0DuHU+nJR8CQBy6edm/+g9AKjLb
	 VXFx/mDNmJJbv7VIYRMqbxciqN7WZ25lSTtfHX84=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: hongtao Liu <liuhongt@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-4701] Support 32/64-bit vectorization for _Float16 fma
 related operations.
X-Act-Checkin: gcc
X-Git-Author: liuhongt <hongtao.liu@intel.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: cf7739d4a6ba0b88068877d14439436c22b57630
X-Git-Newrev: cead92b7fc4d7a545dcf2f02397120e3c9afe1a3
Message-Id: <20231018011508.654523858D33@sourceware.org>
Date: Wed, 18 Oct 2023 01:15:08 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:cead92b7fc4d7a545dcf2f02397120e3c9afe1a3

commit r14-4701-gcead92b7fc4d7a545dcf2f02397120e3c9afe1a3
Author: liuhongt <hongtao.liu@intel.com>
Date:   Mon Oct 16 16:22:04 2023 +0800

    Support 32/64-bit vectorization for _Float16 fma related operations.
    
    gcc/ChangeLog:
    
            * config/i386/mmx.md (fma<mode>4): New expander.
            (fms<mode>4): Ditto.
            (fnma<mode>4): Ditto.
            (fnms<mode>4): Ditto.
            (vec_fmaddsubv4hf4): Ditto.
            (vec_fmsubaddv4hf4): Ditto.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/part-vect-fmaddsubhf-1.c: New test.
            * gcc.target/i386/part-vect-fmahf-1.c: New test.

Diff:
---
 gcc/config/i386/mmx.md                             | 152 ++++++++++++++++++++-
 .../gcc.target/i386/part-vect-fmaddsubhf-1.c       |  22 +++
 gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c  |  58 ++++++++
 3 files changed, 231 insertions(+), 1 deletion(-)
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 82ca49c207b7..491a0a512723 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2365,7 +2365,157 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
-;; Parallel single-precision floating point conversion operations
+;; Parallel half-precision FMA multiply/accumulate instructions.
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "fma<mode>4"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+	(fma:VHF_32_64
+	  (match_operand:VHF_32_64 1 "nonimmediate_operand")
+	  (match_operand:VHF_32_64 2 "nonimmediate_operand")
+	  (match_operand:VHF_32_64 3 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op3, operands[3]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op2, operands[2]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op1, operands[1]));
+
+  emit_insn (gen_fmav8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "fms<mode>4"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+	(fma:VHF_32_64
+	  (match_operand:VHF_32_64   1 "nonimmediate_operand")
+	  (match_operand:VHF_32_64   2 "nonimmediate_operand")
+	  (neg:VHF_32_64
+	    (match_operand:VHF_32_64 3 "nonimmediate_operand"))))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op3, operands[3]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op2, operands[2]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op1, operands[1]));
+
+  emit_insn (gen_fmsv8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "fnma<mode>4"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+	(fma:VHF_32_64
+	  (neg:VHF_32_64
+	    (match_operand:VHF_32_64 1 "nonimmediate_operand"))
+	  (match_operand:VHF_32_64   2 "nonimmediate_operand")
+	  (match_operand:VHF_32_64   3 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op3, operands[3]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op2, operands[2]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op1, operands[1]));
+
+  emit_insn (gen_fnmav8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "fnms<mode>4"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=v,v,x")
+	(fma:VHF_32_64
+	  (neg:VHF_32_64
+	    (match_operand:VHF_32_64 1 "nonimmediate_operand"))
+	  (match_operand:VHF_32_64   2 "nonimmediate_operand")
+	  (neg:VHF_32_64
+	    (match_operand:VHF_32_64 3 "nonimmediate_operand"))))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op3, operands[3]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op2, operands[2]));
+  emit_insn (gen_mov<mov_to_sse_suffix>_<mode>_to_sse (op1, operands[1]));
+
+  emit_insn (gen_fnmsv8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "vec_fmaddsubv4hf4"
+  [(match_operand:V4HF 0 "register_operand")
+   (match_operand:V4HF 1 "nonimmediate_operand")
+   (match_operand:V4HF 2 "nonimmediate_operand")
+   (match_operand:V4HF 3 "nonimmediate_operand")]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmaddsubv8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "vec_fmsubaddv4hf4"
+  [(match_operand:V4HF 0 "register_operand")
+   (match_operand:V4HF 1 "nonimmediate_operand")
+   (match_operand:V4HF 2 "nonimmediate_operand")
+   (match_operand:V4HF 3 "nonimmediate_operand")]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL
+   && ix86_partial_vec_fp_math
+   && TARGET_MMX_WITH_SSE"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmsubaddv8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel half-precision floating point conversion operations
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-fmaddsubhf-1.c b/gcc/testsuite/gcc.target/i386/part-vect-fmaddsubhf-1.c
new file mode 100644
index 000000000000..051f992f66ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-fmaddsubhf-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vfmaddsub...ph\[ \t\]+\[^\n\]*%xmm\[0-9\]" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfmsubadd...ph\[ \t\]+\[^\n\]*%xmm\[0-9\]" 1 { target { ! ia32 } } } } */
+
+void vec_fmaddsub_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, _Float16* __restrict z)
+{
+  for (int i = 0; i < 4; i += 2)
+    {
+      z[i] =  da_r * x[i] - y[i];
+      z[i+1]  =  da_r * x[i+1] + y[i+1];
+    }
+}
+
+void vec_fmasubadd_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, _Float16* __restrict z)
+{
+  for (int i = 0; i < 4; i += 2)
+    {
+      z[i] =  da_r * x[i] + y[i];
+      z[i+1]  =  da_r * x[i+1] - y[i+1];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c b/gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c
new file mode 100644
index 000000000000..46e3cd34103c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vfmadd132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfmsub132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ph\[^\n\r\]*xmm\[0-9\]" 2 { target { ! ia32 } } } } */
+
+typedef _Float16 v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 v2hf __attribute__ ((__vector_size__ (4)));
+
+v4hf
+fma_v4hf (v4hf a, v4hf b, v4hf c)
+{
+  return a * b + c;
+}
+
+v4hf
+fnma_v4hf (v4hf a, v4hf b, v4hf c)
+{
+  return -a * b + c;
+}
+
+v4hf
+fms_v4hf (v4hf a, v4hf b, v4hf c)
+{
+  return a * b - c;
+}
+
+v4hf
+fnms_v4hf (v4hf a, v4hf b, v4hf c)
+{
+  return -a * b - c;
+}
+
+v2hf
+fma_v2hf (v2hf a, v2hf b, v2hf c)
+{
+  return a * b + c;
+}
+
+v2hf
+fnma_v2hf (v2hf a, v2hf b, v2hf c)
+{
+  return -a * b + c;
+}
+
+v2hf
+fms_v2hf (v2hf a, v2hf b, v2hf c)
+{
+  return a * b - c;
+}
+
+v2hf
+fnms_v2hf (v2hf a, v2hf b, v2hf c)
+{
+  return -a * b - c;
+}
+