public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.
@ 2021-09-27 10:42 Hongyu Wang
  2021-09-27 11:32 ` Uros Bizjak
  0 siblings, 1 reply; 5+ messages in thread
From: Hongyu Wang @ 2021-09-27 10:42 UTC (permalink / raw)
  To: ubizjak; +Cc: gcc-patches, hongtao.liu, hjl.tools

Hi Uros,

This patch intends to support V4HF/V2HF vector type and basic operations.

For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
is parsed by stack and returned from GPR since it is not specified
by ABI.

We found for 64bit vector in ia32, when mmx disabled there seems no
mov<mode>_internal, so we add a define_insn for v4hf mode. It would be very
ppreciated if you know why the handling of 64bit vector looks as is and
give some advice.

Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.

OK for master?

gcc/ChangeLog:

	PR target/102230
	* config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
	V4HF and V2HF mode check.
	(VALID_SSE2_REG_VHF_MODE): Likewise.
	(VALID_MMX_REG_MODE): Likewise.
	(SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
	vector mode condition.
	* config/i386/i386.c (classify_argument): Parse V4HF/V2HF
	via sse regs.
	(function_arg_32): Add V4HFmode.
	(function_arg_advance_32): Likewise.
	* config/i386/i386.md (mode): Add V4HF/V2HF.
	(MODE_SIZE): Likewise.
	* config/i386/mmx.md (MMXMODE): Add V4HF mode.
	(V_32): Add V2HF mode.
	(*mov<mode>_internal): Adjust sse alternatives to support
	V4HF mode vector move.
	(*mov<mode>_internal): Adjust sse alternatives
	to support V2HF mode move.
	* config/i386/sse.md (VHF_32_64): New mode iterator.
	(<insn><mode>3): New define_insn for add/sub/mul/div.
	(*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.

gcc/testsuite/ChangeLog:

	PR target/102230
	* gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
	* gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
	* gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
	* gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
	* gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
	* gcc.target/i386/pr102230.c: Ditto.
---
 gcc/config/i386/i386.c                        |  4 +
 gcc/config/i386/i386.h                        | 12 ++-
 gcc/config/i386/i386.md                       |  5 +-
 gcc/config/i386/mmx.md                        | 27 ++++---
 gcc/config/i386/sse.md                        | 49 ++++++++++++
 .../i386/avx512fp16-64-32-vecop-1.c           | 30 ++++++++
 .../i386/avx512fp16-64-32-vecop-2.c           | 75 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-floatvnhf.c    | 12 +--
 .../i386/avx512fp16-trunc-extendvnhf.c        | 12 +--
 .../gcc.target/i386/avx512fp16-truncvnhf.c    | 12 +--
 gcc/testsuite/gcc.target/i386/pr102230.c      | 38 ++++++++++
 11 files changed, 243 insertions(+), 33 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ba89e111d28..b3e4add4b9e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
     case E_V2SFmode:
     case E_V2SImode:
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V2HFmode:
     case E_V8QImode:
       classes[0] = X86_64_SSE_CLASS;
       return 1;
@@ -2902,6 +2904,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
@@ -3149,6 +3152,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8a4251b4926..9f3cad31f96 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == TImode)
 
 #define VALID_AVX512FP16_REG_MODE(MODE)					\
-  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
+  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode	\
+   || (MODE) == V4HFmode || (MODE) == V2HFmode)
 
 #define VALID_SSE2_REG_MODE(MODE)					\
   ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode	\
@@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
 
 #define VALID_SSE2_REG_VHF_MODE(MODE)			\
-  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
+  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode	\
+   || (MODE) == V4HFmode || (MODE) == V2HFmode)
 
 #define VALID_SSE_REG_MODE(MODE)					\
   ((MODE) == V1TImode || (MODE) == TImode				\
@@ -1054,7 +1056,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define VALID_MMX_REG_MODE(MODE)					\
   ((MODE) == V1DImode || (MODE) == DImode				\
    || (MODE) == V2SImode || (MODE) == SImode				\
-   || (MODE) == V4HImode || (MODE) == V8QImode)
+   || (MODE) == V4HImode || (MODE) == V8QImode				\
+   || (MODE) == V4HFmode)
 
 #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
 
@@ -1087,7 +1090,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode	\
    || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode	\
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
-   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
+   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
+   || (MODE) == V8HFmode)
 
 #define X87_FLOAT_MODE_P(MODE)	\
   (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c6279e620c9..758d7d1e3c0 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -498,7 +498,7 @@
 ;; Main data type used by the insn
 (define_attr "mode"
   "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
-   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
+   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -1106,7 +1106,8 @@
 			     (V1TI "16") (V2TI "32") (V4TI "64")
 			     (V2DF "16") (V4DF "32") (V8DF "64")
 			     (V4SF "16") (V8SF "32") (V16SF "64")
-			     (V8HF "16") (V16HF "32") (V32HF "64")])
+			     (V8HF "16") (V16HF "32") (V32HF "64")
+			     (V4HF "8") (V2HF "4")])
 
 ;; Double word integer modes as mode attribute.
 (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index b0093778fc6..68e1c4b2dbd 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -48,7 +48,7 @@
 (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
 
 ;; All 8-byte vector modes handled by MMX
-(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
+(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
 (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
 
 ;; Mix-n-match
@@ -57,8 +57,8 @@
 (define_mode_iterator MMXMODE24 [V4HI V2SI])
 (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 
-;; All 4-byte integer vector modes
-(define_mode_iterator V_32 [V4QI V2HI V1SI])
+;; All 4-byte integer/float16 vector modes
+(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
 
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
@@ -191,6 +191,8 @@
 	    (eq_attr "alternative" "11,12")
 	      (cond [(match_test "<MODE>mode == V2SFmode")
 		       (const_string "V4SF")
+		     (match_test "<MODE>mode == V4HFmode")
+		       (const_string "V4SF")
 		     (ior (not (match_test "TARGET_SSE2"))
 			  (match_test "optimize_function_for_size_p (cfun)"))
 		       (const_string "V4SF")
@@ -198,14 +200,16 @@
 		    (const_string "TI"))
 
 	    (and (eq_attr "alternative" "13")
-		 (ior (and (match_test "<MODE>mode == V2SFmode")
-			   (not (match_test "TARGET_MMX_WITH_SSE")))
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (and (match_test "<MODE>mode == V2SFmode")
+				(not (match_test "TARGET_MMX_WITH_SSE")))
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 
 	    (and (eq_attr "alternative" "14")
-	    	 (ior (match_test "<MODE>mode == V2SFmode")
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (match_test "<MODE>mode == V2SFmode")
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 	   ]
 	   (const_string "DI")))
@@ -289,12 +293,17 @@
        (const_string "*")))
    (set (attr "mode")
      (cond [(eq_attr "alternative" "2,3")
-	      (cond [(match_test "TARGET_AVX")
+	      (cond [(match_test "<MODE>mode == V2HFmode")
+		       (const_string "V4SF")
+		     (match_test "TARGET_AVX")
 		       (const_string "TI")
 		     (match_test "optimize_function_for_size_p (cfun)")
 		       (const_string "V4SF")
 		    ]
 		    (const_string "TI"))
+	    (and (eq_attr "alternative" "4,5")
+		 (match_test "<MODE>mode == V2HFmode"))
+	      (const_string "SF")
 	   ]
 	   (const_string "SI")))
    (set (attr "preferred_for_speed")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a446dedb2ec..b7832926287 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -671,6 +671,9 @@
   [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
 	(V16QI  "TARGET_AVX512VL")])
 
+(define_mode_iterator VHF_32_64
+  [V4HF V2HF])
+
 (define_mode_attr avx512
   [(V16QI "avx512vl") (V32QI "avx512vl") (V64QI "avx512bw")
    (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
@@ -1313,6 +1316,36 @@
 	      ]
 	      (symbol_ref "true")))])
 
+(define_insn "*movv4hf_internal_sse"
+  [(set (match_operand:V4HF 0 "nonimmediate_operand"
+	 "=v,v,v,m")
+	(match_operand:V4HF 1 "nonimmediate_or_sse_const_operand"
+	 " C,v,m,v"))]
+  "!TARGET_MMX && TARGET_SSE2
+   && (register_operand (operands[0], V4HFmode)
+       || register_operand (operands[1], V4HFmode))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands);
+
+    case TYPE_SSEMOV:
+      return ix86_output_ssemov (insn, operands);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog1,ssemov,ssemov,ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "1")
+		 (const_string "V4SF")]
+	      (const_string "V2SF")))]
+)
+
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -2165,6 +2198,22 @@
    (set_attr "prefix" "<bcst_mask_prefix3>")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "<insn><mode>3"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
+	(plusminusmultdiv:VHF_32_64
+	  (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
+	  (match_operand:VHF_32_64 2 "register_operand" "v")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "type")
+      (cond [(match_test "<CODE> == MULT")
+		(const_string "ssemul")
+	     (match_test "<CODE> == DIV")
+		(const_string "ssediv")]
+	     (const_string "sseadd")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8HF")])
+
 ;; Standard scalar operation patterns which preserve the rest of the
 ;; vector for combiner.
 (define_insn "*<sse>_vm<insn><mode>3"
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
new file mode 100644
index 00000000000..754e909d77b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+/* { dg-final { scan-assembler-times "vaddph" 2 } } */
+/* { dg-final { scan-assembler-times "vsubph" 2 } } */
+/* { dg-final { scan-assembler-times "vmulph" 2 } } */
+/* { dg-final { scan-assembler-times "vdivph" 2 } } */
+
+#define DO_PRAGMA(X) _Pragma(#X)
+
+#define VEC_OP_VV(size, op, name)       \
+void \
+__attribute__ ((noinline, noclone, optimize("tree-slp-vectorize"))) \
+vecop_v##size##hf##name (_Float16 * restrict dst,  \
+ _Float16 * restrict src1, _Float16 * restrict src2)   \
+{ \
+    int i;  \
+    DO_PRAGMA (GCC unroll size)   \
+    for (i = 0; i < size; i++)  \
+      dst[i] = src1[i] op src2[i];  \
+}
+
+VEC_OP_VV(4, +, add)
+VEC_OP_VV(2, +, add)
+VEC_OP_VV(4, -, sub)
+VEC_OP_VV(2, -, sub)
+VEC_OP_VV(4, *, mul)
+VEC_OP_VV(2, *, mul)
+VEC_OP_VV(4, /, div)
+VEC_OP_VV(2, /, div)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
new file mode 100644
index 00000000000..4dc6f9fb92e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
@@ -0,0 +1,75 @@
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+static void vec_op_test (void);
+#define DO_TEST vec_op_test
+#define AVX512FP16
+#define AVX512VL
+#include "avx512f-check.h"
+#include "avx512fp16-64-32-vecop-1.c"
+
+_Float16 a[4], b[4], fexp[4], fref[4];
+
+#define EMULATE_VEC_OP_VV(size, op, name) \
+void \
+__attribute__ ((noinline, noclone)) \
+scalar_vecop_v##size##hf##name ( \
+  _Float16 * restrict dst, _Float16 * restrict src1,  \
+  _Float16 * restrict src2)  \
+{ \
+  int i;  \
+  for (i = 0; i < size; i++)  \
+    dst[i] = src1[i] op src2[i];  \
+}
+
+EMULATE_VEC_OP_VV (4, +, add)
+EMULATE_VEC_OP_VV (2, +, add)
+EMULATE_VEC_OP_VV (4, -, sub)
+EMULATE_VEC_OP_VV (2, -, sub)
+EMULATE_VEC_OP_VV (4, *, mul)
+EMULATE_VEC_OP_VV (2, *, mul)
+EMULATE_VEC_OP_VV (4, /, div)
+EMULATE_VEC_OP_VV (2, /, div)
+
+void init()
+{
+  int i;
+  for (i = 0; i < 4; i++)
+    {
+      a[i] = i + 0.5; 
+      b[i] = i * 1.5;
+      fexp[i] = fref[i] = 2.75 * i;
+    }
+}
+
+int check_cond(void *a, void *b, int size)
+{
+  int i;
+  unsigned short *pa = (unsigned short *)a,
+		 *pb = (unsigned short *)b;
+  for (i = 0; i < size; i++)
+    if (pa[i] != pb[i])
+      return 0;
+  return 1;
+}
+
+#define TEST_VEC_OP_VV(size, name)	\
+{ \
+  init ();  \
+  scalar_vecop_v##size##hf##name (a, b, fexp);  \
+  vecop_v##size##hf##name (a, b, fref);  \
+  if (!check_cond ((void *)fexp, (void *)fref, size)) \
+    abort();  \
+}
+
+static void vec_op_test()
+{
+  TEST_VEC_OP_VV (4, add)
+  TEST_VEC_OP_VV (2, add)
+  TEST_VEC_OP_VV (4, sub)
+  TEST_VEC_OP_VV (2, sub)
+  TEST_VEC_OP_VV (4, mul)
+  TEST_VEC_OP_VV (2, mul)
+  TEST_VEC_OP_VV (4, div)
+  TEST_VEC_OP_VV (2, div)
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
index 112ac3e74d5..8471a1d1d10 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
@@ -43,16 +43,16 @@ FLOATHFVV(2, udi)
 
 /* { dg-final { scan-assembler-times "vcvtqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtuqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtdq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtudq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtdq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtudq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtuw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
index 286ea9f2624..2ef901a0375 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
@@ -41,15 +41,15 @@ EXTENDHFVV(8, sf)
 EXTENDHFVV(4, sf)
 
 /* { dg-final { scan-assembler-times "vcvtpd2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtps2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtps2phxy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
index ee55cd12300..7a51c9dd077 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
@@ -43,16 +43,16 @@ FIX_TRUNCHFVV(2, udi)
 
 /* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2uw\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102230.c b/gcc/testsuite/gcc.target/i386/pr102230.c
new file mode 100644
index 00000000000..60cf1c32afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102230.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16" } */
+
+typedef _Float16 v4hf __attribute__ ((vector_size (8)));
+typedef _Float16 v2hf __attribute__ ((vector_size (4)));
+
+v4hf
+v4hf_abi_1 (v4hf a)
+{
+  return a;
+}
+
+v4hf
+v4hf_abi_3 (v4hf a, v4hf b, v4hf c)
+{
+  return c;
+}
+
+/* { dg-final { scan-assembler-times "movq\[[\\t \]*%mm2, %mm0" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm2, %xmm0" 1 { target { ! ia32 } } } } */
+
+v4hf
+v4hf_abi_4 (v4hf a, v4hf b, v4hf c, v4hf d)
+{
+  return d;
+}
+
+/* { dg-final { scan-assembler-times "movq\[[\\t \]*4\[(\]%esp\[)\], %mm0" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm3, %xmm0" 1 { target { ! ia32 } } } } */
+
+v2hf
+v2hf_test (v2hf a, v2hf b, v2hf c, v2hf d)
+{
+  return b;
+}
+
+/* { dg-final { scan-assembler-times "movl\[[\\t \]*8\[(\]%esp\[)\], %eax" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm1, %xmm0" 1 { target { ! ia32 } } } } */
-- 
2.18.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.
  2021-09-27 10:42 [PATCH] AVX512FP16:support basic 64/32bit vector type and operation Hongyu Wang
@ 2021-09-27 11:32 ` Uros Bizjak
  2021-09-28  4:42   ` Hongyu Wang
  0 siblings, 1 reply; 5+ messages in thread
From: Uros Bizjak @ 2021-09-27 11:32 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: gcc-patches, Hongtao Liu, H. J. Lu

On Mon, Sep 27, 2021 at 12:42 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> Hi Uros,
>
> This patch intends to support V4HF/V2HF vector type and basic operations.
>
> For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
> is parsed by stack and returned from GPR since it is not specified
> by ABI.
>
> We found for 64bit vector in ia32, when mmx disabled there seems no
> mov<mode>_internal, so we add a define_insn for v4hf mode. It would be very
> ppreciated if you know why the handling of 64bit vector looks as is and
> give some advice.

ia32 ABI declares that __m64 values pass via MMX registers. Due to
this, we are not able to fully disable MMX register usage, as is the
case with x86_64. So, V4HFmode values will pass to functions via MMX
registers on ia32 targets.

So, there should be no additional define_insn, the addition to the
existing MMXMODE mode iterator should be enough. V4HFmodes should be
handled in the same way as e.g. V8QImode.

This is not the case with 4-byte values, which should be passed using
integer ABI.

Uros.

>
> Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
>
> OK for master?
>
> gcc/ChangeLog:
>
>         PR target/102230
>         * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
>         V4HF and V2HF mode check.
>         (VALID_SSE2_REG_VHF_MODE): Likewise.
>         (VALID_MMX_REG_MODE): Likewise.
>         (SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
>         vector mode condition.
>         * config/i386/i386.c (classify_argument): Parse V4HF/V2HF
>         via sse regs.
>         (function_arg_32): Add V4HFmode.
>         (function_arg_advance_32): Likewise.
>         * config/i386/i386.md (mode): Add V4HF/V2HF.
>         (MODE_SIZE): Likewise.
>         * config/i386/mmx.md (MMXMODE): Add V4HF mode.
>         (V_32): Add V2HF mode.
>         (*mov<mode>_internal): Adjust sse alternatives to support
>         V4HF mode vector move.
>         (*mov<mode>_internal): Adjust sse alternatives
>         to support V2HF mode move.
>         * config/i386/sse.md (VHF_32_64): New mode iterator.
>         (<insn><mode>3): New define_insn for add/sub/mul/div.
>         (*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/102230
>         * gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
>         * gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
>         * gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
>         * gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
>         * gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
>         * gcc.target/i386/pr102230.c: Ditto.
> ---
>  gcc/config/i386/i386.c                        |  4 +
>  gcc/config/i386/i386.h                        | 12 ++-
>  gcc/config/i386/i386.md                       |  5 +-
>  gcc/config/i386/mmx.md                        | 27 ++++---
>  gcc/config/i386/sse.md                        | 49 ++++++++++++
>  .../i386/avx512fp16-64-32-vecop-1.c           | 30 ++++++++
>  .../i386/avx512fp16-64-32-vecop-2.c           | 75 +++++++++++++++++++
>  .../gcc.target/i386/avx512fp16-floatvnhf.c    | 12 +--
>  .../i386/avx512fp16-trunc-extendvnhf.c        | 12 +--
>  .../gcc.target/i386/avx512fp16-truncvnhf.c    | 12 +--
>  gcc/testsuite/gcc.target/i386/pr102230.c      | 38 ++++++++++
>  11 files changed, 243 insertions(+), 33 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index ba89e111d28..b3e4add4b9e 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
>      case E_V2SFmode:
>      case E_V2SImode:
>      case E_V4HImode:
> +    case E_V4HFmode:
> +    case E_V2HFmode:
>      case E_V8QImode:
>        classes[0] = X86_64_SSE_CLASS;
>        return 1;
> @@ -2902,6 +2904,7 @@ pass_in_reg:
>
>      case E_V8QImode:
>      case E_V4HImode:
> +    case E_V4HFmode:
>      case E_V2SImode:
>      case E_V2SFmode:
>      case E_V1TImode:
> @@ -3149,6 +3152,7 @@ pass_in_reg:
>
>      case E_V8QImode:
>      case E_V4HImode:
> +    case E_V4HFmode:
>      case E_V2SImode:
>      case E_V2SFmode:
>      case E_V1TImode:
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 8a4251b4926..9f3cad31f96 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
>     || (MODE) == TImode)
>
>  #define VALID_AVX512FP16_REG_MODE(MODE)                                        \
> -  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
> +  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode    \
> +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
>
>  #define VALID_SSE2_REG_MODE(MODE)                                      \
>    ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode     \
> @@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
>     || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
>
>  #define VALID_SSE2_REG_VHF_MODE(MODE)                  \
> -  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
> +  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode    \
> +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
>
>  #define VALID_SSE_REG_MODE(MODE)                                       \
>    ((MODE) == V1TImode || (MODE) == TImode                              \
> @@ -1054,7 +1056,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
>  #define VALID_MMX_REG_MODE(MODE)                                       \
>    ((MODE) == V1DImode || (MODE) == DImode                              \
>     || (MODE) == V2SImode || (MODE) == SImode                           \
> -   || (MODE) == V4HImode || (MODE) == V8QImode)
> +   || (MODE) == V4HImode || (MODE) == V8QImode                         \
> +   || (MODE) == V4HFmode)
>
>  #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
>
> @@ -1087,7 +1090,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
>     || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode   \
>     || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode  \
>     || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \
> -   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
> +   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
> +   || (MODE) == V8HFmode)
>
>  #define X87_FLOAT_MODE_P(MODE) \
>    (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index c6279e620c9..758d7d1e3c0 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -498,7 +498,7 @@
>  ;; Main data type used by the insn
>  (define_attr "mode"
>    "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
> -   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
> +   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
>    (const_string "unknown"))
>
>  ;; The CPU unit operations uses.
> @@ -1106,7 +1106,8 @@
>                              (V1TI "16") (V2TI "32") (V4TI "64")
>                              (V2DF "16") (V4DF "32") (V8DF "64")
>                              (V4SF "16") (V8SF "32") (V16SF "64")
> -                            (V8HF "16") (V16HF "32") (V32HF "64")])
> +                            (V8HF "16") (V16HF "32") (V32HF "64")
> +                            (V4HF "8") (V2HF "4")])
>
>  ;; Double word integer modes as mode attribute.
>  (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index b0093778fc6..68e1c4b2dbd 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -48,7 +48,7 @@
>  (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
>
>  ;; All 8-byte vector modes handled by MMX
> -(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
> +(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
>  (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
>
>  ;; Mix-n-match
> @@ -57,8 +57,8 @@
>  (define_mode_iterator MMXMODE24 [V4HI V2SI])
>  (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
>
> -;; All 4-byte integer vector modes
> -(define_mode_iterator V_32 [V4QI V2HI V1SI])
> +;; All 4-byte integer/float16 vector modes
> +(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
>
>  ;; 4-byte integer vector modes
>  (define_mode_iterator VI_32 [V4QI V2HI])
> @@ -191,6 +191,8 @@
>             (eq_attr "alternative" "11,12")
>               (cond [(match_test "<MODE>mode == V2SFmode")
>                        (const_string "V4SF")
> +                    (match_test "<MODE>mode == V4HFmode")
> +                      (const_string "V4SF")
>                      (ior (not (match_test "TARGET_SSE2"))
>                           (match_test "optimize_function_for_size_p (cfun)"))
>                        (const_string "V4SF")
> @@ -198,14 +200,16 @@
>                     (const_string "TI"))
>
>             (and (eq_attr "alternative" "13")
> -                (ior (and (match_test "<MODE>mode == V2SFmode")
> -                          (not (match_test "TARGET_MMX_WITH_SSE")))
> -                     (not (match_test "TARGET_SSE2"))))
> +                (ior (ior (and (match_test "<MODE>mode == V2SFmode")
> +                               (not (match_test "TARGET_MMX_WITH_SSE")))
> +                          (not (match_test "TARGET_SSE2")))
> +                     (match_test "<MODE>mode == V4HFmode")))
>               (const_string "V2SF")
>
>             (and (eq_attr "alternative" "14")
> -                (ior (match_test "<MODE>mode == V2SFmode")
> -                     (not (match_test "TARGET_SSE2"))))
> +                (ior (ior (match_test "<MODE>mode == V2SFmode")
> +                          (not (match_test "TARGET_SSE2")))
> +                     (match_test "<MODE>mode == V4HFmode")))
>               (const_string "V2SF")
>            ]
>            (const_string "DI")))
> @@ -289,12 +293,17 @@
>         (const_string "*")))
>     (set (attr "mode")
>       (cond [(eq_attr "alternative" "2,3")
> -             (cond [(match_test "TARGET_AVX")
> +             (cond [(match_test "<MODE>mode == V2HFmode")
> +                      (const_string "V4SF")
> +                    (match_test "TARGET_AVX")
>                        (const_string "TI")
>                      (match_test "optimize_function_for_size_p (cfun)")
>                        (const_string "V4SF")
>                     ]
>                     (const_string "TI"))
> +           (and (eq_attr "alternative" "4,5")
> +                (match_test "<MODE>mode == V2HFmode"))
> +             (const_string "SF")
>            ]
>            (const_string "SI")))
>     (set (attr "preferred_for_speed")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index a446dedb2ec..b7832926287 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -671,6 +671,9 @@
>    [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
>         (V16QI  "TARGET_AVX512VL")])
>
> +(define_mode_iterator VHF_32_64
> +  [V4HF V2HF])
> +
>  (define_mode_attr avx512
>    [(V16QI "avx512vl") (V32QI "avx512vl") (V64QI "avx512bw")
>     (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
> @@ -1313,6 +1316,36 @@
>               ]
>               (symbol_ref "true")))])
>
> +(define_insn "*movv4hf_internal_sse"
> +  [(set (match_operand:V4HF 0 "nonimmediate_operand"
> +        "=v,v,v,m")
> +       (match_operand:V4HF 1 "nonimmediate_or_sse_const_operand"
> +        " C,v,m,v"))]
> +  "!TARGET_MMX && TARGET_SSE2
> +   && (register_operand (operands[0], V4HFmode)
> +       || register_operand (operands[1], V4HFmode))"
> +{
> +  switch (get_attr_type (insn))
> +    {
> +    case TYPE_SSELOG1:
> +      return standard_sse_constant_opcode (insn, operands);
> +
> +    case TYPE_SSEMOV:
> +      return ix86_output_ssemov (insn, operands);
> +
> +    default:
> +      gcc_unreachable ();
> +    }
> +}
> +  [(set_attr "type" "sselog1,ssemov,ssemov,ssemov")
> +   (set_attr "prefix" "maybe_vex")
> +   (set (attr "mode")
> +       (cond [(eq_attr "alternative" "1")
> +                (const_string "V4SF")]
> +             (const_string "V2SF")))]
> +)
> +
> +
>  ;; If mem_addr points to a memory region with less than whole vector size bytes
>  ;; of accessible memory and k is a mask that would prevent reading the inaccessible
>  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> @@ -2165,6 +2198,22 @@
>     (set_attr "prefix" "<bcst_mask_prefix3>")
>     (set_attr "mode" "<MODE>")])
>
> +(define_insn "<insn><mode>3"
> +  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
> +       (plusminusmultdiv:VHF_32_64
> +         (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
> +         (match_operand:VHF_32_64 2 "register_operand" "v")))]
> +  "TARGET_AVX512FP16 && TARGET_AVX512VL"
> +  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
> +  [(set (attr "type")
> +      (cond [(match_test "<CODE> == MULT")
> +               (const_string "ssemul")
> +            (match_test "<CODE> == DIV")
> +               (const_string "ssediv")]
> +            (const_string "sseadd")))
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "V8HF")])
> +
>  ;; Standard scalar operation patterns which preserve the rest of the
>  ;; vector for combiner.
>  (define_insn "*<sse>_vm<insn><mode>3"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> new file mode 100644
> index 00000000000..754e909d77b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +
> +/* { dg-final { scan-assembler-times "vaddph" 2 } } */
> +/* { dg-final { scan-assembler-times "vsubph" 2 } } */
> +/* { dg-final { scan-assembler-times "vmulph" 2 } } */
> +/* { dg-final { scan-assembler-times "vdivph" 2 } } */
> +
> +#define DO_PRAGMA(X) _Pragma(#X)
> +
> +#define VEC_OP_VV(size, op, name)       \
> +void \
> +__attribute__ ((noinline, noclone, optimize("tree-slp-vectorize"))) \
> +vecop_v##size##hf##name (_Float16 * restrict dst,  \
> + _Float16 * restrict src1, _Float16 * restrict src2)   \
> +{ \
> +    int i;  \
> +    DO_PRAGMA (GCC unroll size)   \
> +    for (i = 0; i < size; i++)  \
> +      dst[i] = src1[i] op src2[i];  \
> +}
> +
> +VEC_OP_VV(4, +, add)
> +VEC_OP_VV(2, +, add)
> +VEC_OP_VV(4, -, sub)
> +VEC_OP_VV(2, -, sub)
> +VEC_OP_VV(4, *, mul)
> +VEC_OP_VV(2, *, mul)
> +VEC_OP_VV(4, /, div)
> +VEC_OP_VV(2, /, div)
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> new file mode 100644
> index 00000000000..4dc6f9fb92e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> @@ -0,0 +1,75 @@
> +/* { dg-do run { target avx512fp16 } } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +
> +static void vec_op_test (void);
> +#define DO_TEST vec_op_test
> +#define AVX512FP16
> +#define AVX512VL
> +#include "avx512f-check.h"
> +#include "avx512fp16-64-32-vecop-1.c"
> +
> +_Float16 a[4], b[4], fexp[4], fref[4];
> +
> +#define EMULATE_VEC_OP_VV(size, op, name) \
> +void \
> +__attribute__ ((noinline, noclone)) \
> +scalar_vecop_v##size##hf##name ( \
> +  _Float16 * restrict dst, _Float16 * restrict src1,  \
> +  _Float16 * restrict src2)  \
> +{ \
> +  int i;  \
> +  for (i = 0; i < size; i++)  \
> +    dst[i] = src1[i] op src2[i];  \
> +}
> +
> +EMULATE_VEC_OP_VV (4, +, add)
> +EMULATE_VEC_OP_VV (2, +, add)
> +EMULATE_VEC_OP_VV (4, -, sub)
> +EMULATE_VEC_OP_VV (2, -, sub)
> +EMULATE_VEC_OP_VV (4, *, mul)
> +EMULATE_VEC_OP_VV (2, *, mul)
> +EMULATE_VEC_OP_VV (4, /, div)
> +EMULATE_VEC_OP_VV (2, /, div)
> +
> +void init()
> +{
> +  int i;
> +  for (i = 0; i < 4; i++)
> +    {
> +      a[i] = i + 0.5;
> +      b[i] = i * 1.5;
> +      fexp[i] = fref[i] = 2.75 * i;
> +    }
> +}
> +
> +int check_cond(void *a, void *b, int size)
> +{
> +  int i;
> +  unsigned short *pa = (unsigned short *)a,
> +                *pb = (unsigned short *)b;
> +  for (i = 0; i < size; i++)
> +    if (pa[i] != pb[i])
> +      return 0;
> +  return 1;
> +}
> +
> +#define TEST_VEC_OP_VV(size, name)     \
> +{ \
> +  init ();  \
> +  scalar_vecop_v##size##hf##name (a, b, fexp);  \
> +  vecop_v##size##hf##name (a, b, fref);  \
> +  if (!check_cond ((void *)fexp, (void *)fref, size)) \
> +    abort();  \
> +}
> +
> +static void vec_op_test()
> +{
> +  TEST_VEC_OP_VV (4, add)
> +  TEST_VEC_OP_VV (2, add)
> +  TEST_VEC_OP_VV (4, sub)
> +  TEST_VEC_OP_VV (2, sub)
> +  TEST_VEC_OP_VV (4, mul)
> +  TEST_VEC_OP_VV (2, mul)
> +  TEST_VEC_OP_VV (4, div)
> +  TEST_VEC_OP_VV (2, div)
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> index 112ac3e74d5..8471a1d1d10 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> @@ -43,16 +43,16 @@ FLOATHFVV(2, udi)
>
>  /* { dg-final { scan-assembler-times "vcvtqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtuqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtdq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtudq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtdq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtudq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtuw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> index 286ea9f2624..2ef901a0375 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> @@ -41,15 +41,15 @@ EXTENDHFVV(8, sf)
>  EXTENDHFVV(4, sf)
>
>  /* { dg-final { scan-assembler-times "vcvtpd2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtps2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtps2phxy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> index ee55cd12300..7a51c9dd077 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> @@ -43,16 +43,16 @@ FIX_TRUNCHFVV(2, udi)
>
>  /* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2uw\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102230.c b/gcc/testsuite/gcc.target/i386/pr102230.c
> new file mode 100644
> index 00000000000..60cf1c32afe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102230.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512fp16" } */
> +
> +typedef _Float16 v4hf __attribute__ ((vector_size (8)));
> +typedef _Float16 v2hf __attribute__ ((vector_size (4)));
> +
> +v4hf
> +v4hf_abi_1 (v4hf a)
> +{
> +  return a;
> +}
> +
> +v4hf
> +v4hf_abi_3 (v4hf a, v4hf b, v4hf c)
> +{
> +  return c;
> +}
> +
> +/* { dg-final { scan-assembler-times "movq\[[\\t \]*%mm2, %mm0" 1 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm2, %xmm0" 1 { target { ! ia32 } } } } */
> +
> +v4hf
> +v4hf_abi_4 (v4hf a, v4hf b, v4hf c, v4hf d)
> +{
> +  return d;
> +}
> +
> +/* { dg-final { scan-assembler-times "movq\[[\\t \]*4\[(\]%esp\[)\], %mm0" 1 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm3, %xmm0" 1 { target { ! ia32 } } } } */
> +
> +v2hf
> +v2hf_test (v2hf a, v2hf b, v2hf c, v2hf d)
> +{
> +  return b;
> +}
> +
> +/* { dg-final { scan-assembler-times "movl\[[\\t \]*8\[(\]%esp\[)\], %eax" 1 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm1, %xmm0" 1 { target { ! ia32 } } } } */
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.
  2021-09-27 11:32 ` Uros Bizjak
@ 2021-09-28  4:42   ` Hongyu Wang
  2021-09-28  6:27     ` Uros Bizjak
  0 siblings, 1 reply; 5+ messages in thread
From: Hongyu Wang @ 2021-09-28  4:42 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 29741 bytes --]

> ia32 ABI declares that __m64 values pass via MMX registers. Due to
> this, we are not able to fully disable MMX register usage, as is the
> case with x86_64. So, V4HFmode values will pass to functions via MMX
> registers on ia32 targets.
>
> So, there should be no additional define_insn, the addition to the
> existing MMXMODE mode iterator should be enough. V4HFmodes should be
> handled in the same way as e.g. V8QImode.
>
> This is not the case with 4-byte values, which should be passed using
> integer ABI.

Thanks for the explanation, updated patch by removing the extra define_insn,
and drop V4HFmode from VALID_AVX512FP16_REG_MODE. Now v4hf would behave
same as v8qi.

Bootsrapped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.

OK for master with the updated one?

Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年9月27日周一 下午7:35写道:
>
> On Mon, Sep 27, 2021 at 12:42 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
> >
> > Hi Uros,
> >
> > This patch intends to support V4HF/V2HF vector type and basic operations.
> >
> > For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
> > is parsed by stack and returned from GPR since it is not specified
> > by ABI.
> >
> > We found for 64bit vector in ia32, when mmx disabled there seems no
> > mov<mode>_internal, so we add a define_insn for v4hf mode. It would be very
> > ppreciated if you know why the handling of 64bit vector looks as is and
> > give some advice.
>
> ia32 ABI declares that __m64 values pass via MMX registers. Due to
> this, we are not able to fully disable MMX register usage, as is the
> case with x86_64. So, V4HFmode values will pass to functions via MMX
> registers on ia32 targets.
>
> So, there should be no additional define_insn, the addition to the
> existing MMXMODE mode iterator should be enough. V4HFmodes should be
> handled in the same way as e.g. V8QImode.
>
> This is not the case with 4-byte values, which should be passed using
> integer ABI.
>
> Uros.
>
> >
> > Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > OK for master?
> >
> > gcc/ChangeLog:
> >
> >         PR target/102230
> >         * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
> >         V4HF and V2HF mode check.
> >         (VALID_SSE2_REG_VHF_MODE): Likewise.
> >         (VALID_MMX_REG_MODE): Likewise.
> >         (SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
> >         vector mode condition.
> >         * config/i386/i386.c (classify_argument): Parse V4HF/V2HF
> >         via sse regs.
> >         (function_arg_32): Add V4HFmode.
> >         (function_arg_advance_32): Likewise.
> >         * config/i386/i386.md (mode): Add V4HF/V2HF.
> >         (MODE_SIZE): Likewise.
> >         * config/i386/mmx.md (MMXMODE): Add V4HF mode.
> >         (V_32): Add V2HF mode.
> >         (*mov<mode>_internal): Adjust sse alternatives to support
> >         V4HF mode vector move.
> >         (*mov<mode>_internal): Adjust sse alternatives
> >         to support V2HF mode move.
> >         * config/i386/sse.md (VHF_32_64): New mode iterator.
> >         (<insn><mode>3): New define_insn for add/sub/mul/div.
> >         (*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/102230
> >         * gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
> >         * gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
> >         * gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
> >         * gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
> >         * gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
> >         * gcc.target/i386/pr102230.c: Ditto.
> > ---
> >  gcc/config/i386/i386.c                        |  4 +
> >  gcc/config/i386/i386.h                        | 12 ++-
> >  gcc/config/i386/i386.md                       |  5 +-
> >  gcc/config/i386/mmx.md                        | 27 ++++---
> >  gcc/config/i386/sse.md                        | 49 ++++++++++++
> >  .../i386/avx512fp16-64-32-vecop-1.c           | 30 ++++++++
> >  .../i386/avx512fp16-64-32-vecop-2.c           | 75 +++++++++++++++++++
> >  .../gcc.target/i386/avx512fp16-floatvnhf.c    | 12 +--
> >  .../i386/avx512fp16-trunc-extendvnhf.c        | 12 +--
> >  .../gcc.target/i386/avx512fp16-truncvnhf.c    | 12 +--
> >  gcc/testsuite/gcc.target/i386/pr102230.c      | 38 ++++++++++
> >  11 files changed, 243 insertions(+), 33 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index ba89e111d28..b3e4add4b9e 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
> >      case E_V2SFmode:
> >      case E_V2SImode:
> >      case E_V4HImode:
> > +    case E_V4HFmode:
> > +    case E_V2HFmode:
> >      case E_V8QImode:
> >        classes[0] = X86_64_SSE_CLASS;
> >        return 1;
> > @@ -2902,6 +2904,7 @@ pass_in_reg:
> >
> >      case E_V8QImode:
> >      case E_V4HImode:
> > +    case E_V4HFmode:
> >      case E_V2SImode:
> >      case E_V2SFmode:
> >      case E_V1TImode:
> > @@ -3149,6 +3152,7 @@ pass_in_reg:
> >
> >      case E_V8QImode:
> >      case E_V4HImode:
> > +    case E_V4HFmode:
> >      case E_V2SImode:
> >      case E_V2SFmode:
> >      case E_V1TImode:
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 8a4251b4926..9f3cad31f96 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> >     || (MODE) == TImode)
> >
> >  #define VALID_AVX512FP16_REG_MODE(MODE)                                        \
> > -  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
> > +  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode    \
> > +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
> >
> >  #define VALID_SSE2_REG_MODE(MODE)                                      \
> >    ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode     \
> > @@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> >     || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
> >
> >  #define VALID_SSE2_REG_VHF_MODE(MODE)                  \
> > -  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
> > +  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode    \
> > +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
> >
> >  #define VALID_SSE_REG_MODE(MODE)                                       \
> >    ((MODE) == V1TImode || (MODE) == TImode                              \
> > @@ -1054,7 +1056,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> >  #define VALID_MMX_REG_MODE(MODE)                                       \
> >    ((MODE) == V1DImode || (MODE) == DImode                              \
> >     || (MODE) == V2SImode || (MODE) == SImode                           \
> > -   || (MODE) == V4HImode || (MODE) == V8QImode)
> > +   || (MODE) == V4HImode || (MODE) == V8QImode                         \
> > +   || (MODE) == V4HFmode)
> >
> >  #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
> >
> > @@ -1087,7 +1090,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> >     || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode   \
> >     || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode  \
> >     || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \
> > -   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
> > +   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
> > +   || (MODE) == V8HFmode)
> >
> >  #define X87_FLOAT_MODE_P(MODE) \
> >    (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index c6279e620c9..758d7d1e3c0 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -498,7 +498,7 @@
> >  ;; Main data type used by the insn
> >  (define_attr "mode"
> >    "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
> > -   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
> > +   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
> >    (const_string "unknown"))
> >
> >  ;; The CPU unit operations uses.
> > @@ -1106,7 +1106,8 @@
> >                              (V1TI "16") (V2TI "32") (V4TI "64")
> >                              (V2DF "16") (V4DF "32") (V8DF "64")
> >                              (V4SF "16") (V8SF "32") (V16SF "64")
> > -                            (V8HF "16") (V16HF "32") (V32HF "64")])
> > +                            (V8HF "16") (V16HF "32") (V32HF "64")
> > +                            (V4HF "8") (V2HF "4")])
> >
> >  ;; Double word integer modes as mode attribute.
> >  (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
> > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > index b0093778fc6..68e1c4b2dbd 100644
> > --- a/gcc/config/i386/mmx.md
> > +++ b/gcc/config/i386/mmx.md
> > @@ -48,7 +48,7 @@
> >  (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
> >
> >  ;; All 8-byte vector modes handled by MMX
> > -(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
> > +(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
> >  (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
> >
> >  ;; Mix-n-match
> > @@ -57,8 +57,8 @@
> >  (define_mode_iterator MMXMODE24 [V4HI V2SI])
> >  (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
> >
> > -;; All 4-byte integer vector modes
> > -(define_mode_iterator V_32 [V4QI V2HI V1SI])
> > +;; All 4-byte integer/float16 vector modes
> > +(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
> >
> >  ;; 4-byte integer vector modes
> >  (define_mode_iterator VI_32 [V4QI V2HI])
> > @@ -191,6 +191,8 @@
> >             (eq_attr "alternative" "11,12")
> >               (cond [(match_test "<MODE>mode == V2SFmode")
> >                        (const_string "V4SF")
> > +                    (match_test "<MODE>mode == V4HFmode")
> > +                      (const_string "V4SF")
> >                      (ior (not (match_test "TARGET_SSE2"))
> >                           (match_test "optimize_function_for_size_p (cfun)"))
> >                        (const_string "V4SF")
> > @@ -198,14 +200,16 @@
> >                     (const_string "TI"))
> >
> >             (and (eq_attr "alternative" "13")
> > -                (ior (and (match_test "<MODE>mode == V2SFmode")
> > -                          (not (match_test "TARGET_MMX_WITH_SSE")))
> > -                     (not (match_test "TARGET_SSE2"))))
> > +                (ior (ior (and (match_test "<MODE>mode == V2SFmode")
> > +                               (not (match_test "TARGET_MMX_WITH_SSE")))
> > +                          (not (match_test "TARGET_SSE2")))
> > +                     (match_test "<MODE>mode == V4HFmode")))
> >               (const_string "V2SF")
> >
> >             (and (eq_attr "alternative" "14")
> > -                (ior (match_test "<MODE>mode == V2SFmode")
> > -                     (not (match_test "TARGET_SSE2"))))
> > +                (ior (ior (match_test "<MODE>mode == V2SFmode")
> > +                          (not (match_test "TARGET_SSE2")))
> > +                     (match_test "<MODE>mode == V4HFmode")))
> >               (const_string "V2SF")
> >            ]
> >            (const_string "DI")))
> > @@ -289,12 +293,17 @@
> >         (const_string "*")))
> >     (set (attr "mode")
> >       (cond [(eq_attr "alternative" "2,3")
> > -             (cond [(match_test "TARGET_AVX")
> > +             (cond [(match_test "<MODE>mode == V2HFmode")
> > +                      (const_string "V4SF")
> > +                    (match_test "TARGET_AVX")
> >                        (const_string "TI")
> >                      (match_test "optimize_function_for_size_p (cfun)")
> >                        (const_string "V4SF")
> >                     ]
> >                     (const_string "TI"))
> > +           (and (eq_attr "alternative" "4,5")
> > +                (match_test "<MODE>mode == V2HFmode"))
> > +             (const_string "SF")
> >            ]
> >            (const_string "SI")))
> >     (set (attr "preferred_for_speed")
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index a446dedb2ec..b7832926287 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -671,6 +671,9 @@
> >    [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
> >         (V16QI  "TARGET_AVX512VL")])
> >
> > +(define_mode_iterator VHF_32_64
> > +  [V4HF V2HF])
> > +
> >  (define_mode_attr avx512
> >    [(V16QI "avx512vl") (V32QI "avx512vl") (V64QI "avx512bw")
> >     (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
> > @@ -1313,6 +1316,36 @@
> >               ]
> >               (symbol_ref "true")))])
> >
> > +(define_insn "*movv4hf_internal_sse"
> > +  [(set (match_operand:V4HF 0 "nonimmediate_operand"
> > +        "=v,v,v,m")
> > +       (match_operand:V4HF 1 "nonimmediate_or_sse_const_operand"
> > +        " C,v,m,v"))]
> > +  "!TARGET_MMX && TARGET_SSE2
> > +   && (register_operand (operands[0], V4HFmode)
> > +       || register_operand (operands[1], V4HFmode))"
> > +{
> > +  switch (get_attr_type (insn))
> > +    {
> > +    case TYPE_SSELOG1:
> > +      return standard_sse_constant_opcode (insn, operands);
> > +
> > +    case TYPE_SSEMOV:
> > +      return ix86_output_ssemov (insn, operands);
> > +
> > +    default:
> > +      gcc_unreachable ();
> > +    }
> > +}
> > +  [(set_attr "type" "sselog1,ssemov,ssemov,ssemov")
> > +   (set_attr "prefix" "maybe_vex")
> > +   (set (attr "mode")
> > +       (cond [(eq_attr "alternative" "1")
> > +                (const_string "V4SF")]
> > +             (const_string "V2SF")))]
> > +)
> > +
> > +
> >  ;; If mem_addr points to a memory region with less than whole vector size bytes
> >  ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> >  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> > @@ -2165,6 +2198,22 @@
> >     (set_attr "prefix" "<bcst_mask_prefix3>")
> >     (set_attr "mode" "<MODE>")])
> >
> > +(define_insn "<insn><mode>3"
> > +  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
> > +       (plusminusmultdiv:VHF_32_64
> > +         (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
> > +         (match_operand:VHF_32_64 2 "register_operand" "v")))]
> > +  "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > +  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
> > +  [(set (attr "type")
> > +      (cond [(match_test "<CODE> == MULT")
> > +               (const_string "ssemul")
> > +            (match_test "<CODE> == DIV")
> > +               (const_string "ssediv")]
> > +            (const_string "sseadd")))
> > +   (set_attr "prefix" "evex")
> > +   (set_attr "mode" "V8HF")])
> > +
> >  ;; Standard scalar operation patterns which preserve the rest of the
> >  ;; vector for combiner.
> >  (define_insn "*<sse>_vm<insn><mode>3"
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > new file mode 100644
> > index 00000000000..754e909d77b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > +
> > +/* { dg-final { scan-assembler-times "vaddph" 2 } } */
> > +/* { dg-final { scan-assembler-times "vsubph" 2 } } */
> > +/* { dg-final { scan-assembler-times "vmulph" 2 } } */
> > +/* { dg-final { scan-assembler-times "vdivph" 2 } } */
> > +
> > +#define DO_PRAGMA(X) _Pragma(#X)
> > +
> > +#define VEC_OP_VV(size, op, name)       \
> > +void \
> > +__attribute__ ((noinline, noclone, optimize("tree-slp-vectorize"))) \
> > +vecop_v##size##hf##name (_Float16 * restrict dst,  \
> > + _Float16 * restrict src1, _Float16 * restrict src2)   \
> > +{ \
> > +    int i;  \
> > +    DO_PRAGMA (GCC unroll size)   \
> > +    for (i = 0; i < size; i++)  \
> > +      dst[i] = src1[i] op src2[i];  \
> > +}
> > +
> > +VEC_OP_VV(4, +, add)
> > +VEC_OP_VV(2, +, add)
> > +VEC_OP_VV(4, -, sub)
> > +VEC_OP_VV(2, -, sub)
> > +VEC_OP_VV(4, *, mul)
> > +VEC_OP_VV(2, *, mul)
> > +VEC_OP_VV(4, /, div)
> > +VEC_OP_VV(2, /, div)
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > new file mode 100644
> > index 00000000000..4dc6f9fb92e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > @@ -0,0 +1,75 @@
> > +/* { dg-do run { target avx512fp16 } } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > +
> > +static void vec_op_test (void);
> > +#define DO_TEST vec_op_test
> > +#define AVX512FP16
> > +#define AVX512VL
> > +#include "avx512f-check.h"
> > +#include "avx512fp16-64-32-vecop-1.c"
> > +
> > +_Float16 a[4], b[4], fexp[4], fref[4];
> > +
> > +#define EMULATE_VEC_OP_VV(size, op, name) \
> > +void \
> > +__attribute__ ((noinline, noclone)) \
> > +scalar_vecop_v##size##hf##name ( \
> > +  _Float16 * restrict dst, _Float16 * restrict src1,  \
> > +  _Float16 * restrict src2)  \
> > +{ \
> > +  int i;  \
> > +  for (i = 0; i < size; i++)  \
> > +    dst[i] = src1[i] op src2[i];  \
> > +}
> > +
> > +EMULATE_VEC_OP_VV (4, +, add)
> > +EMULATE_VEC_OP_VV (2, +, add)
> > +EMULATE_VEC_OP_VV (4, -, sub)
> > +EMULATE_VEC_OP_VV (2, -, sub)
> > +EMULATE_VEC_OP_VV (4, *, mul)
> > +EMULATE_VEC_OP_VV (2, *, mul)
> > +EMULATE_VEC_OP_VV (4, /, div)
> > +EMULATE_VEC_OP_VV (2, /, div)
> > +
> > +void init()
> > +{
> > +  int i;
> > +  for (i = 0; i < 4; i++)
> > +    {
> > +      a[i] = i + 0.5;
> > +      b[i] = i * 1.5;
> > +      fexp[i] = fref[i] = 2.75 * i;
> > +    }
> > +}
> > +
> > +int check_cond(void *a, void *b, int size)
> > +{
> > +  int i;
> > +  unsigned short *pa = (unsigned short *)a,
> > +                *pb = (unsigned short *)b;
> > +  for (i = 0; i < size; i++)
> > +    if (pa[i] != pb[i])
> > +      return 0;
> > +  return 1;
> > +}
> > +
> > +#define TEST_VEC_OP_VV(size, name)     \
> > +{ \
> > +  init ();  \
> > +  scalar_vecop_v##size##hf##name (a, b, fexp);  \
> > +  vecop_v##size##hf##name (a, b, fref);  \
> > +  if (!check_cond ((void *)fexp, (void *)fref, size)) \
> > +    abort();  \
> > +}
> > +
> > +static void vec_op_test()
> > +{
> > +  TEST_VEC_OP_VV (4, add)
> > +  TEST_VEC_OP_VV (2, add)
> > +  TEST_VEC_OP_VV (4, sub)
> > +  TEST_VEC_OP_VV (2, sub)
> > +  TEST_VEC_OP_VV (4, mul)
> > +  TEST_VEC_OP_VV (2, mul)
> > +  TEST_VEC_OP_VV (4, div)
> > +  TEST_VEC_OP_VV (2, div)
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > index 112ac3e74d5..8471a1d1d10 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > @@ -43,16 +43,16 @@ FLOATHFVV(2, udi)
> >
> >  /* { dg-final { scan-assembler-times "vcvtqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtuqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtdq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtudq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtdq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtudq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtuw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > index 286ea9f2624..2ef901a0375 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > @@ -41,15 +41,15 @@ EXTENDHFVV(8, sf)
> >  EXTENDHFVV(4, sf)
> >
> >  /* { dg-final { scan-assembler-times "vcvtpd2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtps2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtps2phxy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > index ee55cd12300..7a51c9dd077 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > @@ -43,16 +43,16 @@ FIX_TRUNCHFVV(2, udi)
> >
> >  /* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > -/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > -/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2uw\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> >  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr102230.c b/gcc/testsuite/gcc.target/i386/pr102230.c
> > new file mode 100644
> > index 00000000000..60cf1c32afe
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr102230.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512fp16" } */
> > +
> > +typedef _Float16 v4hf __attribute__ ((vector_size (8)));
> > +typedef _Float16 v2hf __attribute__ ((vector_size (4)));
> > +
> > +v4hf
> > +v4hf_abi_1 (v4hf a)
> > +{
> > +  return a;
> > +}
> > +
> > +v4hf
> > +v4hf_abi_3 (v4hf a, v4hf b, v4hf c)
> > +{
> > +  return c;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "movq\[[\\t \]*%mm2, %mm0" 1 { target { ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm2, %xmm0" 1 { target { ! ia32 } } } } */
> > +
> > +v4hf
> > +v4hf_abi_4 (v4hf a, v4hf b, v4hf c, v4hf d)
> > +{
> > +  return d;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "movq\[[\\t \]*4\[(\]%esp\[)\], %mm0" 1 { target { ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm3, %xmm0" 1 { target { ! ia32 } } } } */
> > +
> > +v2hf
> > +v2hf_test (v2hf a, v2hf b, v2hf c, v2hf d)
> > +{
> > +  return b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "movl\[[\\t \]*8\[(\]%esp\[)\], %eax" 1 { target { ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm1, %xmm0" 1 { target { ! ia32 } } } } */
> > --
> > 2.18.1
> >

[-- Attachment #2: FP16_64_32bit_v2.patch --]
[-- Type: text/x-patch, Size: 23396 bytes --]

From 4a5086162b81551f79f2483dee39a81c400e3d75 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Thu, 15 Jul 2021 13:31:24 +0800
Subject: [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.

For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
is parsed by stack and returned from GPR since it is not specified
by ABI.

gcc/ChangeLog:

	PR target/102230
	* config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
	V2HF mode check.
	(VALID_SSE2_REG_VHF_MODE): Add V4HFmode and V2HFmode.
	(VALID_MMX_REG_MODE): Add V4HFmode.
	(SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
	vector mode condition.
	* config/i386/i386.c (classify_argument): Parse V4HF/V2HF
	via sse regs.
	(function_arg_32): Add V4HFmode.
	(function_arg_advance_32): Likewise.
	* config/i386/i386.md (mode): Add V4HF/V2HF.
	(MODE_SIZE): Likewise.
	* config/i386/mmx.md (MMXMODE): Add V4HF mode.
	(V_32): Add V2HF mode.
	(*mov<mode>_internal): Adjust sse alternatives to support
	V4HF mode move.
	(*mov<mode>_internal): Adjust sse alternatives to support
	V2HF mode move.
	* config/i386/sse.md (VHF_32_64): New mode iterator.
	(<insn><mode>3): New define_insn for add/sub/mul/div.

gcc/testsuite/ChangeLog:

	PR target/102230
	* gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
	* gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
	* gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
	* gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
	* gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
	* gcc.target/i386/pr102230.c: Ditto.
---
 gcc/config/i386/i386.c                        |  4 +
 gcc/config/i386/i386.h                        | 13 +++-
 gcc/config/i386/i386.md                       |  5 +-
 gcc/config/i386/mmx.md                        | 27 ++++---
 gcc/config/i386/sse.md                        | 20 +++++
 .../i386/avx512fp16-64-32-vecop-1.c           | 30 ++++++++
 .../i386/avx512fp16-64-32-vecop-2.c           | 75 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-floatvnhf.c    | 12 +--
 .../i386/avx512fp16-trunc-extendvnhf.c        | 12 +--
 .../gcc.target/i386/avx512fp16-truncvnhf.c    | 12 +--
 gcc/testsuite/gcc.target/i386/pr102230.c      | 38 ++++++++++
 11 files changed, 215 insertions(+), 33 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ba89e111d28..b3e4add4b9e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
     case E_V2SFmode:
     case E_V2SImode:
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V2HFmode:
     case E_V8QImode:
       classes[0] = X86_64_SSE_CLASS;
       return 1;
@@ -2902,6 +2904,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
@@ -3149,6 +3152,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8a4251b4926..cba6d835910 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == TImode)
 
 #define VALID_AVX512FP16_REG_MODE(MODE)					\
-  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
+  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode	\
+   || (MODE) == V2HFmode)
 
 #define VALID_SSE2_REG_MODE(MODE)					\
   ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode	\
@@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
 
 #define VALID_SSE2_REG_VHF_MODE(MODE)			\
-  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
+  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode	\
+   || (MODE) == V4HFmode || (MODE) == V2HFmode)
 
 #define VALID_SSE_REG_MODE(MODE)					\
   ((MODE) == V1TImode || (MODE) == TImode				\
@@ -1051,10 +1053,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define VALID_MMX_REG_MODE_3DNOW(MODE) \
   ((MODE) == V2SFmode || (MODE) == SFmode)
 
+/* To match ia32 psABI, V4HFmode should be added here.  */
 #define VALID_MMX_REG_MODE(MODE)					\
   ((MODE) == V1DImode || (MODE) == DImode				\
    || (MODE) == V2SImode || (MODE) == SImode				\
-   || (MODE) == V4HImode || (MODE) == V8QImode)
+   || (MODE) == V4HImode || (MODE) == V8QImode				\
+   || (MODE) == V4HFmode)
 
 #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
 
@@ -1087,7 +1091,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode	\
    || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode	\
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
-   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
+   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
+   || (MODE) == V8HFmode)
 
 #define X87_FLOAT_MODE_P(MODE)	\
   (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c6279e620c9..758d7d1e3c0 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -498,7 +498,7 @@
 ;; Main data type used by the insn
 (define_attr "mode"
   "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
-   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
+   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -1106,7 +1106,8 @@
 			     (V1TI "16") (V2TI "32") (V4TI "64")
 			     (V2DF "16") (V4DF "32") (V8DF "64")
 			     (V4SF "16") (V8SF "32") (V16SF "64")
-			     (V8HF "16") (V16HF "32") (V32HF "64")])
+			     (V8HF "16") (V16HF "32") (V32HF "64")
+			     (V4HF "8") (V2HF "4")])
 
 ;; Double word integer modes as mode attribute.
 (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index b0093778fc6..68e1c4b2dbd 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -48,7 +48,7 @@
 (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
 
 ;; All 8-byte vector modes handled by MMX
-(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
+(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
 (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
 
 ;; Mix-n-match
@@ -57,8 +57,8 @@
 (define_mode_iterator MMXMODE24 [V4HI V2SI])
 (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 
-;; All 4-byte integer vector modes
-(define_mode_iterator V_32 [V4QI V2HI V1SI])
+;; All 4-byte integer/float16 vector modes
+(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
 
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
@@ -191,6 +191,8 @@
 	    (eq_attr "alternative" "11,12")
 	      (cond [(match_test "<MODE>mode == V2SFmode")
 		       (const_string "V4SF")
+		     (match_test "<MODE>mode == V4HFmode")
+		       (const_string "V4SF")
 		     (ior (not (match_test "TARGET_SSE2"))
 			  (match_test "optimize_function_for_size_p (cfun)"))
 		       (const_string "V4SF")
@@ -198,14 +200,16 @@
 		    (const_string "TI"))
 
 	    (and (eq_attr "alternative" "13")
-		 (ior (and (match_test "<MODE>mode == V2SFmode")
-			   (not (match_test "TARGET_MMX_WITH_SSE")))
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (and (match_test "<MODE>mode == V2SFmode")
+				(not (match_test "TARGET_MMX_WITH_SSE")))
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 
 	    (and (eq_attr "alternative" "14")
-	    	 (ior (match_test "<MODE>mode == V2SFmode")
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (match_test "<MODE>mode == V2SFmode")
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 	   ]
 	   (const_string "DI")))
@@ -289,12 +293,17 @@
        (const_string "*")))
    (set (attr "mode")
      (cond [(eq_attr "alternative" "2,3")
-	      (cond [(match_test "TARGET_AVX")
+	      (cond [(match_test "<MODE>mode == V2HFmode")
+		       (const_string "V4SF")
+		     (match_test "TARGET_AVX")
 		       (const_string "TI")
 		     (match_test "optimize_function_for_size_p (cfun)")
 		       (const_string "V4SF")
 		    ]
 		    (const_string "TI"))
+	    (and (eq_attr "alternative" "4,5")
+		 (match_test "<MODE>mode == V2HFmode"))
+	      (const_string "SF")
 	   ]
 	   (const_string "SI")))
    (set (attr "preferred_for_speed")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb7600edbab..173259ffb77 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -671,6 +671,9 @@
   [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
 	(V16QI  "TARGET_AVX512VL")])
 
+(define_mode_iterator VHF_32_64
+  [V4HF V2HF])
+
 (define_mode_attr avx512
   [(V16QI "avx512vl") (V32QI "avx512vl") (V64QI "avx512bw")
    (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
@@ -1313,6 +1316,7 @@
 	      ]
 	      (symbol_ref "true")))])
 
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -2165,6 +2169,22 @@
    (set_attr "prefix" "<bcst_mask_prefix3>")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "<insn><mode>3"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
+	(plusminusmultdiv:VHF_32_64
+	  (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
+	  (match_operand:VHF_32_64 2 "register_operand" "v")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "type")
+      (cond [(match_test "<CODE> == MULT")
+		(const_string "ssemul")
+	     (match_test "<CODE> == DIV")
+		(const_string "ssediv")]
+	     (const_string "sseadd")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8HF")])
+
 ;; Standard scalar operation patterns which preserve the rest of the
 ;; vector for combiner.
 (define_insn "*<sse>_vm<insn><mode>3"
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
new file mode 100644
index 00000000000..754e909d77b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+/* { dg-final { scan-assembler-times "vaddph" 2 } } */
+/* { dg-final { scan-assembler-times "vsubph" 2 } } */
+/* { dg-final { scan-assembler-times "vmulph" 2 } } */
+/* { dg-final { scan-assembler-times "vdivph" 2 } } */
+
+#define DO_PRAGMA(X) _Pragma(#X)
+
+#define VEC_OP_VV(size, op, name)       \
+void \
+__attribute__ ((noinline, noclone, optimize("tree-slp-vectorize"))) \
+vecop_v##size##hf##name (_Float16 * restrict dst,  \
+ _Float16 * restrict src1, _Float16 * restrict src2)   \
+{ \
+    int i;  \
+    DO_PRAGMA (GCC unroll size)   \
+    for (i = 0; i < size; i++)  \
+      dst[i] = src1[i] op src2[i];  \
+}
+
+VEC_OP_VV(4, +, add)
+VEC_OP_VV(2, +, add)
+VEC_OP_VV(4, -, sub)
+VEC_OP_VV(2, -, sub)
+VEC_OP_VV(4, *, mul)
+VEC_OP_VV(2, *, mul)
+VEC_OP_VV(4, /, div)
+VEC_OP_VV(2, /, div)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
new file mode 100644
index 00000000000..4dc6f9fb92e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
@@ -0,0 +1,75 @@
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+static void vec_op_test (void);
+#define DO_TEST vec_op_test
+#define AVX512FP16
+#define AVX512VL
+#include "avx512f-check.h"
+#include "avx512fp16-64-32-vecop-1.c"
+
+_Float16 a[4], b[4], fexp[4], fref[4];
+
+#define EMULATE_VEC_OP_VV(size, op, name) \
+void \
+__attribute__ ((noinline, noclone)) \
+scalar_vecop_v##size##hf##name ( \
+  _Float16 * restrict dst, _Float16 * restrict src1,  \
+  _Float16 * restrict src2)  \
+{ \
+  int i;  \
+  for (i = 0; i < size; i++)  \
+    dst[i] = src1[i] op src2[i];  \
+}
+
+EMULATE_VEC_OP_VV (4, +, add)
+EMULATE_VEC_OP_VV (2, +, add)
+EMULATE_VEC_OP_VV (4, -, sub)
+EMULATE_VEC_OP_VV (2, -, sub)
+EMULATE_VEC_OP_VV (4, *, mul)
+EMULATE_VEC_OP_VV (2, *, mul)
+EMULATE_VEC_OP_VV (4, /, div)
+EMULATE_VEC_OP_VV (2, /, div)
+
+void init()
+{
+  int i;
+  for (i = 0; i < 4; i++)
+    {
+      a[i] = i + 0.5; 
+      b[i] = i * 1.5;
+      fexp[i] = fref[i] = 2.75 * i;
+    }
+}
+
+int check_cond(void *a, void *b, int size)
+{
+  int i;
+  unsigned short *pa = (unsigned short *)a,
+		 *pb = (unsigned short *)b;
+  for (i = 0; i < size; i++)
+    if (pa[i] != pb[i])
+      return 0;
+  return 1;
+}
+
+#define TEST_VEC_OP_VV(size, name)	\
+{ \
+  init ();  \
+  scalar_vecop_v##size##hf##name (a, b, fexp);  \
+  vecop_v##size##hf##name (a, b, fref);  \
+  if (!check_cond ((void *)fexp, (void *)fref, size)) \
+    abort();  \
+}
+
+static void vec_op_test()
+{
+  TEST_VEC_OP_VV (4, add)
+  TEST_VEC_OP_VV (2, add)
+  TEST_VEC_OP_VV (4, sub)
+  TEST_VEC_OP_VV (2, sub)
+  TEST_VEC_OP_VV (4, mul)
+  TEST_VEC_OP_VV (2, mul)
+  TEST_VEC_OP_VV (4, div)
+  TEST_VEC_OP_VV (2, div)
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
index 112ac3e74d5..8471a1d1d10 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
@@ -43,16 +43,16 @@ FLOATHFVV(2, udi)
 
 /* { dg-final { scan-assembler-times "vcvtqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtuqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtdq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtudq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtdq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtudq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtuw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
index 286ea9f2624..2ef901a0375 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
@@ -41,15 +41,15 @@ EXTENDHFVV(8, sf)
 EXTENDHFVV(4, sf)
 
 /* { dg-final { scan-assembler-times "vcvtpd2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtps2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtps2phxy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
index ee55cd12300..7a51c9dd077 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
@@ -43,16 +43,16 @@ FIX_TRUNCHFVV(2, udi)
 
 /* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2uw\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102230.c b/gcc/testsuite/gcc.target/i386/pr102230.c
new file mode 100644
index 00000000000..60cf1c32afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102230.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16" } */
+
+typedef _Float16 v4hf __attribute__ ((vector_size (8)));
+typedef _Float16 v2hf __attribute__ ((vector_size (4)));
+
+v4hf
+v4hf_abi_1 (v4hf a)
+{
+  return a;
+}
+
+v4hf
+v4hf_abi_3 (v4hf a, v4hf b, v4hf c)
+{
+  return c;
+}
+
+/* { dg-final { scan-assembler-times "movq\[[\\t \]*%mm2, %mm0" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm2, %xmm0" 1 { target { ! ia32 } } } } */
+
+v4hf
+v4hf_abi_4 (v4hf a, v4hf b, v4hf c, v4hf d)
+{
+  return d;
+}
+
+/* { dg-final { scan-assembler-times "movq\[[\\t \]*4\[(\]%esp\[)\], %mm0" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm3, %xmm0" 1 { target { ! ia32 } } } } */
+
+v2hf
+v2hf_test (v2hf a, v2hf b, v2hf c, v2hf d)
+{
+  return b;
+}
+
+/* { dg-final { scan-assembler-times "movl\[[\\t \]*8\[(\]%esp\[)\], %eax" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm1, %xmm0" 1 { target { ! ia32 } } } } */
-- 
2.27.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.
  2021-09-28  4:42   ` Hongyu Wang
@ 2021-09-28  6:27     ` Uros Bizjak
  2021-09-28  7:20       ` Hongyu Wang
  0 siblings, 1 reply; 5+ messages in thread
From: Uros Bizjak @ 2021-09-28  6:27 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

On Tue, Sep 28, 2021 at 6:48 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> > ia32 ABI declares that __m64 values pass via MMX registers. Due to
> > this, we are not able to fully disable MMX register usage, as is the
> > case with x86_64. So, V4HFmode values will pass to functions via MMX
> > registers on ia32 targets.
> >
> > So, there should be no additional define_insn, the addition to the
> > existing MMXMODE mode iterator should be enough. V4HFmodes should be
> > handled in the same way as e.g. V8QImode.
> >
> > This is not the case with 4-byte values, which should be passed using
> > integer ABI.
>
> Thanks for the explanation, updated patch by removing the extra define_insn,
> and drop V4HFmode from VALID_AVX512FP16_REG_MODE. Now v4hf would behave
> same as v8qi.
>
> Bootsrapped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
>
> OK for master with the updated one?

I'd put this new pattern in mmx.md to keep 64bit/32bit modes in
mmx.md, similar to e.g. FMA patterns among others.

OK with the eventual above change.

Thanks,
Uros.

>
> Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年9月27日周一 下午7:35写道:
> >
> > On Mon, Sep 27, 2021 at 12:42 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
> > >
> > > Hi Uros,
> > >
> > > This patch intends to support V4HF/V2HF vector type and basic operations.
> > >
> > > For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
> > > is parsed by stack and returned from GPR since it is not specified
> > > by ABI.
> > >
> > > We found for 64bit vector in ia32, when mmx disabled there seems no
> > > mov<mode>_internal, so we add a define_insn for v4hf mode. It would be very
> > > ppreciated if you know why the handling of 64bit vector looks as is and
> > > give some advice.
> >
> > ia32 ABI declares that __m64 values pass via MMX registers. Due to
> > this, we are not able to fully disable MMX register usage, as is the
> > case with x86_64. So, V4HFmode values will pass to functions via MMX
> > registers on ia32 targets.
> >
> > So, there should be no additional define_insn, the addition to the
> > existing MMXMODE mode iterator should be enough. V4HFmodes should be
> > handled in the same way as e.g. V8QImode.
> >
> > This is not the case with 4-byte values, which should be passed using
> > integer ABI.
> >
> > Uros.
> >
> > >
> > > Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> > >
> > > OK for master?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/102230
> > >         * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
> > >         V4HF and V2HF mode check.
> > >         (VALID_SSE2_REG_VHF_MODE): Likewise.
> > >         (VALID_MMX_REG_MODE): Likewise.
> > >         (SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
> > >         vector mode condition.
> > >         * config/i386/i386.c (classify_argument): Parse V4HF/V2HF
> > >         via sse regs.
> > >         (function_arg_32): Add V4HFmode.
> > >         (function_arg_advance_32): Likewise.
> > >         * config/i386/i386.md (mode): Add V4HF/V2HF.
> > >         (MODE_SIZE): Likewise.
> > >         * config/i386/mmx.md (MMXMODE): Add V4HF mode.
> > >         (V_32): Add V2HF mode.
> > >         (*mov<mode>_internal): Adjust sse alternatives to support
> > >         V4HF mode vector move.
> > >         (*mov<mode>_internal): Adjust sse alternatives
> > >         to support V2HF mode move.
> > >         * config/i386/sse.md (VHF_32_64): New mode iterator.
> > >         (<insn><mode>3): New define_insn for add/sub/mul/div.
> > >         (*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/102230
> > >         * gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
> > >         * gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
> > >         * gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
> > >         * gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
> > >         * gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
> > >         * gcc.target/i386/pr102230.c: Ditto.
> > > ---
> > >  gcc/config/i386/i386.c                        |  4 +
> > >  gcc/config/i386/i386.h                        | 12 ++-
> > >  gcc/config/i386/i386.md                       |  5 +-
> > >  gcc/config/i386/mmx.md                        | 27 ++++---
> > >  gcc/config/i386/sse.md                        | 49 ++++++++++++
> > >  .../i386/avx512fp16-64-32-vecop-1.c           | 30 ++++++++
> > >  .../i386/avx512fp16-64-32-vecop-2.c           | 75 +++++++++++++++++++
> > >  .../gcc.target/i386/avx512fp16-floatvnhf.c    | 12 +--
> > >  .../i386/avx512fp16-trunc-extendvnhf.c        | 12 +--
> > >  .../gcc.target/i386/avx512fp16-truncvnhf.c    | 12 +--
> > >  gcc/testsuite/gcc.target/i386/pr102230.c      | 38 ++++++++++
> > >  11 files changed, 243 insertions(+), 33 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c
> > >
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index ba89e111d28..b3e4add4b9e 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
> > >      case E_V2SFmode:
> > >      case E_V2SImode:
> > >      case E_V4HImode:
> > > +    case E_V4HFmode:
> > > +    case E_V2HFmode:
> > >      case E_V8QImode:
> > >        classes[0] = X86_64_SSE_CLASS;
> > >        return 1;
> > > @@ -2902,6 +2904,7 @@ pass_in_reg:
> > >
> > >      case E_V8QImode:
> > >      case E_V4HImode:
> > > +    case E_V4HFmode:
> > >      case E_V2SImode:
> > >      case E_V2SFmode:
> > >      case E_V1TImode:
> > > @@ -3149,6 +3152,7 @@ pass_in_reg:
> > >
> > >      case E_V8QImode:
> > >      case E_V4HImode:
> > > +    case E_V4HFmode:
> > >      case E_V2SImode:
> > >      case E_V2SFmode:
> > >      case E_V1TImode:
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index 8a4251b4926..9f3cad31f96 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > >     || (MODE) == TImode)
> > >
> > >  #define VALID_AVX512FP16_REG_MODE(MODE)                                        \
> > > -  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
> > > +  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode    \
> > > +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
> > >
> > >  #define VALID_SSE2_REG_MODE(MODE)                                      \
> > >    ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode     \
> > > @@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > >     || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
> > >
> > >  #define VALID_SSE2_REG_VHF_MODE(MODE)                  \
> > > -  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
> > > +  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode    \
> > > +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
> > >
> > >  #define VALID_SSE_REG_MODE(MODE)                                       \
> > >    ((MODE) == V1TImode || (MODE) == TImode                              \
> > > @@ -1054,7 +1056,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > >  #define VALID_MMX_REG_MODE(MODE)                                       \
> > >    ((MODE) == V1DImode || (MODE) == DImode                              \
> > >     || (MODE) == V2SImode || (MODE) == SImode                           \
> > > -   || (MODE) == V4HImode || (MODE) == V8QImode)
> > > +   || (MODE) == V4HImode || (MODE) == V8QImode                         \
> > > +   || (MODE) == V4HFmode)
> > >
> > >  #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
> > >
> > > @@ -1087,7 +1090,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > >     || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode   \
> > >     || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode  \
> > >     || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \
> > > -   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
> > > +   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
> > > +   || (MODE) == V8HFmode)
> > >
> > >  #define X87_FLOAT_MODE_P(MODE) \
> > >    (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
> > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > index c6279e620c9..758d7d1e3c0 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -498,7 +498,7 @@
> > >  ;; Main data type used by the insn
> > >  (define_attr "mode"
> > >    "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
> > > -   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
> > > +   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
> > >    (const_string "unknown"))
> > >
> > >  ;; The CPU unit operations uses.
> > > @@ -1106,7 +1106,8 @@
> > >                              (V1TI "16") (V2TI "32") (V4TI "64")
> > >                              (V2DF "16") (V4DF "32") (V8DF "64")
> > >                              (V4SF "16") (V8SF "32") (V16SF "64")
> > > -                            (V8HF "16") (V16HF "32") (V32HF "64")])
> > > +                            (V8HF "16") (V16HF "32") (V32HF "64")
> > > +                            (V4HF "8") (V2HF "4")])
> > >
> > >  ;; Double word integer modes as mode attribute.
> > >  (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
> > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > > index b0093778fc6..68e1c4b2dbd 100644
> > > --- a/gcc/config/i386/mmx.md
> > > +++ b/gcc/config/i386/mmx.md
> > > @@ -48,7 +48,7 @@
> > >  (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
> > >
> > >  ;; All 8-byte vector modes handled by MMX
> > > -(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
> > > +(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
> > >  (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
> > >
> > >  ;; Mix-n-match
> > > @@ -57,8 +57,8 @@
> > >  (define_mode_iterator MMXMODE24 [V4HI V2SI])
> > >  (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
> > >
> > > -;; All 4-byte integer vector modes
> > > -(define_mode_iterator V_32 [V4QI V2HI V1SI])
> > > +;; All 4-byte integer/float16 vector modes
> > > +(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
> > >
> > >  ;; 4-byte integer vector modes
> > >  (define_mode_iterator VI_32 [V4QI V2HI])
> > > @@ -191,6 +191,8 @@
> > >             (eq_attr "alternative" "11,12")
> > >               (cond [(match_test "<MODE>mode == V2SFmode")
> > >                        (const_string "V4SF")
> > > +                    (match_test "<MODE>mode == V4HFmode")
> > > +                      (const_string "V4SF")
> > >                      (ior (not (match_test "TARGET_SSE2"))
> > >                           (match_test "optimize_function_for_size_p (cfun)"))
> > >                        (const_string "V4SF")
> > > @@ -198,14 +200,16 @@
> > >                     (const_string "TI"))
> > >
> > >             (and (eq_attr "alternative" "13")
> > > -                (ior (and (match_test "<MODE>mode == V2SFmode")
> > > -                          (not (match_test "TARGET_MMX_WITH_SSE")))
> > > -                     (not (match_test "TARGET_SSE2"))))
> > > +                (ior (ior (and (match_test "<MODE>mode == V2SFmode")
> > > +                               (not (match_test "TARGET_MMX_WITH_SSE")))
> > > +                          (not (match_test "TARGET_SSE2")))
> > > +                     (match_test "<MODE>mode == V4HFmode")))
> > >               (const_string "V2SF")
> > >
> > >             (and (eq_attr "alternative" "14")
> > > -                (ior (match_test "<MODE>mode == V2SFmode")
> > > -                     (not (match_test "TARGET_SSE2"))))
> > > +                (ior (ior (match_test "<MODE>mode == V2SFmode")
> > > +                          (not (match_test "TARGET_SSE2")))
> > > +                     (match_test "<MODE>mode == V4HFmode")))
> > >               (const_string "V2SF")
> > >            ]
> > >            (const_string "DI")))
> > > @@ -289,12 +293,17 @@
> > >         (const_string "*")))
> > >     (set (attr "mode")
> > >       (cond [(eq_attr "alternative" "2,3")
> > > -             (cond [(match_test "TARGET_AVX")
> > > +             (cond [(match_test "<MODE>mode == V2HFmode")
> > > +                      (const_string "V4SF")
> > > +                    (match_test "TARGET_AVX")
> > >                        (const_string "TI")
> > >                      (match_test "optimize_function_for_size_p (cfun)")
> > >                        (const_string "V4SF")
> > >                     ]
> > >                     (const_string "TI"))
> > > +           (and (eq_attr "alternative" "4,5")
> > > +                (match_test "<MODE>mode == V2HFmode"))
> > > +             (const_string "SF")
> > >            ]
> > >            (const_string "SI")))
> > >     (set (attr "preferred_for_speed")
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index a446dedb2ec..b7832926287 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -671,6 +671,9 @@
> > >    [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
> > >         (V16QI  "TARGET_AVX512VL")])
> > >
> > > +(define_mode_iterator VHF_32_64
> > > +  [V4HF V2HF])
> > > +
> > >  (define_mode_attr avx512
> > >    [(V16QI "avx512vl") (V32QI "avx512vl") (V64QI "avx512bw")
> > >     (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
> > > @@ -1313,6 +1316,36 @@
> > >               ]
> > >               (symbol_ref "true")))])
> > >
> > > +(define_insn "*movv4hf_internal_sse"
> > > +  [(set (match_operand:V4HF 0 "nonimmediate_operand"
> > > +        "=v,v,v,m")
> > > +       (match_operand:V4HF 1 "nonimmediate_or_sse_const_operand"
> > > +        " C,v,m,v"))]
> > > +  "!TARGET_MMX && TARGET_SSE2
> > > +   && (register_operand (operands[0], V4HFmode)
> > > +       || register_operand (operands[1], V4HFmode))"
> > > +{
> > > +  switch (get_attr_type (insn))
> > > +    {
> > > +    case TYPE_SSELOG1:
> > > +      return standard_sse_constant_opcode (insn, operands);
> > > +
> > > +    case TYPE_SSEMOV:
> > > +      return ix86_output_ssemov (insn, operands);
> > > +
> > > +    default:
> > > +      gcc_unreachable ();
> > > +    }
> > > +}
> > > +  [(set_attr "type" "sselog1,ssemov,ssemov,ssemov")
> > > +   (set_attr "prefix" "maybe_vex")
> > > +   (set (attr "mode")
> > > +       (cond [(eq_attr "alternative" "1")
> > > +                (const_string "V4SF")]
> > > +             (const_string "V2SF")))]
> > > +)
> > > +
> > > +
> > >  ;; If mem_addr points to a memory region with less than whole vector size bytes
> > >  ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> > >  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> > > @@ -2165,6 +2198,22 @@
> > >     (set_attr "prefix" "<bcst_mask_prefix3>")
> > >     (set_attr "mode" "<MODE>")])
> > >
> > > +(define_insn "<insn><mode>3"
> > > +  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
> > > +       (plusminusmultdiv:VHF_32_64
> > > +         (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
> > > +         (match_operand:VHF_32_64 2 "register_operand" "v")))]
> > > +  "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > > +  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
> > > +  [(set (attr "type")
> > > +      (cond [(match_test "<CODE> == MULT")
> > > +               (const_string "ssemul")
> > > +            (match_test "<CODE> == DIV")
> > > +               (const_string "ssediv")]
> > > +            (const_string "sseadd")))
> > > +   (set_attr "prefix" "evex")
> > > +   (set_attr "mode" "V8HF")])
> > > +
> > >  ;; Standard scalar operation patterns which preserve the rest of the
> > >  ;; vector for combiner.
> > >  (define_insn "*<sse>_vm<insn><mode>3"
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > > new file mode 100644
> > > index 00000000000..754e909d77b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > > +
> > > +/* { dg-final { scan-assembler-times "vaddph" 2 } } */
> > > +/* { dg-final { scan-assembler-times "vsubph" 2 } } */
> > > +/* { dg-final { scan-assembler-times "vmulph" 2 } } */
> > > +/* { dg-final { scan-assembler-times "vdivph" 2 } } */
> > > +
> > > +#define DO_PRAGMA(X) _Pragma(#X)
> > > +
> > > +#define VEC_OP_VV(size, op, name)       \
> > > +void \
> > > +__attribute__ ((noinline, noclone, optimize("tree-slp-vectorize"))) \
> > > +vecop_v##size##hf##name (_Float16 * restrict dst,  \
> > > + _Float16 * restrict src1, _Float16 * restrict src2)   \
> > > +{ \
> > > +    int i;  \
> > > +    DO_PRAGMA (GCC unroll size)   \
> > > +    for (i = 0; i < size; i++)  \
> > > +      dst[i] = src1[i] op src2[i];  \
> > > +}
> > > +
> > > +VEC_OP_VV(4, +, add)
> > > +VEC_OP_VV(2, +, add)
> > > +VEC_OP_VV(4, -, sub)
> > > +VEC_OP_VV(2, -, sub)
> > > +VEC_OP_VV(4, *, mul)
> > > +VEC_OP_VV(2, *, mul)
> > > +VEC_OP_VV(4, /, div)
> > > +VEC_OP_VV(2, /, div)
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > > new file mode 100644
> > > index 00000000000..4dc6f9fb92e
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > > @@ -0,0 +1,75 @@
> > > +/* { dg-do run { target avx512fp16 } } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > > +
> > > +static void vec_op_test (void);
> > > +#define DO_TEST vec_op_test
> > > +#define AVX512FP16
> > > +#define AVX512VL
> > > +#include "avx512f-check.h"
> > > +#include "avx512fp16-64-32-vecop-1.c"
> > > +
> > > +_Float16 a[4], b[4], fexp[4], fref[4];
> > > +
> > > +#define EMULATE_VEC_OP_VV(size, op, name) \
> > > +void \
> > > +__attribute__ ((noinline, noclone)) \
> > > +scalar_vecop_v##size##hf##name ( \
> > > +  _Float16 * restrict dst, _Float16 * restrict src1,  \
> > > +  _Float16 * restrict src2)  \
> > > +{ \
> > > +  int i;  \
> > > +  for (i = 0; i < size; i++)  \
> > > +    dst[i] = src1[i] op src2[i];  \
> > > +}
> > > +
> > > +EMULATE_VEC_OP_VV (4, +, add)
> > > +EMULATE_VEC_OP_VV (2, +, add)
> > > +EMULATE_VEC_OP_VV (4, -, sub)
> > > +EMULATE_VEC_OP_VV (2, -, sub)
> > > +EMULATE_VEC_OP_VV (4, *, mul)
> > > +EMULATE_VEC_OP_VV (2, *, mul)
> > > +EMULATE_VEC_OP_VV (4, /, div)
> > > +EMULATE_VEC_OP_VV (2, /, div)
> > > +
> > > +void init()
> > > +{
> > > +  int i;
> > > +  for (i = 0; i < 4; i++)
> > > +    {
> > > +      a[i] = i + 0.5;
> > > +      b[i] = i * 1.5;
> > > +      fexp[i] = fref[i] = 2.75 * i;
> > > +    }
> > > +}
> > > +
> > > +int check_cond(void *a, void *b, int size)
> > > +{
> > > +  int i;
> > > +  unsigned short *pa = (unsigned short *)a,
> > > +                *pb = (unsigned short *)b;
> > > +  for (i = 0; i < size; i++)
> > > +    if (pa[i] != pb[i])
> > > +      return 0;
> > > +  return 1;
> > > +}
> > > +
> > > +#define TEST_VEC_OP_VV(size, name)     \
> > > +{ \
> > > +  init ();  \
> > > +  scalar_vecop_v##size##hf##name (a, b, fexp);  \
> > > +  vecop_v##size##hf##name (a, b, fref);  \
> > > +  if (!check_cond ((void *)fexp, (void *)fref, size)) \
> > > +    abort();  \
> > > +}
> > > +
> > > +static void vec_op_test()
> > > +{
> > > +  TEST_VEC_OP_VV (4, add)
> > > +  TEST_VEC_OP_VV (2, add)
> > > +  TEST_VEC_OP_VV (4, sub)
> > > +  TEST_VEC_OP_VV (2, sub)
> > > +  TEST_VEC_OP_VV (4, mul)
> > > +  TEST_VEC_OP_VV (2, mul)
> > > +  TEST_VEC_OP_VV (4, div)
> > > +  TEST_VEC_OP_VV (2, div)
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > > index 112ac3e74d5..8471a1d1d10 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > > @@ -43,16 +43,16 @@ FLOATHFVV(2, udi)
> > >
> > >  /* { dg-final { scan-assembler-times "vcvtqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtuqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtdq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtudq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtdq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtudq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtuw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > > index 286ea9f2624..2ef901a0375 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > > @@ -41,15 +41,15 @@ EXTENDHFVV(8, sf)
> > >  EXTENDHFVV(4, sf)
> > >
> > >  /* { dg-final { scan-assembler-times "vcvtpd2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtps2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtps2phxy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > > index ee55cd12300..7a51c9dd077 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > > @@ -43,16 +43,16 @@ FIX_TRUNCHFVV(2, udi)
> > >
> > >  /* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2uw\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr102230.c b/gcc/testsuite/gcc.target/i386/pr102230.c
> > > new file mode 100644
> > > index 00000000000..60cf1c32afe
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr102230.c
> > > @@ -0,0 +1,38 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx512fp16" } */
> > > +
> > > +typedef _Float16 v4hf __attribute__ ((vector_size (8)));
> > > +typedef _Float16 v2hf __attribute__ ((vector_size (4)));
> > > +
> > > +v4hf
> > > +v4hf_abi_1 (v4hf a)
> > > +{
> > > +  return a;
> > > +}
> > > +
> > > +v4hf
> > > +v4hf_abi_3 (v4hf a, v4hf b, v4hf c)
> > > +{
> > > +  return c;
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "movq\[[\\t \]*%mm2, %mm0" 1 { target { ia32 } } } } */
> > > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm2, %xmm0" 1 { target { ! ia32 } } } } */
> > > +
> > > +v4hf
> > > +v4hf_abi_4 (v4hf a, v4hf b, v4hf c, v4hf d)
> > > +{
> > > +  return d;
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "movq\[[\\t \]*4\[(\]%esp\[)\], %mm0" 1 { target { ia32 } } } } */
> > > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm3, %xmm0" 1 { target { ! ia32 } } } } */
> > > +
> > > +v2hf
> > > +v2hf_test (v2hf a, v2hf b, v2hf c, v2hf d)
> > > +{
> > > +  return b;
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "movl\[[\\t \]*8\[(\]%esp\[)\], %eax" 1 { target { ia32 } } } } */
> > > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm1, %xmm0" 1 { target { ! ia32 } } } } */
> > > --
> > > 2.18.1
> > >

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.
  2021-09-28  6:27     ` Uros Bizjak
@ 2021-09-28  7:20       ` Hongyu Wang
  0 siblings, 0 replies; 5+ messages in thread
From: Hongyu Wang @ 2021-09-28  7:20 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongyu Wang, Hongtao Liu, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 32711 bytes --]

> I'd put this new pattern in mmx.md to keep 64bit/32bit modes in
> mmx.md, similar to e.g. FMA patterns among others.

Yes, I put it after single-float patterns. Attached the patch I'm
going to check-in.
Thanks for your review.

Uros Bizjak <ubizjak@gmail.com> 于2021年9月28日周二 下午2:27写道:
>
> On Tue, Sep 28, 2021 at 6:48 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> >
> > > ia32 ABI declares that __m64 values pass via MMX registers. Due to
> > > this, we are not able to fully disable MMX register usage, as is the
> > > case with x86_64. So, V4HFmode values will pass to functions via MMX
> > > registers on ia32 targets.
> > >
> > > So, there should be no additional define_insn, the addition to the
> > > existing MMXMODE mode iterator should be enough. V4HFmodes should be
> > > handled in the same way as e.g. V8QImode.
> > >
> > > This is not the case with 4-byte values, which should be passed using
> > > integer ABI.
> >
> > Thanks for the explanation, updated patch by removing the extra define_insn,
> > and drop V4HFmode from VALID_AVX512FP16_REG_MODE. Now v4hf would behave
> > same as v8qi.
> >
> > Bootsrapped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > OK for master with the updated one?
>
> I'd put this new pattern in mmx.md to keep 64bit/32bit modes in
> mmx.md, similar to e.g. FMA patterns among others.
>
> OK with the eventual above change.
>
> Thanks,
> Uros.
>
> >
> > Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年9月27日周一 下午7:35写道:
> > >
> > > On Mon, Sep 27, 2021 at 12:42 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
> > > >
> > > > Hi Uros,
> > > >
> > > > This patch intends to support V4HF/V2HF vector type and basic operations.
> > > >
> > > > For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
> > > > is parsed by stack and returned from GPR since it is not specified
> > > > by ABI.
> > > >
> > > > We found for 64bit vector in ia32, when mmx disabled there seems no
> > > > mov<mode>_internal, so we add a define_insn for v4hf mode. It would be very
> > > > ppreciated if you know why the handling of 64bit vector looks as is and
> > > > give some advice.
> > >
> > > ia32 ABI declares that __m64 values pass via MMX registers. Due to
> > > this, we are not able to fully disable MMX register usage, as is the
> > > case with x86_64. So, V4HFmode values will pass to functions via MMX
> > > registers on ia32 targets.
> > >
> > > So, there should be no additional define_insn, the addition to the
> > > existing MMXMODE mode iterator should be enough. V4HFmodes should be
> > > handled in the same way as e.g. V8QImode.
> > >
> > > This is not the case with 4-byte values, which should be passed using
> > > integer ABI.
> > >
> > > Uros.
> > >
> > > >
> > > > Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> > > >
> > > > OK for master?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR target/102230
> > > >         * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
> > > >         V4HF and V2HF mode check.
> > > >         (VALID_SSE2_REG_VHF_MODE): Likewise.
> > > >         (VALID_MMX_REG_MODE): Likewise.
> > > >         (SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
> > > >         vector mode condition.
> > > >         * config/i386/i386.c (classify_argument): Parse V4HF/V2HF
> > > >         via sse regs.
> > > >         (function_arg_32): Add V4HFmode.
> > > >         (function_arg_advance_32): Likewise.
> > > >         * config/i386/i386.md (mode): Add V4HF/V2HF.
> > > >         (MODE_SIZE): Likewise.
> > > >         * config/i386/mmx.md (MMXMODE): Add V4HF mode.
> > > >         (V_32): Add V2HF mode.
> > > >         (*mov<mode>_internal): Adjust sse alternatives to support
> > > >         V4HF mode vector move.
> > > >         (*mov<mode>_internal): Adjust sse alternatives
> > > >         to support V2HF mode move.
> > > >         * config/i386/sse.md (VHF_32_64): New mode iterator.
> > > >         (<insn><mode>3): New define_insn for add/sub/mul/div.
> > > >         (*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         PR target/102230
> > > >         * gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
> > > >         * gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
> > > >         * gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
> > > >         * gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
> > > >         * gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
> > > >         * gcc.target/i386/pr102230.c: Ditto.
> > > > ---
> > > >  gcc/config/i386/i386.c                        |  4 +
> > > >  gcc/config/i386/i386.h                        | 12 ++-
> > > >  gcc/config/i386/i386.md                       |  5 +-
> > > >  gcc/config/i386/mmx.md                        | 27 ++++---
> > > >  gcc/config/i386/sse.md                        | 49 ++++++++++++
> > > >  .../i386/avx512fp16-64-32-vecop-1.c           | 30 ++++++++
> > > >  .../i386/avx512fp16-64-32-vecop-2.c           | 75 +++++++++++++++++++
> > > >  .../gcc.target/i386/avx512fp16-floatvnhf.c    | 12 +--
> > > >  .../i386/avx512fp16-trunc-extendvnhf.c        | 12 +--
> > > >  .../gcc.target/i386/avx512fp16-truncvnhf.c    | 12 +--
> > > >  gcc/testsuite/gcc.target/i386/pr102230.c      | 38 ++++++++++
> > > >  11 files changed, 243 insertions(+), 33 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c
> > > >
> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > index ba89e111d28..b3e4add4b9e 100644
> > > > --- a/gcc/config/i386/i386.c
> > > > +++ b/gcc/config/i386/i386.c
> > > > @@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
> > > >      case E_V2SFmode:
> > > >      case E_V2SImode:
> > > >      case E_V4HImode:
> > > > +    case E_V4HFmode:
> > > > +    case E_V2HFmode:
> > > >      case E_V8QImode:
> > > >        classes[0] = X86_64_SSE_CLASS;
> > > >        return 1;
> > > > @@ -2902,6 +2904,7 @@ pass_in_reg:
> > > >
> > > >      case E_V8QImode:
> > > >      case E_V4HImode:
> > > > +    case E_V4HFmode:
> > > >      case E_V2SImode:
> > > >      case E_V2SFmode:
> > > >      case E_V1TImode:
> > > > @@ -3149,6 +3152,7 @@ pass_in_reg:
> > > >
> > > >      case E_V8QImode:
> > > >      case E_V4HImode:
> > > > +    case E_V4HFmode:
> > > >      case E_V2SImode:
> > > >      case E_V2SFmode:
> > > >      case E_V1TImode:
> > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > index 8a4251b4926..9f3cad31f96 100644
> > > > --- a/gcc/config/i386/i386.h
> > > > +++ b/gcc/config/i386/i386.h
> > > > @@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > > >     || (MODE) == TImode)
> > > >
> > > >  #define VALID_AVX512FP16_REG_MODE(MODE)                                        \
> > > > -  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
> > > > +  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode    \
> > > > +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
> > > >
> > > >  #define VALID_SSE2_REG_MODE(MODE)                                      \
> > > >    ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode     \
> > > > @@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > > >     || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
> > > >
> > > >  #define VALID_SSE2_REG_VHF_MODE(MODE)                  \
> > > > -  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
> > > > +  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode    \
> > > > +   || (MODE) == V4HFmode || (MODE) == V2HFmode)
> > > >
> > > >  #define VALID_SSE_REG_MODE(MODE)                                       \
> > > >    ((MODE) == V1TImode || (MODE) == TImode                              \
> > > > @@ -1054,7 +1056,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > > >  #define VALID_MMX_REG_MODE(MODE)                                       \
> > > >    ((MODE) == V1DImode || (MODE) == DImode                              \
> > > >     || (MODE) == V2SImode || (MODE) == SImode                           \
> > > > -   || (MODE) == V4HImode || (MODE) == V8QImode)
> > > > +   || (MODE) == V4HImode || (MODE) == V8QImode                         \
> > > > +   || (MODE) == V4HFmode)
> > > >
> > > >  #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
> > > >
> > > > @@ -1087,7 +1090,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
> > > >     || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode   \
> > > >     || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode  \
> > > >     || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \
> > > > -   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
> > > > +   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
> > > > +   || (MODE) == V8HFmode)
> > > >
> > > >  #define X87_FLOAT_MODE_P(MODE) \
> > > >    (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
> > > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > > index c6279e620c9..758d7d1e3c0 100644
> > > > --- a/gcc/config/i386/i386.md
> > > > +++ b/gcc/config/i386/i386.md
> > > > @@ -498,7 +498,7 @@
> > > >  ;; Main data type used by the insn
> > > >  (define_attr "mode"
> > > >    "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
> > > > -   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
> > > > +   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
> > > >    (const_string "unknown"))
> > > >
> > > >  ;; The CPU unit operations uses.
> > > > @@ -1106,7 +1106,8 @@
> > > >                              (V1TI "16") (V2TI "32") (V4TI "64")
> > > >                              (V2DF "16") (V4DF "32") (V8DF "64")
> > > >                              (V4SF "16") (V8SF "32") (V16SF "64")
> > > > -                            (V8HF "16") (V16HF "32") (V32HF "64")])
> > > > +                            (V8HF "16") (V16HF "32") (V32HF "64")
> > > > +                            (V4HF "8") (V2HF "4")])
> > > >
> > > >  ;; Double word integer modes as mode attribute.
> > > >  (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
> > > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > > > index b0093778fc6..68e1c4b2dbd 100644
> > > > --- a/gcc/config/i386/mmx.md
> > > > +++ b/gcc/config/i386/mmx.md
> > > > @@ -48,7 +48,7 @@
> > > >  (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
> > > >
> > > >  ;; All 8-byte vector modes handled by MMX
> > > > -(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
> > > > +(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
> > > >  (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
> > > >
> > > >  ;; Mix-n-match
> > > > @@ -57,8 +57,8 @@
> > > >  (define_mode_iterator MMXMODE24 [V4HI V2SI])
> > > >  (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
> > > >
> > > > -;; All 4-byte integer vector modes
> > > > -(define_mode_iterator V_32 [V4QI V2HI V1SI])
> > > > +;; All 4-byte integer/float16 vector modes
> > > > +(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
> > > >
> > > >  ;; 4-byte integer vector modes
> > > >  (define_mode_iterator VI_32 [V4QI V2HI])
> > > > @@ -191,6 +191,8 @@
> > > >             (eq_attr "alternative" "11,12")
> > > >               (cond [(match_test "<MODE>mode == V2SFmode")
> > > >                        (const_string "V4SF")
> > > > +                    (match_test "<MODE>mode == V4HFmode")
> > > > +                      (const_string "V4SF")
> > > >                      (ior (not (match_test "TARGET_SSE2"))
> > > >                           (match_test "optimize_function_for_size_p (cfun)"))
> > > >                        (const_string "V4SF")
> > > > @@ -198,14 +200,16 @@
> > > >                     (const_string "TI"))
> > > >
> > > >             (and (eq_attr "alternative" "13")
> > > > -                (ior (and (match_test "<MODE>mode == V2SFmode")
> > > > -                          (not (match_test "TARGET_MMX_WITH_SSE")))
> > > > -                     (not (match_test "TARGET_SSE2"))))
> > > > +                (ior (ior (and (match_test "<MODE>mode == V2SFmode")
> > > > +                               (not (match_test "TARGET_MMX_WITH_SSE")))
> > > > +                          (not (match_test "TARGET_SSE2")))
> > > > +                     (match_test "<MODE>mode == V4HFmode")))
> > > >               (const_string "V2SF")
> > > >
> > > >             (and (eq_attr "alternative" "14")
> > > > -                (ior (match_test "<MODE>mode == V2SFmode")
> > > > -                     (not (match_test "TARGET_SSE2"))))
> > > > +                (ior (ior (match_test "<MODE>mode == V2SFmode")
> > > > +                          (not (match_test "TARGET_SSE2")))
> > > > +                     (match_test "<MODE>mode == V4HFmode")))
> > > >               (const_string "V2SF")
> > > >            ]
> > > >            (const_string "DI")))
> > > > @@ -289,12 +293,17 @@
> > > >         (const_string "*")))
> > > >     (set (attr "mode")
> > > >       (cond [(eq_attr "alternative" "2,3")
> > > > -             (cond [(match_test "TARGET_AVX")
> > > > +             (cond [(match_test "<MODE>mode == V2HFmode")
> > > > +                      (const_string "V4SF")
> > > > +                    (match_test "TARGET_AVX")
> > > >                        (const_string "TI")
> > > >                      (match_test "optimize_function_for_size_p (cfun)")
> > > >                        (const_string "V4SF")
> > > >                     ]
> > > >                     (const_string "TI"))
> > > > +           (and (eq_attr "alternative" "4,5")
> > > > +                (match_test "<MODE>mode == V2HFmode"))
> > > > +             (const_string "SF")
> > > >            ]
> > > >            (const_string "SI")))
> > > >     (set (attr "preferred_for_speed")
> > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > > index a446dedb2ec..b7832926287 100644
> > > > --- a/gcc/config/i386/sse.md
> > > > +++ b/gcc/config/i386/sse.md
> > > > @@ -671,6 +671,9 @@
> > > >    [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
> > > >         (V16QI  "TARGET_AVX512VL")])
> > > >
> > > > +(define_mode_iterator VHF_32_64
> > > > +  [V4HF V2HF])
> > > > +
> > > >  (define_mode_attr avx512
> > > >    [(V16QI "avx512vl") (V32QI "avx512vl") (V64QI "avx512bw")
> > > >     (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
> > > > @@ -1313,6 +1316,36 @@
> > > >               ]
> > > >               (symbol_ref "true")))])
> > > >
> > > > +(define_insn "*movv4hf_internal_sse"
> > > > +  [(set (match_operand:V4HF 0 "nonimmediate_operand"
> > > > +        "=v,v,v,m")
> > > > +       (match_operand:V4HF 1 "nonimmediate_or_sse_const_operand"
> > > > +        " C,v,m,v"))]
> > > > +  "!TARGET_MMX && TARGET_SSE2
> > > > +   && (register_operand (operands[0], V4HFmode)
> > > > +       || register_operand (operands[1], V4HFmode))"
> > > > +{
> > > > +  switch (get_attr_type (insn))
> > > > +    {
> > > > +    case TYPE_SSELOG1:
> > > > +      return standard_sse_constant_opcode (insn, operands);
> > > > +
> > > > +    case TYPE_SSEMOV:
> > > > +      return ix86_output_ssemov (insn, operands);
> > > > +
> > > > +    default:
> > > > +      gcc_unreachable ();
> > > > +    }
> > > > +}
> > > > +  [(set_attr "type" "sselog1,ssemov,ssemov,ssemov")
> > > > +   (set_attr "prefix" "maybe_vex")
> > > > +   (set (attr "mode")
> > > > +       (cond [(eq_attr "alternative" "1")
> > > > +                (const_string "V4SF")]
> > > > +             (const_string "V2SF")))]
> > > > +)
> > > > +
> > > > +
> > > >  ;; If mem_addr points to a memory region with less than whole vector size bytes
> > > >  ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> > > >  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> > > > @@ -2165,6 +2198,22 @@
> > > >     (set_attr "prefix" "<bcst_mask_prefix3>")
> > > >     (set_attr "mode" "<MODE>")])
> > > >
> > > > +(define_insn "<insn><mode>3"
> > > > +  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
> > > > +       (plusminusmultdiv:VHF_32_64
> > > > +         (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
> > > > +         (match_operand:VHF_32_64 2 "register_operand" "v")))]
> > > > +  "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > > > +  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
> > > > +  [(set (attr "type")
> > > > +      (cond [(match_test "<CODE> == MULT")
> > > > +               (const_string "ssemul")
> > > > +            (match_test "<CODE> == DIV")
> > > > +               (const_string "ssediv")]
> > > > +            (const_string "sseadd")))
> > > > +   (set_attr "prefix" "evex")
> > > > +   (set_attr "mode" "V8HF")])
> > > > +
> > > >  ;; Standard scalar operation patterns which preserve the rest of the
> > > >  ;; vector for combiner.
> > > >  (define_insn "*<sse>_vm<insn><mode>3"
> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > > > new file mode 100644
> > > > index 00000000000..754e909d77b
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> > > > @@ -0,0 +1,30 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > > > +
> > > > +/* { dg-final { scan-assembler-times "vaddph" 2 } } */
> > > > +/* { dg-final { scan-assembler-times "vsubph" 2 } } */
> > > > +/* { dg-final { scan-assembler-times "vmulph" 2 } } */
> > > > +/* { dg-final { scan-assembler-times "vdivph" 2 } } */
> > > > +
> > > > +#define DO_PRAGMA(X) _Pragma(#X)
> > > > +
> > > > +#define VEC_OP_VV(size, op, name)       \
> > > > +void \
> > > > +__attribute__ ((noinline, noclone, optimize("tree-slp-vectorize"))) \
> > > > +vecop_v##size##hf##name (_Float16 * restrict dst,  \
> > > > + _Float16 * restrict src1, _Float16 * restrict src2)   \
> > > > +{ \
> > > > +    int i;  \
> > > > +    DO_PRAGMA (GCC unroll size)   \
> > > > +    for (i = 0; i < size; i++)  \
> > > > +      dst[i] = src1[i] op src2[i];  \
> > > > +}
> > > > +
> > > > +VEC_OP_VV(4, +, add)
> > > > +VEC_OP_VV(2, +, add)
> > > > +VEC_OP_VV(4, -, sub)
> > > > +VEC_OP_VV(2, -, sub)
> > > > +VEC_OP_VV(4, *, mul)
> > > > +VEC_OP_VV(2, *, mul)
> > > > +VEC_OP_VV(4, /, div)
> > > > +VEC_OP_VV(2, /, div)
> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > > > new file mode 100644
> > > > index 00000000000..4dc6f9fb92e
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> > > > @@ -0,0 +1,75 @@
> > > > +/* { dg-do run { target avx512fp16 } } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > > > +
> > > > +static void vec_op_test (void);
> > > > +#define DO_TEST vec_op_test
> > > > +#define AVX512FP16
> > > > +#define AVX512VL
> > > > +#include "avx512f-check.h"
> > > > +#include "avx512fp16-64-32-vecop-1.c"
> > > > +
> > > > +_Float16 a[4], b[4], fexp[4], fref[4];
> > > > +
> > > > +#define EMULATE_VEC_OP_VV(size, op, name) \
> > > > +void \
> > > > +__attribute__ ((noinline, noclone)) \
> > > > +scalar_vecop_v##size##hf##name ( \
> > > > +  _Float16 * restrict dst, _Float16 * restrict src1,  \
> > > > +  _Float16 * restrict src2)  \
> > > > +{ \
> > > > +  int i;  \
> > > > +  for (i = 0; i < size; i++)  \
> > > > +    dst[i] = src1[i] op src2[i];  \
> > > > +}
> > > > +
> > > > +EMULATE_VEC_OP_VV (4, +, add)
> > > > +EMULATE_VEC_OP_VV (2, +, add)
> > > > +EMULATE_VEC_OP_VV (4, -, sub)
> > > > +EMULATE_VEC_OP_VV (2, -, sub)
> > > > +EMULATE_VEC_OP_VV (4, *, mul)
> > > > +EMULATE_VEC_OP_VV (2, *, mul)
> > > > +EMULATE_VEC_OP_VV (4, /, div)
> > > > +EMULATE_VEC_OP_VV (2, /, div)
> > > > +
> > > > +void init()
> > > > +{
> > > > +  int i;
> > > > +  for (i = 0; i < 4; i++)
> > > > +    {
> > > > +      a[i] = i + 0.5;
> > > > +      b[i] = i * 1.5;
> > > > +      fexp[i] = fref[i] = 2.75 * i;
> > > > +    }
> > > > +}
> > > > +
> > > > +int check_cond(void *a, void *b, int size)
> > > > +{
> > > > +  int i;
> > > > +  unsigned short *pa = (unsigned short *)a,
> > > > +                *pb = (unsigned short *)b;
> > > > +  for (i = 0; i < size; i++)
> > > > +    if (pa[i] != pb[i])
> > > > +      return 0;
> > > > +  return 1;
> > > > +}
> > > > +
> > > > +#define TEST_VEC_OP_VV(size, name)     \
> > > > +{ \
> > > > +  init ();  \
> > > > +  scalar_vecop_v##size##hf##name (a, b, fexp);  \
> > > > +  vecop_v##size##hf##name (a, b, fref);  \
> > > > +  if (!check_cond ((void *)fexp, (void *)fref, size)) \
> > > > +    abort();  \
> > > > +}
> > > > +
> > > > +static void vec_op_test()
> > > > +{
> > > > +  TEST_VEC_OP_VV (4, add)
> > > > +  TEST_VEC_OP_VV (2, add)
> > > > +  TEST_VEC_OP_VV (4, sub)
> > > > +  TEST_VEC_OP_VV (2, sub)
> > > > +  TEST_VEC_OP_VV (4, mul)
> > > > +  TEST_VEC_OP_VV (2, mul)
> > > > +  TEST_VEC_OP_VV (4, div)
> > > > +  TEST_VEC_OP_VV (2, div)
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > > > index 112ac3e74d5..8471a1d1d10 100644
> > > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
> > > > @@ -43,16 +43,16 @@ FLOATHFVV(2, udi)
> > > >
> > > >  /* { dg-final { scan-assembler-times "vcvtqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtuqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtdq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtudq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtdq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtudq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtuw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > > > index 286ea9f2624..2ef901a0375 100644
> > > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
> > > > @@ -41,15 +41,15 @@ EXTENDHFVV(8, sf)
> > > >  EXTENDHFVV(4, sf)
> > > >
> > > >  /* { dg-final { scan-assembler-times "vcvtpd2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtps2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtps2phxy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >
> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > > > index ee55cd12300..7a51c9dd077 100644
> > > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
> > > > @@ -43,16 +43,16 @@ FIX_TRUNCHFVV(2, udi)
> > > >
> > > >  /* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > -/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > -/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
> > > > +/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2uw\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > >  /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr102230.c b/gcc/testsuite/gcc.target/i386/pr102230.c
> > > > new file mode 100644
> > > > index 00000000000..60cf1c32afe
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr102230.c
> > > > @@ -0,0 +1,38 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx512fp16" } */
> > > > +
> > > > +typedef _Float16 v4hf __attribute__ ((vector_size (8)));
> > > > +typedef _Float16 v2hf __attribute__ ((vector_size (4)));
> > > > +
> > > > +v4hf
> > > > +v4hf_abi_1 (v4hf a)
> > > > +{
> > > > +  return a;
> > > > +}
> > > > +
> > > > +v4hf
> > > > +v4hf_abi_3 (v4hf a, v4hf b, v4hf c)
> > > > +{
> > > > +  return c;
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-assembler-times "movq\[[\\t \]*%mm2, %mm0" 1 { target { ia32 } } } } */
> > > > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm2, %xmm0" 1 { target { ! ia32 } } } } */
> > > > +
> > > > +v4hf
> > > > +v4hf_abi_4 (v4hf a, v4hf b, v4hf c, v4hf d)
> > > > +{
> > > > +  return d;
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-assembler-times "movq\[[\\t \]*4\[(\]%esp\[)\], %mm0" 1 { target { ia32 } } } } */
> > > > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm3, %xmm0" 1 { target { ! ia32 } } } } */
> > > > +
> > > > +v2hf
> > > > +v2hf_test (v2hf a, v2hf b, v2hf c, v2hf d)
> > > > +{
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-assembler-times "movl\[[\\t \]*8\[(\]%esp\[)\], %eax" 1 { target { ia32 } } } } */
> > > > +/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm1, %xmm0" 1 { target { ! ia32 } } } } */
> > > > --
> > > > 2.18.1
> > > >

[-- Attachment #2: FP16_64_32bit_v3.patch --]
[-- Type: text/x-patch, Size: 22973 bytes --]

From 276076d8d88f9f84361a500135ba61be30611a2a Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Thu, 15 Jul 2021 13:31:24 +0800
Subject: [PATCH] AVX512FP16: Support basic 64/32bit vector type and operation.

For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
is parsed by stack and returned from GPR since it is not specified
by ABI.

gcc/ChangeLog:

	PR target/102230
	* config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
	V2HF mode check.
	(VALID_SSE2_REG_VHF_MODE): Add V4HFmode and V2HFmode.
	(VALID_MMX_REG_MODE): Add V4HFmode.
	(SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
	vector mode condition.
	* config/i386/i386.c (classify_argument): Parse V4HF/V2HF
	via sse regs.
	(function_arg_32): Add V4HFmode.
	(function_arg_advance_32): Likewise.
	* config/i386/i386.md (mode): Add V4HF/V2HF.
	(MODE_SIZE): Likewise.
	* config/i386/mmx.md (MMXMODE): Add V4HF mode.
	(V_32): Add V2HF mode.
	(VHF_32_64): New mode iterator.
	(*mov<mode>_internal): Adjust sse alternatives to support
	V4HF mode move.
	(*mov<mode>_internal): Adjust sse alternatives to support
	V2HF mode move.
	(<insn><mode>3): New define_insn for add/sub/mul/div.

gcc/testsuite/ChangeLog:

	PR target/102230
	* gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
	* gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
	* gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
	* gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
	* gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
	* gcc.target/i386/pr102230.c: Ditto.
---
 gcc/config/i386/i386.c                        |  4 +
 gcc/config/i386/i386.h                        | 13 +++-
 gcc/config/i386/i386.md                       |  5 +-
 gcc/config/i386/mmx.md                        | 52 ++++++++++---
 .../i386/avx512fp16-64-32-vecop-1.c           | 30 ++++++++
 .../i386/avx512fp16-64-32-vecop-2.c           | 75 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-floatvnhf.c    | 12 +--
 .../i386/avx512fp16-trunc-extendvnhf.c        | 12 +--
 .../gcc.target/i386/avx512fp16-truncvnhf.c    | 12 +--
 gcc/testsuite/gcc.target/i386/pr102230.c      | 38 ++++++++++
 10 files changed, 220 insertions(+), 33 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ba89e111d28..b3e4add4b9e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
     case E_V2SFmode:
     case E_V2SImode:
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V2HFmode:
     case E_V8QImode:
       classes[0] = X86_64_SSE_CLASS;
       return 1;
@@ -2902,6 +2904,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
@@ -3149,6 +3152,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8a4251b4926..cba6d835910 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == TImode)
 
 #define VALID_AVX512FP16_REG_MODE(MODE)					\
-  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
+  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode	\
+   || (MODE) == V2HFmode)
 
 #define VALID_SSE2_REG_MODE(MODE)					\
   ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode	\
@@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
 
 #define VALID_SSE2_REG_VHF_MODE(MODE)			\
-  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
+  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode	\
+   || (MODE) == V4HFmode || (MODE) == V2HFmode)
 
 #define VALID_SSE_REG_MODE(MODE)					\
   ((MODE) == V1TImode || (MODE) == TImode				\
@@ -1051,10 +1053,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define VALID_MMX_REG_MODE_3DNOW(MODE) \
   ((MODE) == V2SFmode || (MODE) == SFmode)
 
+/* To match ia32 psABI, V4HFmode should be added here.  */
 #define VALID_MMX_REG_MODE(MODE)					\
   ((MODE) == V1DImode || (MODE) == DImode				\
    || (MODE) == V2SImode || (MODE) == SImode				\
-   || (MODE) == V4HImode || (MODE) == V8QImode)
+   || (MODE) == V4HImode || (MODE) == V8QImode				\
+   || (MODE) == V4HFmode)
 
 #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
 
@@ -1087,7 +1091,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode	\
    || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode	\
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
-   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
+   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
+   || (MODE) == V8HFmode)
 
 #define X87_FLOAT_MODE_P(MODE)	\
   (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c6279e620c9..758d7d1e3c0 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -498,7 +498,7 @@
 ;; Main data type used by the insn
 (define_attr "mode"
   "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
-   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
+   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -1106,7 +1106,8 @@
 			     (V1TI "16") (V2TI "32") (V4TI "64")
 			     (V2DF "16") (V4DF "32") (V8DF "64")
 			     (V4SF "16") (V8SF "32") (V16SF "64")
-			     (V8HF "16") (V16HF "32") (V32HF "64")])
+			     (V8HF "16") (V16HF "32") (V32HF "64")
+			     (V4HF "8") (V2HF "4")])
 
 ;; Double word integer modes as mode attribute.
 (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index b0093778fc6..c9467bc095a 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -48,7 +48,7 @@
 (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
 
 ;; All 8-byte vector modes handled by MMX
-(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
+(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
 (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
 
 ;; Mix-n-match
@@ -57,8 +57,8 @@
 (define_mode_iterator MMXMODE24 [V4HI V2SI])
 (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 
-;; All 4-byte integer vector modes
-(define_mode_iterator V_32 [V4QI V2HI V1SI])
+;; All 4-byte integer/float16 vector modes
+(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
 
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
@@ -66,6 +66,9 @@
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
+;; 4-byte and 8-byte float16 vector modes
+(define_mode_iterator VHF_32_64 [V4HF V2HF])
+
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
@@ -191,6 +194,8 @@
 	    (eq_attr "alternative" "11,12")
 	      (cond [(match_test "<MODE>mode == V2SFmode")
 		       (const_string "V4SF")
+		     (match_test "<MODE>mode == V4HFmode")
+		       (const_string "V4SF")
 		     (ior (not (match_test "TARGET_SSE2"))
 			  (match_test "optimize_function_for_size_p (cfun)"))
 		       (const_string "V4SF")
@@ -198,14 +203,16 @@
 		    (const_string "TI"))
 
 	    (and (eq_attr "alternative" "13")
-		 (ior (and (match_test "<MODE>mode == V2SFmode")
-			   (not (match_test "TARGET_MMX_WITH_SSE")))
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (and (match_test "<MODE>mode == V2SFmode")
+				(not (match_test "TARGET_MMX_WITH_SSE")))
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 
 	    (and (eq_attr "alternative" "14")
-	    	 (ior (match_test "<MODE>mode == V2SFmode")
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (match_test "<MODE>mode == V2SFmode")
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 	   ]
 	   (const_string "DI")))
@@ -289,12 +296,17 @@
        (const_string "*")))
    (set (attr "mode")
      (cond [(eq_attr "alternative" "2,3")
-	      (cond [(match_test "TARGET_AVX")
+	      (cond [(match_test "<MODE>mode == V2HFmode")
+		       (const_string "V4SF")
+		     (match_test "TARGET_AVX")
 		       (const_string "TI")
 		     (match_test "optimize_function_for_size_p (cfun)")
 		       (const_string "V4SF")
 		    ]
 		    (const_string "TI"))
+	    (and (eq_attr "alternative" "4,5")
+		 (match_test "<MODE>mode == V2HFmode"))
+	      (const_string "SF")
 	   ]
 	   (const_string "SI")))
    (set (attr "preferred_for_speed")
@@ -1391,6 +1403,28 @@
   DONE;
 })
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel half-precision floating point arithmetic
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "<insn><mode>3"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
+	(plusminusmultdiv:VHF_32_64
+	  (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
+	  (match_operand:VHF_32_64 2 "register_operand" "v")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "type")
+      (cond [(match_test "<CODE> == MULT")
+		(const_string "ssemul")
+	     (match_test "<CODE> == DIV")
+		(const_string "ssediv")]
+	     (const_string "sseadd")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8HF")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integral arithmetic
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
new file mode 100644
index 00000000000..754e909d77b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+/* { dg-final { scan-assembler-times "vaddph" 2 } } */
+/* { dg-final { scan-assembler-times "vsubph" 2 } } */
+/* { dg-final { scan-assembler-times "vmulph" 2 } } */
+/* { dg-final { scan-assembler-times "vdivph" 2 } } */
+
+#define DO_PRAGMA(X) _Pragma(#X)
+
+#define VEC_OP_VV(size, op, name)       \
+void \
+__attribute__ ((noinline, noclone, optimize("tree-slp-vectorize"))) \
+vecop_v##size##hf##name (_Float16 * restrict dst,  \
+ _Float16 * restrict src1, _Float16 * restrict src2)   \
+{ \
+    int i;  \
+    DO_PRAGMA (GCC unroll size)   \
+    for (i = 0; i < size; i++)  \
+      dst[i] = src1[i] op src2[i];  \
+}
+
+VEC_OP_VV(4, +, add)
+VEC_OP_VV(2, +, add)
+VEC_OP_VV(4, -, sub)
+VEC_OP_VV(2, -, sub)
+VEC_OP_VV(4, *, mul)
+VEC_OP_VV(2, *, mul)
+VEC_OP_VV(4, /, div)
+VEC_OP_VV(2, /, div)
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
new file mode 100644
index 00000000000..4dc6f9fb92e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
@@ -0,0 +1,75 @@
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+static void vec_op_test (void);
+#define DO_TEST vec_op_test
+#define AVX512FP16
+#define AVX512VL
+#include "avx512f-check.h"
+#include "avx512fp16-64-32-vecop-1.c"
+
+_Float16 a[4], b[4], fexp[4], fref[4];
+
+#define EMULATE_VEC_OP_VV(size, op, name) \
+void \
+__attribute__ ((noinline, noclone)) \
+scalar_vecop_v##size##hf##name ( \
+  _Float16 * restrict dst, _Float16 * restrict src1,  \
+  _Float16 * restrict src2)  \
+{ \
+  int i;  \
+  for (i = 0; i < size; i++)  \
+    dst[i] = src1[i] op src2[i];  \
+}
+
+EMULATE_VEC_OP_VV (4, +, add)
+EMULATE_VEC_OP_VV (2, +, add)
+EMULATE_VEC_OP_VV (4, -, sub)
+EMULATE_VEC_OP_VV (2, -, sub)
+EMULATE_VEC_OP_VV (4, *, mul)
+EMULATE_VEC_OP_VV (2, *, mul)
+EMULATE_VEC_OP_VV (4, /, div)
+EMULATE_VEC_OP_VV (2, /, div)
+
+void init()
+{
+  int i;
+  for (i = 0; i < 4; i++)
+    {
+      a[i] = i + 0.5; 
+      b[i] = i * 1.5;
+      fexp[i] = fref[i] = 2.75 * i;
+    }
+}
+
+int check_cond(void *a, void *b, int size)
+{
+  int i;
+  unsigned short *pa = (unsigned short *)a,
+		 *pb = (unsigned short *)b;
+  for (i = 0; i < size; i++)
+    if (pa[i] != pb[i])
+      return 0;
+  return 1;
+}
+
+#define TEST_VEC_OP_VV(size, name)	\
+{ \
+  init ();  \
+  scalar_vecop_v##size##hf##name (a, b, fexp);  \
+  vecop_v##size##hf##name (a, b, fref);  \
+  if (!check_cond ((void *)fexp, (void *)fref, size)) \
+    abort();  \
+}
+
+static void vec_op_test()
+{
+  TEST_VEC_OP_VV (4, add)
+  TEST_VEC_OP_VV (2, add)
+  TEST_VEC_OP_VV (4, sub)
+  TEST_VEC_OP_VV (2, sub)
+  TEST_VEC_OP_VV (4, mul)
+  TEST_VEC_OP_VV (2, mul)
+  TEST_VEC_OP_VV (4, div)
+  TEST_VEC_OP_VV (2, div)
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
index 112ac3e74d5..8471a1d1d10 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-floatvnhf.c
@@ -43,16 +43,16 @@ FLOATHFVV(2, udi)
 
 /* { dg-final { scan-assembler-times "vcvtqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtuqq2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtuqq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtuqq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtdq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtudq2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtdq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtudq2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtudq2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtuw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtw2ph\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
index 286ea9f2624..2ef901a0375 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-trunc-extendvnhf.c
@@ -41,15 +41,15 @@ EXTENDHFVV(8, sf)
 EXTENDHFVV(4, sf)
 
 /* { dg-final { scan-assembler-times "vcvtpd2phz\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtpd2phy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtps2phx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtps2phxy\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtps2phxx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvtph2psx\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
index ee55cd12300..7a51c9dd077 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-truncvnhf.c
@@ -43,16 +43,16 @@ FIX_TRUNCHFVV(2, udi)
 
 /* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uqq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvttph2udq\[ \\t\]+\[^\{\n\]*\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2uw\[ \\t\]+\[^\{\n\]*\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vcvttph2w\[ \\t\]+\[^\{\n\]*\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102230.c b/gcc/testsuite/gcc.target/i386/pr102230.c
new file mode 100644
index 00000000000..60cf1c32afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102230.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16" } */
+
+typedef _Float16 v4hf __attribute__ ((vector_size (8)));
+typedef _Float16 v2hf __attribute__ ((vector_size (4)));
+
+v4hf
+v4hf_abi_1 (v4hf a)
+{
+  return a;
+}
+
+v4hf
+v4hf_abi_3 (v4hf a, v4hf b, v4hf c)
+{
+  return c;
+}
+
+/* { dg-final { scan-assembler-times "movq\[[\\t \]*%mm2, %mm0" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm2, %xmm0" 1 { target { ! ia32 } } } } */
+
+v4hf
+v4hf_abi_4 (v4hf a, v4hf b, v4hf c, v4hf d)
+{
+  return d;
+}
+
+/* { dg-final { scan-assembler-times "movq\[[\\t \]*4\[(\]%esp\[)\], %mm0" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm3, %xmm0" 1 { target { ! ia32 } } } } */
+
+v2hf
+v2hf_test (v2hf a, v2hf b, v2hf c, v2hf d)
+{
+  return b;
+}
+
+/* { dg-final { scan-assembler-times "movl\[[\\t \]*8\[(\]%esp\[)\], %eax" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovaps\[[\\t \]*%xmm1, %xmm0" 1 { target { ! ia32 } } } } */
-- 
2.27.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-09-28  7:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-27 10:42 [PATCH] AVX512FP16:support basic 64/32bit vector type and operation Hongyu Wang
2021-09-27 11:32 ` Uros Bizjak
2021-09-28  4:42   ` Hongyu Wang
2021-09-28  6:27     ` Uros Bizjak
2021-09-28  7:20       ` Hongyu Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).