[PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
@ 2024-05-08  1:38 Hu, Lin1
  2024-05-14  2:25 ` Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-08  1:38 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak

Hi, all

This patch aims to optimize __builtin_convertvector. We want the function
can generate more efficient insn for some situations. Like v2si -> v2di.

The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for
trunk?

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc (expand_vector_conversion): Support
	convert for int -> int, float -> float and int <-> float.
	(expand_vector_conversion_no_vec_pack): Check if can convert
	int <-> int, float <-> float and int <-> float, directly.
	Support indirect convert, when direct optab is not supported.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
 gcc/tree-vect-generic.cc                   | 107 +++++++++-
 8 files changed, 918 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
+}
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
+}
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2su);
+}
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
+}
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
+}
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu);
+}
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu);
+}
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
+}
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu);
+}
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu);
+}
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu);
+}
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu);
+}
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu);
+}
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
+}
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
+}
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu);
+}
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu);
+}
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu);
+}
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
+}
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu);
+}
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu);
+}
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
+}
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v8hi);
+}
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi);
+}
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2sf);
+}
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2hf);
+}
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2si);
+}
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si);
+}
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi);
+}
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi);
+}
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi);
+}
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu);
+}
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu);
+}
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu);
+}
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi);
+}
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi);
+}
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi);
+}
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi);
+}
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu);
+}
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu);
+}
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu);
+}
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu);
+}
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi);
+}
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi);
+}
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi);
+}
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi);
+}
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu);
+}
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu);
+}
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu);
+}
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df);
+}
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df);
+}
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df);
+}
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df);
+}
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df);
+}
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df);
+}
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf);
+}
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf);
+}
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf);
+}
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index ab640096ca2..e14fac9f179 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1834,6 +1836,102 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
   return gimplify_build2 (gsi, code, outer_type, b, c);
 }
 
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like char -> double.  */
+bool
+expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
+				      enum tree_code code,
+				      tree lhs,
+				      tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+  enum {NARROW, NONE, WIDEN} modifier = NONE;
+  enum tree_code code1 = ERROR_MARK;
+  enum tree_code codecvt1 = ERROR_MARK;
+  bool float_expr_p = code == FLOAT_EXPR;
+
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);
+  unsigned int arg_elt_bits = vector_element_bits (arg_type);
+  if (ret_elt_bits < arg_elt_bits)
+    modifier = NARROW;
+  else if (ret_elt_bits > arg_elt_bits)
+    modifier = WIDEN;
+
+  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier == NARROW)
+       || (code == FLOAT_EXPR && modifier == WIDEN)))
+    {
+      unsigned short target_size;
+      scalar_mode tmp_cvt_mode;
+      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+      tree cvt_type = NULL_TREE;
+      if (modifier == NARROW)
+	{
+	  tmp_cvt_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
+	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
+	    return false;
+	}
+
+      code1 = float_expr_p ? code : NOP_EXPR;
+      codecvt1 = float_expr_p ? NOP_EXPR : code;
+      opt_scalar_mode mode_iter;
+      enum tree_code tc1, tc2;
+      unsigned HOST_WIDE_INT nelts
+	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+	{
+	  tmp_cvt_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	    break;
+
+	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+	  cvt_type = build_vector_type (cvt_type, nelts);
+	  if (cvt_type == NULL_TREE
+	      || !supportable_convert_operation ((tree_code) code1,
+						 ret_type,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) codecvt1,
+						 cvt_type,
+						 arg_type, &tc2))
+	    continue;
+
+	  new_rhs = make_ssa_name (cvt_type);
+	  g = vect_gimple_build (new_rhs, tc2, arg);
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	  g = gimple_build_assign (lhs, tc1, new_rhs);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+    }
+  return false;
+}
+
 /* Expand VEC_CONVERT ifn call.  */
 
 static void
@@ -1871,14 +1969,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
+    return;
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-08  1:38 [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float Hu, Lin1
@ 2024-05-14  2:25 ` Hu, Lin1
  2024-05-14 12:23   ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-14  2:25 UTC (permalink / raw)
  To: Hu, Lin1, gcc-patches; +Cc: Liu, Hongtao, ubizjak, Richard Biener

Do you have any advice?

BRs,
Lin

-----Original Message-----
From: Hu, Lin1 <lin1.hu@intel.com> 
Sent: Wednesday, May 8, 2024 9:38 AM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com
Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

Hi, all

This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di.

The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc (expand_vector_conversion): Support
	convert for int -> int, float -> float and int <-> float.
	(expand_vector_conversion_no_vec_pack): Check if can convert
	int <-> int, float <-> float and int <-> float, directly.
	Support indirect convert, when direct optab is not supported.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
 gcc/tree-vect-generic.cc                   | 107 +++++++++-
 8 files changed, 918 insertions(+), 6 deletions(-)  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); 
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2di)a, __v2si); }
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi); }
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi); }
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi); }
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi); }
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi); }
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi); }
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi); }
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi); }
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi); }
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi); }
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi); }
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi); }
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2du)a, __v2su); }
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu); }
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu); }
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu); }
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu); }
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu); }
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu); }
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu); }
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu); }
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu); }
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu); }
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu); }
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu); }
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
+  return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
+  return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
+  return (__m128i)__builtin_convertvector(a, __v8hi); }
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi); }
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2sf); }
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf); }
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf); }
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2hf); }
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf); }
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf); }
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
+  return __builtin_convertvector(a, __v4hf); }
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf); }
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
+  return __builtin_convertvector(a, __v2df); }
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df); }
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df); }
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
+  return __builtin_convertvector(a, __v2df); }
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df); }
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df); }
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
+  return __builtin_convertvector(a, __v4sf); }
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf); }
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" 
+} */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2si); }
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si); }
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si); }
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di); }
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di); }
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
+  return __builtin_convertvector(a, __v4si); }
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si); }
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si); }
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di); }
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
+__attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi); }
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi); }
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi); }
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu); }
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu); }
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu); }
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi); }
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi); }
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi); }
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi); }
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu); }
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu); }
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu); }
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu); }
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi); }
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi); }
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi); }
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi); }
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu); }
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu); }
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu); }
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
+} } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
+__attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df); }
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df); }
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df); }
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df); }
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df); }
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df); }
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf); }
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf); }
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf); }
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf); }
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf); }
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf); }
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf); }
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf); }
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf); }
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf); }
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf); }
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf); }
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf); }
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf); }
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf); }
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf); }
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf); }
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf); }
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ab640096ca2..e14fac9f179 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see  #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1834,6 +1836,102 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
   return gimplify_build2 (gsi, code, outer_type, b, c);  }
 
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like char -> double.  */ bool 
+expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
+				      enum tree_code code,
+				      tree lhs,
+				      tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+  enum {NARROW, NONE, WIDEN} modifier = NONE;
+  enum tree_code code1 = ERROR_MARK;
+  enum tree_code codecvt1 = ERROR_MARK;
+  bool float_expr_p = code == FLOAT_EXPR;
+
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);  unsigned 
+ int arg_elt_bits = vector_element_bits (arg_type);  if (ret_elt_bits < 
+ arg_elt_bits)
+    modifier = NARROW;
+  else if (ret_elt_bits > arg_elt_bits)
+    modifier = WIDEN;
+
+  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier == NARROW)
+       || (code == FLOAT_EXPR && modifier == WIDEN)))
+    {
+      unsigned short target_size;
+      scalar_mode tmp_cvt_mode;
+      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+      tree cvt_type = NULL_TREE;
+      if (modifier == NARROW)
+	{
+	  tmp_cvt_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
+	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
+	    return false;
+	}
+
+      code1 = float_expr_p ? code : NOP_EXPR;
+      codecvt1 = float_expr_p ? NOP_EXPR : code;
+      opt_scalar_mode mode_iter;
+      enum tree_code tc1, tc2;
+      unsigned HOST_WIDE_INT nelts
+	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+	{
+	  tmp_cvt_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	    break;
+
+	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+	  cvt_type = build_vector_type (cvt_type, nelts);
+	  if (cvt_type == NULL_TREE
+	      || !supportable_convert_operation ((tree_code) code1,
+						 ret_type,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) codecvt1,
+						 cvt_type,
+						 arg_type, &tc2))
+	    continue;
+
+	  new_rhs = make_ssa_name (cvt_type);
+	  g = vect_gimple_build (new_rhs, tc2, arg);
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	  g = gimple_build_assign (lhs, tc1, new_rhs);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+    }
+  return false;
+}
+
 /* Expand VEC_CONVERT ifn call.  */
 
 static void
@@ -1871,14 +1969,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
+    return;
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
--
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-14  2:25 ` Hu, Lin1
@ 2024-05-14 12:23   ` Richard Biener
  2024-05-15  2:30     ` Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-05-14 12:23 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao, ubizjak

On Tue, 14 May 2024, Hu, Lin1 wrote:

> Do you have any advice?
> 
> BRs,
> Lin
> 
> -----Original Message-----
> From: Hu, Lin1 <lin1.hu@intel.com> 
> Sent: Wednesday, May 8, 2024 9:38 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com
> Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> 
> Hi, all
> 
> This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di.
> 
> The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

I don't like the new code to be in a separate function, not integrated
with the existing handling.  Note the existing handling should get, say,
V8DF -> V8SI correct for SSE by splitting the operation into smaller
vectors but your code seems to just handle the cases the vectors are
already properly sized.

Without checking it seems you are basing the code on what the
vectorizer does?  Maybe we should have some common code that
computes intermediate conversion steps supported by the HW
unifying what for example supportable_widening_operation or
supportable_narrowing_operation can do to also cover int <-> float
conversions.

That said, if you don't want to do that please still think about
the core part of tree-vect-generic.cc which is breaking down large
emulated vectors into small supported vectors.

Richard.

> BRs,
> Lin
> 
> gcc/ChangeLog:
> 
> 	PR target/107432
> 	* tree-vect-generic.cc (expand_vector_conversion): Support
> 	convert for int -> int, float -> float and int <-> float.
> 	(expand_vector_conversion_no_vec_pack): Check if can convert
> 	int <-> int, float <-> float and int <-> float, directly.
> 	Support indirect convert, when direct optab is not supported.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/107432
> 	* gcc.target/i386/pr107432-1.c: New test.
> 	* gcc.target/i386/pr107432-2.c: Ditto.
> 	* gcc.target/i386/pr107432-3.c: Ditto.
> 	* gcc.target/i386/pr107432-4.c: Ditto.
> 	* gcc.target/i386/pr107432-5.c: Ditto.
> 	* gcc.target/i386/pr107432-6.c: Ditto.
> 	* gcc.target/i386/pr107432-7.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
>  gcc/tree-vect-generic.cc                   | 107 +++++++++-
>  8 files changed, 918 insertions(+), 6 deletions(-)  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> new file mode 100644
> index 00000000000..a4f37447eb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -0,0 +1,234 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
> +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
> +__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8)));
> +
> +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); 
> +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); 
> +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); 
> +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); 
> +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); 
> +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
> +  return __builtin_convertvector((__v2di)a, __v2si); }
> +
> +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
> +
> +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
> +
> +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi); }
> +
> +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi); }
> +
> +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
> +
> +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi); }
> +
> +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi); }
> +
> +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8di)a, __v8qi); }
> +
> +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2hi); }
> +
> +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi); }
> +
> +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
> +
> +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
> +
> +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2qi); }
> +
> +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi); }
> +
> +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi); }
> +
> +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
> +
> +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector((__v2hi)a, __v2qi); }
> +
> +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> +
> +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
> +
> +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
> +
> +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
> +  return __builtin_convertvector((__v2du)a, __v2su); }
> +
> +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
> +
> +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
> +
> +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2hu); }
> +
> +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4hu); }
> +
> +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
> +
> +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2qu); }
> +
> +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4qu); }
> +
> +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8du)a, __v8qu); }
> +
> +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2hu); }
> +
> +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4hu); }
> +
> +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
> +
> +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
> +
> +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2qu); }
> +
> +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4qu); }
> +
> +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8su)a, __v8qu); }
> +
> +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
> +
> +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> +{
> +  return __builtin_convertvector((__v2hu)a, __v2qu); }
> +
> +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hu)a, __v8qu); }
> +
> +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
> +
> +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> new file mode 100644
> index 00000000000..02ffd811cb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> @@ -0,0 +1,105 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
> +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
> +__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8)));
> +
> +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di); }
> +
> +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di); }
> +
> +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di); }
> +
> +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di); }
> +
> +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di); }
> +
> +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di); }
> +
> +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
> +  return (__m128i)__builtin_convertvector(a, __v4si); }
> +
> +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si); }
> +
> +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si); }
> +
> +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
> +  return (__m128i)__builtin_convertvector(a, __v4si); }
> +
> +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si); }
> +
> +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si); }
> +
> +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
> +  return (__m128i)__builtin_convertvector(a, __v8hi); }
> +
> +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v16hi); }
> +
> +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector(a, __v32hi); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> new file mode 100644
> index 00000000000..30dc947b6dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> @@ -0,0 +1,55 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
> +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
> +  return __builtin_convertvector(a, __v2sf); }
> +
> +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4sf); }
> +
> +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8sf); }
> +
> +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
> +  return __builtin_convertvector(a, __v2hf); }
> +
> +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4hf); }
> +
> +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8hf); }
> +
> +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
> +  return __builtin_convertvector(a, __v4hf); }
> +
> +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8hf); }
> +
> +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector(a, __v16hf); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> new file mode 100644
> index 00000000000..e537e7349e4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
> +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
> +  return __builtin_convertvector(a, __v2df); }
> +
> +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4df); }
> +
> +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8df); }
> +
> +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
> +  return __builtin_convertvector(a, __v2df); }
> +
> +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4df); }
> +
> +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8df); }
> +
> +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
> +  return __builtin_convertvector(a, __v4sf); }
> +
> +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8sf); }
> +
> +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16sf); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> new file mode 100644
> index 00000000000..5a44ef9f3b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" 
> +} */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
> +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
> +  return __builtin_convertvector(a, __v2si); }
> +
> +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4si); }
> +
> +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8si); }
> +
> +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4di); }
> +
> +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8di); }
> +
> +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
> +  return __builtin_convertvector(a, __v4si); }
> +
> +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8si); }
> +
> +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16si); }
> +
> +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
> +  return __builtin_convertvector(a, __v2di); }
> +
> +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4di); }
> +
> +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8di); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> new file mode 100644
> index 00000000000..4a68a10b089
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -0,0 +1,139 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
> +-fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 
> +} } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 
> +} } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
> +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
> +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
> +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
> +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
> +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
> +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
> +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
> +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
> +__attribute__ ((__vector_size__ (16)));
> +
> +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qi); }
> +
> +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qi); }
> +
> +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qi); }
> +
> +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qu); }
> +
> +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qu); }
> +
> +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qu); }
> +
> +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qi); }
> +
> +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qi); }
> +
> +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qi); }
> +
> +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qi); }
> +
> +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qu); }
> +
> +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qu); }
> +
> +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qu); }
> +
> +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qu); }
> +
> +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qi); }
> +
> +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qi); }
> +
> +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qi); }
> +
> +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qi); }
> +
> +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qu); }
> +
> +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qu); }
> +
> +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qu); }
> +
> +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qu); }
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> new file mode 100644
> index 00000000000..0ff5a97ed1a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> @@ -0,0 +1,156 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
> +-fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } 
> +} */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
> +} } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
> +} } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
> +} } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
> +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
> +__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
> +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
> +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
> +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
> +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
> +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
> +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
> +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
> +__attribute__ ((__vector_size__ (16)));
> +
> +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2df); }
> +
> +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4df); }
> +
> +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8df); }
> +
> +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2df); }
> +
> +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4df); }
> +
> +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8df); }
> +
> +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2sf); }
> +
> +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4sf); }
> +
> +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8sf); }
> +
> +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16sf); }
> +
> +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2sf); }
> +
> +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4sf); }
> +
> +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8sf); }
> +
> +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16sf); }
> +
> +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2hf); }
> +
> +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4hf); }
> +
> +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8hf); }
> +
> +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16hf); }
> +
> +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector((__v32qi)a, __v32hf); }
> +
> +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2hf); }
> +
> +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4hf); }
> +
> +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8hf); }
> +
> +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16hf); }
> +
> +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> +{
> +  return __builtin_convertvector((__v32qu)a, __v32hf); }
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ab640096ca2..e14fac9f179 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see  #include "gimple-match.h"
>  #include "recog.h"		/* FIXME: for insn_data */
>  #include "optabs-libfuncs.h"
> +#include "cfgloop.h"
> +#include "tree-vectorizer.h"
>  
>  
>  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> @@ -1834,6 +1836,102 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
>    return gimplify_build2 (gsi, code, outer_type, b, c);  }
>  
> +/* A subroutine of expand_vector_conversion, support indirect conversion for
> +   float <-> int, like char -> double.  */ bool 
> +expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
> +				      enum tree_code code,
> +				      tree lhs,
> +				      tree arg)
> +{
> +  gimple *g;
> +  tree ret_type = TREE_TYPE (lhs);
> +  tree arg_type = TREE_TYPE (arg);
> +  tree new_rhs;
> +  enum {NARROW, NONE, WIDEN} modifier = NONE;
> +  enum tree_code code1 = ERROR_MARK;
> +  enum tree_code codecvt1 = ERROR_MARK;
> +  bool float_expr_p = code == FLOAT_EXPR;
> +
> +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> +    {
> +      g = gimple_build_assign (lhs, code1, arg);
> +      gsi_replace (gsi, g, false);
> +      return true;
> +    }
> +
> +  unsigned int ret_elt_bits = vector_element_bits (ret_type);  unsigned 
> + int arg_elt_bits = vector_element_bits (arg_type);  if (ret_elt_bits < 
> + arg_elt_bits)
> +    modifier = NARROW;
> +  else if (ret_elt_bits > arg_elt_bits)
> +    modifier = WIDEN;
> +
> +  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier == NARROW)
> +       || (code == FLOAT_EXPR && modifier == WIDEN)))
> +    {
> +      unsigned short target_size;
> +      scalar_mode tmp_cvt_mode;
> +      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> +      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> +      tree cvt_type = NULL_TREE;
> +      if (modifier == NARROW)
> +	{
> +	  tmp_cvt_mode = lhs_mode;
> +	  target_size = GET_MODE_SIZE (rhs_mode);
> +	}
> +      else
> +	{
> +	  target_size = GET_MODE_SIZE (lhs_mode);
> +	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
> +	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> +	    return false;
> +	}
> +
> +      code1 = float_expr_p ? code : NOP_EXPR;
> +      codecvt1 = float_expr_p ? NOP_EXPR : code;
> +      opt_scalar_mode mode_iter;
> +      enum tree_code tc1, tc2;
> +      unsigned HOST_WIDE_INT nelts
> +	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> +
> +      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> +	{
> +	  tmp_cvt_mode = mode_iter.require ();
> +
> +	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> +	    break;
> +
> +	  scalar_mode cvt_mode;
> +	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> +	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> +	    break;
> +
> +	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> +	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
> +	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> +
> +	  cvt_type = build_vector_type (cvt_type, nelts);
> +	  if (cvt_type == NULL_TREE
> +	      || !supportable_convert_operation ((tree_code) code1,
> +						 ret_type,
> +						 cvt_type, &tc1)
> +	      || !supportable_convert_operation ((tree_code) codecvt1,
> +						 cvt_type,
> +						 arg_type, &tc2))
> +	    continue;
> +
> +	  new_rhs = make_ssa_name (cvt_type);
> +	  g = vect_gimple_build (new_rhs, tc2, arg);
> +	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +	  g = gimple_build_assign (lhs, tc1, new_rhs);
> +	  gsi_replace (gsi, g, false);
> +	  return true;
> +	}
> +    }
> +  return false;
> +}
> +
>  /* Expand VEC_CONVERT ifn call.  */
>  
>  static void
> @@ -1871,14 +1969,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
>    else if (ret_elt_bits > arg_elt_bits)
>      modifier = WIDEN;
>  
> +  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
> +    return;
> +
>    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
>      {
> -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> -	{
> -	  g = gimple_build_assign (lhs, code1, arg);
> -	  gsi_replace (gsi, g, false);
> -	  return;
> -	}
>        /* Can't use get_compute_type here, as supportable_convert_operation
>  	 doesn't necessarily use an optab and needs two arguments.  */
>        tree vec_compute_type
> --
> 2.31.1
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-14 12:23   ` Richard Biener
@ 2024-05-15  2:30     ` Hu, Lin1
  2024-05-23  6:37       ` [PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-15  2:30 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Liu, Hongtao, ubizjak

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Tuesday, May 14, 2024 8:23 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH] vect: generate suitable convert insn for int -> int, float ->
> float and int <-> float.
> 
> On Tue, 14 May 2024, Hu, Lin1 wrote:
> 
> > Do you have any advice?
> >
> > BRs,
> > Lin
> >
> > -----Original Message-----
> > From: Hu, Lin1 <lin1.hu@intel.com>
> > Sent: Wednesday, May 8, 2024 9:38 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com
> > Subject: [PATCH] vect: generate suitable convert insn for int -> int, float ->
> float and int <-> float.
> >
> > Hi, all
> >
> > This patch aims to optimize __builtin_convertvector. We want the function
> can generate more efficient insn for some situations. Like v2si -> v2di.
> >
> > The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK
> for trunk?
> 
> I don't like the new code to be in a separate function, not integrated with the
> existing handling.  Note the existing handling should get, say, V8DF -> V8SI
> correct for SSE by splitting the operation into smaller vectors but your code
> seems to just handle the cases the vectors are already properly sized.
>

Yes, my code only handles some cases, but others are handled by the core part of 
tree-vect-generic.cc. I just take care of some special cases up front. So, V8DF -> V8SI
is still split into smaller vectors for SSE.

And for SSE, I have
another patch to expand the available direct optab environment with 
ix86_expand_vec_perm_const_1 (...). This patch hasn't been sent yet. 
I will sending it out together after I modify this patch. This gives an overall view
of my changes to this function.

> 
> Without checking it seems you are basing the code on what the vectorizer does?
> Maybe we should have some common code that computes intermediate
> conversion steps supported by the HW unifying what for example
> supportable_widening_operation or supportable_narrowing_operation can do
> to also cover int <-> float conversions.
>

Yes, my code is based on vectorizable_conversion(...). I will consider to split the function
and  define some new function like your advises to make my code more common.

BRs,
Lin
 
>
> That said, if you don't want to do that please still think about the core part of
> tree-vect-generic.cc which is breaking down large emulated vectors into small
> supported vectors.
> 
> Richard.
> 
> > BRs,
> > Lin
> >
> > gcc/ChangeLog:
> >
> > 	PR target/107432
> > 	* tree-vect-generic.cc (expand_vector_conversion): Support
> > 	convert for int -> int, float -> float and int <-> float.
> > 	(expand_vector_conversion_no_vec_pack): Check if can convert
> > 	int <-> int, float <-> float and int <-> float, directly.
> > 	Support indirect convert, when direct optab is not supported.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR target/107432
> > 	* gcc.target/i386/pr107432-1.c: New test.
> > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > ---
> >  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
> gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
> gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
> gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
> gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
> gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
> >  gcc/tree-vect-generic.cc                   | 107 +++++++++-
> >  8 files changed, 918 insertions(+), 6 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > new file mode 100644
> > index 00000000000..a4f37447eb4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > @@ -0,0 +1,234 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> > +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
> > +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
> > +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
> > +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
> > +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
> > +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2di)a, __v2si); }
> > +
> > +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
> > +
> > +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
> > +
> > +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2di)a, __v2hi); }
> > +
> > +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4di)a, __v4hi); }
> > +
> > +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
> > +
> > +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2di)a, __v2qi); }
> > +
> > +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4di)a, __v4qi); }
> > +
> > +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> > +{
> > +  return __builtin_convertvector((__v8di)a, __v8qi); }
> > +
> > +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> > +{
> > +  return __builtin_convertvector((__v2si)a, __v2hi); }
> > +
> > +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4si)a, __v4hi); }
> > +
> > +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
> > +
> > +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
> > +
> > +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> > +{
> > +  return __builtin_convertvector((__v2si)a, __v2qi); }
> > +
> > +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4si)a, __v4qi); }
> > +
> > +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v8si)a, __v8qi); }
> > +
> > +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
> > +
> > +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> > +{
> > +  return __builtin_convertvector((__v2hi)a, __v2qi); }
> > +
> > +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> > +
> > +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
> > +
> > +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
> > +
> > +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2du)a, __v2su); }
> > +
> > +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
> > +
> > +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
> > +
> > +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2du)a, __v2hu); }
> > +
> > +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4du)a, __v4hu); }
> > +
> > +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
> > +
> > +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v2du)a, __v2qu); }
> > +
> > +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v4du)a, __v4qu); }
> > +
> > +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> > +{
> > +  return __builtin_convertvector((__v8du)a, __v8qu); }
> > +
> > +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> > +{
> > +  return __builtin_convertvector((__v2su)a, __v2hu); }
> > +
> > +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4su)a, __v4hu); }
> > +
> > +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
> > +
> > +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
> > +
> > +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> > +{
> > +  return __builtin_convertvector((__v2su)a, __v2qu); }
> > +
> > +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v4su)a, __v4qu); }
> > +
> > +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> > +{
> > +  return __builtin_convertvector((__v8su)a, __v8qu); }
> > +
> > +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
> > +
> > +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> > +{
> > +  return __builtin_convertvector((__v2hu)a, __v2qu); }
> > +
> > +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> > +{
> > +  return __builtin_convertvector((__v8hu)a, __v8qu); }
> > +
> > +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> > +{
> > +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
> > +
> > +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> > +{
> > +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> > new file mode 100644
> > index 00000000000..02ffd811cb4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> > @@ -0,0 +1,105 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v4di); }
> > +
> > +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v8di); }
> > +
> > +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v4di); }
> > +
> > +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v8di); }
> > +
> > +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v4di); }
> > +
> > +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v8di); }
> > +
> > +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
> > +  return (__m128i)__builtin_convertvector(a, __v4si); }
> > +
> > +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v8si); }
> > +
> > +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v16si); }
> > +
> > +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
> > +  return (__m128i)__builtin_convertvector(a, __v4si); }
> > +
> > +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v8si); }
> > +
> > +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> > +{
> > +  return (__m512i)__builtin_convertvector(a, __v16si); }
> > +
> > +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
> > +  return (__m128i)__builtin_convertvector(a, __v8hi); }
> > +
> > +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> > +{
> > +  return (__m256i)__builtin_convertvector(a, __v16hi); }
> > +
> > +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> > +{
> > +  return __builtin_convertvector(a, __v32hi); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> > new file mode 100644
> > index 00000000000..30dc947b6dd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> > @@ -0,0 +1,55 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> > +typedef
> > +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
> > +  return __builtin_convertvector(a, __v2sf); }
> > +
> > +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector(a, __v4sf); }
> > +
> > +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector(a, __v8sf); }
> > +
> > +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
> > +  return __builtin_convertvector(a, __v2hf); }
> > +
> > +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector(a, __v4hf); }
> > +
> > +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector(a, __v8hf); }
> > +
> > +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
> > +  return __builtin_convertvector(a, __v4hf); }
> > +
> > +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector(a, __v8hf); }
> > +
> > +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> > +{
> > +  return __builtin_convertvector(a, __v16hf); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> > new file mode 100644
> > index 00000000000..e537e7349e4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> > @@ -0,0 +1,56 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> > +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> > +typedef
> > +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
> > +  return __builtin_convertvector(a, __v2df); }
> > +
> > +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector(a, __v4df); }
> > +
> > +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector(a, __v8df); }
> > +
> > +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
> > +  return __builtin_convertvector(a, __v2df); }
> > +
> > +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> > +{
> > +  return __builtin_convertvector(a, __v4df); }
> > +
> > +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8df); }
> > +
> > +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
> > +  return __builtin_convertvector(a, __v4sf); }
> > +
> > +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8sf); }
> > +
> > +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector(a, __v16sf); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> > new file mode 100644
> > index 00000000000..5a44ef9f3b9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> > @@ -0,0 +1,72 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3"
> > +} */
> > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> > +typedef
> > +_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> > +
> > +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
> > +  return __builtin_convertvector(a, __v2si); }
> > +
> > +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector(a, __v4si); }
> > +
> > +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector(a, __v8si); }
> > +
> > +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector(a, __v4di); }
> > +
> > +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector(a, __v8di); }
> > +
> > +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
> > +  return __builtin_convertvector(a, __v4si); }
> > +
> > +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8si); }
> > +
> > +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector(a, __v16si); }
> > +
> > +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
> > +  return __builtin_convertvector(a, __v2di); }
> > +
> > +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> > +{
> > +  return __builtin_convertvector(a, __v4di); }
> > +
> > +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector(a, __v8di); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> > new file mode 100644
> > index 00000000000..4a68a10b089
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> > @@ -0,0 +1,139 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq
> > +-fno-trapping-math" } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { !
> > +ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { !
> > +ia32 } } } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> > +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef
> > +char __v4qi __attribute__ ((__vector_size__ (4))); typedef char
> > +__v8qi __attribute__ ((__vector_size__ (8))); typedef char __v16qi
> > +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu
> > +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu
> > +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu
> > +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu
> > +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf
> > +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf
> > +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf
> > +__attribute__ ((__vector_size__ (16)));
> > +
> > +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> > +{
> > +  return __builtin_convertvector((__v2df)a, __v2qi); }
> > +
> > +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector((__v4df)a, __v4qi); }
> > +
> > +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector((__v8df)a, __v8qi); }
> > +
> > +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> > +{
> > +  return __builtin_convertvector((__v2df)a, __v2qu); }
> > +
> > +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> > +{
> > +  return __builtin_convertvector((__v4df)a, __v4qu); }
> > +
> > +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> > +{
> > +  return __builtin_convertvector((__v8df)a, __v8qu); }
> > +
> > +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> > +{
> > +  return __builtin_convertvector((__v2sf)a, __v2qi); }
> > +
> > +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector((__v4sf)a, __v4qi); }
> > +
> > +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector((__v8sf)a, __v8qi); }
> > +
> > +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> > +{
> > +  return __builtin_convertvector((__v16sf)a, __v16qi); }
> > +
> > +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> > +{
> > +  return __builtin_convertvector((__v2sf)a, __v2qu); }
> > +
> > +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> > +{
> > +  return __builtin_convertvector((__v4sf)a, __v4qu); }
> > +
> > +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> > +{
> > +  return __builtin_convertvector((__v8sf)a, __v8qu); }
> > +
> > +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> > +{
> > +  return __builtin_convertvector((__v16sf)a, __v16qu); }
> > +
> > +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> > +{
> > +  return __builtin_convertvector((__v2hf)a, __v2qi); }
> > +
> > +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector((__v8hf)a, __v8qi); }
> > +
> > +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector((__v16hf)a, __v16qi); }
> > +
> > +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> > +{
> > +  return __builtin_convertvector((__v32hf)a, __v32qi); }
> > +
> > +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> > +{
> > +  return __builtin_convertvector((__v2hf)a, __v2qu); }
> > +
> > +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> > +{
> > +  return __builtin_convertvector((__v8hf)a, __v8qu); }
> > +
> > +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> > +{
> > +  return __builtin_convertvector((__v16hf)a, __v16qu); }
> > +
> > +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> > +{
> > +  return __builtin_convertvector((__v32hf)a, __v32qu); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> > new file mode 100644
> > index 00000000000..0ff5a97ed1a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> > @@ -0,0 +1,156 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq
> > +-fno-trapping-math" } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } }
> > +} } */
> > +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32
> > +} } } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 }
> > +} } } */
> > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32
> > +} } } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef
> > +char __v4qi __attribute__ ((__vector_size__ (4))); typedef char
> > +__v8qi __attribute__ ((__vector_size__ (8))); typedef char __v16qi
> > +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu
> > +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu
> > +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu
> > +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu
> > +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf
> > +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf
> > +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf
> > +__attribute__ ((__vector_size__ (16)));
> > +
> > +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> > +{
> > +  return __builtin_convertvector((__v2qi)a, __v2df); }
> > +
> > +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> > +{
> > +  return __builtin_convertvector((__v4qi)a, __v4df); }
> > +
> > +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> > +{
> > +  return __builtin_convertvector((__v8qi)a, __v8df); }
> > +
> > +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> > +{
> > +  return __builtin_convertvector((__v2qu)a, __v2df); }
> > +
> > +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> > +{
> > +  return __builtin_convertvector((__v4qu)a, __v4df); }
> > +
> > +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> > +{
> > +  return __builtin_convertvector((__v8qu)a, __v8df); }
> > +
> > +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> > +{
> > +  return __builtin_convertvector((__v2qi)a, __v2sf); }
> > +
> > +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> > +{
> > +  return __builtin_convertvector((__v4qi)a, __v4sf); }
> > +
> > +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> > +{
> > +  return __builtin_convertvector((__v8qi)a, __v8sf); }
> > +
> > +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> > +{
> > +  return __builtin_convertvector((__v16qi)a, __v16sf); }
> > +
> > +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> > +{
> > +  return __builtin_convertvector((__v2qu)a, __v2sf); }
> > +
> > +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> > +{
> > +  return __builtin_convertvector((__v4qu)a, __v4sf); }
> > +
> > +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> > +{
> > +  return __builtin_convertvector((__v8qu)a, __v8sf); }
> > +
> > +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> > +{
> > +  return __builtin_convertvector((__v16qu)a, __v16sf); }
> > +
> > +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> > +{
> > +  return __builtin_convertvector((__v2qi)a, __v2hf); }
> > +
> > +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> > +{
> > +  return __builtin_convertvector((__v4qi)a, __v4hf); }
> > +
> > +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> > +{
> > +  return __builtin_convertvector((__v8qi)a, __v8hf); }
> > +
> > +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> > +{
> > +  return __builtin_convertvector((__v16qi)a, __v16hf); }
> > +
> > +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> > +{
> > +  return __builtin_convertvector((__v32qi)a, __v32hf); }
> > +
> > +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> > +{
> > +  return __builtin_convertvector((__v2qu)a, __v2hf); }
> > +
> > +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> > +{
> > +  return __builtin_convertvector((__v4qu)a, __v4hf); }
> > +
> > +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> > +{
> > +  return __builtin_convertvector((__v8qu)a, __v8hf); }
> > +
> > +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> > +{
> > +  return __builtin_convertvector((__v16qu)a, __v16hf); }
> > +
> > +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> > +{
> > +  return __builtin_convertvector((__v32qu)a, __v32hf); }
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > ab640096ca2..e14fac9f179 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
> #include "gimple-match.h"
> >  #include "recog.h"		/* FIXME: for insn_data */
> >  #include "optabs-libfuncs.h"
> > +#include "cfgloop.h"
> > +#include "tree-vectorizer.h"
> >
> >
> >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > @@ -1834,6 +1836,102 @@ do_vec_narrow_conversion
> (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> >
> > +/* A subroutine of expand_vector_conversion, support indirect conversion
> for
> > +   float <-> int, like char -> double.  */ bool
> > +expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi,
> > +				      enum tree_code code,
> > +				      tree lhs,
> > +				      tree arg)
> > +{
> > +  gimple *g;
> > +  tree ret_type = TREE_TYPE (lhs);
> > +  tree arg_type = TREE_TYPE (arg);
> > +  tree new_rhs;
> > +  enum {NARROW, NONE, WIDEN} modifier = NONE;
> > +  enum tree_code code1 = ERROR_MARK;
> > +  enum tree_code codecvt1 = ERROR_MARK;
> > +  bool float_expr_p = code == FLOAT_EXPR;
> > +
> > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > +    {
> > +      g = gimple_build_assign (lhs, code1, arg);
> > +      gsi_replace (gsi, g, false);
> > +      return true;
> > +    }
> > +
> > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > + (ret_elt_bits <
> > + arg_elt_bits)
> > +    modifier = NARROW;
> > +  else if (ret_elt_bits > arg_elt_bits)
> > +    modifier = WIDEN;
> > +
> > +  if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier ==
> NARROW)
> > +       || (code == FLOAT_EXPR && modifier == WIDEN)))
> > +    {
> > +      unsigned short target_size;
> > +      scalar_mode tmp_cvt_mode;
> > +      scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > +      scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > +      tree cvt_type = NULL_TREE;
> > +      if (modifier == NARROW)
> > +	{
> > +	  tmp_cvt_mode = lhs_mode;
> > +	  target_size = GET_MODE_SIZE (rhs_mode);
> > +	}
> > +      else
> > +	{
> > +	  target_size = GET_MODE_SIZE (lhs_mode);
> > +	  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
> > +	  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> > +	    return false;
> > +	}
> > +
> > +      code1 = float_expr_p ? code : NOP_EXPR;
> > +      codecvt1 = float_expr_p ? NOP_EXPR : code;
> > +      opt_scalar_mode mode_iter;
> > +      enum tree_code tc1, tc2;
> > +      unsigned HOST_WIDE_INT nelts
> > +	= constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > +
> > +      FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > +	{
> > +	  tmp_cvt_mode = mode_iter.require ();
> > +
> > +	  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > +	    break;
> > +
> > +	  scalar_mode cvt_mode;
> > +	  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > +	  if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > +	    break;
> > +
> > +	  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > +	  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> (arg_type);
> > +	  cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> > +
> > +	  cvt_type = build_vector_type (cvt_type, nelts);
> > +	  if (cvt_type == NULL_TREE
> > +	      || !supportable_convert_operation ((tree_code) code1,
> > +						 ret_type,
> > +						 cvt_type, &tc1)
> > +	      || !supportable_convert_operation ((tree_code) codecvt1,
> > +						 cvt_type,
> > +						 arg_type, &tc2))
> > +	    continue;
> > +
> > +	  new_rhs = make_ssa_name (cvt_type);
> > +	  g = vect_gimple_build (new_rhs, tc2, arg);
> > +	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +	  g = gimple_build_assign (lhs, tc1, new_rhs);
> > +	  gsi_replace (gsi, g, false);
> > +	  return true;
> > +	}
> > +    }
> > +  return false;
> > +}
> > +
> >  /* Expand VEC_CONVERT ifn call.  */
> >
> >  static void
> > @@ -1871,14 +1969,11 @@ expand_vector_conversion
> (gimple_stmt_iterator *gsi)
> >    else if (ret_elt_bits > arg_elt_bits)
> >      modifier = WIDEN;
> >
> > +  if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg))
> > +    return;
> > +
> >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> FLOAT_EXPR))
> >      {
> > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > -	{
> > -	  g = gimple_build_assign (lhs, code1, arg);
> > -	  gsi_replace (gsi, g, false);
> > -	  return;
> > -	}
> >        /* Can't use get_compute_type here, as supportable_convert_operation
> >  	 doesn't necessarily use an optab and needs two arguments.  */
> >        tree vec_compute_type
> > --
> > 2.31.1
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and
  2024-05-15  2:30     ` Hu, Lin1
@ 2024-05-23  6:37       ` Hu, Lin1
  2024-05-23  6:37         ` [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float Hu, Lin1
                           ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-23  6:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther

These patches are a series of improved patches to the __builtin_convertvector
for x86-64-v4 and x86-64-v3.

I modified the first patch according to Richard's suggestion and send them out
together to show my complete modification of the function.

They are bootstrapped and regtested on x86_64-pc-linux-gnu.

BRs,
Lin

Hu, Lin1 (3):
  vect: generate suitable convert insn for int -> int, float -> float
    and int <-> float.
  vect: Support v4hi -> v4qi.
  vect: support direct conversion under x86-64-v3.

 gcc/config/i386/i386-expand.cc             |  47 +++-
 gcc/config/i386/i386-protos.h              |   3 +
 gcc/config/i386/mmx.md                     |  10 +
 gcc/config/i386/sse.md                     |  87 ++++++--
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 244 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 152 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 ++++++
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 ++++++++++
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 gcc/tree-vect-generic.cc                   | 157 ++++++++++++-
 15 files changed, 1305 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-23  6:37       ` [PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and Hu, Lin1
@ 2024-05-23  6:37         ` Hu, Lin1
  2024-05-29  9:40           ` Richard Biener
  2024-05-23  6:37         ` [PATCH 2/3] vect: Support v4hi -> v4qi Hu, Lin1
  2024-05-23  6:37         ` [PATCH 3/3] vect: support direct conversion under x86-64-v3 Hu, Lin1
  2 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-23  6:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc
	(supportable_indirect_narrowing_operation): New function for
	support indirect narrowing convert.
	(supportable_indirect_widening_operation): New function for
	support indirect widening convert.
	(expand_vector_conversion): Support convert for int -> int,
	float -> float and int <-> float.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
 gcc/tree-vect-generic.cc                   | 157 +++++++++++++-
 8 files changed, 968 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
+}
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
+}
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2su);
+}
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
+}
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
+}
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu);
+}
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu);
+}
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
+}
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu);
+}
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu);
+}
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu);
+}
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu);
+}
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu);
+}
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
+}
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
+}
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu);
+}
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu);
+}
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu);
+}
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
+}
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu);
+}
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu);
+}
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
+}
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v8hi);
+}
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi);
+}
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2sf);
+}
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2hf);
+}
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2si);
+}
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si);
+}
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi);
+}
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi);
+}
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi);
+}
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu);
+}
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu);
+}
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu);
+}
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi);
+}
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi);
+}
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi);
+}
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi);
+}
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu);
+}
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu);
+}
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu);
+}
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu);
+}
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi);
+}
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi);
+}
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi);
+}
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi);
+}
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu);
+}
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu);
+}
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu);
+}
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df);
+}
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df);
+}
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df);
+}
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df);
+}
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df);
+}
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df);
+}
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf);
+}
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf);
+}
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf);
+}
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index ab640096ca2..0bedb53d9f9 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1834,6 +1836,142 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
   return gimplify_build2 (gsi, code, outer_type, b, c);
 }
 
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like double -> char.  */
+bool
+supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
+					 enum tree_code code,
+					 tree lhs,
+					 tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);
+  unsigned int arg_elt_bits = vector_element_bits (arg_type);
+  if (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= arg_elt_bits)
+    return false;
+
+  unsigned short target_size;
+  scalar_mode tmp_cvt_mode;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+  tree cvt_type = NULL_TREE;
+  tmp_cvt_mode = lhs_mode;
+  target_size = GET_MODE_SIZE (rhs_mode);
+
+  opt_scalar_mode mode_iter;
+  enum tree_code tc1, tc2;
+  unsigned HOST_WIDE_INT nelts
+    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+    {
+      tmp_cvt_mode = mode_iter.require ();
+
+      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	break;
+
+      scalar_mode cvt_mode;
+      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	break;
+
+      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+      cvt_type = build_vector_type (cvt_type, nelts);
+      if (cvt_type == NULL_TREE
+	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
+					     ret_type,
+					     cvt_type, &tc1)
+	  || !supportable_convert_operation ((tree_code) code,
+					     cvt_type,
+					     arg_type, &tc2))
+	continue;
+
+      new_rhs = make_ssa_name (cvt_type);
+      g = vect_gimple_build (new_rhs, tc2, arg);
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, tc1, new_rhs);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+  return false;
+}
+
+/* A subroutine of expand_vector_conversion, support indirect conversion for
+   float <-> int, like char -> double.  */
+bool
+supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
+					 enum tree_code code,
+					 tree lhs,
+					 tree arg)
+{
+  gimple *g;
+  tree ret_type = TREE_TYPE (lhs);
+  tree arg_type = TREE_TYPE (arg);
+  tree new_rhs;
+
+  unsigned int ret_elt_bits = vector_element_bits (ret_type);
+  unsigned int arg_elt_bits = vector_element_bits (arg_type);
+  if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
+    return false;
+
+  unsigned short target_size;
+  scalar_mode tmp_cvt_mode;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
+  tree cvt_type = NULL_TREE;
+  target_size = GET_MODE_SIZE (lhs_mode);
+  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
+  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
+    return false;
+
+  opt_scalar_mode mode_iter;
+  enum tree_code tc1, tc2;
+  unsigned HOST_WIDE_INT nelts
+    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
+
+  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
+    {
+      tmp_cvt_mode = mode_iter.require ();
+
+      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
+	break;
+
+      scalar_mode cvt_mode;
+      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
+      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
+	break;
+
+      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
+      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
+      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
+
+      cvt_type = build_vector_type (cvt_type, nelts);
+      if (cvt_type == NULL_TREE
+	  || !supportable_convert_operation ((tree_code) code,
+					     ret_type,
+					     cvt_type, &tc1)
+	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
+					     cvt_type,
+					     arg_type, &tc2))
+	continue;
+
+      new_rhs = make_ssa_name (cvt_type);
+      g = vect_gimple_build (new_rhs, tc2, arg);
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, tc1, new_rhs);
+      gsi_replace (gsi, g, false);
+      return true;
+    }
+  return false;
+}
+
 /* Expand VEC_CONVERT ifn call.  */
 
 static void
@@ -1871,14 +2009,21 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
+  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
+    return;
+
+  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
+    return;
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-23  6:37         ` [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float Hu, Lin1
@ 2024-05-29  9:40           ` Richard Biener
  2024-05-31  8:54             ` Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-05-29  9:40 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu, ubizjak

On Thu, 23 May 2024, Hu, Lin1 wrote:

> gcc/ChangeLog:
> 
> 	PR target/107432
> 	* tree-vect-generic.cc
> 	(supportable_indirect_narrowing_operation): New function for
> 	support indirect narrowing convert.
> 	(supportable_indirect_widening_operation): New function for
> 	support indirect widening convert.
> 	(expand_vector_conversion): Support convert for int -> int,
> 	float -> float and int <-> float.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/107432
> 	* gcc.target/i386/pr107432-1.c: New test.
> 	* gcc.target/i386/pr107432-2.c: Ditto.
> 	* gcc.target/i386/pr107432-3.c: Ditto.
> 	* gcc.target/i386/pr107432-4.c: Ditto.
> 	* gcc.target/i386/pr107432-5.c: Ditto.
> 	* gcc.target/i386/pr107432-6.c: Ditto.
> 	* gcc.target/i386/pr107432-7.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++++++
>  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++
>  gcc/tree-vect-generic.cc                   | 157 +++++++++++++-
>  8 files changed, 968 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> new file mode 100644
> index 00000000000..a4f37447eb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -0,0 +1,234 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
> +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2si);
> +}
> +
> +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
> +}
> +
> +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
> +}
> +
> +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi);
> +}
> +
> +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi);
> +}
> +
> +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
> +}
> +
> +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8di)a, __v8qi);
> +}
> +
> +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2hi);
> +}
> +
> +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi);
> +}
> +
> +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
> +}
> +
> +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
> +}
> +
> +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2qi);
> +}
> +
> +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi);
> +}
> +
> +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
> +}
> +
> +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector((__v2hi)a, __v2qi);
> +}
> +
> +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi);
> +}
> +
> +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
> +}
> +
> +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
> +}
> +
> +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2su);
> +}
> +
> +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
> +}
> +
> +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
> +}
> +
> +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2hu);
> +}
> +
> +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4hu);
> +}
> +
> +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
> +}
> +
> +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8du)a, __v8qu);
> +}
> +
> +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2hu);
> +}
> +
> +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4hu);
> +}
> +
> +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
> +}
> +
> +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
> +}
> +
> +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2qu);
> +}
> +
> +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8su)a, __v8qu);
> +}
> +
> +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
> +}
> +
> +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> +{
> +  return __builtin_convertvector((__v2hu)a, __v2qu);
> +}
> +
> +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hu)a, __v8qu);
> +}
> +
> +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
> +}
> +
> +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> new file mode 100644
> index 00000000000..02ffd811cb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> @@ -0,0 +1,105 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v8hi);
> +}
> +
> +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v16hi);
> +}
> +
> +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector(a, __v32hi);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> new file mode 100644
> index 00000000000..30dc947b6dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> @@ -0,0 +1,55 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2sf);
> +}
> +
> +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2hf);
> +}
> +
> +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector(a, __v16hf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> new file mode 100644
> index 00000000000..e537e7349e4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16sf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> new file mode 100644
> index 00000000000..5a44ef9f3b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2si);
> +}
> +
> +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> +
> +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16si);
> +}
> +
> +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> new file mode 100644
> index 00000000000..4a68a10b089
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -0,0 +1,139 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qi);
> +}
> +
> +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qu);
> +}
> +
> +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qi);
> +}
> +
> +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qi);
> +}
> +
> +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qi);
> +}
> +
> +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qu);
> +}
> +
> +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qu);
> +}
> +
> +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qu);
> +}
> +
> +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qi);
> +}
> +
> +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qi);
> +}
> +
> +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qi);
> +}
> +
> +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qi);
> +}
> +
> +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qu);
> +}
> +
> +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qu);
> +}
> +
> +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qu);
> +}
> +
> +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> new file mode 100644
> index 00000000000..0ff5a97ed1a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> @@ -0,0 +1,156 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8df);
> +}
> +
> +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8df);
> +}
> +
> +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16sf);
> +}
> +
> +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16sf);
> +}
> +
> +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector((__v32qi)a, __v32hf);
> +}
> +
> +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> +{
> +  return __builtin_convertvector((__v32qu)a, __v32hf);
> +}
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index ab640096ca2..0bedb53d9f9 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "gimple-match.h"
>  #include "recog.h"		/* FIXME: for insn_data */
>  #include "optabs-libfuncs.h"
> +#include "cfgloop.h"
> +#include "tree-vectorizer.h"
>  
>  
>  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a,
>    return gimplify_build2 (gsi, code, outer_type, b, c);
>  }
>  
> +/* A subroutine of expand_vector_conversion, support indirect conversion for
> +   float <-> int, like double -> char.  */
> +bool
> +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> +					 enum tree_code code,
> +					 tree lhs,
> +					 tree arg)
> +{
> +  gimple *g;
> +  tree ret_type = TREE_TYPE (lhs);
> +  tree arg_type = TREE_TYPE (arg);
> +  tree new_rhs;
> +
> +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> +  unsigned int arg_elt_bits = vector_element_bits (arg_type);
> +  if (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= arg_elt_bits)
> +    return false;
> +
> +  unsigned short target_size;
> +  scalar_mode tmp_cvt_mode;
> +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> +  tree cvt_type = NULL_TREE;
> +  tmp_cvt_mode = lhs_mode;
> +  target_size = GET_MODE_SIZE (rhs_mode);
> +
> +  opt_scalar_mode mode_iter;
> +  enum tree_code tc1, tc2;
> +  unsigned HOST_WIDE_INT nelts
> +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> +
> +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> +    {
> +      tmp_cvt_mode = mode_iter.require ();
> +
> +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> +	break;
> +
> +      scalar_mode cvt_mode;
> +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> +	break;
> +
> +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
> +      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> +
> +      cvt_type = build_vector_type (cvt_type, nelts);
> +      if (cvt_type == NULL_TREE
> +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> +					     ret_type,
> +					     cvt_type, &tc1)
> +	  || !supportable_convert_operation ((tree_code) code,
> +					     cvt_type,
> +					     arg_type, &tc2))
> +	continue;
> +
> +      new_rhs = make_ssa_name (cvt_type);
> +      g = vect_gimple_build (new_rhs, tc2, arg);
> +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +      g = gimple_build_assign (lhs, tc1, new_rhs);
> +      gsi_replace (gsi, g, false);
> +      return true;
> +    }
> +  return false;
> +}
> +
> +/* A subroutine of expand_vector_conversion, support indirect conversion for
> +   float <-> int, like char -> double.  */
> +bool
> +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> +					 enum tree_code code,
> +					 tree lhs,
> +					 tree arg)
> +{
> +  gimple *g;
> +  tree ret_type = TREE_TYPE (lhs);
> +  tree arg_type = TREE_TYPE (arg);
> +  tree new_rhs;
> +
> +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> +  unsigned int arg_elt_bits = vector_element_bits (arg_type);
> +  if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> +    return false;
> +
> +  unsigned short target_size;
> +  scalar_mode tmp_cvt_mode;
> +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> +  tree cvt_type = NULL_TREE;
> +  target_size = GET_MODE_SIZE (lhs_mode);
> +  int rhs_size = GET_MODE_BITSIZE (rhs_mode);
> +  if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> +    return false;
> +
> +  opt_scalar_mode mode_iter;
> +  enum tree_code tc1, tc2;
> +  unsigned HOST_WIDE_INT nelts
> +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> +
> +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> +    {
> +      tmp_cvt_mode = mode_iter.require ();
> +
> +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> +	break;
> +
> +      scalar_mode cvt_mode;
> +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> +	break;
> +
> +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type);
> +      cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned);
> +
> +      cvt_type = build_vector_type (cvt_type, nelts);
> +      if (cvt_type == NULL_TREE
> +	  || !supportable_convert_operation ((tree_code) code,
> +					     ret_type,
> +					     cvt_type, &tc1)
> +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> +					     cvt_type,
> +					     arg_type, &tc2))
> +	continue;
> +
> +      new_rhs = make_ssa_name (cvt_type);
> +      g = vect_gimple_build (new_rhs, tc2, arg);
> +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +      g = gimple_build_assign (lhs, tc1, new_rhs);
> +      gsi_replace (gsi, g, false);
> +      return true;
> +    }
> +  return false;
> +}
> +

So the above improve the situation where the target can handle
the two-step conversion.  It doesn't really allow this to work
for too large vectors AFAICS (nor does it try pack/unpack for
any of the conversions).  It also still duplicates code
that's in the vectorizer.  I think you should be able to use
supportable_narrowing_operation and possibly even
supportable_widening_operation (though that needs refatoring to
avoid the vectorizer internal stmt_vec_info type - possibly
simply by gating the respective code on a non-NULL vinfo).  Both 
support multi-step conversions.

>  /* Expand VEC_CONVERT ifn call.  */
>  
>  static void
> @@ -1871,14 +2009,21 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
>    else if (ret_elt_bits > arg_elt_bits)
>      modifier = WIDEN;
>  
> +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> +    {
> +      g = gimple_build_assign (lhs, code1, arg);
> +      gsi_replace (gsi, g, false);
> +      return;
> +    }
> +
> +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> +    return;
> +
> +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> +    return;
> +
>    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
>      {
> -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> -	{
> -	  g = gimple_build_assign (lhs, code1, arg);
> -	  gsi_replace (gsi, g, false);
> -	  return;
> -	}
>        /* Can't use get_compute_type here, as supportable_convert_operation
>  	 doesn't necessarily use an optab and needs two arguments.  */
>        tree vec_compute_type
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-29  9:40           ` Richard Biener
@ 2024-05-31  8:54             ` Hu, Lin1
  2024-05-31 12:41               ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-31  8:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Liu, Hongtao, ubizjak

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Wednesday, May 29, 2024 5:41 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> -> float and int <-> float.
> 
> On Thu, 23 May 2024, Hu, Lin1 wrote:
> 
> > gcc/ChangeLog:
> >
> > 	PR target/107432
> > 	* tree-vect-generic.cc
> > 	(supportable_indirect_narrowing_operation): New function for
> > 	support indirect narrowing convert.
> > 	(supportable_indirect_widening_operation): New function for
> > 	support indirect widening convert.
> > 	(expand_vector_conversion): Support convert for int -> int,
> > 	float -> float and int <-> float.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR target/107432
> > 	* gcc.target/i386/pr107432-1.c: New test.
> > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > ---
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > ab640096ca2..0bedb53d9f9 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
> > #include "gimple-match.h"
> >  #include "recog.h"		/* FIXME: for insn_data */
> >  #include "optabs-libfuncs.h"
> > +#include "cfgloop.h"
> > +#include "tree-vectorizer.h"
> >
> >
> >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> >
> > +/* A subroutine of expand_vector_conversion, support indirect conversion
> for
> > +   float <-> int, like double -> char.  */ bool
> > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > +					 enum tree_code code,
> > +					 tree lhs,
> > +					 tree arg)
> > +{
> > +  gimple *g;
> > +  tree ret_type = TREE_TYPE (lhs);
> > +  tree arg_type = TREE_TYPE (arg);
> > +  tree new_rhs;
> > +
> > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> arg_elt_bits)
> > +    return false;
> > +
> > +  unsigned short target_size;
> > +  scalar_mode tmp_cvt_mode;
> > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > + cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;  target_size =
> > + GET_MODE_SIZE (rhs_mode);
> > +
> > +  opt_scalar_mode mode_iter;
> > +  enum tree_code tc1, tc2;
> > +  unsigned HOST_WIDE_INT nelts
> > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > +
> > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > +    {
> > +      tmp_cvt_mode = mode_iter.require ();
> > +
> > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > +	break;
> > +
> > +      scalar_mode cvt_mode;
> > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > +	break;
> > +
> > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> (arg_type);
> > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > + isUnsigned);
> > +
> > +      cvt_type = build_vector_type (cvt_type, nelts);
> > +      if (cvt_type == NULL_TREE
> > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > +					     ret_type,
> > +					     cvt_type, &tc1)
> > +	  || !supportable_convert_operation ((tree_code) code,
> > +					     cvt_type,
> > +					     arg_type, &tc2))
> > +	continue;
> > +
> > +      new_rhs = make_ssa_name (cvt_type);
> > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > +      gsi_replace (gsi, g, false);
> > +      return true;
> > +    }
> > +  return false;
> > +}
> > +
> > +/* A subroutine of expand_vector_conversion, support indirect conversion
> for
> > +   float <-> int, like char -> double.  */ bool
> > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > +					 enum tree_code code,
> > +					 tree lhs,
> > +					 tree arg)
> > +{
> > +  gimple *g;
> > +  tree ret_type = TREE_TYPE (lhs);
> > +  tree arg_type = TREE_TYPE (arg);
> > +  tree new_rhs;
> > +
> > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > +    return false;
> > +
> > +  unsigned short target_size;
> > +  scalar_mode tmp_cvt_mode;
> > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > + cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE (lhs_mode);  int
> > + rhs_size = GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size
> > + (rhs_size, 0).exists (&tmp_cvt_mode))
> > +    return false;
> > +
> > +  opt_scalar_mode mode_iter;
> > +  enum tree_code tc1, tc2;
> > +  unsigned HOST_WIDE_INT nelts
> > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > +
> > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > +    {
> > +      tmp_cvt_mode = mode_iter.require ();
> > +
> > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > +	break;
> > +
> > +      scalar_mode cvt_mode;
> > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > +	break;
> > +
> > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> (arg_type);
> > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > + isUnsigned);
> > +
> > +      cvt_type = build_vector_type (cvt_type, nelts);
> > +      if (cvt_type == NULL_TREE
> > +	  || !supportable_convert_operation ((tree_code) code,
> > +					     ret_type,
> > +					     cvt_type, &tc1)
> > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > +					     cvt_type,
> > +					     arg_type, &tc2))
> > +	continue;
> > +
> > +      new_rhs = make_ssa_name (cvt_type);
> > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > +      gsi_replace (gsi, g, false);
> > +      return true;
> > +    }
> > +  return false;
> > +}
> > +
> 
> So the above improve the situation where the target can handle the two-step
> conversion.  It doesn't really allow this to work for too large vectors AFAICS (nor
> does it try pack/unpack for any of the conversions).  It also still duplicates code
> that's in the vectorizer.  I think you should be able to use
> supportable_narrowing_operation and possibly even
> supportable_widening_operation (though that needs refatoring to avoid the
> vectorizer internal stmt_vec_info type - possibly simply by gating the respective
> code on a non-NULL vinfo).  Both support multi-step conversions.
>

I tried to use supportable_narrowing_operation and I met two questions:

1) supportable_narrowing_operation support v2df->v16qi, but I don't know which optab can help me convert v16qi to v2qi.
2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is not what I expected, because it only use vec_pack_trunc. I expect it can use vcvttpd2dq + vpmovdw.

If I can solve the first question and the function be better (maybe support trunc<vectype_in><vectype_out>), I'd be happy to use it directly. I prefer my scheme for now. My functions is more like supportable_convert_operation. Perhaps, we can modify supportable_narrowing_operation, but I think it should be another patch, it will influence vectorizer.

BRs,
Lin

>
> >  /* Expand VEC_CONVERT ifn call.  */
> >
> >  static void
> > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> (gimple_stmt_iterator *gsi)
> >    else if (ret_elt_bits > arg_elt_bits)
> >      modifier = WIDEN;
> >
> > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > +    {
> > +      g = gimple_build_assign (lhs, code1, arg);
> > +      gsi_replace (gsi, g, false);
> > +      return;
> > +    }
> > +
> > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > +    return;
> > +
> > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > +    return;
> > +
> >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> FLOAT_EXPR))
> >      {
> > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > -	{
> > -	  g = gimple_build_assign (lhs, code1, arg);
> > -	  gsi_replace (gsi, g, false);
> > -	  return;
> > -	}
> >        /* Can't use get_compute_type here, as supportable_convert_operation
> >  	 doesn't necessarily use an optab and needs two arguments.  */
> >        tree vec_compute_type
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-31  8:54             ` Hu, Lin1
@ 2024-05-31 12:41               ` Richard Biener
  2024-06-03  8:23                 ` Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-05-31 12:41 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao, ubizjak

On Fri, 31 May 2024, Hu, Lin1 wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Wednesday, May 29, 2024 5:41 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com
> > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> > -> float and int <-> float.
> > 
> > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > 
> > > gcc/ChangeLog:
> > >
> > > 	PR target/107432
> > > 	* tree-vect-generic.cc
> > > 	(supportable_indirect_narrowing_operation): New function for
> > > 	support indirect narrowing convert.
> > > 	(supportable_indirect_widening_operation): New function for
> > > 	support indirect widening convert.
> > > 	(expand_vector_conversion): Support convert for int -> int,
> > > 	float -> float and int <-> float.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	PR target/107432
> > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > ---
> > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > > ab640096ca2..0bedb53d9f9 100644
> > > --- a/gcc/tree-vect-generic.cc
> > > +++ b/gcc/tree-vect-generic.cc
> > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
> > > #include "gimple-match.h"
> > >  #include "recog.h"		/* FIXME: for insn_data */
> > >  #include "optabs-libfuncs.h"
> > > +#include "cfgloop.h"
> > > +#include "tree-vectorizer.h"
> > >
> > >
> > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > >
> > > +/* A subroutine of expand_vector_conversion, support indirect conversion
> > for
> > > +   float <-> int, like double -> char.  */ bool
> > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > +					 enum tree_code code,
> > > +					 tree lhs,
> > > +					 tree arg)
> > > +{
> > > +  gimple *g;
> > > +  tree ret_type = TREE_TYPE (lhs);
> > > +  tree arg_type = TREE_TYPE (arg);
> > > +  tree new_rhs;
> > > +
> > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> > arg_elt_bits)
> > > +    return false;
> > > +
> > > +  unsigned short target_size;
> > > +  scalar_mode tmp_cvt_mode;
> > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > > + cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;  target_size =
> > > + GET_MODE_SIZE (rhs_mode);
> > > +
> > > +  opt_scalar_mode mode_iter;
> > > +  enum tree_code tc1, tc2;
> > > +  unsigned HOST_WIDE_INT nelts
> > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > +
> > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > +    {
> > > +      tmp_cvt_mode = mode_iter.require ();
> > > +
> > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > +	break;
> > > +
> > > +      scalar_mode cvt_mode;
> > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > +	break;
> > > +
> > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > (arg_type);
> > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > + isUnsigned);
> > > +
> > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > +      if (cvt_type == NULL_TREE
> > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > +					     ret_type,
> > > +					     cvt_type, &tc1)
> > > +	  || !supportable_convert_operation ((tree_code) code,
> > > +					     cvt_type,
> > > +					     arg_type, &tc2))
> > > +	continue;
> > > +
> > > +      new_rhs = make_ssa_name (cvt_type);
> > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > +      gsi_replace (gsi, g, false);
> > > +      return true;
> > > +    }
> > > +  return false;
> > > +}
> > > +
> > > +/* A subroutine of expand_vector_conversion, support indirect conversion
> > for
> > > +   float <-> int, like char -> double.  */ bool
> > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > +					 enum tree_code code,
> > > +					 tree lhs,
> > > +					 tree arg)
> > > +{
> > > +  gimple *g;
> > > +  tree ret_type = TREE_TYPE (lhs);
> > > +  tree arg_type = TREE_TYPE (arg);
> > > +  tree new_rhs;
> > > +
> > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > +    return false;
> > > +
> > > +  unsigned short target_size;
> > > +  scalar_mode tmp_cvt_mode;
> > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > > + cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE (lhs_mode);  int
> > > + rhs_size = GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size
> > > + (rhs_size, 0).exists (&tmp_cvt_mode))
> > > +    return false;
> > > +
> > > +  opt_scalar_mode mode_iter;
> > > +  enum tree_code tc1, tc2;
> > > +  unsigned HOST_WIDE_INT nelts
> > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > +
> > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > +    {
> > > +      tmp_cvt_mode = mode_iter.require ();
> > > +
> > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > +	break;
> > > +
> > > +      scalar_mode cvt_mode;
> > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > +	break;
> > > +
> > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > (arg_type);
> > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > + isUnsigned);
> > > +
> > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > +      if (cvt_type == NULL_TREE
> > > +	  || !supportable_convert_operation ((tree_code) code,
> > > +					     ret_type,
> > > +					     cvt_type, &tc1)
> > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > +					     cvt_type,
> > > +					     arg_type, &tc2))
> > > +	continue;
> > > +
> > > +      new_rhs = make_ssa_name (cvt_type);
> > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > +      gsi_replace (gsi, g, false);
> > > +      return true;
> > > +    }
> > > +  return false;
> > > +}
> > > +
> > 
> > So the above improve the situation where the target can handle the two-step
> > conversion.  It doesn't really allow this to work for too large vectors AFAICS (nor
> > does it try pack/unpack for any of the conversions).  It also still duplicates code
> > that's in the vectorizer.  I think you should be able to use
> > supportable_narrowing_operation and possibly even
> > supportable_widening_operation (though that needs refatoring to avoid the
> > vectorizer internal stmt_vec_info type - possibly simply by gating the respective
> > code on a non-NULL vinfo).  Both support multi-step conversions.
> >
> 
> I tried to use supportable_narrowing_operation and I met two questions:
> 
> 1) supportable_narrowing_operation support v2df->v16qi, but I don't know 
>    which optab can help me convert v16qi to v2qi.

It's API is a bit tricky but for v2df -> v2qi (I expect you'll have
an equal number of lanes in/out for .CONVERT_VECTOR) it likely outputs
a multi-step conversion where you have to look into *INTERM_TYPES
and second-guess the operation code to use for the intermediate steps
(IIRC the intermediate steps all use either PACK/UNPACK or CONVERT,
never FLOAT/FIX).

> 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is 
>    not what I expected, because it only use vec_pack_trunc. I expect it 
>    can use vcvttpd2dq + vpmovdw.

With -O3 -fno-tree-loop-vectorize that's what you get.  What you see
is because of the restriction of the loop vectorizer to work on a
single vector size only.

> If I can solve the first question and the function be better (maybe 
> support trunc<vectype_in><vectype_out>), I'd be happy to use it 
> directly. I prefer my scheme for now. My functions is more like 
> supportable_convert_operation. Perhaps, we can modify 
> supportable_narrowing_operation, but I think it should be another patch, 
> it will influence vectorizer.

But since you are doing a multi-step conversion this is really what
supportable_narrowing_operation is about.  I don't think we want to
re-invent the wheel here.  Likewise your approach won't get you
to use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the
current single-step .CONVERT_VECTOR lowering).  
supportable_narrowing_operation also checks for this.

Richard.

> BRs,
> Lin
> 
> >
> > >  /* Expand VEC_CONVERT ifn call.  */
> > >
> > >  static void
> > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > (gimple_stmt_iterator *gsi)
> > >    else if (ret_elt_bits > arg_elt_bits)
> > >      modifier = WIDEN;
> > >
> > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > +    {
> > > +      g = gimple_build_assign (lhs, code1, arg);
> > > +      gsi_replace (gsi, g, false);
> > > +      return;
> > > +    }
> > > +
> > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > +    return;
> > > +
> > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > +    return;
> > > +
> > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > FLOAT_EXPR))
> > >      {
> > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > -	{
> > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > -	  gsi_replace (gsi, g, false);
> > > -	  return;
> > > -	}
> > >        /* Can't use get_compute_type here, as supportable_convert_operation
> > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > >        tree vec_compute_type
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-05-31 12:41               ` Richard Biener
@ 2024-06-03  8:23                 ` Hu, Lin1
  2024-06-03  9:02                   ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-03  8:23 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Liu, Hongtao, ubizjak

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, May 31, 2024 8:41 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> -> float and int <-> float.
> 
> On Fri, 31 May 2024, Hu, Lin1 wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > ubizjak@gmail.com
> > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for
> > > int -> int, float
> > > -> float and int <-> float.
> > >
> > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	PR target/107432
> > > > 	* tree-vect-generic.cc
> > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > 	support indirect narrowing convert.
> > > > 	(supportable_indirect_widening_operation): New function for
> > > > 	support indirect widening convert.
> > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > 	float -> float and int <-> float.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > 	PR target/107432
> > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > ---
> > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> > > > index
> > > > ab640096ca2..0bedb53d9f9 100644
> > > > --- a/gcc/tree-vect-generic.cc
> > > > +++ b/gcc/tree-vect-generic.cc
> > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not
> > > > see #include "gimple-match.h"
> > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > >  #include "optabs-libfuncs.h"
> > > > +#include "cfgloop.h"
> > > > +#include "tree-vectorizer.h"
> > > >
> > > >
> > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > >
> > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > +conversion
> > > for
> > > > +   float <-> int, like double -> char.  */ bool
> > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > +					 enum tree_code code,
> > > > +					 tree lhs,
> > > > +					 tree arg)
> > > > +{
> > > > +  gimple *g;
> > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > +  tree arg_type = TREE_TYPE (arg);
> > > > +  tree new_rhs;
> > > > +
> > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> > > arg_elt_bits)
> > > > +    return false;
> > > > +
> > > > +  unsigned short target_size;
> > > > +  scalar_mode tmp_cvt_mode;
> > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > + tree cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;
> > > > + target_size = GET_MODE_SIZE (rhs_mode);
> > > > +
> > > > +  opt_scalar_mode mode_iter;
> > > > +  enum tree_code tc1, tc2;
> > > > +  unsigned HOST_WIDE_INT nelts
> > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > +
> > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > +    {
> > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > +
> > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > +	break;
> > > > +
> > > > +      scalar_mode cvt_mode;
> > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > +	break;
> > > > +
> > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > (arg_type);
> > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > + isUnsigned);
> > > > +
> > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > +      if (cvt_type == NULL_TREE
> > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > +					     ret_type,
> > > > +					     cvt_type, &tc1)
> > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > +					     cvt_type,
> > > > +					     arg_type, &tc2))
> > > > +	continue;
> > > > +
> > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return true;
> > > > +    }
> > > > +  return false;
> > > > +}
> > > > +
> > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > +conversion
> > > for
> > > > +   float <-> int, like char -> double.  */ bool
> > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > +					 enum tree_code code,
> > > > +					 tree lhs,
> > > > +					 tree arg)
> > > > +{
> > > > +  gimple *g;
> > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > +  tree arg_type = TREE_TYPE (arg);
> > > > +  tree new_rhs;
> > > > +
> > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > +    return false;
> > > > +
> > > > +  unsigned short target_size;
> > > > +  scalar_mode tmp_cvt_mode;
> > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > + tree cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE
> > > > + (lhs_mode);  int rhs_size = GET_MODE_BITSIZE (rhs_mode);  if
> > > > + (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> > > > +    return false;
> > > > +
> > > > +  opt_scalar_mode mode_iter;
> > > > +  enum tree_code tc1, tc2;
> > > > +  unsigned HOST_WIDE_INT nelts
> > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > +
> > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > +    {
> > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > +
> > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > +	break;
> > > > +
> > > > +      scalar_mode cvt_mode;
> > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > +	break;
> > > > +
> > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > (arg_type);
> > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > + isUnsigned);
> > > > +
> > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > +      if (cvt_type == NULL_TREE
> > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > +					     ret_type,
> > > > +					     cvt_type, &tc1)
> > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > +					     cvt_type,
> > > > +					     arg_type, &tc2))
> > > > +	continue;
> > > > +
> > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return true;
> > > > +    }
> > > > +  return false;
> > > > +}
> > > > +
> > >
> > > So the above improve the situation where the target can handle the
> > > two-step conversion.  It doesn't really allow this to work for too
> > > large vectors AFAICS (nor does it try pack/unpack for any of the
> > > conversions).  It also still duplicates code that's in the
> > > vectorizer.  I think you should be able to use
> > > supportable_narrowing_operation and possibly even
> > > supportable_widening_operation (though that needs refatoring to
> > > avoid the vectorizer internal stmt_vec_info type - possibly simply by gating
> the respective code on a non-NULL vinfo).  Both support multi-step conversions.
> > >
> >
> > I tried to use supportable_narrowing_operation and I met two questions:
> >
> > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> >    which optab can help me convert v16qi to v2qi.
> 
> It's API is a bit tricky but for v2df -> v2qi (I expect you'll have an equal number of
> lanes in/out for .CONVERT_VECTOR) it likely outputs a multi-step conversion
> where you have to look into *INTERM_TYPES and second-guess the operation
> code to use for the intermediate steps (IIRC the intermediate steps all use either
> PACK/UNPACK or CONVERT, never FLOAT/FIX).
>

I made a mistake in what I said before. I think supportable_narrowing_operation doesn't support v2df->v2qi, it only use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it require that vectype_in and vectype_out have the same size to return true. I want to make sure I'm doing the right thing, I can build a tmp_type by build_nonstandard_integer_type and get_same_sized_vectype. And use tree_vec_extract to extract v2qi from v16qi after supportable_narrowing_operation.

> 
> > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> >    not what I expected, because it only use vec_pack_trunc. I expect it
> >    can use vcvttpd2dq + vpmovdw.
> 
> With -O3 -fno-tree-loop-vectorize that's what you get.  What you see is because
> of the restriction of the loop vectorizer to work on a single vector size only.
>

Yes, it works, but the program runs the NONE part (tree-vect-stmts.cc:5357) instead of the NARROW_DST part (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the code from line:5373 to line:5455 as a function. This avoids duplicating the wheel, and I get the results I'm looking for. 

In addition to wrapping the function. If you are motivated by the fact that our modifications are not generalized enough, I think we can add supportable_narrowing/widening_operation after the current single step VEC_CONVERT (line 1972 and line 2078). It should try to use a single step and then use multiple steps. If you agree, I'd like to remove my changes about indirect conversions for now, and keep only the direct conversions, so that I can merge the three current patches into the trunk first, and then add the change about indirect conversions later.

BRs,
Lin
 
>
> > If I can solve the first question and the function be better (maybe
> > support trunc<vectype_in><vectype_out>), I'd be happy to use it
> > directly. I prefer my scheme for now. My functions is more like
> > supportable_convert_operation. Perhaps, we can modify
> > supportable_narrowing_operation, but I think it should be another
> > patch, it will influence vectorizer.
> 
> But since you are doing a multi-step conversion this is really what
> supportable_narrowing_operation is about.  I don't think we want to re-invent
> the wheel here.  Likewise your approach won't get you to use
> VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current single-
> step .CONVERT_VECTOR lowering).
> supportable_narrowing_operation also checks for this.
> 
> Richard.
> 
>
> > BRs,
> > Lin
> >
> > >
> > > >  /* Expand VEC_CONVERT ifn call.  */
> > > >
> > > >  static void
> > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > (gimple_stmt_iterator *gsi)
> > > >    else if (ret_elt_bits > arg_elt_bits)
> > > >      modifier = WIDEN;
> > > >
> > > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > +    {
> > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return;
> > > > +    }
> > > > +
> > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > +    return;
> > > > +
> > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > +    return;
> > > > +
> > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > FLOAT_EXPR))
> > > >      {
> > > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > -	{
> > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > -	  gsi_replace (gsi, g, false);
> > > > -	  return;
> > > > -	}
> > > >        /* Can't use get_compute_type here, as
> supportable_convert_operation
> > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > >        tree vec_compute_type
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-03  8:23                 ` Hu, Lin1
@ 2024-06-03  9:02                   ` Richard Biener
  2024-06-03  9:26                     ` Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-06-03  9:02 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao, ubizjak

On Mon, 3 Jun 2024, Hu, Lin1 wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Friday, May 31, 2024 8:41 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com
> > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> > -> float and int <-> float.
> > 
> > On Fri, 31 May 2024, Hu, Lin1 wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > > ubizjak@gmail.com
> > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for
> > > > int -> int, float
> > > > -> float and int <-> float.
> > > >
> > > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > 	PR target/107432
> > > > > 	* tree-vect-generic.cc
> > > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > > 	support indirect narrowing convert.
> > > > > 	(supportable_indirect_widening_operation): New function for
> > > > > 	support indirect widening convert.
> > > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > > 	float -> float and int <-> float.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > 	PR target/107432
> > > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > > ---
> > > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> > > > > index
> > > > > ab640096ca2..0bedb53d9f9 100644
> > > > > --- a/gcc/tree-vect-generic.cc
> > > > > +++ b/gcc/tree-vect-generic.cc
> > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not
> > > > > see #include "gimple-match.h"
> > > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > > >  #include "optabs-libfuncs.h"
> > > > > +#include "cfgloop.h"
> > > > > +#include "tree-vectorizer.h"
> > > > >
> > > > >
> > > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > > >
> > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > +conversion
> > > > for
> > > > > +   float <-> int, like double -> char.  */ bool
> > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > > +					 enum tree_code code,
> > > > > +					 tree lhs,
> > > > > +					 tree arg)
> > > > > +{
> > > > > +  gimple *g;
> > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > +  tree new_rhs;
> > > > > +
> > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> > > > arg_elt_bits)
> > > > > +    return false;
> > > > > +
> > > > > +  unsigned short target_size;
> > > > > +  scalar_mode tmp_cvt_mode;
> > > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > > + tree cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;
> > > > > + target_size = GET_MODE_SIZE (rhs_mode);
> > > > > +
> > > > > +  opt_scalar_mode mode_iter;
> > > > > +  enum tree_code tc1, tc2;
> > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > +
> > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > +    {
> > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > +
> > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > +	break;
> > > > > +
> > > > > +      scalar_mode cvt_mode;
> > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > +	break;
> > > > > +
> > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > > (arg_type);
> > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > + isUnsigned);
> > > > > +
> > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > +      if (cvt_type == NULL_TREE
> > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > +					     ret_type,
> > > > > +					     cvt_type, &tc1)
> > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > +					     cvt_type,
> > > > > +					     arg_type, &tc2))
> > > > > +	continue;
> > > > > +
> > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return true;
> > > > > +    }
> > > > > +  return false;
> > > > > +}
> > > > > +
> > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > +conversion
> > > > for
> > > > > +   float <-> int, like char -> double.  */ bool
> > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > > +					 enum tree_code code,
> > > > > +					 tree lhs,
> > > > > +					 tree arg)
> > > > > +{
> > > > > +  gimple *g;
> > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > +  tree new_rhs;
> > > > > +
> > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > > + (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > > +    return false;
> > > > > +
> > > > > +  unsigned short target_size;
> > > > > +  scalar_mode tmp_cvt_mode;
> > > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > > + tree cvt_type = NULL_TREE;  target_size = GET_MODE_SIZE
> > > > > + (lhs_mode);  int rhs_size = GET_MODE_BITSIZE (rhs_mode);  if
> > > > > + (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode))
> > > > > +    return false;
> > > > > +
> > > > > +  opt_scalar_mode mode_iter;
> > > > > +  enum tree_code tc1, tc2;
> > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > +
> > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > +    {
> > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > +
> > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > +	break;
> > > > > +
> > > > > +      scalar_mode cvt_mode;
> > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > +	break;
> > > > > +
> > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> > > > (arg_type);
> > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > + isUnsigned);
> > > > > +
> > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > +      if (cvt_type == NULL_TREE
> > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > +					     ret_type,
> > > > > +					     cvt_type, &tc1)
> > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > +					     cvt_type,
> > > > > +					     arg_type, &tc2))
> > > > > +	continue;
> > > > > +
> > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return true;
> > > > > +    }
> > > > > +  return false;
> > > > > +}
> > > > > +
> > > >
> > > > So the above improve the situation where the target can handle the
> > > > two-step conversion.  It doesn't really allow this to work for too
> > > > large vectors AFAICS (nor does it try pack/unpack for any of the
> > > > conversions).  It also still duplicates code that's in the
> > > > vectorizer.  I think you should be able to use
> > > > supportable_narrowing_operation and possibly even
> > > > supportable_widening_operation (though that needs refatoring to
> > > > avoid the vectorizer internal stmt_vec_info type - possibly simply by gating
> > the respective code on a non-NULL vinfo).  Both support multi-step conversions.
> > > >
> > >
> > > I tried to use supportable_narrowing_operation and I met two questions:
> > >
> > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> > >    which optab can help me convert v16qi to v2qi.
> > 
> > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have an equal number of
> > lanes in/out for .CONVERT_VECTOR) it likely outputs a multi-step conversion
> > where you have to look into *INTERM_TYPES and second-guess the operation
> > code to use for the intermediate steps (IIRC the intermediate steps all use either
> > PACK/UNPACK or CONVERT, never FLOAT/FIX).
> >
> 
> I made a mistake in what I said before. I think 
> supportable_narrowing_operation doesn't support v2df->v2qi, it only use 
> VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it require 
> that vectype_in and vectype_out have the same size to return true. I 
> want to make sure I'm doing the right thing, I can build a tmp_type by 
> build_nonstandard_integer_type and get_same_sized_vectype. And use 
> tree_vec_extract to extract v2qi from v16qi after 
> supportable_narrowing_operation.

Yes.  It looks like the vectorizer, when the vector types number of
lanes agree goes the 'NONE' conversion path, checks
supportable_convert_operation and then has open-coded handling for

      /* For conversions between float and integer types try whether
         we can use intermediate signed integer types to support the
         conversion.  */

that means I was wrong in indicating supportable_narrowing_operation
was for element narrowing, it is for number-of-lane "narrowing".

That said, vectorizable_conversion, in the NONE case has handling
that should be split out into a function that's usable also from
vector lowering then so that both vectorization and lowering
handle the same cases.  The interface would be similar to
supportable_narrowing_operation.

> > 
> > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> > >    not what I expected, because it only use vec_pack_trunc. I expect it
> > >    can use vcvttpd2dq + vpmovdw.
> > 
> > With -O3 -fno-tree-loop-vectorize that's what you get.  What you see is because
> > of the restriction of the loop vectorizer to work on a single vector size only.
> >
> 
> Yes, it works, but the program runs the NONE part 
> (tree-vect-stmts.cc:5357) instead of the NARROW_DST part 
> (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the 
> code from line:5373 to line:5455 as a function. This avoids duplicating 
> the wheel, and I get the results I'm looking for.

Yeah.

> In addition to wrapping the function. If you are motivated by the fact 
> that our modifications are not generalized enough, I think we can add 
> supportable_narrowing/widening_operation after the current single step 
> VEC_CONVERT (line 1972 and line 2078). It should try to use a single 
> step and then use multiple steps. If you agree, I'd like to remove my 
> changes about indirect conversions for now, and keep only the direct 
> conversions, so that I can merge the three current patches into the 
> trunk first, and then add the change about indirect conversions later.

I think it should go like finding the largest compute_vectype pair
(source/destination) that we can handle either directly or indirectly
via the new function.

Richard.

> BRs,
> Lin
>  
> >
> > > If I can solve the first question and the function be better (maybe
> > > support trunc<vectype_in><vectype_out>), I'd be happy to use it
> > > directly. I prefer my scheme for now. My functions is more like
> > > supportable_convert_operation. Perhaps, we can modify
> > > supportable_narrowing_operation, but I think it should be another
> > > patch, it will influence vectorizer.
> > 
> > But since you are doing a multi-step conversion this is really what
> > supportable_narrowing_operation is about.  I don't think we want to re-invent
> > the wheel here.  Likewise your approach won't get you to use
> > VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current single-
> > step .CONVERT_VECTOR lowering).
> > supportable_narrowing_operation also checks for this.
> > 
> > Richard.
> > 
> >
> > > BRs,
> > > Lin
> > >
> > > >
> > > > >  /* Expand VEC_CONVERT ifn call.  */
> > > > >
> > > > >  static void
> > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > > (gimple_stmt_iterator *gsi)
> > > > >    else if (ret_elt_bits > arg_elt_bits)
> > > > >      modifier = WIDEN;
> > > > >
> > > > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > > +    {
> > > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return;
> > > > > +    }
> > > > > +
> > > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > > +    return;
> > > > > +
> > > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > > +    return;
> > > > > +
> > > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > FLOAT_EXPR))
> > > > >      {
> > > > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > > -	{
> > > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > > -	  gsi_replace (gsi, g, false);
> > > > > -	  return;
> > > > > -	}
> > > > >        /* Can't use get_compute_type here, as
> > supportable_convert_operation
> > > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > > >        tree vec_compute_type
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-03  9:02                   ` Richard Biener
@ 2024-06-03  9:26                     ` Hu, Lin1
  2024-06-03  9:30                       ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-03  9:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Liu, Hongtao, ubizjak

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 3, 2024 5:03 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> -> float and int <-> float.
> 
> On Mon, 3 Jun 2024, Hu, Lin1 wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Friday, May 31, 2024 8:41 PM
> > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > ubizjak@gmail.com
> > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for
> > > int -> int, float
> > > -> float and int <-> float.
> > >
> > > On Fri, 31 May 2024, Hu, Lin1 wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <rguenther@suse.de>
> > > > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao
> > > > > <hongtao.liu@intel.com>; ubizjak@gmail.com
> > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn
> > > > > for int -> int, float
> > > > > -> float and int <-> float.
> > > > >
> > > > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > 	PR target/107432
> > > > > > 	* tree-vect-generic.cc
> > > > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > > > 	support indirect narrowing convert.
> > > > > > 	(supportable_indirect_widening_operation): New function for
> > > > > > 	support indirect widening convert.
> > > > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > > > 	float -> float and int <-> float.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > 	PR target/107432
> > > > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > > > ---
> > > > > > diff --git a/gcc/tree-vect-generic.cc
> > > > > > b/gcc/tree-vect-generic.cc index
> > > > > > ab640096ca2..0bedb53d9f9 100644
> > > > > > --- a/gcc/tree-vect-generic.cc
> > > > > > +++ b/gcc/tree-vect-generic.cc
> > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If
> > > > > > not see #include "gimple-match.h"
> > > > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > > > >  #include "optabs-libfuncs.h"
> > > > > > +#include "cfgloop.h"
> > > > > > +#include "tree-vectorizer.h"
> > > > > >
> > > > > >
> > > > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > > > >
> > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > +conversion
> > > > > for
> > > > > > +   float <-> int, like double -> char.  */ bool
> > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > > > +					 enum tree_code code,
> > > > > > +					 tree lhs,
> > > > > > +					 tree arg)
> > > > > > +{
> > > > > > +  gimple *g;
> > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > +  tree new_rhs;
> > > > > > +
> > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > + if (code != FIX_TRUNC_EXPR || flag_trapping_math ||
> > > > > > + ret_elt_bits >=
> > > > > arg_elt_bits)
> > > > > > +    return false;
> > > > > > +
> > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > + tmp_cvt_mode = lhs_mode; target_size = GET_MODE_SIZE
> > > > > > + (rhs_mode);
> > > > > > +
> > > > > > +  opt_scalar_mode mode_iter;
> > > > > > +  enum tree_code tc1, tc2;
> > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > +
> > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > +    {
> > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > +
> > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > +	break;
> > > > > > +
> > > > > > +      scalar_mode cvt_mode;
> > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > +	break;
> > > > > > +
> > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > + TYPE_UNSIGNED
> > > > > (arg_type);
> > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > + isUnsigned);
> > > > > > +
> > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > +      if (cvt_type == NULL_TREE
> > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > +					     ret_type,
> > > > > > +					     cvt_type, &tc1)
> > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > +					     cvt_type,
> > > > > > +					     arg_type, &tc2))
> > > > > > +	continue;
> > > > > > +
> > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > +      gsi_replace (gsi, g, false);
> > > > > > +      return true;
> > > > > > +    }
> > > > > > +  return false;
> > > > > > +}
> > > > > > +
> > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > +conversion
> > > > > for
> > > > > > +   float <-> int, like char -> double.  */ bool
> > > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > > > +					 enum tree_code code,
> > > > > > +					 tree lhs,
> > > > > > +					 tree arg)
> > > > > > +{
> > > > > > +  gimple *g;
> > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > +  tree new_rhs;
> > > > > > +
> > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > > > +    return false;
> > > > > > +
> > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > + target_size = GET_MODE_SIZE (lhs_mode);  int rhs_size =
> > > > > > + GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size (rhs_size,
> 0).exists (&tmp_cvt_mode))
> > > > > > +    return false;
> > > > > > +
> > > > > > +  opt_scalar_mode mode_iter;
> > > > > > +  enum tree_code tc1, tc2;
> > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > +
> > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > +    {
> > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > +
> > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > +	break;
> > > > > > +
> > > > > > +      scalar_mode cvt_mode;
> > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > +	break;
> > > > > > +
> > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > + TYPE_UNSIGNED
> > > > > (arg_type);
> > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > + isUnsigned);
> > > > > > +
> > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > +      if (cvt_type == NULL_TREE
> > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > +					     ret_type,
> > > > > > +					     cvt_type, &tc1)
> > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > +					     cvt_type,
> > > > > > +					     arg_type, &tc2))
> > > > > > +	continue;
> > > > > > +
> > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > +      gsi_replace (gsi, g, false);
> > > > > > +      return true;
> > > > > > +    }
> > > > > > +  return false;
> > > > > > +}
> > > > > > +
> > > > >
> > > > > So the above improve the situation where the target can handle
> > > > > the two-step conversion.  It doesn't really allow this to work
> > > > > for too large vectors AFAICS (nor does it try pack/unpack for
> > > > > any of the conversions).  It also still duplicates code that's
> > > > > in the vectorizer.  I think you should be able to use
> > > > > supportable_narrowing_operation and possibly even
> > > > > supportable_widening_operation (though that needs refatoring to
> > > > > avoid the vectorizer internal stmt_vec_info type - possibly
> > > > > simply by gating
> > > the respective code on a non-NULL vinfo).  Both support multi-step
> conversions.
> > > > >
> > > >
> > > > I tried to use supportable_narrowing_operation and I met two questions:
> > > >
> > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> > > >    which optab can help me convert v16qi to v2qi.
> > >
> > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have
> > > an equal number of lanes in/out for .CONVERT_VECTOR) it likely
> > > outputs a multi-step conversion where you have to look into
> > > *INTERM_TYPES and second-guess the operation code to use for the
> > > intermediate steps (IIRC the intermediate steps all use either PACK/UNPACK
> or CONVERT, never FLOAT/FIX).
> > >
> >
> > I made a mistake in what I said before. I think
> > supportable_narrowing_operation doesn't support v2df->v2qi, it only
> > use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it
> > require that vectype_in and vectype_out have the same size to return
> > true. I want to make sure I'm doing the right thing, I can build a
> > tmp_type by build_nonstandard_integer_type and get_same_sized_vectype.
> > And use tree_vec_extract to extract v2qi from v16qi after
> > supportable_narrowing_operation.
> 
> Yes.  It looks like the vectorizer, when the vector types number of lanes agree
> goes the 'NONE' conversion path, checks supportable_convert_operation and
> then has open-coded handling for
> 
>       /* For conversions between float and integer types try whether
>          we can use intermediate signed integer types to support the
>          conversion.  */
> 
> that means I was wrong in indicating supportable_narrowing_operation was for
> element narrowing, it is for number-of-lane "narrowing".
> 
> That said, vectorizable_conversion, in the NONE case has handling that should
> be split out into a function that's usable also from vector lowering then so that
> both vectorization and lowering handle the same cases.  The interface would be
> similar to supportable_narrowing_operation.
> 
> > >
> > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> > > >    not what I expected, because it only use vec_pack_trunc. I expect it
> > > >    can use vcvttpd2dq + vpmovdw.
> > >
> > > With -O3 -fno-tree-loop-vectorize that's what you get.  What you see
> > > is because of the restriction of the loop vectorizer to work on a single vector
> size only.
> > >
> >
> > Yes, it works, but the program runs the NONE part
> > (tree-vect-stmts.cc:5357) instead of the NARROW_DST part
> > (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the
> > code from line:5373 to line:5455 as a function. This avoids
> > duplicating the wheel, and I get the results I'm looking for.
> 
> Yeah.
> 
> > In addition to wrapping the function. If you are motivated by the fact
> > that our modifications are not generalized enough, I think we can add
> > supportable_narrowing/widening_operation after the current single step
> > VEC_CONVERT (line 1972 and line 2078). It should try to use a single
> > step and then use multiple steps. If you agree, I'd like to remove my
> > changes about indirect conversions for now, and keep only the direct
> > conversions, so that I can merge the three current patches into the
> > trunk first, and then add the change about indirect conversions later.
> 
> I think it should go like finding the largest compute_vectype pair
> (source/destination) that we can handle either directly or indirectly via the new
> function.
> 
> Richard.
> 

Thanks, I will wrap the code in the new function and put out a new version of this patch. I have a small question, what does "finding the largest compute_vectype pair" mean? Some piece of code from gcc?

BRs,
Lin

>
> >
> > >
> > > > If I can solve the first question and the function be better
> > > > (maybe support trunc<vectype_in><vectype_out>), I'd be happy to
> > > > use it directly. I prefer my scheme for now. My functions is more
> > > > like supportable_convert_operation. Perhaps, we can modify
> > > > supportable_narrowing_operation, but I think it should be another
> > > > patch, it will influence vectorizer.
> > >
> > > But since you are doing a multi-step conversion this is really what
> > > supportable_narrowing_operation is about.  I don't think we want to
> > > re-invent the wheel here.  Likewise your approach won't get you to
> > > use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current
> > > single- step .CONVERT_VECTOR lowering).
> > > supportable_narrowing_operation also checks for this.
> > >
> > > Richard.
> > >
> > >
> > > > BRs,
> > > > Lin
> > > >
> > > > >
> > > > > >  /* Expand VEC_CONVERT ifn call.  */
> > > > > >
> > > > > >  static void
> > > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > > > (gimple_stmt_iterator *gsi)
> > > > > >    else if (ret_elt_bits > arg_elt_bits)
> > > > > >      modifier = WIDEN;
> > > > > >
> > > > > > +  if (supportable_convert_operation (code, ret_type, arg_type,
> &code1))
> > > > > > +    {
> > > > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > > > +      gsi_replace (gsi, g, false);
> > > > > > +      return;
> > > > > > +    }
> > > > > > +
> > > > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > > > +    return;
> > > > > > +
> > > > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > > > +    return;
> > > > > > +
> > > > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > > FLOAT_EXPR))
> > > > > >      {
> > > > > > -      if (supportable_convert_operation (code, ret_type, arg_type,
> &code1))
> > > > > > -	{
> > > > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > > > -	  gsi_replace (gsi, g, false);
> > > > > > -	  return;
> > > > > > -	}
> > > > > >        /* Can't use get_compute_type here, as
> > > supportable_convert_operation
> > > > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > > > >        tree vec_compute_type
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-03  9:26                     ` Hu, Lin1
@ 2024-06-03  9:30                       ` Richard Biener
  2024-06-11  6:49                         ` [PATCH 1/3 v3] " Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-06-03  9:30 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao, ubizjak

On Mon, 3 Jun 2024, Hu, Lin1 wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Monday, June 3, 2024 5:03 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com
> > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float
> > -> float and int <-> float.
> > 
> > On Mon, 3 Jun 2024, Hu, Lin1 wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Friday, May 31, 2024 8:41 PM
> > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > > > ubizjak@gmail.com
> > > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for
> > > > int -> int, float
> > > > -> float and int <-> float.
> > > >
> > > > On Fri, 31 May 2024, Hu, Lin1 wrote:
> > > >
> > > > > > -----Original Message-----
> > > > > > From: Richard Biener <rguenther@suse.de>
> > > > > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > > > > To: Hu, Lin1 <lin1.hu@intel.com>
> > > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao
> > > > > > <hongtao.liu@intel.com>; ubizjak@gmail.com
> > > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn
> > > > > > for int -> int, float
> > > > > > -> float and int <-> float.
> > > > > >
> > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > > 	PR target/107432
> > > > > > > 	* tree-vect-generic.cc
> > > > > > > 	(supportable_indirect_narrowing_operation): New function for
> > > > > > > 	support indirect narrowing convert.
> > > > > > > 	(supportable_indirect_widening_operation): New function for
> > > > > > > 	support indirect widening convert.
> > > > > > > 	(expand_vector_conversion): Support convert for int -> int,
> > > > > > > 	float -> float and int <-> float.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > > 	PR target/107432
> > > > > > > 	* gcc.target/i386/pr107432-1.c: New test.
> > > > > > > 	* gcc.target/i386/pr107432-2.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-3.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-4.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-5.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-6.c: Ditto.
> > > > > > > 	* gcc.target/i386/pr107432-7.c: Ditto.
> > > > > > > ---
> > > > > > > diff --git a/gcc/tree-vect-generic.cc
> > > > > > > b/gcc/tree-vect-generic.cc index
> > > > > > > ab640096ca2..0bedb53d9f9 100644
> > > > > > > --- a/gcc/tree-vect-generic.cc
> > > > > > > +++ b/gcc/tree-vect-generic.cc
> > > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If
> > > > > > > not see #include "gimple-match.h"
> > > > > > >  #include "recog.h"		/* FIXME: for insn_data */
> > > > > > >  #include "optabs-libfuncs.h"
> > > > > > > +#include "cfgloop.h"
> > > > > > > +#include "tree-vectorizer.h"
> > > > > > >
> > > > > > >
> > > > > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > > > > >    return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > > > > >
> > > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > > +conversion
> > > > > > for
> > > > > > > +   float <-> int, like double -> char.  */ bool
> > > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > > > > +					 enum tree_code code,
> > > > > > > +					 tree lhs,
> > > > > > > +					 tree arg)
> > > > > > > +{
> > > > > > > +  gimple *g;
> > > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > > +  tree new_rhs;
> > > > > > > +
> > > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > > + if (code != FIX_TRUNC_EXPR || flag_trapping_math ||
> > > > > > > + ret_elt_bits >=
> > > > > > arg_elt_bits)
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > > + tmp_cvt_mode = lhs_mode; target_size = GET_MODE_SIZE
> > > > > > > + (rhs_mode);
> > > > > > > +
> > > > > > > +  opt_scalar_mode mode_iter;
> > > > > > > +  enum tree_code tc1, tc2;
> > > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > > +
> > > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > > +    {
> > > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > > +
> > > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      scalar_mode cvt_mode;
> > > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > > + TYPE_UNSIGNED
> > > > > > (arg_type);
> > > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > > + isUnsigned);
> > > > > > > +
> > > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > > +      if (cvt_type == NULL_TREE
> > > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > > +					     ret_type,
> > > > > > > +					     cvt_type, &tc1)
> > > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > > +					     cvt_type,
> > > > > > > +					     arg_type, &tc2))
> > > > > > > +	continue;
> > > > > > > +
> > > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > > +      gsi_replace (gsi, g, false);
> > > > > > > +      return true;
> > > > > > > +    }
> > > > > > > +  return false;
> > > > > > > +}
> > > > > > > +
> > > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > > +conversion
> > > > > > for
> > > > > > > +   float <-> int, like char -> double.  */ bool
> > > > > > > +supportable_indirect_widening_operation (gimple_stmt_iterator *gsi,
> > > > > > > +					 enum tree_code code,
> > > > > > > +					 tree lhs,
> > > > > > > +					 tree arg)
> > > > > > > +{
> > > > > > > +  gimple *g;
> > > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > > +  tree arg_type = TREE_TYPE (arg);
> > > > > > > +  tree new_rhs;
> > > > > > > +
> > > > > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);
> > > > > > > + if (ret_elt_bits <= arg_elt_bits || code != FLOAT_EXPR)
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  unsigned short target_size;  scalar_mode tmp_cvt_mode;
> > > > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > > > + (ret_type)); scalar_mode rhs_mode = GET_MODE_INNER
> > > > > > > + (TYPE_MODE (arg_type)); tree cvt_type = NULL_TREE;
> > > > > > > + target_size = GET_MODE_SIZE (lhs_mode);  int rhs_size =
> > > > > > > + GET_MODE_BITSIZE (rhs_mode);  if (!int_mode_for_size (rhs_size,
> > 0).exists (&tmp_cvt_mode))
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  opt_scalar_mode mode_iter;
> > > > > > > +  enum tree_code tc1, tc2;
> > > > > > > +  unsigned HOST_WIDE_INT nelts
> > > > > > > +    = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > > > > +
> > > > > > > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > > > > > > +    {
> > > > > > > +      tmp_cvt_mode = mode_iter.require ();
> > > > > > > +
> > > > > > > +      if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      scalar_mode cvt_mode;
> > > > > > > +      int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > > > > > > +      if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode))
> > > > > > > +	break;
> > > > > > > +
> > > > > > > +      int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > > > > > > +      bool isUnsigned = TYPE_UNSIGNED (ret_type) ||
> > > > > > > + TYPE_UNSIGNED
> > > > > > (arg_type);
> > > > > > > +      cvt_type = build_nonstandard_integer_type (cvt_size,
> > > > > > > + isUnsigned);
> > > > > > > +
> > > > > > > +      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > > > +      if (cvt_type == NULL_TREE
> > > > > > > +	  || !supportable_convert_operation ((tree_code) code,
> > > > > > > +					     ret_type,
> > > > > > > +					     cvt_type, &tc1)
> > > > > > > +	  || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > > > > > > +					     cvt_type,
> > > > > > > +					     arg_type, &tc2))
> > > > > > > +	continue;
> > > > > > > +
> > > > > > > +      new_rhs = make_ssa_name (cvt_type);
> > > > > > > +      g = vect_gimple_build (new_rhs, tc2, arg);
> > > > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > > > +      g = gimple_build_assign (lhs, tc1, new_rhs);
> > > > > > > +      gsi_replace (gsi, g, false);
> > > > > > > +      return true;
> > > > > > > +    }
> > > > > > > +  return false;
> > > > > > > +}
> > > > > > > +
> > > > > >
> > > > > > So the above improve the situation where the target can handle
> > > > > > the two-step conversion.  It doesn't really allow this to work
> > > > > > for too large vectors AFAICS (nor does it try pack/unpack for
> > > > > > any of the conversions).  It also still duplicates code that's
> > > > > > in the vectorizer.  I think you should be able to use
> > > > > > supportable_narrowing_operation and possibly even
> > > > > > supportable_widening_operation (though that needs refatoring to
> > > > > > avoid the vectorizer internal stmt_vec_info type - possibly
> > > > > > simply by gating
> > > > the respective code on a non-NULL vinfo).  Both support multi-step
> > conversions.
> > > > > >
> > > > >
> > > > > I tried to use supportable_narrowing_operation and I met two questions:
> > > > >
> > > > > 1) supportable_narrowing_operation support v2df->v16qi, but I don't know
> > > > >    which optab can help me convert v16qi to v2qi.
> > > >
> > > > It's API is a bit tricky but for v2df -> v2qi (I expect you'll have
> > > > an equal number of lanes in/out for .CONVERT_VECTOR) it likely
> > > > outputs a multi-step conversion where you have to look into
> > > > *INTERM_TYPES and second-guess the operation code to use for the
> > > > intermediate steps (IIRC the intermediate steps all use either PACK/UNPACK
> > or CONVERT, never FLOAT/FIX).
> > > >
> > >
> > > I made a mistake in what I said before. I think
> > > supportable_narrowing_operation doesn't support v2df->v2qi, it only
> > > use VEC_PACK_TRUNC_EXPRT in its intermediate steps. This makes it
> > > require that vectype_in and vectype_out have the same size to return
> > > true. I want to make sure I'm doing the right thing, I can build a
> > > tmp_type by build_nonstandard_integer_type and get_same_sized_vectype.
> > > And use tree_vec_extract to extract v2qi from v16qi after
> > > supportable_narrowing_operation.
> > 
> > Yes.  It looks like the vectorizer, when the vector types number of lanes agree
> > goes the 'NONE' conversion path, checks supportable_convert_operation and
> > then has open-coded handling for
> > 
> >       /* For conversions between float and integer types try whether
> >          we can use intermediate signed integer types to support the
> >          conversion.  */
> > 
> > that means I was wrong in indicating supportable_narrowing_operation was for
> > element narrowing, it is for number-of-lane "narrowing".
> > 
> > That said, vectorizable_conversion, in the NONE case has handling that should
> > be split out into a function that's usable also from vector lowering then so that
> > both vectorization and lowering handle the same cases.  The interface would be
> > similar to supportable_narrowing_operation.
> > 
> > > >
> > > > > 2) I tried a testcase (https://godbolt.org/z/z88xYW85e), this result is
> > > > >    not what I expected, because it only use vec_pack_trunc. I expect it
> > > > >    can use vcvttpd2dq + vpmovdw.
> > > >
> > > > With -O3 -fno-tree-loop-vectorize that's what you get.  What you see
> > > > is because of the restriction of the loop vectorizer to work on a single vector
> > size only.
> > > >
> > >
> > > Yes, it works, but the program runs the NONE part
> > > (tree-vect-stmts.cc:5357) instead of the NARROW_DST part
> > > (tree-vect-stmts.cc:5545). I think maybe we can wrap the part of the
> > > code from line:5373 to line:5455 as a function. This avoids
> > > duplicating the wheel, and I get the results I'm looking for.
> > 
> > Yeah.
> > 
> > > In addition to wrapping the function. If you are motivated by the fact
> > > that our modifications are not generalized enough, I think we can add
> > > supportable_narrowing/widening_operation after the current single step
> > > VEC_CONVERT (line 1972 and line 2078). It should try to use a single
> > > step and then use multiple steps. If you agree, I'd like to remove my
> > > changes about indirect conversions for now, and keep only the direct
> > > conversions, so that I can merge the three current patches into the
> > > trunk first, and then add the change about indirect conversions later.
> > 
> > I think it should go like finding the largest compute_vectype pair
> > (source/destination) that we can handle either directly or indirectly via the new
> > function.
> > 
> > Richard.
> > 
> 
> Thanks, I will wrap the code in the new function and put out a new 
> version of this patch. I have a small question, what does "finding the 
> largest compute_vectype pair" mean? Some piece of code from gcc?

No, I mean what vector lowering does for .VEC_CONVERT right now,
it uses

      /* Can't use get_compute_type here, as supportable_convert_operation
         doesn't necessarily use an optab and needs two arguments.  */
      tree vec_compute_type
        = type_for_widest_vector_mode (arg_type, mov_optab);

or

      if (optab1)
        compute_type = get_compute_type (code1, optab1, arg_type);

and then expand_vector_piecewise to emit code for say V4SF -> V4QI
from V16SF -> V16QI .VEC_CONVERT.

Richard.

> BRs,
> Lin
> 
> >
> > >
> > > >
> > > > > If I can solve the first question and the function be better
> > > > > (maybe support trunc<vectype_in><vectype_out>), I'd be happy to
> > > > > use it directly. I prefer my scheme for now. My functions is more
> > > > > like supportable_convert_operation. Perhaps, we can modify
> > > > > supportable_narrowing_operation, but I think it should be another
> > > > > patch, it will influence vectorizer.
> > > >
> > > > But since you are doing a multi-step conversion this is really what
> > > > supportable_narrowing_operation is about.  I don't think we want to
> > > > re-invent the wheel here.  Likewise your approach won't get you to
> > > > use VEC_[UN]PACK_HI/LO/EVEN/ODD either (supported by the current
> > > > single- step .CONVERT_VECTOR lowering).
> > > > supportable_narrowing_operation also checks for this.
> > > >
> > > > Richard.
> > > >
> > > >
> > > > > BRs,
> > > > > Lin
> > > > >
> > > > > >
> > > > > > >  /* Expand VEC_CONVERT ifn call.  */
> > > > > > >
> > > > > > >  static void
> > > > > > > @@ -1871,14 +2009,21 @@ expand_vector_conversion
> > > > > > (gimple_stmt_iterator *gsi)
> > > > > > >    else if (ret_elt_bits > arg_elt_bits)
> > > > > > >      modifier = WIDEN;
> > > > > > >
> > > > > > > +  if (supportable_convert_operation (code, ret_type, arg_type,
> > &code1))
> > > > > > > +    {
> > > > > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > > > > +      gsi_replace (gsi, g, false);
> > > > > > > +      return;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  if (supportable_indirect_narrowing_operation(gsi, code, lhs, arg))
> > > > > > > +    return;
> > > > > > > +
> > > > > > > +  if (supportable_indirect_widening_operation(gsi, code, lhs, arg))
> > > > > > > +    return;
> > > > > > > +
> > > > > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > > > FLOAT_EXPR))
> > > > > > >      {
> > > > > > > -      if (supportable_convert_operation (code, ret_type, arg_type,
> > &code1))
> > > > > > > -	{
> > > > > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > > > > -	  gsi_replace (gsi, g, false);
> > > > > > > -	  return;
> > > > > > > -	}
> > > > > > >        /* Can't use get_compute_type here, as
> > > > supportable_convert_operation
> > > > > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > > > > >        tree vec_compute_type
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > > Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-03  9:30                       ` Richard Biener
@ 2024-06-11  6:49                         ` Hu, Lin1
  2024-06-17  1:48                           ` Hu, Lin1
  2024-06-18 11:44                           ` Richard Biener
  0 siblings, 2 replies; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-11  6:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther

I wrap a part of code about indirect conversion. The API refers to 
supportable_narrowing/widening_operations.

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc
	(expand_vector_conversion): Support convert for int -> int,
	float -> float and int <-> float.
	* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
	indirect convert part.
	(supportable_indirect_convert_operation): New function.
	* tree-vectorizer.h (supportable_indirect_convert_operation):
	Define the new function.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +++++++++++++
 gcc/tree-vect-generic.cc                   |  33 ++-
 gcc/tree-vect-stmts.cc                     | 244 +++++++++++++--------
 gcc/tree-vectorizer.h                      |   9 +
 10 files changed, 1011 insertions(+), 92 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
+}
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
+}
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2su);
+}
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
+}
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
+}
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu);
+}
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu);
+}
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
+}
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu);
+}
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu);
+}
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu);
+}
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu);
+}
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu);
+}
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
+}
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
+}
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu);
+}
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu);
+}
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu);
+}
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
+}
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu);
+}
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu);
+}
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
+}
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v8hi);
+}
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi);
+}
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2sf);
+}
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2hf);
+}
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2si);
+}
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si);
+}
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi);
+}
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi);
+}
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi);
+}
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu);
+}
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu);
+}
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu);
+}
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi);
+}
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi);
+}
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi);
+}
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi);
+}
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu);
+}
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu);
+}
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu);
+}
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu);
+}
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi);
+}
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi);
+}
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi);
+}
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi);
+}
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu);
+}
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu);
+}
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu);
+}
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df);
+}
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df);
+}
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df);
+}
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df);
+}
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df);
+}
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df);
+}
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf);
+}
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf);
+}
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf);
+}
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index ea0069f7a67..c38c0b9dda8 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1870,14 +1872,33 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
+  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
+  int multi_step_cvt = 0;
+  vec<tree> interm_types = vNULL;
+  if (supportable_indirect_convert_operation (NULL,
+					      code,
+					      ret_type, arg_type,
+					      &code2, &code3,
+					      &multi_step_cvt,
+					      &interm_types, arg))
+    {
+      new_rhs = make_ssa_name (interm_types[0]);
+      g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 05a169ecb2d..0aa608202ca 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
-  tree_code tc1, tc2;
+  tree_code tc1;
   code_helper code, code1, code2;
   code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
   tree new_temp;
@@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
 	break;
       }
 
-      /* For conversions between float and integer types try whether
-	 we can use intermediate signed integer types to support the
-	 conversion.  */
-      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
-	  && (code == FLOAT_EXPR ||
-	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
-	{
-	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
-	  bool float_expr_p = code == FLOAT_EXPR;
-	  unsigned short target_size;
-	  scalar_mode intermediate_mode;
-	  if (demotion)
-	    {
-	      intermediate_mode = lhs_mode;
-	      target_size = GET_MODE_SIZE (rhs_mode);
-	    }
-	  else
-	    {
-	      target_size = GET_MODE_SIZE (lhs_mode);
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
-		goto unsupported;
-	    }
-	  code1 = float_expr_p ? code : NOP_EXPR;
-	  codecvt1 = float_expr_p ? NOP_EXPR : code;
-	  opt_scalar_mode mode_iter;
-	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
-	    {
-	      intermediate_mode = mode_iter.require ();
-
-	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
-		break;
-
-	      scalar_mode cvt_mode;
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
-		break;
-
-	      cvt_type = build_nonstandard_integer_type
-		(GET_MODE_BITSIZE (cvt_mode), 0);
-
-	      /* Check if the intermediate type can hold OP0's range.
-		 When converting from float to integer this is not necessary
-		 because values that do not fit the (smaller) target type are
-		 unspecified anyway.  */
-	      if (demotion && float_expr_p)
-		{
-		  wide_int op_min_value, op_max_value;
-		  if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
-		    break;
-
-		  if (cvt_type == NULL_TREE
-		      || (wi::min_precision (op_max_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type))
-		      || (wi::min_precision (op_min_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type)))
-		    continue;
-		}
-
-	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
-	      /* This should only happened for SLP as long as loop vectorizer
-		 only supports same-sized vector.  */
-	      if (cvt_type == NULL_TREE
-		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
-		  || !supportable_convert_operation ((tree_code) code1,
-						     vectype_out,
-						     cvt_type, &tc1)
-		  || !supportable_convert_operation ((tree_code) codecvt1,
-						     cvt_type,
-						     vectype_in, &tc2))
-		continue;
-
-	      found_mode = true;
-	      break;
-	    }
+      if (supportable_indirect_convert_operation (vinfo,
+						  code,
+						  vectype_out,
+						  vectype_in,
+						  &code1,
+						  &codecvt1,
+						  &multi_step_cvt,
+						  &interm_types,
+						  op0,slp_node))
+	break;
 
-	  if (found_mode)
-	    {
-	      multi_step_cvt++;
-	      interm_types.safe_push (cvt_type);
-	      cvt_type = NULL_TREE;
-	      code1 = tc1;
-	      codecvt1 = tc2;
-	      break;
-	    }
-	}
       /* FALLTHRU */
     unsupported:
       if (dump_enabled_p ())
@@ -14626,6 +14551,153 @@ supportable_narrowing_operation (code_helper code,
   return false;
 }
 
+/* Function supportable_indirect_convert_operation
+
+   Check whether an operation represented by the code CODE is two
+   convert operations that are supported by the target platform in
+   vector form (i.e., when operating on arguments of type VECTYPE_IN
+   producing a result of type VECTYPE_OUT).
+
+   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
+   This function checks if these operations are supported
+   by the target platform directly (via vector tree-codes).
+
+   Output:
+   - CODE1 is the code of a vector operation to be used when
+   converting the operation in the first step, if available.
+   - CODE2 is the code of a vector operation to be used when
+   converting the operation in the second step, if available.
+   - MULTI_STEP_CVT determines the number of required intermediate steps in
+   case of multi-step conversion (like int->short->char - in that case
+   MULTI_STEP_CVT will be 1). In the function, it should be 1.
+   - INTERM_TYPES contains the intermediate type required to perform the
+   convert operation (short in the above example).   */
+bool
+supportable_indirect_convert_operation (vec_info *vinfo,
+					code_helper code,
+					tree vectype_out,
+					tree vectype_in,
+					code_helper *code1,
+					code_helper *code2,
+					int *multi_step_cvt,
+					vec<tree> *interm_types,
+					tree op0,
+					slp_tree slp_node)
+{
+  bool found_mode = false;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
+  opt_scalar_mode mode_iter;
+  tree_code tc1, tc2;
+
+  tree cvt_type = NULL_TREE;
+  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
+
+  (*multi_step_cvt) = 0;
+  /* For conversions between float and integer types try whether
+     we can use intermediate signed integer types to support the
+     conversion.  */
+  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
+      && (code == FLOAT_EXPR
+	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
+    {
+      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
+      bool float_expr_p = code == FLOAT_EXPR;
+      unsigned short target_size;
+      scalar_mode intermediate_mode;
+      if (demotion)
+	{
+	  intermediate_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
+	    return false;
+	}
+      *code1 = float_expr_p ? code : NOP_EXPR;
+      *code2 = float_expr_p ? NOP_EXPR : code;
+      opt_scalar_mode mode_iter;
+      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
+	{
+	  intermediate_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
+	    break;
+
+	  cvt_type = build_nonstandard_integer_type
+	    (GET_MODE_BITSIZE (cvt_mode), 0);
+
+	  /* Check if the intermediate type can hold OP0's range.
+	     When converting from float to integer this is not necessary
+	     because values that do not fit the (smaller) target type are
+	     unspecified anyway.  */
+	  if (demotion && float_expr_p)
+	    {
+	      wide_int op_min_value, op_max_value;
+	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
+		 In the future, if it is supported, changes may need to be made
+		 to this part, such as checking the RANGE of each element
+		 in the vector.  */
+	      if (!SSA_NAME_RANGE_INFO (op0)
+		  || !vect_get_range_info (op0, &op_min_value, &op_max_value))
+		break;
+
+	      if (cvt_type == NULL_TREE
+		  || (wi::min_precision (op_max_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type))
+		  || (wi::min_precision (op_min_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type)))
+		continue;
+	    }
+
+	  if (vinfo != NULL && slp_node != NULL)
+	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
+	  else
+	    {
+	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
+			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
+	      cvt_type = build_nonstandard_integer_type
+		(GET_MODE_BITSIZE (cvt_mode), uns);
+	      cvt_type = build_vector_type (cvt_type, nelts);
+	    }
+	  /* This should only happened for SLP as long as loop vectorizer
+	     only supports same-sized vector.  */
+	  if (cvt_type == NULL_TREE
+	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
+	      || !supportable_convert_operation ((tree_code) *code1,
+						 vectype_out,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) *code2,
+						 cvt_type,
+						 vectype_in, &tc2))
+	    continue;
+
+	  found_mode = true;
+	  break;
+	}
+
+      if (found_mode)
+	{
+	  (*multi_step_cvt)++;
+	  interm_types->safe_push (cvt_type);
+	  cvt_type = NULL_TREE;
+	  *code1 = tc1;
+	  *code2 = tc2;
+	  return true;
+	}
+    }
+  interm_types->release ();
+  return false;
+}
+
 /* Generate and return a vector mask of MASK_TYPE such that
    mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
    Add the statements to SEQ.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 97ec9c341e7..ad65ce71bb7 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2265,6 +2265,15 @@ extern bool supportable_widening_operation (vec_info*, code_helper,
 extern bool supportable_narrowing_operation (code_helper, tree, tree,
 					     code_helper *, int *,
 					     vec<tree> *);
+extern bool supportable_indirect_convert_operation (vec_info *,
+						    code_helper,
+						    tree, tree,
+						    code_helper *,
+						    code_helper *,
+						    int *,
+						    vec<tree> *,
+						    tree = NULL_TREE,
+						    slp_tree = NULL);
 
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 				  enum vect_cost_for_stmt, stmt_vec_info,
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-11  6:49                         ` [PATCH 1/3 v3] " Hu, Lin1
@ 2024-06-17  1:48                           ` Hu, Lin1
  2024-06-18 11:44                           ` Richard Biener
  1 sibling, 0 replies; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-17  1:48 UTC (permalink / raw)
  To: Hu, Lin1, gcc-patches; +Cc: Liu, Hongtao, ubizjak, rguenther

Ping this thread.

BRs,
Lin

-----Original Message-----
From: Hu, Lin1 <lin1.hu@intel.com> 
Sent: Tuesday, June 11, 2024 2:49 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com; rguenther@suse.de
Subject: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

I wrap a part of code about indirect conversion. The API refers to supportable_narrowing/widening_operations.

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc
	(expand_vector_conversion): Support convert for int -> int,
	float -> float and int <-> float.
	* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
	indirect convert part.
	(supportable_indirect_convert_operation): New function.
	* tree-vectorizer.h (supportable_indirect_convert_operation):
	Define the new function.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 ++++++++++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++++++  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +++++++++++++
 gcc/tree-vect-generic.cc                   |  33 ++-
 gcc/tree-vect-stmts.cc                     | 244 +++++++++++++--------
 gcc/tree-vectorizer.h                      |   9 +
 10 files changed, 1011 insertions(+), 92 deletions(-)  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); 
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2di)a, __v2si); }
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi); }
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi); }
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi); }
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi); }
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi); }
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi); }
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi); }
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi); }
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi); }
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi); }
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi); }
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi); }
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2du)a, __v2su); }
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu); }
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu); }
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu); }
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu); }
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu); }
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu); }
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu); }
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu); }
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu); }
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu); }
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu); }
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu); }
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
+  return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
+  return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
+  return (__m128i)__builtin_convertvector(a, __v8hi); }
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi); }
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2sf); }
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf); }
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf); }
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2hf); }
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf); }
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf); }
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
+  return __builtin_convertvector(a, __v4hf); }
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf); }
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
+  return __builtin_convertvector(a, __v2df); }
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df); }
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df); }
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
+  return __builtin_convertvector(a, __v2df); }
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df); }
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df); }
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
+  return __builtin_convertvector(a, __v4sf); }
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf); }
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" 
+} */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef 
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
+  return __builtin_convertvector(a, __v2si); }
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si); }
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si); }
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di); }
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di); }
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
+  return __builtin_convertvector(a, __v4si); }
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si); }
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si); }
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
+  return __builtin_convertvector(a, __v2di); }
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di); }
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
+__attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi); }
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi); }
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi); }
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu); }
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu); }
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu); }
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi); }
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi); }
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi); }
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi); }
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu); }
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu); }
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu); }
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu); }
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi); }
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi); }
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi); }
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi); }
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu); }
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu); }
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu); }
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq 
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } 
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } 
+} } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char 
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi 
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu 
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu 
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu 
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu 
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf 
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf 
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf 
+__attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df); }
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df); }
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df); }
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df); }
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df); }
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df); }
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf); }
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf); }
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf); }
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf); }
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf); }
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf); }
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf); }
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf); }
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf); }
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf); }
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf); }
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf); }
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf); }
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf); }
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf); }
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf); }
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf); }
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf); }
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ea0069f7a67..c38c0b9dda8 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see  #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1870,14 +1872,33 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+    {
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
+  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
+  int multi_step_cvt = 0;
+  vec<tree> interm_types = vNULL;
+  if (supportable_indirect_convert_operation (NULL,
+					      code,
+					      ret_type, arg_type,
+					      &code2, &code3,
+					      &multi_step_cvt,
+					      &interm_types, arg))
+    {
+      new_rhs = make_ssa_name (interm_types[0]);
+      g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
+      gsi_insert_before (gsi, g, GSI_SAME_STMT);
+      g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
-	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
-	}
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 05a169ecb2d..0aa608202ca 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
-  tree_code tc1, tc2;
+  tree_code tc1;
   code_helper code, code1, code2;
   code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
   tree new_temp;
@@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
 	break;
       }
 
-      /* For conversions between float and integer types try whether
-	 we can use intermediate signed integer types to support the
-	 conversion.  */
-      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
-	  && (code == FLOAT_EXPR ||
-	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
-	{
-	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
-	  bool float_expr_p = code == FLOAT_EXPR;
-	  unsigned short target_size;
-	  scalar_mode intermediate_mode;
-	  if (demotion)
-	    {
-	      intermediate_mode = lhs_mode;
-	      target_size = GET_MODE_SIZE (rhs_mode);
-	    }
-	  else
-	    {
-	      target_size = GET_MODE_SIZE (lhs_mode);
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
-		goto unsupported;
-	    }
-	  code1 = float_expr_p ? code : NOP_EXPR;
-	  codecvt1 = float_expr_p ? NOP_EXPR : code;
-	  opt_scalar_mode mode_iter;
-	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
-	    {
-	      intermediate_mode = mode_iter.require ();
-
-	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
-		break;
-
-	      scalar_mode cvt_mode;
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
-		break;
-
-	      cvt_type = build_nonstandard_integer_type
-		(GET_MODE_BITSIZE (cvt_mode), 0);
-
-	      /* Check if the intermediate type can hold OP0's range.
-		 When converting from float to integer this is not necessary
-		 because values that do not fit the (smaller) target type are
-		 unspecified anyway.  */
-	      if (demotion && float_expr_p)
-		{
-		  wide_int op_min_value, op_max_value;
-		  if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
-		    break;
-
-		  if (cvt_type == NULL_TREE
-		      || (wi::min_precision (op_max_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type))
-		      || (wi::min_precision (op_min_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type)))
-		    continue;
-		}
-
-	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
-	      /* This should only happened for SLP as long as loop vectorizer
-		 only supports same-sized vector.  */
-	      if (cvt_type == NULL_TREE
-		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
-		  || !supportable_convert_operation ((tree_code) code1,
-						     vectype_out,
-						     cvt_type, &tc1)
-		  || !supportable_convert_operation ((tree_code) codecvt1,
-						     cvt_type,
-						     vectype_in, &tc2))
-		continue;
-
-	      found_mode = true;
-	      break;
-	    }
+      if (supportable_indirect_convert_operation (vinfo,
+						  code,
+						  vectype_out,
+						  vectype_in,
+						  &code1,
+						  &codecvt1,
+						  &multi_step_cvt,
+						  &interm_types,
+						  op0,slp_node))
+	break;
 
-	  if (found_mode)
-	    {
-	      multi_step_cvt++;
-	      interm_types.safe_push (cvt_type);
-	      cvt_type = NULL_TREE;
-	      code1 = tc1;
-	      codecvt1 = tc2;
-	      break;
-	    }
-	}
       /* FALLTHRU */
     unsupported:
       if (dump_enabled_p ())
@@ -14626,6 +14551,153 @@ supportable_narrowing_operation (code_helper code,
   return false;
 }
 
+/* Function supportable_indirect_convert_operation
+
+   Check whether an operation represented by the code CODE is two
+   convert operations that are supported by the target platform in
+   vector form (i.e., when operating on arguments of type VECTYPE_IN
+   producing a result of type VECTYPE_OUT).
+
+   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
+   This function checks if these operations are supported
+   by the target platform directly (via vector tree-codes).
+
+   Output:
+   - CODE1 is the code of a vector operation to be used when
+   converting the operation in the first step, if available.
+   - CODE2 is the code of a vector operation to be used when
+   converting the operation in the second step, if available.
+   - MULTI_STEP_CVT determines the number of required intermediate steps in
+   case of multi-step conversion (like int->short->char - in that case
+   MULTI_STEP_CVT will be 1). In the function, it should be 1.
+   - INTERM_TYPES contains the intermediate type required to perform the
+   convert operation (short in the above example).   */
+bool
+supportable_indirect_convert_operation (vec_info *vinfo,
+					code_helper code,
+					tree vectype_out,
+					tree vectype_in,
+					code_helper *code1,
+					code_helper *code2,
+					int *multi_step_cvt,
+					vec<tree> *interm_types,
+					tree op0,
+					slp_tree slp_node)
+{
+  bool found_mode = false;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
+  opt_scalar_mode mode_iter;
+  tree_code tc1, tc2;
+
+  tree cvt_type = NULL_TREE;
+  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
+
+  (*multi_step_cvt) = 0;
+  /* For conversions between float and integer types try whether
+     we can use intermediate signed integer types to support the
+     conversion.  */
+  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
+      && (code == FLOAT_EXPR
+	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
+    {
+      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
+      bool float_expr_p = code == FLOAT_EXPR;
+      unsigned short target_size;
+      scalar_mode intermediate_mode;
+      if (demotion)
+	{
+	  intermediate_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
+	    return false;
+	}
+      *code1 = float_expr_p ? code : NOP_EXPR;
+      *code2 = float_expr_p ? NOP_EXPR : code;
+      opt_scalar_mode mode_iter;
+      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
+	{
+	  intermediate_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
+	    break;
+
+	  cvt_type = build_nonstandard_integer_type
+	    (GET_MODE_BITSIZE (cvt_mode), 0);
+
+	  /* Check if the intermediate type can hold OP0's range.
+	     When converting from float to integer this is not necessary
+	     because values that do not fit the (smaller) target type are
+	     unspecified anyway.  */
+	  if (demotion && float_expr_p)
+	    {
+	      wide_int op_min_value, op_max_value;
+	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
+		 In the future, if it is supported, changes may need to be made
+		 to this part, such as checking the RANGE of each element
+		 in the vector.  */
+	      if (!SSA_NAME_RANGE_INFO (op0)
+		  || !vect_get_range_info (op0, &op_min_value, &op_max_value))
+		break;
+
+	      if (cvt_type == NULL_TREE
+		  || (wi::min_precision (op_max_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type))
+		  || (wi::min_precision (op_min_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type)))
+		continue;
+	    }
+
+	  if (vinfo != NULL && slp_node != NULL)
+	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
+	  else
+	    {
+	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
+			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
+	      cvt_type = build_nonstandard_integer_type
+		(GET_MODE_BITSIZE (cvt_mode), uns);
+	      cvt_type = build_vector_type (cvt_type, nelts);
+	    }
+	  /* This should only happened for SLP as long as loop vectorizer
+	     only supports same-sized vector.  */
+	  if (cvt_type == NULL_TREE
+	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
+	      || !supportable_convert_operation ((tree_code) *code1,
+						 vectype_out,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) *code2,
+						 cvt_type,
+						 vectype_in, &tc2))
+	    continue;
+
+	  found_mode = true;
+	  break;
+	}
+
+      if (found_mode)
+	{
+	  (*multi_step_cvt)++;
+	  interm_types->safe_push (cvt_type);
+	  cvt_type = NULL_TREE;
+	  *code1 = tc1;
+	  *code2 = tc2;
+	  return true;
+	}
+    }
+  interm_types->release ();
+  return false;
+}
+
 /* Generate and return a vector mask of MASK_TYPE such that
    mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
    Add the statements to SEQ.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 97ec9c341e7..ad65ce71bb7 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2265,6 +2265,15 @@ extern bool supportable_widening_operation (vec_info*, code_helper,  extern bool supportable_narrowing_operation (code_helper, tree, tree,
 					     code_helper *, int *,
 					     vec<tree> *);
+extern bool supportable_indirect_convert_operation (vec_info *,
+						    code_helper,
+						    tree, tree,
+						    code_helper *,
+						    code_helper *,
+						    int *,
+						    vec<tree> *,
+						    tree = NULL_TREE,
+						    slp_tree = NULL);
 
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 				  enum vect_cost_for_stmt, stmt_vec_info,
--
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-11  6:49                         ` [PATCH 1/3 v3] " Hu, Lin1
  2024-06-17  1:48                           ` Hu, Lin1
@ 2024-06-18 11:44                           ` Richard Biener
  2024-06-20 11:26                             ` Hu, Lin1
  1 sibling, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-06-18 11:44 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu, ubizjak

On Tue, 11 Jun 2024, Hu, Lin1 wrote:

> I wrap a part of code about indirect conversion. The API refers to 
> supportable_narrowing/widening_operations.

Sorry for the delay - comments inline.

> BRs,
> Lin
> 
> gcc/ChangeLog:
> 
> 	PR target/107432
> 	* tree-vect-generic.cc
> 	(expand_vector_conversion): Support convert for int -> int,
> 	float -> float and int <-> float.
> 	* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
> 	indirect convert part.
> 	(supportable_indirect_convert_operation): New function.
> 	* tree-vectorizer.h (supportable_indirect_convert_operation):
> 	Define the new function.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/107432
> 	* gcc.target/i386/pr107432-1.c: New test.
> 	* gcc.target/i386/pr107432-2.c: Ditto.
> 	* gcc.target/i386/pr107432-3.c: Ditto.
> 	* gcc.target/i386/pr107432-4.c: Ditto.
> 	* gcc.target/i386/pr107432-5.c: Ditto.
> 	* gcc.target/i386/pr107432-6.c: Ditto.
> 	* gcc.target/i386/pr107432-7.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++++++
>  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +++++++++++++
>  gcc/tree-vect-generic.cc                   |  33 ++-
>  gcc/tree-vect-stmts.cc                     | 244 +++++++++++++--------
>  gcc/tree-vectorizer.h                      |   9 +
>  10 files changed, 1011 insertions(+), 92 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> new file mode 100644
> index 00000000000..a4f37447eb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -0,0 +1,234 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
> +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2si);
> +}
> +
> +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
> +}
> +
> +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
> +}
> +
> +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi);
> +}
> +
> +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi);
> +}
> +
> +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
> +}
> +
> +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8di)a, __v8qi);
> +}
> +
> +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2hi);
> +}
> +
> +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi);
> +}
> +
> +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
> +}
> +
> +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
> +}
> +
> +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2qi);
> +}
> +
> +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi);
> +}
> +
> +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
> +}
> +
> +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector((__v2hi)a, __v2qi);
> +}
> +
> +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi);
> +}
> +
> +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
> +}
> +
> +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
> +}
> +
> +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2su);
> +}
> +
> +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
> +}
> +
> +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
> +}
> +
> +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2hu);
> +}
> +
> +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4hu);
> +}
> +
> +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
> +}
> +
> +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8du)a, __v8qu);
> +}
> +
> +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2hu);
> +}
> +
> +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4hu);
> +}
> +
> +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
> +}
> +
> +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
> +}
> +
> +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2qu);
> +}
> +
> +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8su)a, __v8qu);
> +}
> +
> +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
> +}
> +
> +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> +{
> +  return __builtin_convertvector((__v2hu)a, __v2qu);
> +}
> +
> +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hu)a, __v8qu);
> +}
> +
> +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
> +}
> +
> +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> new file mode 100644
> index 00000000000..02ffd811cb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> @@ -0,0 +1,105 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v8hi);
> +}
> +
> +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v16hi);
> +}
> +
> +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector(a, __v32hi);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> new file mode 100644
> index 00000000000..30dc947b6dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> @@ -0,0 +1,55 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2sf);
> +}
> +
> +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2hf);
> +}
> +
> +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector(a, __v16hf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> new file mode 100644
> index 00000000000..e537e7349e4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16sf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> new file mode 100644
> index 00000000000..5a44ef9f3b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2si);
> +}
> +
> +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> +
> +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16si);
> +}
> +
> +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> new file mode 100644
> index 00000000000..4a68a10b089
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -0,0 +1,139 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qi);
> +}
> +
> +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qu);
> +}
> +
> +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qi);
> +}
> +
> +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qi);
> +}
> +
> +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qi);
> +}
> +
> +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qu);
> +}
> +
> +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qu);
> +}
> +
> +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qu);
> +}
> +
> +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qi);
> +}
> +
> +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qi);
> +}
> +
> +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qi);
> +}
> +
> +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qi);
> +}
> +
> +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qu);
> +}
> +
> +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qu);
> +}
> +
> +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qu);
> +}
> +
> +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> new file mode 100644
> index 00000000000..0ff5a97ed1a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> @@ -0,0 +1,156 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8df);
> +}
> +
> +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8df);
> +}
> +
> +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16sf);
> +}
> +
> +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16sf);
> +}
> +
> +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector((__v32qi)a, __v32hf);
> +}
> +
> +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> +{
> +  return __builtin_convertvector((__v32qu)a, __v32hf);
> +}
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index ea0069f7a67..c38c0b9dda8 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "gimple-match.h"
>  #include "recog.h"		/* FIXME: for insn_data */
>  #include "optabs-libfuncs.h"
> +#include "cfgloop.h"
> +#include "tree-vectorizer.h"
>  
>  
>  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> @@ -1870,14 +1872,33 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
>    else if (ret_elt_bits > arg_elt_bits)
>      modifier = WIDEN;
>  
> +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> +    {
> +      g = gimple_build_assign (lhs, code1, arg);
> +      gsi_replace (gsi, g, false);
> +      return;
> +    }

Given the API change I suggest below it might make sense to have
supportable_indirect_convert_operation do the above and represent
it as single-step conversion?

> +  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
> +  int multi_step_cvt = 0;
> +  vec<tree> interm_types = vNULL;
> +  if (supportable_indirect_convert_operation (NULL,
> +					      code,
> +					      ret_type, arg_type,
> +					      &code2, &code3,
> +					      &multi_step_cvt,
> +					      &interm_types, arg))
> +    {
> +      new_rhs = make_ssa_name (interm_types[0]);
> +      g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
> +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +      g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
> +      gsi_replace (gsi, g, false);
> +      return;
> +    }
> +
>    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
>      {
> -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> -	{
> -	  g = gimple_build_assign (lhs, code1, arg);
> -	  gsi_replace (gsi, g, false);
> -	  return;
> -	}
>        /* Can't use get_compute_type here, as supportable_convert_operation
>  	 doesn't necessarily use an optab and needs two arguments.  */
>        tree vec_compute_type
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 05a169ecb2d..0aa608202ca 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
>    tree scalar_dest;
>    tree op0, op1 = NULL_TREE;
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> -  tree_code tc1, tc2;
> +  tree_code tc1;
>    code_helper code, code1, code2;
>    code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
>    tree new_temp;
> @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
>  	break;
>        }
>  
> -      /* For conversions between float and integer types try whether
> -	 we can use intermediate signed integer types to support the
> -	 conversion.  */
> -      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> -	  && (code == FLOAT_EXPR ||
> -	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> -	{
> -	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
> -	  bool float_expr_p = code == FLOAT_EXPR;
> -	  unsigned short target_size;
> -	  scalar_mode intermediate_mode;
> -	  if (demotion)
> -	    {
> -	      intermediate_mode = lhs_mode;
> -	      target_size = GET_MODE_SIZE (rhs_mode);
> -	    }
> -	  else
> -	    {
> -	      target_size = GET_MODE_SIZE (lhs_mode);
> -	      if (!int_mode_for_size
> -		  (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
> -		goto unsupported;
> -	    }
> -	  code1 = float_expr_p ? code : NOP_EXPR;
> -	  codecvt1 = float_expr_p ? NOP_EXPR : code;
> -	  opt_scalar_mode mode_iter;
> -	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> -	    {
> -	      intermediate_mode = mode_iter.require ();
> -
> -	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
> -		break;
> -
> -	      scalar_mode cvt_mode;
> -	      if (!int_mode_for_size
> -		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
> -		break;
> -
> -	      cvt_type = build_nonstandard_integer_type
> -		(GET_MODE_BITSIZE (cvt_mode), 0);
> -
> -	      /* Check if the intermediate type can hold OP0's range.
> -		 When converting from float to integer this is not necessary
> -		 because values that do not fit the (smaller) target type are
> -		 unspecified anyway.  */
> -	      if (demotion && float_expr_p)
> -		{
> -		  wide_int op_min_value, op_max_value;
> -		  if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
> -		    break;
> -
> -		  if (cvt_type == NULL_TREE
> -		      || (wi::min_precision (op_max_value, SIGNED)
> -			  > TYPE_PRECISION (cvt_type))
> -		      || (wi::min_precision (op_min_value, SIGNED)
> -			  > TYPE_PRECISION (cvt_type)))
> -		    continue;
> -		}
> -
> -	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> -	      /* This should only happened for SLP as long as loop vectorizer
> -		 only supports same-sized vector.  */
> -	      if (cvt_type == NULL_TREE
> -		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
> -		  || !supportable_convert_operation ((tree_code) code1,
> -						     vectype_out,
> -						     cvt_type, &tc1)
> -		  || !supportable_convert_operation ((tree_code) codecvt1,
> -						     cvt_type,
> -						     vectype_in, &tc2))
> -		continue;
> -
> -	      found_mode = true;
> -	      break;
> -	    }
> +      if (supportable_indirect_convert_operation (vinfo,
> +						  code,
> +						  vectype_out,
> +						  vectype_in,
> +						  &code1,
> +						  &codecvt1,
> +						  &multi_step_cvt,
> +						  &interm_types,
> +						  op0,slp_node))
> +	break;
>  
> -	  if (found_mode)
> -	    {
> -	      multi_step_cvt++;
> -	      interm_types.safe_push (cvt_type);
> -	      cvt_type = NULL_TREE;
> -	      code1 = tc1;
> -	      codecvt1 = tc2;
> -	      break;
> -	    }
> -	}
>        /* FALLTHRU */
>      unsupported:
>        if (dump_enabled_p ())
> @@ -14626,6 +14551,153 @@ supportable_narrowing_operation (code_helper code,
>    return false;
>  }
>  
> +/* Function supportable_indirect_convert_operation
> +
> +   Check whether an operation represented by the code CODE is two
> +   convert operations that are supported by the target platform in
> +   vector form (i.e., when operating on arguments of type VECTYPE_IN
> +   producing a result of type VECTYPE_OUT).
> +
> +   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
> +   This function checks if these operations are supported
> +   by the target platform directly (via vector tree-codes).
> +
> +   Output:
> +   - CODE1 is the code of a vector operation to be used when
> +   converting the operation in the first step, if available.
> +   - CODE2 is the code of a vector operation to be used when
> +   converting the operation in the second step, if available.
> +   - MULTI_STEP_CVT determines the number of required intermediate steps in
> +   case of multi-step conversion (like int->short->char - in that case
> +   MULTI_STEP_CVT will be 1). In the function, it should be 1.
> +   - INTERM_TYPES contains the intermediate type required to perform the
> +   convert operation (short in the above example).   */
> +bool
> +supportable_indirect_convert_operation (vec_info *vinfo,
> +					code_helper code,
> +					tree vectype_out,
> +					tree vectype_in,
> +					code_helper *code1,
> +					code_helper *code2,
> +					int *multi_step_cvt,
> +					vec<tree> *interm_types,

This API is somewhat awkward, as we're inventing a new one I guess
we can do better.  I think we want

      vec<std::pair<tree, tree_code> > *converts,

covering all code1, code2, multi_step_cvt and interm_types with
the conversion sequence being

      converts[0].first tem0 = converts[0].second op0;
      converts[1].first tem1 = converts[1].second tem;

... while converts.length () determines the length of the chain,
one being a direct conversion where then converts[0].first
is vectype_out.  That would allow double -> char to go
double -> float -> int -> short -> char for example.

> +					tree op0,
> +					slp_tree slp_node)

I would like to avoid passing VINFO and SLP_NODE here, see below.
The same is true for OP0 where the existing use is wrong for SLP already,
but I guess that can stay for now (I opened PR115538 about the
wrong-code issue).

> +{
> +  bool found_mode = false;
> +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
> +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
> +  opt_scalar_mode mode_iter;
> +  tree_code tc1, tc2;
> +
> +  tree cvt_type = NULL_TREE;
> +  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
> +
> +  (*multi_step_cvt) = 0;
> +  /* For conversions between float and integer types try whether
> +     we can use intermediate signed integer types to support the
> +     conversion.  */
> +  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> +      && (code == FLOAT_EXPR
> +	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> +    {
> +      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
> +      bool float_expr_p = code == FLOAT_EXPR;
> +      unsigned short target_size;
> +      scalar_mode intermediate_mode;
> +      if (demotion)
> +	{
> +	  intermediate_mode = lhs_mode;
> +	  target_size = GET_MODE_SIZE (rhs_mode);
> +	}
> +      else
> +	{
> +	  target_size = GET_MODE_SIZE (lhs_mode);
> +	  if (!int_mode_for_size
> +	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
> +	    return false;
> +	}
> +      *code1 = float_expr_p ? code : NOP_EXPR;
> +      *code2 = float_expr_p ? NOP_EXPR : code;
> +      opt_scalar_mode mode_iter;
> +      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> +	{
> +	  intermediate_mode = mode_iter.require ();
> +
> +	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
> +	    break;
> +
> +	  scalar_mode cvt_mode;
> +	  if (!int_mode_for_size
> +	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
> +	    break;
> +
> +	  cvt_type = build_nonstandard_integer_type
> +	    (GET_MODE_BITSIZE (cvt_mode), 0);
> +
> +	  /* Check if the intermediate type can hold OP0's range.
> +	     When converting from float to integer this is not necessary
> +	     because values that do not fit the (smaller) target type are
> +	     unspecified anyway.  */
> +	  if (demotion && float_expr_p)
> +	    {
> +	      wide_int op_min_value, op_max_value;
> +	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
> +		 In the future, if it is supported, changes may need to be made
> +		 to this part, such as checking the RANGE of each element
> +		 in the vector.  */
> +	      if (!SSA_NAME_RANGE_INFO (op0)
> +		  || !vect_get_range_info (op0, &op_min_value, &op_max_value))
> +		break;
> +
> +	      if (cvt_type == NULL_TREE
> +		  || (wi::min_precision (op_max_value, SIGNED)
> +		      > TYPE_PRECISION (cvt_type))
> +		  || (wi::min_precision (op_min_value, SIGNED)
> +		      > TYPE_PRECISION (cvt_type)))
> +		continue;
> +	    }
> +
> +	  if (vinfo != NULL && slp_node != NULL)
> +	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> +	  else
> +	    {
> +	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
> +			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
> +	      cvt_type = build_nonstandard_integer_type
> +		(GET_MODE_BITSIZE (cvt_mode), uns);
> +	      cvt_type = build_vector_type (cvt_type, nelts);
> +	    }

So this would then become

          cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE 
(vectype_in), cvt_type, TYPE_VECTOR_SUBPARTS (vectype_in));

> +	  /* This should only happened for SLP as long as loop vectorizer
> +	     only supports same-sized vector.  */
> +	  if (cvt_type == NULL_TREE
> +	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
> +	      || !supportable_convert_operation ((tree_code) *code1,
> +						 vectype_out,
> +						 cvt_type, &tc1)
> +	      || !supportable_convert_operation ((tree_code) *code2,
> +						 cvt_type,
> +						 vectype_in, &tc2))
> +	    continue;
> +
> +	  found_mode = true;
> +	  break;
> +	}
> +
> +      if (found_mode)
> +	{
> +	  (*multi_step_cvt)++;
> +	  interm_types->safe_push (cvt_type);
> +	  cvt_type = NULL_TREE;
> +	  *code1 = tc1;
> +	  *code2 = tc2;
> +	  return true;
> +	}
> +    }
> +  interm_types->release ();

Hmm, ownership of interm_types is somewhat unclear here - the caller
should release it, or is the situation that the caller is confused
by stray elements in it?  In that case I'd suggest to instead
do interm_types->truncate (0).

> +  return false;
> +}
> +
>  /* Generate and return a vector mask of MASK_TYPE such that
>     mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
>     Add the statements to SEQ.  */
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 97ec9c341e7..ad65ce71bb7 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2265,6 +2265,15 @@ extern bool supportable_widening_operation (vec_info*, code_helper,
>  extern bool supportable_narrowing_operation (code_helper, tree, tree,
>  					     code_helper *, int *,
>  					     vec<tree> *);
> +extern bool supportable_indirect_convert_operation (vec_info *,
> +						    code_helper,
> +						    tree, tree,
> +						    code_helper *,
> +						    code_helper *,
> +						    int *,
> +						    vec<tree> *,
> +						    tree = NULL_TREE,
> +						    slp_tree = NULL);
>  
>  extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
>  				  enum vect_cost_for_stmt, stmt_vec_info,
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-18 11:44                           ` Richard Biener
@ 2024-06-20 11:26                             ` Hu, Lin1
  2024-06-24 12:33                               ` Richard Biener
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-20 11:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Liu, Hongtao, ubizjak

> >    else if (ret_elt_bits > arg_elt_bits)
> >      modifier = WIDEN;
> >
> > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > +    {
> > +      g = gimple_build_assign (lhs, code1, arg);
> > +      gsi_replace (gsi, g, false);
> > +      return;
> > +    }
> 
> Given the API change I suggest below it might make sense to have
> supportable_indirect_convert_operation do the above and represent it as single-
> step conversion?
>

OK, if you want to supportable_indirect_convert_operation can do something like supportable_convert_operation, I'll give it a try. This functionality is really the part that this function can cover. But this would require some changes not only the API change, because supportable_indirect_convert_operation originally only supported Float -> Int or Int ->Float.
 
>
> > +  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
> > +  int multi_step_cvt = 0;
> > +  vec<tree> interm_types = vNULL;
> > +  if (supportable_indirect_convert_operation (NULL,
> > +					      code,
> > +					      ret_type, arg_type,
> > +					      &code2, &code3,
> > +					      &multi_step_cvt,
> > +					      &interm_types, arg))
> > +    {
> > +      new_rhs = make_ssa_name (interm_types[0]);
> > +      g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
> > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +      g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
> > +      gsi_replace (gsi, g, false);
> > +      return;
> > +    }
> > +
> >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> FLOAT_EXPR))
> >      {
> > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > -	{
> > -	  g = gimple_build_assign (lhs, code1, arg);
> > -	  gsi_replace (gsi, g, false);
> > -	  return;
> > -	}
> >        /* Can't use get_compute_type here, as supportable_convert_operation
> >  	 doesn't necessarily use an optab and needs two arguments.  */
> >        tree vec_compute_type
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > 05a169ecb2d..0aa608202ca 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
> >    tree scalar_dest;
> >    tree op0, op1 = NULL_TREE;
> >    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> > -  tree_code tc1, tc2;
> > +  tree_code tc1;
> >    code_helper code, code1, code2;
> >    code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
> >    tree new_temp;
> > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
> >  	break;
> >        }
> >
> > -      /* For conversions between float and integer types try whether
> > -	 we can use intermediate signed integer types to support the
> > -	 conversion.  */
> > -      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > -	  && (code == FLOAT_EXPR ||
> > -	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > -	{
> > -	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> (lhs_mode);
> > -	  bool float_expr_p = code == FLOAT_EXPR;
> > -	  unsigned short target_size;
> > -	  scalar_mode intermediate_mode;
> > -	  if (demotion)
> > -	    {
> > -	      intermediate_mode = lhs_mode;
> > -	      target_size = GET_MODE_SIZE (rhs_mode);
> > -	    }
> > -	  else
> > -	    {
> > -	      target_size = GET_MODE_SIZE (lhs_mode);
> > -	      if (!int_mode_for_size
> > -		  (GET_MODE_BITSIZE (rhs_mode), 0).exists
> (&intermediate_mode))
> > -		goto unsupported;
> > -	    }
> > -	  code1 = float_expr_p ? code : NOP_EXPR;
> > -	  codecvt1 = float_expr_p ? NOP_EXPR : code;
> > -	  opt_scalar_mode mode_iter;
> > -	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > -	    {
> > -	      intermediate_mode = mode_iter.require ();
> > -
> > -	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > -		break;
> > -
> > -	      scalar_mode cvt_mode;
> > -	      if (!int_mode_for_size
> > -		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists
> (&cvt_mode))
> > -		break;
> > -
> > -	      cvt_type = build_nonstandard_integer_type
> > -		(GET_MODE_BITSIZE (cvt_mode), 0);
> > -
> > -	      /* Check if the intermediate type can hold OP0's range.
> > -		 When converting from float to integer this is not necessary
> > -		 because values that do not fit the (smaller) target type are
> > -		 unspecified anyway.  */
> > -	      if (demotion && float_expr_p)
> > -		{
> > -		  wide_int op_min_value, op_max_value;
> > -		  if (!vect_get_range_info (op0, &op_min_value,
> &op_max_value))
> > -		    break;
> > -
> > -		  if (cvt_type == NULL_TREE
> > -		      || (wi::min_precision (op_max_value, SIGNED)
> > -			  > TYPE_PRECISION (cvt_type))
> > -		      || (wi::min_precision (op_min_value, SIGNED)
> > -			  > TYPE_PRECISION (cvt_type)))
> > -		    continue;
> > -		}
> > -
> > -	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> > -	      /* This should only happened for SLP as long as loop vectorizer
> > -		 only supports same-sized vector.  */
> > -	      if (cvt_type == NULL_TREE
> > -		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
> > -		  || !supportable_convert_operation ((tree_code) code1,
> > -						     vectype_out,
> > -						     cvt_type, &tc1)
> > -		  || !supportable_convert_operation ((tree_code) codecvt1,
> > -						     cvt_type,
> > -						     vectype_in, &tc2))
> > -		continue;
> > -
> > -	      found_mode = true;
> > -	      break;
> > -	    }
> > +      if (supportable_indirect_convert_operation (vinfo,
> > +						  code,
> > +						  vectype_out,
> > +						  vectype_in,
> > +						  &code1,
> > +						  &codecvt1,
> > +						  &multi_step_cvt,
> > +						  &interm_types,
> > +						  op0,slp_node))
> > +	break;
> >
> > -	  if (found_mode)
> > -	    {
> > -	      multi_step_cvt++;
> > -	      interm_types.safe_push (cvt_type);
> > -	      cvt_type = NULL_TREE;
> > -	      code1 = tc1;
> > -	      codecvt1 = tc2;
> > -	      break;
> > -	    }
> > -	}
> >        /* FALLTHRU */
> >      unsupported:
> >        if (dump_enabled_p ())
> > @@ -14626,6 +14551,153 @@ supportable_narrowing_operation
> (code_helper code,
> >    return false;
> >  }
> >
> > +/* Function supportable_indirect_convert_operation
> > +
> > +   Check whether an operation represented by the code CODE is two
> > +   convert operations that are supported by the target platform in
> > +   vector form (i.e., when operating on arguments of type VECTYPE_IN
> > +   producing a result of type VECTYPE_OUT).
> > +
> > +   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
> > +   This function checks if these operations are supported
> > +   by the target platform directly (via vector tree-codes).
> > +
> > +   Output:
> > +   - CODE1 is the code of a vector operation to be used when
> > +   converting the operation in the first step, if available.
> > +   - CODE2 is the code of a vector operation to be used when
> > +   converting the operation in the second step, if available.
> > +   - MULTI_STEP_CVT determines the number of required intermediate steps
> in
> > +   case of multi-step conversion (like int->short->char - in that case
> > +   MULTI_STEP_CVT will be 1). In the function, it should be 1.
> > +   - INTERM_TYPES contains the intermediate type required to perform the
> > +   convert operation (short in the above example).   */
> > +bool
> > +supportable_indirect_convert_operation (vec_info *vinfo,
> > +					code_helper code,
> > +					tree vectype_out,
> > +					tree vectype_in,
> > +					code_helper *code1,
> > +					code_helper *code2,
> > +					int *multi_step_cvt,
> > +					vec<tree> *interm_types,
> 
> This API is somewhat awkward, as we're inventing a new one I guess we can do
> better.  I think we want
> 
>       vec<std::pair<tree, tree_code> > *converts,
> 
> covering all code1, code2, multi_step_cvt and interm_types with the conversion
> sequence being
> 
>       converts[0].first tem0 = converts[0].second op0;
>       converts[1].first tem1 = converts[1].second tem;
> 

That's great, this really makes the function work better.

>
> ... while converts.length () determines the length of the chain, one being a direct
> conversion where then converts[0].first is vectype_out.  That would allow
> double -> char to go double -> float -> int -> short -> char for example.
>

I'm trying to determine the requirements, do you want this function to support multiple conversions (the current implementation just does a two-step conversion, like double -> char, which becomes double -> int -> char). Actually we should be able to do all conversions in two steps, if we have some suitable instructions. I can't think of a scenario where multiple conversions are needed yet. Could you give me some examples? Of course, I could tweak this feature in advance if it is for future consideration.

>
> > +					tree op0,
> > +					slp_tree slp_node)
> 
> I would like to avoid passing VINFO and SLP_NODE here, see below.
> The same is true for OP0 where the existing use is wrong for SLP already, but I
> guess that can stay for now (I opened PR115538 about the wrong-code issue).
>

Thanks, I have removed them. 
 
>
> > +{
> > +  bool found_mode = false;
> > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
> > +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
> > +  opt_scalar_mode mode_iter;
> > +  tree_code tc1, tc2;
> > +
> > +  tree cvt_type = NULL_TREE;
> > +  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
> > +
> > +  (*multi_step_cvt) = 0;
> > +  /* For conversions between float and integer types try whether
> > +     we can use intermediate signed integer types to support the
> > +     conversion.  */
> > +  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > +      && (code == FLOAT_EXPR
> > +	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > +    {
> > +      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> (lhs_mode);
> > +      bool float_expr_p = code == FLOAT_EXPR;
> > +      unsigned short target_size;
> > +      scalar_mode intermediate_mode;
> > +      if (demotion)
> > +	{
> > +	  intermediate_mode = lhs_mode;
> > +	  target_size = GET_MODE_SIZE (rhs_mode);
> > +	}
> > +      else
> > +	{
> > +	  target_size = GET_MODE_SIZE (lhs_mode);
> > +	  if (!int_mode_for_size
> > +	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
> > +	    return false;
> > +	}
> > +      *code1 = float_expr_p ? code : NOP_EXPR;
> > +      *code2 = float_expr_p ? NOP_EXPR : code;
> > +      opt_scalar_mode mode_iter;
> > +      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > +	{
> > +	  intermediate_mode = mode_iter.require ();
> > +
> > +	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > +	    break;
> > +
> > +	  scalar_mode cvt_mode;
> > +	  if (!int_mode_for_size
> > +	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
> > +	    break;
> > +
> > +	  cvt_type = build_nonstandard_integer_type
> > +	    (GET_MODE_BITSIZE (cvt_mode), 0);
> > +
> > +	  /* Check if the intermediate type can hold OP0's range.
> > +	     When converting from float to integer this is not necessary
> > +	     because values that do not fit the (smaller) target type are
> > +	     unspecified anyway.  */
> > +	  if (demotion && float_expr_p)
> > +	    {
> > +	      wide_int op_min_value, op_max_value;
> > +	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
> > +		 In the future, if it is supported, changes may need to be made
> > +		 to this part, such as checking the RANGE of each element
> > +		 in the vector.  */
> > +	      if (!SSA_NAME_RANGE_INFO (op0)
> > +		  || !vect_get_range_info (op0, &op_min_value,
> &op_max_value))
> > +		break;
> > +
> > +	      if (cvt_type == NULL_TREE
> > +		  || (wi::min_precision (op_max_value, SIGNED)
> > +		      > TYPE_PRECISION (cvt_type))
> > +		  || (wi::min_precision (op_min_value, SIGNED)
> > +		      > TYPE_PRECISION (cvt_type)))
> > +		continue;
> > +	    }
> > +
> > +	  if (vinfo != NULL && slp_node != NULL)
> > +	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> > +	  else
> > +	    {
> > +	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
> > +			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
> > +	      cvt_type = build_nonstandard_integer_type
> > +		(GET_MODE_BITSIZE (cvt_mode), uns);
> > +	      cvt_type = build_vector_type (cvt_type, nelts);
> > +	    }
> 
> So this would then become
> 
>           cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE
> (vectype_in), cvt_type, TYPE_VECTOR_SUBPARTS (vectype_in));
> 
> > +	  /* This should only happened for SLP as long as loop vectorizer
> > +	     only supports same-sized vector.  */
> > +	  if (cvt_type == NULL_TREE
> > +	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
> > +	      || !supportable_convert_operation ((tree_code) *code1,
> > +						 vectype_out,
> > +						 cvt_type, &tc1)
> > +	      || !supportable_convert_operation ((tree_code) *code2,
> > +						 cvt_type,
> > +						 vectype_in, &tc2))
> > +	    continue;
> > +
> > +	  found_mode = true;
> > +	  break;
> > +	}
> > +
> > +      if (found_mode)
> > +	{
> > +	  (*multi_step_cvt)++;
> > +	  interm_types->safe_push (cvt_type);
> > +	  cvt_type = NULL_TREE;
> > +	  *code1 = tc1;
> > +	  *code2 = tc2;
> > +	  return true;
> > +	}
> > +    }
> > +  interm_types->release ();
> 
> Hmm, ownership of interm_types is somewhat unclear here - the caller should
> release it, or is the situation that the caller is confused by stray elements in it?  In
> that case I'd suggest to instead do interm_types->truncate (0).
>

It's my fault, I just imitate supportable_narrowing/widening_operation, I think for this function, interm_types->release() is not needed.
 
BRs,
Lin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-20 11:26                             ` Hu, Lin1
@ 2024-06-24 12:33                               ` Richard Biener
  2024-06-24 14:12                                 ` Tamar Christina
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-06-24 12:33 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao, ubizjak

On Thu, 20 Jun 2024, Hu, Lin1 wrote:

> > >    else if (ret_elt_bits > arg_elt_bits)
> > >      modifier = WIDEN;
> > >
> > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > +    {
> > > +      g = gimple_build_assign (lhs, code1, arg);
> > > +      gsi_replace (gsi, g, false);
> > > +      return;
> > > +    }
> > 
> > Given the API change I suggest below it might make sense to have
> > supportable_indirect_convert_operation do the above and represent it as single-
> > step conversion?
> >
> 
> OK, if you want to supportable_indirect_convert_operation can do 
> something like supportable_convert_operation, I'll give it a try. This 
> functionality is really the part that this function can cover. But this 
> would require some changes not only the API change, because 
> supportable_indirect_convert_operation originally only supported Float 
> -> Int or Int ->Float.

I think I'd like to see a single API to handle direct and
(multi-)indirect-level converts that operate on vectors with all
the same number of lanes.

> >
> > > +  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
> > > +  int multi_step_cvt = 0;
> > > +  vec<tree> interm_types = vNULL;
> > > +  if (supportable_indirect_convert_operation (NULL,
> > > +					      code,
> > > +					      ret_type, arg_type,
> > > +					      &code2, &code3,
> > > +					      &multi_step_cvt,
> > > +					      &interm_types, arg))
> > > +    {
> > > +      new_rhs = make_ssa_name (interm_types[0]);
> > > +      g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
> > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > +      g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
> > > +      gsi_replace (gsi, g, false);
> > > +      return;
> > > +    }
> > > +
> > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > FLOAT_EXPR))
> > >      {
> > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > -	{
> > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > -	  gsi_replace (gsi, g, false);
> > > -	  return;
> > > -	}
> > >        /* Can't use get_compute_type here, as supportable_convert_operation
> > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > >        tree vec_compute_type
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > > 05a169ecb2d..0aa608202ca 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
> > >    tree scalar_dest;
> > >    tree op0, op1 = NULL_TREE;
> > >    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> > > -  tree_code tc1, tc2;
> > > +  tree_code tc1;
> > >    code_helper code, code1, code2;
> > >    code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
> > >    tree new_temp;
> > > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
> > >  	break;
> > >        }
> > >
> > > -      /* For conversions between float and integer types try whether
> > > -	 we can use intermediate signed integer types to support the
> > > -	 conversion.  */
> > > -      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > > -	  && (code == FLOAT_EXPR ||
> > > -	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > > -	{
> > > -	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> > (lhs_mode);
> > > -	  bool float_expr_p = code == FLOAT_EXPR;
> > > -	  unsigned short target_size;
> > > -	  scalar_mode intermediate_mode;
> > > -	  if (demotion)
> > > -	    {
> > > -	      intermediate_mode = lhs_mode;
> > > -	      target_size = GET_MODE_SIZE (rhs_mode);
> > > -	    }
> > > -	  else
> > > -	    {
> > > -	      target_size = GET_MODE_SIZE (lhs_mode);
> > > -	      if (!int_mode_for_size
> > > -		  (GET_MODE_BITSIZE (rhs_mode), 0).exists
> > (&intermediate_mode))
> > > -		goto unsupported;
> > > -	    }
> > > -	  code1 = float_expr_p ? code : NOP_EXPR;
> > > -	  codecvt1 = float_expr_p ? NOP_EXPR : code;
> > > -	  opt_scalar_mode mode_iter;
> > > -	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > > -	    {
> > > -	      intermediate_mode = mode_iter.require ();
> > > -
> > > -	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > > -		break;
> > > -
> > > -	      scalar_mode cvt_mode;
> > > -	      if (!int_mode_for_size
> > > -		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists
> > (&cvt_mode))
> > > -		break;
> > > -
> > > -	      cvt_type = build_nonstandard_integer_type
> > > -		(GET_MODE_BITSIZE (cvt_mode), 0);
> > > -
> > > -	      /* Check if the intermediate type can hold OP0's range.
> > > -		 When converting from float to integer this is not necessary
> > > -		 because values that do not fit the (smaller) target type are
> > > -		 unspecified anyway.  */
> > > -	      if (demotion && float_expr_p)
> > > -		{
> > > -		  wide_int op_min_value, op_max_value;
> > > -		  if (!vect_get_range_info (op0, &op_min_value,
> > &op_max_value))
> > > -		    break;
> > > -
> > > -		  if (cvt_type == NULL_TREE
> > > -		      || (wi::min_precision (op_max_value, SIGNED)
> > > -			  > TYPE_PRECISION (cvt_type))
> > > -		      || (wi::min_precision (op_min_value, SIGNED)
> > > -			  > TYPE_PRECISION (cvt_type)))
> > > -		    continue;
> > > -		}
> > > -
> > > -	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> > > -	      /* This should only happened for SLP as long as loop vectorizer
> > > -		 only supports same-sized vector.  */
> > > -	      if (cvt_type == NULL_TREE
> > > -		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
> > > -		  || !supportable_convert_operation ((tree_code) code1,
> > > -						     vectype_out,
> > > -						     cvt_type, &tc1)
> > > -		  || !supportable_convert_operation ((tree_code) codecvt1,
> > > -						     cvt_type,
> > > -						     vectype_in, &tc2))
> > > -		continue;
> > > -
> > > -	      found_mode = true;
> > > -	      break;
> > > -	    }
> > > +      if (supportable_indirect_convert_operation (vinfo,
> > > +						  code,
> > > +						  vectype_out,
> > > +						  vectype_in,
> > > +						  &code1,
> > > +						  &codecvt1,
> > > +						  &multi_step_cvt,
> > > +						  &interm_types,
> > > +						  op0,slp_node))
> > > +	break;
> > >
> > > -	  if (found_mode)
> > > -	    {
> > > -	      multi_step_cvt++;
> > > -	      interm_types.safe_push (cvt_type);
> > > -	      cvt_type = NULL_TREE;
> > > -	      code1 = tc1;
> > > -	      codecvt1 = tc2;
> > > -	      break;
> > > -	    }
> > > -	}
> > >        /* FALLTHRU */
> > >      unsupported:
> > >        if (dump_enabled_p ())
> > > @@ -14626,6 +14551,153 @@ supportable_narrowing_operation
> > (code_helper code,
> > >    return false;
> > >  }
> > >
> > > +/* Function supportable_indirect_convert_operation
> > > +
> > > +   Check whether an operation represented by the code CODE is two
> > > +   convert operations that are supported by the target platform in
> > > +   vector form (i.e., when operating on arguments of type VECTYPE_IN
> > > +   producing a result of type VECTYPE_OUT).
> > > +
> > > +   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
> > > +   This function checks if these operations are supported
> > > +   by the target platform directly (via vector tree-codes).
> > > +
> > > +   Output:
> > > +   - CODE1 is the code of a vector operation to be used when
> > > +   converting the operation in the first step, if available.
> > > +   - CODE2 is the code of a vector operation to be used when
> > > +   converting the operation in the second step, if available.
> > > +   - MULTI_STEP_CVT determines the number of required intermediate steps
> > in
> > > +   case of multi-step conversion (like int->short->char - in that case
> > > +   MULTI_STEP_CVT will be 1). In the function, it should be 1.
> > > +   - INTERM_TYPES contains the intermediate type required to perform the
> > > +   convert operation (short in the above example).   */
> > > +bool
> > > +supportable_indirect_convert_operation (vec_info *vinfo,
> > > +					code_helper code,
> > > +					tree vectype_out,
> > > +					tree vectype_in,
> > > +					code_helper *code1,
> > > +					code_helper *code2,
> > > +					int *multi_step_cvt,
> > > +					vec<tree> *interm_types,
> > 
> > This API is somewhat awkward, as we're inventing a new one I guess we can do
> > better.  I think we want
> > 
> >       vec<std::pair<tree, tree_code> > *converts,
> > 
> > covering all code1, code2, multi_step_cvt and interm_types with the conversion
> > sequence being
> > 
> >       converts[0].first tem0 = converts[0].second op0;
> >       converts[1].first tem1 = converts[1].second tem;
> > 
> 
> That's great, this really makes the function work better.
> 
> >
> > ... while converts.length () determines the length of the chain, one being a direct
> > conversion where then converts[0].first is vectype_out.  That would allow
> > double -> char to go double -> float -> int -> short -> char for example.
> >
> 
> I'm trying to determine the requirements, do you want this function to 
> support multiple conversions (the current implementation just does a 
> two-step conversion, like double -> char, which becomes double -> int -> 
> char). Actually we should be able to do all conversions in two steps, if 
> we have some suitable instructions. I can't think of a scenario where 
> multiple conversions are needed yet. Could you give me some examples? Of 
> course, I could tweak this feature in advance if it is for future 
> consideration.

I think the API should support multi-level, not only two levels.  The
implementation doesn't need to cover that case unless we run into
such a requirement.  Usually vector ISAs allow 2x integer 
widening/shortening but not 4x, so a VnDImode -> VnQImode conversion
would need to go via VnSImode and VnHImode (of course some targets
might "help" the vectorizer by providing a VnDImode -> VnQImode
pattern that does the intermediate conversions behind the vectorizers
back).  But yes, the original motivation for the vectorizer code
was that float<->int conversions are limited.

Thanks,
Richard.

> 
> >
> > > +					tree op0,
> > > +					slp_tree slp_node)
> > 
> > I would like to avoid passing VINFO and SLP_NODE here, see below.
> > The same is true for OP0 where the existing use is wrong for SLP already, but I
> > guess that can stay for now (I opened PR115538 about the wrong-code issue).
> >
> 
> Thanks, I have removed them. 
>  
> >
> > > +{
> > > +  bool found_mode = false;
> > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
> > > +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
> > > +  opt_scalar_mode mode_iter;
> > > +  tree_code tc1, tc2;
> > > +
> > > +  tree cvt_type = NULL_TREE;
> > > +  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
> > > +
> > > +  (*multi_step_cvt) = 0;
> > > +  /* For conversions between float and integer types try whether
> > > +     we can use intermediate signed integer types to support the
> > > +     conversion.  */
> > > +  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > > +      && (code == FLOAT_EXPR
> > > +	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > > +    {
> > > +      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> > (lhs_mode);
> > > +      bool float_expr_p = code == FLOAT_EXPR;
> > > +      unsigned short target_size;
> > > +      scalar_mode intermediate_mode;
> > > +      if (demotion)
> > > +	{
> > > +	  intermediate_mode = lhs_mode;
> > > +	  target_size = GET_MODE_SIZE (rhs_mode);
> > > +	}
> > > +      else
> > > +	{
> > > +	  target_size = GET_MODE_SIZE (lhs_mode);
> > > +	  if (!int_mode_for_size
> > > +	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
> > > +	    return false;
> > > +	}
> > > +      *code1 = float_expr_p ? code : NOP_EXPR;
> > > +      *code2 = float_expr_p ? NOP_EXPR : code;
> > > +      opt_scalar_mode mode_iter;
> > > +      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > > +	{
> > > +	  intermediate_mode = mode_iter.require ();
> > > +
> > > +	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > > +	    break;
> > > +
> > > +	  scalar_mode cvt_mode;
> > > +	  if (!int_mode_for_size
> > > +	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
> > > +	    break;
> > > +
> > > +	  cvt_type = build_nonstandard_integer_type
> > > +	    (GET_MODE_BITSIZE (cvt_mode), 0);
> > > +
> > > +	  /* Check if the intermediate type can hold OP0's range.
> > > +	     When converting from float to integer this is not necessary
> > > +	     because values that do not fit the (smaller) target type are
> > > +	     unspecified anyway.  */
> > > +	  if (demotion && float_expr_p)
> > > +	    {
> > > +	      wide_int op_min_value, op_max_value;
> > > +	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
> > > +		 In the future, if it is supported, changes may need to be made
> > > +		 to this part, such as checking the RANGE of each element
> > > +		 in the vector.  */
> > > +	      if (!SSA_NAME_RANGE_INFO (op0)
> > > +		  || !vect_get_range_info (op0, &op_min_value,
> > &op_max_value))
> > > +		break;
> > > +
> > > +	      if (cvt_type == NULL_TREE
> > > +		  || (wi::min_precision (op_max_value, SIGNED)
> > > +		      > TYPE_PRECISION (cvt_type))
> > > +		  || (wi::min_precision (op_min_value, SIGNED)
> > > +		      > TYPE_PRECISION (cvt_type)))
> > > +		continue;
> > > +	    }
> > > +
> > > +	  if (vinfo != NULL && slp_node != NULL)
> > > +	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> > > +	  else
> > > +	    {
> > > +	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
> > > +			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
> > > +	      cvt_type = build_nonstandard_integer_type
> > > +		(GET_MODE_BITSIZE (cvt_mode), uns);
> > > +	      cvt_type = build_vector_type (cvt_type, nelts);
> > > +	    }
> > 
> > So this would then become
> > 
> >           cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE
> > (vectype_in), cvt_type, TYPE_VECTOR_SUBPARTS (vectype_in));
> > 
> > > +	  /* This should only happened for SLP as long as loop vectorizer
> > > +	     only supports same-sized vector.  */
> > > +	  if (cvt_type == NULL_TREE
> > > +	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
> > > +	      || !supportable_convert_operation ((tree_code) *code1,
> > > +						 vectype_out,
> > > +						 cvt_type, &tc1)
> > > +	      || !supportable_convert_operation ((tree_code) *code2,
> > > +						 cvt_type,
> > > +						 vectype_in, &tc2))
> > > +	    continue;
> > > +
> > > +	  found_mode = true;
> > > +	  break;
> > > +	}
> > > +
> > > +      if (found_mode)
> > > +	{
> > > +	  (*multi_step_cvt)++;
> > > +	  interm_types->safe_push (cvt_type);
> > > +	  cvt_type = NULL_TREE;
> > > +	  *code1 = tc1;
> > > +	  *code2 = tc2;
> > > +	  return true;
> > > +	}
> > > +    }
> > > +  interm_types->release ();
> > 
> > Hmm, ownership of interm_types is somewhat unclear here - the caller should
> > release it, or is the situation that the caller is confused by stray elements in it?  In
> > that case I'd suggest to instead do interm_types->truncate (0).
> >
> 
> It's my fault, I just imitate supportable_narrowing/widening_operation, 
> I think for this function, interm_types->release() is not needed.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-24 12:33                               ` Richard Biener
@ 2024-06-24 14:12                                 ` Tamar Christina
  2024-06-25  2:00                                   ` Hu, Lin1
  2024-06-25  3:28                                   ` [PATCH 1/3 v4] " Hu, Lin1
  0 siblings, 2 replies; 33+ messages in thread
From: Tamar Christina @ 2024-06-24 14:12 UTC (permalink / raw)
  To: Richard Biener, Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao, ubizjak

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 24, 2024 1:34 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float
> -> float and int <-> float.
> 
> On Thu, 20 Jun 2024, Hu, Lin1 wrote:
> 
> > > >    else if (ret_elt_bits > arg_elt_bits)
> > > >      modifier = WIDEN;
> > > >
> > > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > +    {
> > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return;
> > > > +    }
> > >
> > > Given the API change I suggest below it might make sense to have
> > > supportable_indirect_convert_operation do the above and represent it as
> single-
> > > step conversion?
> > >
> >
> > OK, if you want to supportable_indirect_convert_operation can do
> > something like supportable_convert_operation, I'll give it a try. This
> > functionality is really the part that this function can cover. But this
> > would require some changes not only the API change, because
> > supportable_indirect_convert_operation originally only supported Float
> > -> Int or Int ->Float.
> 
> I think I'd like to see a single API to handle direct and
> (multi-)indirect-level converts that operate on vectors with all
> the same number of lanes.
> 
> > >
> > > > +  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
> > > > +  int multi_step_cvt = 0;
> > > > +  vec<tree> interm_types = vNULL;
> > > > +  if (supportable_indirect_convert_operation (NULL,
> > > > +					      code,
> > > > +					      ret_type, arg_type,
> > > > +					      &code2, &code3,
> > > > +					      &multi_step_cvt,
> > > > +					      &interm_types, arg))
> > > > +    {
> > > > +      new_rhs = make_ssa_name (interm_types[0]);
> > > > +      g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
> > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > +      g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
> > > > +      gsi_replace (gsi, g, false);
> > > > +      return;
> > > > +    }
> > > > +
> > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > FLOAT_EXPR))
> > > >      {
> > > > -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > -	{
> > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > -	  gsi_replace (gsi, g, false);
> > > > -	  return;
> > > > -	}
> > > >        /* Can't use get_compute_type here, as supportable_convert_operation
> > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > >        tree vec_compute_type
> > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > > > 05a169ecb2d..0aa608202ca 100644
> > > > --- a/gcc/tree-vect-stmts.cc
> > > > +++ b/gcc/tree-vect-stmts.cc
> > > > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
> > > >    tree scalar_dest;
> > > >    tree op0, op1 = NULL_TREE;
> > > >    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> > > > -  tree_code tc1, tc2;
> > > > +  tree_code tc1;
> > > >    code_helper code, code1, code2;
> > > >    code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
> > > >    tree new_temp;
> > > > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
> > > >  	break;
> > > >        }
> > > >
> > > > -      /* For conversions between float and integer types try whether
> > > > -	 we can use intermediate signed integer types to support the
> > > > -	 conversion.  */
> > > > -      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > > > -	  && (code == FLOAT_EXPR ||
> > > > -	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > > > -	{
> > > > -	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> > > (lhs_mode);
> > > > -	  bool float_expr_p = code == FLOAT_EXPR;
> > > > -	  unsigned short target_size;
> > > > -	  scalar_mode intermediate_mode;
> > > > -	  if (demotion)
> > > > -	    {
> > > > -	      intermediate_mode = lhs_mode;
> > > > -	      target_size = GET_MODE_SIZE (rhs_mode);
> > > > -	    }
> > > > -	  else
> > > > -	    {
> > > > -	      target_size = GET_MODE_SIZE (lhs_mode);
> > > > -	      if (!int_mode_for_size
> > > > -		  (GET_MODE_BITSIZE (rhs_mode), 0).exists
> > > (&intermediate_mode))
> > > > -		goto unsupported;
> > > > -	    }
> > > > -	  code1 = float_expr_p ? code : NOP_EXPR;
> > > > -	  codecvt1 = float_expr_p ? NOP_EXPR : code;
> > > > -	  opt_scalar_mode mode_iter;
> > > > -	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > > > -	    {
> > > > -	      intermediate_mode = mode_iter.require ();
> > > > -
> > > > -	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > > > -		break;
> > > > -
> > > > -	      scalar_mode cvt_mode;
> > > > -	      if (!int_mode_for_size
> > > > -		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists
> > > (&cvt_mode))
> > > > -		break;
> > > > -
> > > > -	      cvt_type = build_nonstandard_integer_type
> > > > -		(GET_MODE_BITSIZE (cvt_mode), 0);
> > > > -
> > > > -	      /* Check if the intermediate type can hold OP0's range.
> > > > -		 When converting from float to integer this is not necessary
> > > > -		 because values that do not fit the (smaller) target type are
> > > > -		 unspecified anyway.  */
> > > > -	      if (demotion && float_expr_p)
> > > > -		{
> > > > -		  wide_int op_min_value, op_max_value;
> > > > -		  if (!vect_get_range_info (op0, &op_min_value,
> > > &op_max_value))
> > > > -		    break;
> > > > -
> > > > -		  if (cvt_type == NULL_TREE
> > > > -		      || (wi::min_precision (op_max_value, SIGNED)
> > > > -			  > TYPE_PRECISION (cvt_type))
> > > > -		      || (wi::min_precision (op_min_value, SIGNED)
> > > > -			  > TYPE_PRECISION (cvt_type)))
> > > > -		    continue;
> > > > -		}
> > > > -
> > > > -	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> > > > -	      /* This should only happened for SLP as long as loop vectorizer
> > > > -		 only supports same-sized vector.  */
> > > > -	      if (cvt_type == NULL_TREE
> > > > -		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
> > > > -		  || !supportable_convert_operation ((tree_code) code1,
> > > > -						     vectype_out,
> > > > -						     cvt_type, &tc1)
> > > > -		  || !supportable_convert_operation ((tree_code) codecvt1,
> > > > -						     cvt_type,
> > > > -						     vectype_in, &tc2))
> > > > -		continue;
> > > > -
> > > > -	      found_mode = true;
> > > > -	      break;
> > > > -	    }
> > > > +      if (supportable_indirect_convert_operation (vinfo,
> > > > +						  code,
> > > > +						  vectype_out,
> > > > +						  vectype_in,
> > > > +						  &code1,
> > > > +						  &codecvt1,
> > > > +						  &multi_step_cvt,
> > > > +						  &interm_types,
> > > > +						  op0,slp_node))
> > > > +	break;
> > > >
> > > > -	  if (found_mode)
> > > > -	    {
> > > > -	      multi_step_cvt++;
> > > > -	      interm_types.safe_push (cvt_type);
> > > > -	      cvt_type = NULL_TREE;
> > > > -	      code1 = tc1;
> > > > -	      codecvt1 = tc2;
> > > > -	      break;
> > > > -	    }
> > > > -	}
> > > >        /* FALLTHRU */
> > > >      unsupported:
> > > >        if (dump_enabled_p ())
> > > > @@ -14626,6 +14551,153 @@ supportable_narrowing_operation
> > > (code_helper code,
> > > >    return false;
> > > >  }
> > > >
> > > > +/* Function supportable_indirect_convert_operation
> > > > +
> > > > +   Check whether an operation represented by the code CODE is two
> > > > +   convert operations that are supported by the target platform in
> > > > +   vector form (i.e., when operating on arguments of type VECTYPE_IN
> > > > +   producing a result of type VECTYPE_OUT).
> > > > +
> > > > +   Convert operations we currently support directly are FIX_TRUNC and
> FLOAT.
> > > > +   This function checks if these operations are supported
> > > > +   by the target platform directly (via vector tree-codes).
> > > > +
> > > > +   Output:
> > > > +   - CODE1 is the code of a vector operation to be used when
> > > > +   converting the operation in the first step, if available.
> > > > +   - CODE2 is the code of a vector operation to be used when
> > > > +   converting the operation in the second step, if available.
> > > > +   - MULTI_STEP_CVT determines the number of required intermediate steps
> > > in
> > > > +   case of multi-step conversion (like int->short->char - in that case
> > > > +   MULTI_STEP_CVT will be 1). In the function, it should be 1.
> > > > +   - INTERM_TYPES contains the intermediate type required to perform the
> > > > +   convert operation (short in the above example).   */
> > > > +bool
> > > > +supportable_indirect_convert_operation (vec_info *vinfo,
> > > > +					code_helper code,
> > > > +					tree vectype_out,
> > > > +					tree vectype_in,
> > > > +					code_helper *code1,
> > > > +					code_helper *code2,
> > > > +					int *multi_step_cvt,
> > > > +					vec<tree> *interm_types,
> > >
> > > This API is somewhat awkward, as we're inventing a new one I guess we can do
> > > better.  I think we want
> > >
> > >       vec<std::pair<tree, tree_code> > *converts,
> > >
> > > covering all code1, code2, multi_step_cvt and interm_types with the
> conversion
> > > sequence being
> > >
> > >       converts[0].first tem0 = converts[0].second op0;
> > >       converts[1].first tem1 = converts[1].second tem;
> > >
> >
> > That's great, this really makes the function work better.
> >
> > >
> > > ... while converts.length () determines the length of the chain, one being a
> direct
> > > conversion where then converts[0].first is vectype_out.  That would allow
> > > double -> char to go double -> float -> int -> short -> char for example.
> > >
> >
> > I'm trying to determine the requirements, do you want this function to
> > support multiple conversions (the current implementation just does a
> > two-step conversion, like double -> char, which becomes double -> int ->
> > char). Actually we should be able to do all conversions in two steps, if
> > we have some suitable instructions. I can't think of a scenario where
> > multiple conversions are needed yet. Could you give me some examples? Of
> > course, I could tweak this feature in advance if it is for future
> > consideration.
> 
> I think the API should support multi-level, not only two levels.  The
> implementation doesn't need to cover that case unless we run into
> such a requirement.  Usually vector ISAs allow 2x integer
> widening/shortening but not 4x, so a VnDImode -> VnQImode conversion
> would need to go via VnSImode and VnHImode (of course some targets
> might "help" the vectorizer by providing a VnDImode -> VnQImode
> pattern that does the intermediate conversions behind the vectorizers
> back).  But yes, the original motivation for the vectorizer code
> was that float<->int conversions are limited.
> 

I have a similar patch in this area but instead looking at unsigned int <-> double
conversion.   I would want to avoid complicating this area too much so it would
be good if the API doesn't care about sign either and allows the target to choose
the operation mode?

My current patch has a backend target hook that asks if the current widening is
Preferred as multilevel or single level.  For single level I just generate VEC_PERM_EXPRs
with a zero register.

Just wanted to bring it up in case we can have a coherent story around this conversions.

Thanks,
Tamar
> Thanks,
> Richard.
> 
> >
> > >
> > > > +					tree op0,
> > > > +					slp_tree slp_node)
> > >
> > > I would like to avoid passing VINFO and SLP_NODE here, see below.
> > > The same is true for OP0 where the existing use is wrong for SLP already, but I
> > > guess that can stay for now (I opened PR115538 about the wrong-code issue).
> > >
> >
> > Thanks, I have removed them.
> >
> > >
> > > > +{
> > > > +  bool found_mode = false;
> > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
> > > > +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
> > > > +  opt_scalar_mode mode_iter;
> > > > +  tree_code tc1, tc2;
> > > > +
> > > > +  tree cvt_type = NULL_TREE;
> > > > +  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
> > > > +
> > > > +  (*multi_step_cvt) = 0;
> > > > +  /* For conversions between float and integer types try whether
> > > > +     we can use intermediate signed integer types to support the
> > > > +     conversion.  */
> > > > +  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > > > +      && (code == FLOAT_EXPR
> > > > +	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > > > +    {
> > > > +      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> > > (lhs_mode);
> > > > +      bool float_expr_p = code == FLOAT_EXPR;
> > > > +      unsigned short target_size;
> > > > +      scalar_mode intermediate_mode;
> > > > +      if (demotion)
> > > > +	{
> > > > +	  intermediate_mode = lhs_mode;
> > > > +	  target_size = GET_MODE_SIZE (rhs_mode);
> > > > +	}
> > > > +      else
> > > > +	{
> > > > +	  target_size = GET_MODE_SIZE (lhs_mode);
> > > > +	  if (!int_mode_for_size
> > > > +	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
> > > > +	    return false;
> > > > +	}
> > > > +      *code1 = float_expr_p ? code : NOP_EXPR;
> > > > +      *code2 = float_expr_p ? NOP_EXPR : code;
> > > > +      opt_scalar_mode mode_iter;
> > > > +      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > > > +	{
> > > > +	  intermediate_mode = mode_iter.require ();
> > > > +
> > > > +	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > > > +	    break;
> > > > +
> > > > +	  scalar_mode cvt_mode;
> > > > +	  if (!int_mode_for_size
> > > > +	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
> > > > +	    break;
> > > > +
> > > > +	  cvt_type = build_nonstandard_integer_type
> > > > +	    (GET_MODE_BITSIZE (cvt_mode), 0);
> > > > +
> > > > +	  /* Check if the intermediate type can hold OP0's range.
> > > > +	     When converting from float to integer this is not necessary
> > > > +	     because values that do not fit the (smaller) target type are
> > > > +	     unspecified anyway.  */
> > > > +	  if (demotion && float_expr_p)
> > > > +	    {
> > > > +	      wide_int op_min_value, op_max_value;
> > > > +	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
> > > > +		 In the future, if it is supported, changes may need to be made
> > > > +		 to this part, such as checking the RANGE of each element
> > > > +		 in the vector.  */
> > > > +	      if (!SSA_NAME_RANGE_INFO (op0)
> > > > +		  || !vect_get_range_info (op0, &op_min_value,
> > > &op_max_value))
> > > > +		break;
> > > > +
> > > > +	      if (cvt_type == NULL_TREE
> > > > +		  || (wi::min_precision (op_max_value, SIGNED)
> > > > +		      > TYPE_PRECISION (cvt_type))
> > > > +		  || (wi::min_precision (op_min_value, SIGNED)
> > > > +		      > TYPE_PRECISION (cvt_type)))
> > > > +		continue;
> > > > +	    }
> > > > +
> > > > +	  if (vinfo != NULL && slp_node != NULL)
> > > > +	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> > > > +	  else
> > > > +	    {
> > > > +	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
> > > > +			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
> > > > +	      cvt_type = build_nonstandard_integer_type
> > > > +		(GET_MODE_BITSIZE (cvt_mode), uns);
> > > > +	      cvt_type = build_vector_type (cvt_type, nelts);
> > > > +	    }
> > >
> > > So this would then become
> > >
> > >           cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE
> > > (vectype_in), cvt_type, TYPE_VECTOR_SUBPARTS (vectype_in));
> > >
> > > > +	  /* This should only happened for SLP as long as loop vectorizer
> > > > +	     only supports same-sized vector.  */
> > > > +	  if (cvt_type == NULL_TREE
> > > > +	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
> > > > +	      || !supportable_convert_operation ((tree_code) *code1,
> > > > +						 vectype_out,
> > > > +						 cvt_type, &tc1)
> > > > +	      || !supportable_convert_operation ((tree_code) *code2,
> > > > +						 cvt_type,
> > > > +						 vectype_in, &tc2))
> > > > +	    continue;
> > > > +
> > > > +	  found_mode = true;
> > > > +	  break;
> > > > +	}
> > > > +
> > > > +      if (found_mode)
> > > > +	{
> > > > +	  (*multi_step_cvt)++;
> > > > +	  interm_types->safe_push (cvt_type);
> > > > +	  cvt_type = NULL_TREE;
> > > > +	  *code1 = tc1;
> > > > +	  *code2 = tc2;
> > > > +	  return true;
> > > > +	}
> > > > +    }
> > > > +  interm_types->release ();
> > >
> > > Hmm, ownership of interm_types is somewhat unclear here - the caller should
> > > release it, or is the situation that the caller is confused by stray elements in it?
> In
> > > that case I'd suggest to instead do interm_types->truncate (0).
> > >
> >
> > It's my fault, I just imitate supportable_narrowing/widening_operation,
> > I think for this function, interm_types->release() is not needed.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-24 14:12                                 ` Tamar Christina
@ 2024-06-25  2:00                                   ` Hu, Lin1
  2024-06-25  3:28                                   ` [PATCH 1/3 v4] " Hu, Lin1
  1 sibling, 0 replies; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-25  2:00 UTC (permalink / raw)
  To: Tamar Christina, Richard Biener; +Cc: gcc-patches, Liu, Hongtao, ubizjak

> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Monday, June 24, 2024 10:12 PM
> To: Richard Biener <rguenther@suse.de>; Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com
> Subject: RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int,
> float -> float and int <-> float.
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Monday, June 24, 2024 1:34 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com
> > Subject: RE: [PATCH 1/3 v3] vect: generate suitable convert insn for
> > int -> int, float
> > -> float and int <-> float.
> >
> > On Thu, 20 Jun 2024, Hu, Lin1 wrote:
> >
> > > > >    else if (ret_elt_bits > arg_elt_bits)
> > > > >      modifier = WIDEN;
> > > > >
> > > > > +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> > > > > +    {
> > > > > +      g = gimple_build_assign (lhs, code1, arg);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return;
> > > > > +    }
> > > >
> > > > Given the API change I suggest below it might make sense to have
> > > > supportable_indirect_convert_operation do the above and represent
> > > > it as
> > single-
> > > > step conversion?
> > > >
> > >
> > > OK, if you want to supportable_indirect_convert_operation can do
> > > something like supportable_convert_operation, I'll give it a try.
> > > This functionality is really the part that this function can cover.
> > > But this would require some changes not only the API change, because
> > > supportable_indirect_convert_operation originally only supported
> > > Float
> > > -> Int or Int ->Float.
> >
> > I think I'd like to see a single API to handle direct and
> > (multi-)indirect-level converts that operate on vectors with all the
> > same number of lanes.
> >
> > > >
> > > > > +  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
> > > > > +  int multi_step_cvt = 0;
> > > > > +  vec<tree> interm_types = vNULL;
> > > > > +  if (supportable_indirect_convert_operation (NULL,
> > > > > +					      code,
> > > > > +					      ret_type, arg_type,
> > > > > +					      &code2, &code3,
> > > > > +					      &multi_step_cvt,
> > > > > +					      &interm_types, arg))
> > > > > +    {
> > > > > +      new_rhs = make_ssa_name (interm_types[0]);
> > > > > +      g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
> > > > > +      gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > +      g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
> > > > > +      gsi_replace (gsi, g, false);
> > > > > +      return;
> > > > > +    }
> > > > > +
> > > > >    if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > FLOAT_EXPR))
> > > > >      {
> > > > > -      if (supportable_convert_operation (code, ret_type, arg_type,
> &code1))
> > > > > -	{
> > > > > -	  g = gimple_build_assign (lhs, code1, arg);
> > > > > -	  gsi_replace (gsi, g, false);
> > > > > -	  return;
> > > > > -	}
> > > > >        /* Can't use get_compute_type here, as
> supportable_convert_operation
> > > > >  	 doesn't necessarily use an optab and needs two arguments.  */
> > > > >        tree vec_compute_type
> > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > index 05a169ecb2d..0aa608202ca 100644
> > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
> > > > >    tree scalar_dest;
> > > > >    tree op0, op1 = NULL_TREE;
> > > > >    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> > > > > -  tree_code tc1, tc2;
> > > > > +  tree_code tc1;
> > > > >    code_helper code, code1, code2;
> > > > >    code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
> > > > >    tree new_temp;
> > > > > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
> > > > >  	break;
> > > > >        }
> > > > >
> > > > > -      /* For conversions between float and integer types try whether
> > > > > -	 we can use intermediate signed integer types to support the
> > > > > -	 conversion.  */
> > > > > -      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > > > > -	  && (code == FLOAT_EXPR ||
> > > > > -	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > > > > -	{
> > > > > -	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> > > > (lhs_mode);
> > > > > -	  bool float_expr_p = code == FLOAT_EXPR;
> > > > > -	  unsigned short target_size;
> > > > > -	  scalar_mode intermediate_mode;
> > > > > -	  if (demotion)
> > > > > -	    {
> > > > > -	      intermediate_mode = lhs_mode;
> > > > > -	      target_size = GET_MODE_SIZE (rhs_mode);
> > > > > -	    }
> > > > > -	  else
> > > > > -	    {
> > > > > -	      target_size = GET_MODE_SIZE (lhs_mode);
> > > > > -	      if (!int_mode_for_size
> > > > > -		  (GET_MODE_BITSIZE (rhs_mode), 0).exists
> > > > (&intermediate_mode))
> > > > > -		goto unsupported;
> > > > > -	    }
> > > > > -	  code1 = float_expr_p ? code : NOP_EXPR;
> > > > > -	  codecvt1 = float_expr_p ? NOP_EXPR : code;
> > > > > -	  opt_scalar_mode mode_iter;
> > > > > -	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > > > > -	    {
> > > > > -	      intermediate_mode = mode_iter.require ();
> > > > > -
> > > > > -	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > > > > -		break;
> > > > > -
> > > > > -	      scalar_mode cvt_mode;
> > > > > -	      if (!int_mode_for_size
> > > > > -		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists
> > > > (&cvt_mode))
> > > > > -		break;
> > > > > -
> > > > > -	      cvt_type = build_nonstandard_integer_type
> > > > > -		(GET_MODE_BITSIZE (cvt_mode), 0);
> > > > > -
> > > > > -	      /* Check if the intermediate type can hold OP0's range.
> > > > > -		 When converting from float to integer this is not necessary
> > > > > -		 because values that do not fit the (smaller) target type are
> > > > > -		 unspecified anyway.  */
> > > > > -	      if (demotion && float_expr_p)
> > > > > -		{
> > > > > -		  wide_int op_min_value, op_max_value;
> > > > > -		  if (!vect_get_range_info (op0, &op_min_value,
> > > > &op_max_value))
> > > > > -		    break;
> > > > > -
> > > > > -		  if (cvt_type == NULL_TREE
> > > > > -		      || (wi::min_precision (op_max_value, SIGNED)
> > > > > -			  > TYPE_PRECISION (cvt_type))
> > > > > -		      || (wi::min_precision (op_min_value, SIGNED)
> > > > > -			  > TYPE_PRECISION (cvt_type)))
> > > > > -		    continue;
> > > > > -		}
> > > > > -
> > > > > -	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> > > > > -	      /* This should only happened for SLP as long as loop vectorizer
> > > > > -		 only supports same-sized vector.  */
> > > > > -	      if (cvt_type == NULL_TREE
> > > > > -		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
> > > > > -		  || !supportable_convert_operation ((tree_code) code1,
> > > > > -						     vectype_out,
> > > > > -						     cvt_type, &tc1)
> > > > > -		  || !supportable_convert_operation ((tree_code) codecvt1,
> > > > > -						     cvt_type,
> > > > > -						     vectype_in, &tc2))
> > > > > -		continue;
> > > > > -
> > > > > -	      found_mode = true;
> > > > > -	      break;
> > > > > -	    }
> > > > > +      if (supportable_indirect_convert_operation (vinfo,
> > > > > +						  code,
> > > > > +						  vectype_out,
> > > > > +						  vectype_in,
> > > > > +						  &code1,
> > > > > +						  &codecvt1,
> > > > > +						  &multi_step_cvt,
> > > > > +						  &interm_types,
> > > > > +						  op0,slp_node))
> > > > > +	break;
> > > > >
> > > > > -	  if (found_mode)
> > > > > -	    {
> > > > > -	      multi_step_cvt++;
> > > > > -	      interm_types.safe_push (cvt_type);
> > > > > -	      cvt_type = NULL_TREE;
> > > > > -	      code1 = tc1;
> > > > > -	      codecvt1 = tc2;
> > > > > -	      break;
> > > > > -	    }
> > > > > -	}
> > > > >        /* FALLTHRU */
> > > > >      unsupported:
> > > > >        if (dump_enabled_p ())
> > > > > @@ -14626,6 +14551,153 @@ supportable_narrowing_operation
> > > > (code_helper code,
> > > > >    return false;
> > > > >  }
> > > > >
> > > > > +/* Function supportable_indirect_convert_operation
> > > > > +
> > > > > +   Check whether an operation represented by the code CODE is two
> > > > > +   convert operations that are supported by the target platform in
> > > > > +   vector form (i.e., when operating on arguments of type VECTYPE_IN
> > > > > +   producing a result of type VECTYPE_OUT).
> > > > > +
> > > > > +   Convert operations we currently support directly are
> > > > > + FIX_TRUNC and
> > FLOAT.
> > > > > +   This function checks if these operations are supported
> > > > > +   by the target platform directly (via vector tree-codes).
> > > > > +
> > > > > +   Output:
> > > > > +   - CODE1 is the code of a vector operation to be used when
> > > > > +   converting the operation in the first step, if available.
> > > > > +   - CODE2 is the code of a vector operation to be used when
> > > > > +   converting the operation in the second step, if available.
> > > > > +   - MULTI_STEP_CVT determines the number of required
> > > > > + intermediate steps
> > > > in
> > > > > +   case of multi-step conversion (like int->short->char - in that case
> > > > > +   MULTI_STEP_CVT will be 1). In the function, it should be 1.
> > > > > +   - INTERM_TYPES contains the intermediate type required to perform
> the
> > > > > +   convert operation (short in the above example).   */
> > > > > +bool
> > > > > +supportable_indirect_convert_operation (vec_info *vinfo,
> > > > > +					code_helper code,
> > > > > +					tree vectype_out,
> > > > > +					tree vectype_in,
> > > > > +					code_helper *code1,
> > > > > +					code_helper *code2,
> > > > > +					int *multi_step_cvt,
> > > > > +					vec<tree> *interm_types,
> > > >
> > > > This API is somewhat awkward, as we're inventing a new one I guess
> > > > we can do better.  I think we want
> > > >
> > > >       vec<std::pair<tree, tree_code> > *converts,
> > > >
> > > > covering all code1, code2, multi_step_cvt and interm_types with
> > > > the
> > conversion
> > > > sequence being
> > > >
> > > >       converts[0].first tem0 = converts[0].second op0;
> > > >       converts[1].first tem1 = converts[1].second tem;
> > > >
> > >
> > > That's great, this really makes the function work better.
> > >
> > > >
> > > > ... while converts.length () determines the length of the chain,
> > > > one being a
> > direct
> > > > conversion where then converts[0].first is vectype_out.  That
> > > > would allow double -> char to go double -> float -> int -> short -> char for
> example.
> > > >
> > >
> > > I'm trying to determine the requirements, do you want this function
> > > to support multiple conversions (the current implementation just
> > > does a two-step conversion, like double -> char, which becomes
> > > double -> int -> char). Actually we should be able to do all
> > > conversions in two steps, if we have some suitable instructions. I
> > > can't think of a scenario where multiple conversions are needed yet.
> > > Could you give me some examples? Of course, I could tweak this
> > > feature in advance if it is for future consideration.
> >
> > I think the API should support multi-level, not only two levels.  The
> > implementation doesn't need to cover that case unless we run into such
> > a requirement.  Usually vector ISAs allow 2x integer
> > widening/shortening but not 4x, so a VnDImode -> VnQImode conversion
> > would need to go via VnSImode and VnHImode (of course some targets
> > might "help" the vectorizer by providing a VnDImode -> VnQImode
> > pattern that does the intermediate conversions behind the vectorizers
> > back).  But yes, the original motivation for the vectorizer code was
> > that float<->int conversions are limited.
> >
> 
> I have a similar patch in this area but instead looking at unsigned int <-> double
> conversion.   I would want to avoid complicating this area too much so it would
> be good if the API doesn't care about sign either and allows the target to choose
> the operation mode?
> 
> My current patch has a backend target hook that asks if the current widening is
> Preferred as multilevel or single level.  For single level I just generate
> VEC_PERM_EXPRs with a zero register.
> 
> Just wanted to bring it up in case we can have a coherent story around this
> conversions.
>

This is the current API.
14579 bool
14580 supportable_indirect_convert_operation (code_helper code,
14581                                         tree vectype_out,
14582                                         tree vectype_in,
14583                                         vec<std::pair<tree, tree_code> > *converts,
14584                                         tree op0)


This API doesn't care about sign, whether double to char or unsigned char will be converted to int first in my opinion.

About the backend target hook, I guess you use targetm.* to determine the target to choose the operation mode, if yes, I think the API is ok for now, you can use targetm to determine the operation mode and output through converts. What do you think?

BRs,
Lin 

>
> >
> > >
> > > >
> > > > > +					tree op0,
> > > > > +					slp_tree slp_node)
> > > >
> > > > I would like to avoid passing VINFO and SLP_NODE here, see below.
> > > > The same is true for OP0 where the existing use is wrong for SLP
> > > > already, but I guess that can stay for now (I opened PR115538 about the
> wrong-code issue).
> > > >
> > >
> > > Thanks, I have removed them.
> > >
> > > >
> > > > > +{
> > > > > +  bool found_mode = false;
> > > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > +(vectype_out));
> > > > > +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE
> > > > > +(vectype_in));
> > > > > +  opt_scalar_mode mode_iter;
> > > > > +  tree_code tc1, tc2;
> > > > > +
> > > > > +  tree cvt_type = NULL_TREE;
> > > > > +  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
> > > > > +
> > > > > +  (*multi_step_cvt) = 0;
> > > > > +  /* For conversions between float and integer types try whether
> > > > > +     we can use intermediate signed integer types to support the
> > > > > +     conversion.  */
> > > > > +  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > > > > +      && (code == FLOAT_EXPR
> > > > > +	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > > > > +    {
> > > > > +      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> > > > (lhs_mode);
> > > > > +      bool float_expr_p = code == FLOAT_EXPR;
> > > > > +      unsigned short target_size;
> > > > > +      scalar_mode intermediate_mode;
> > > > > +      if (demotion)
> > > > > +	{
> > > > > +	  intermediate_mode = lhs_mode;
> > > > > +	  target_size = GET_MODE_SIZE (rhs_mode);
> > > > > +	}
> > > > > +      else
> > > > > +	{
> > > > > +	  target_size = GET_MODE_SIZE (lhs_mode);
> > > > > +	  if (!int_mode_for_size
> > > > > +	      (GET_MODE_BITSIZE (rhs_mode), 0).exists
> (&intermediate_mode))
> > > > > +	    return false;
> > > > > +	}
> > > > > +      *code1 = float_expr_p ? code : NOP_EXPR;
> > > > > +      *code2 = float_expr_p ? NOP_EXPR : code;
> > > > > +      opt_scalar_mode mode_iter;
> > > > > +      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > > > > +	{
> > > > > +	  intermediate_mode = mode_iter.require ();
> > > > > +
> > > > > +	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > > > > +	    break;
> > > > > +
> > > > > +	  scalar_mode cvt_mode;
> > > > > +	  if (!int_mode_for_size
> > > > > +	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists
> (&cvt_mode))
> > > > > +	    break;
> > > > > +
> > > > > +	  cvt_type = build_nonstandard_integer_type
> > > > > +	    (GET_MODE_BITSIZE (cvt_mode), 0);
> > > > > +
> > > > > +	  /* Check if the intermediate type can hold OP0's range.
> > > > > +	     When converting from float to integer this is not necessary
> > > > > +	     because values that do not fit the (smaller) target type are
> > > > > +	     unspecified anyway.  */
> > > > > +	  if (demotion && float_expr_p)
> > > > > +	    {
> > > > > +	      wide_int op_min_value, op_max_value;
> > > > > +	      /* For vector form, it looks like op0 doesn't have
> RANGE_INFO.
> > > > > +		 In the future, if it is supported, changes may need to be
> made
> > > > > +		 to this part, such as checking the RANGE of each
> element
> > > > > +		 in the vector.  */
> > > > > +	      if (!SSA_NAME_RANGE_INFO (op0)
> > > > > +		  || !vect_get_range_info (op0, &op_min_value,
> > > > &op_max_value))
> > > > > +		break;
> > > > > +
> > > > > +	      if (cvt_type == NULL_TREE
> > > > > +		  || (wi::min_precision (op_max_value, SIGNED)
> > > > > +		      > TYPE_PRECISION (cvt_type))
> > > > > +		  || (wi::min_precision (op_min_value, SIGNED)
> > > > > +		      > TYPE_PRECISION (cvt_type)))
> > > > > +		continue;
> > > > > +	    }
> > > > > +
> > > > > +	  if (vinfo != NULL && slp_node != NULL)
> > > > > +	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type,
> slp_node);
> > > > > +	  else
> > > > > +	    {
> > > > > +	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
> > > > > +			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
> > > > > +	      cvt_type = build_nonstandard_integer_type
> > > > > +		(GET_MODE_BITSIZE (cvt_mode), uns);
> > > > > +	      cvt_type = build_vector_type (cvt_type, nelts);
> > > > > +	    }
> > > >
> > > > So this would then become
> > > >
> > > >           cvt_type = get_related_vectype_for_scalar_type
> > > > (TYPE_MODE (vectype_in), cvt_type, TYPE_VECTOR_SUBPARTS
> > > > (vectype_in));
> > > >
> > > > > +	  /* This should only happened for SLP as long as loop vectorizer
> > > > > +	     only supports same-sized vector.  */
> > > > > +	  if (cvt_type == NULL_TREE
> > > > > +	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
> > > > > +	      || !supportable_convert_operation ((tree_code) *code1,
> > > > > +						 vectype_out,
> > > > > +						 cvt_type, &tc1)
> > > > > +	      || !supportable_convert_operation ((tree_code) *code2,
> > > > > +						 cvt_type,
> > > > > +						 vectype_in, &tc2))
> > > > > +	    continue;
> > > > > +
> > > > > +	  found_mode = true;
> > > > > +	  break;
> > > > > +	}
> > > > > +
> > > > > +      if (found_mode)
> > > > > +	{
> > > > > +	  (*multi_step_cvt)++;
> > > > > +	  interm_types->safe_push (cvt_type);
> > > > > +	  cvt_type = NULL_TREE;
> > > > > +	  *code1 = tc1;
> > > > > +	  *code2 = tc2;
> > > > > +	  return true;
> > > > > +	}
> > > > > +    }
> > > > > +  interm_types->release ();
> > > >
> > > > Hmm, ownership of interm_types is somewhat unclear here - the
> > > > caller should release it, or is the situation that the caller is confused by
> stray elements in it?
> > In
> > > > that case I'd suggest to instead do interm_types->truncate (0).
> > > >
> > >
> > > It's my fault, I just imitate
> > > supportable_narrowing/widening_operation,
> > > I think for this function, interm_types->release() is not needed.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/3 v4] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-24 14:12                                 ` Tamar Christina
  2024-06-25  2:00                                   ` Hu, Lin1
@ 2024-06-25  3:28                                   ` Hu, Lin1
  2024-06-25 13:30                                     ` Richard Biener
  1 sibling, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-25  3:28 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther, Tamar.Christina

Hi,

This is the current version. 

I haven't made any major changes to the original code, I think it will have less impact on your code. And I think the current API is sufficient to support the mode selection you mentioned, if you have any concerns you can mention them. I can tweak it further.

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc
	(expand_vector_conversion): Support convert for int -> int,
	float -> float and int <-> float.
	* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
	indirect convert part.
	(supportable_indirect_convert_operation): New function.
	* tree-vectorizer.h (supportable_indirect_convert_operation):
	Define the new function.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 +++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 150 ++++++++++++
 gcc/tree-vect-generic.cc                   |  34 ++-
 gcc/tree-vect-stmts.cc                     | 259 ++++++++++++++-------
 gcc/tree-vectorizer.h                      |   4 +
 10 files changed, 1013 insertions(+), 95 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
+}
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
+}
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2su);
+}
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
+}
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
+}
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu);
+}
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu);
+}
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
+}
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu);
+}
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu);
+}
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu);
+}
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu);
+}
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu);
+}
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
+}
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
+}
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu);
+}
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu);
+}
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu);
+}
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
+}
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu);
+}
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu);
+}
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
+}
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v8hi);
+}
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi);
+}
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2sf);
+}
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2hf);
+}
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2si);
+}
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si);
+}
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi);
+}
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi);
+}
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi);
+}
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu);
+}
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu);
+}
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu);
+}
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi);
+}
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi);
+}
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi);
+}
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi);
+}
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu);
+}
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu);
+}
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu);
+}
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu);
+}
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi);
+}
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi);
+}
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi);
+}
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi);
+}
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu);
+}
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu);
+}
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu);
+}
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..1b33e9a9508
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,150 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 6 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 10 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df);
+}
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df);
+}
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df);
+}
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df);
+}
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df);
+}
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df);
+}
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf);
+}
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf);
+}
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf);
+}
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index ea0069f7a67..b5c87bb0e13 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1870,14 +1872,36 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
-  if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
+  /*
+  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+      g = gimple_build_assign (lhs, code1, arg);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+    */
+
+  vec<std::pair<tree, tree_code> > converts = vNULL;
+  if (supportable_indirect_convert_operation (code,
+					      ret_type, arg_type,
+					      &converts,
+					      arg))
+    {
+      if (TYPE_MODE (converts[0].first) == TYPE_MODE (ret_type))
+	g = gimple_build_assign (lhs, converts[0].second, arg);
+      else
 	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
+	  new_rhs = make_ssa_name (converts[0].first);
+	  g = gimple_build_assign (new_rhs, converts[0].second, arg);
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	  g = gimple_build_assign (lhs, converts[1].second, new_rhs);
 	}
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
+  if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
+    {
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 05a169ecb2d..f4c829ff6af 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
-  tree_code tc1, tc2;
+  tree_code tc1;
   code_helper code, code1, code2;
   code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
   tree new_temp;
@@ -5367,6 +5367,7 @@ vectorizable_conversion (vec_info *vinfo,
   scalar_mode lhs_mode = SCALAR_TYPE_MODE (lhs_type);
   scalar_mode rhs_mode = SCALAR_TYPE_MODE (rhs_type);
   opt_scalar_mode rhs_mode_iter;
+  vec<std::pair<tree, tree_code> > converts = vNULL;
 
   /* Supportable by target?  */
   switch (modifier)
@@ -5377,99 +5378,25 @@ vectorizable_conversion (vec_info *vinfo,
 	  && !CONVERT_EXPR_CODE_P (code))
 	return false;
       gcc_assert (code.is_tree_code ());
-      if (supportable_convert_operation ((tree_code) code, vectype_out,
-					 vectype_in, &tc1))
-      {
-	code1 = tc1;
-	break;
-      }
-
-      /* For conversions between float and integer types try whether
-	 we can use intermediate signed integer types to support the
-	 conversion.  */
-      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
-	  && (code == FLOAT_EXPR ||
-	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
-	{
-	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
-	  bool float_expr_p = code == FLOAT_EXPR;
-	  unsigned short target_size;
-	  scalar_mode intermediate_mode;
-	  if (demotion)
-	    {
-	      intermediate_mode = lhs_mode;
-	      target_size = GET_MODE_SIZE (rhs_mode);
-	    }
+      if (supportable_indirect_convert_operation (code,
+						  vectype_out,
+						  vectype_in,
+						  &converts,
+						  op0))
+	{
+	  if (converts.length () == 1)
+	    code1 = converts[0].second;
 	  else
 	    {
-	      target_size = GET_MODE_SIZE (lhs_mode);
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
-		goto unsupported;
-	    }
-	  code1 = float_expr_p ? code : NOP_EXPR;
-	  codecvt1 = float_expr_p ? NOP_EXPR : code;
-	  opt_scalar_mode mode_iter;
-	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
-	    {
-	      intermediate_mode = mode_iter.require ();
-
-	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
-		break;
-
-	      scalar_mode cvt_mode;
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
-		break;
-
-	      cvt_type = build_nonstandard_integer_type
-		(GET_MODE_BITSIZE (cvt_mode), 0);
-
-	      /* Check if the intermediate type can hold OP0's range.
-		 When converting from float to integer this is not necessary
-		 because values that do not fit the (smaller) target type are
-		 unspecified anyway.  */
-	      if (demotion && float_expr_p)
-		{
-		  wide_int op_min_value, op_max_value;
-		  if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
-		    break;
-
-		  if (cvt_type == NULL_TREE
-		      || (wi::min_precision (op_max_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type))
-		      || (wi::min_precision (op_min_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type)))
-		    continue;
-		}
-
-	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
-	      /* This should only happened for SLP as long as loop vectorizer
-		 only supports same-sized vector.  */
-	      if (cvt_type == NULL_TREE
-		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
-		  || !supportable_convert_operation ((tree_code) code1,
-						     vectype_out,
-						     cvt_type, &tc1)
-		  || !supportable_convert_operation ((tree_code) codecvt1,
-						     cvt_type,
-						     vectype_in, &tc2))
-		continue;
-
-	      found_mode = true;
-	      break;
-	    }
-
-	  if (found_mode)
-	    {
-	      multi_step_cvt++;
-	      interm_types.safe_push (cvt_type);
-	      cvt_type = NULL_TREE;
-	      code1 = tc1;
-	      codecvt1 = tc2;
-	      break;
+	      gcc_assert (converts.length () == 2);
+	      multi_step_cvt = converts.length () - 1;
+	      codecvt1 = converts[0].second;
+	      code1 = converts[1].second;
+	      interm_types.safe_push (converts[0].first);
 	    }
+	  break;
 	}
+
       /* FALLTHRU */
     unsupported:
       if (dump_enabled_p ())
@@ -14626,6 +14553,158 @@ supportable_narrowing_operation (code_helper code,
   return false;
 }
 
+/* Function supportable_indirect_convert_operation
+
+   Check whether an operation represented by the code CODE is two
+   convert operations that are supported by the target platform in
+   vector form (i.e., when operating on arguments of type VECTYPE_IN
+   producing a result of type VECTYPE_OUT).
+
+   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
+   This function checks if these operations are supported
+   by the target platform directly (via vector tree-codes).
+
+   Output:
+   - CODE1 is the code of a vector operation to be used when
+   converting the operation in the first step, if available.
+   - CODE2 is the code of a vector operation to be used when
+   converting the operation in the second step, if available.
+   - MULTI_STEP_CVT determines the number of required intermediate steps in
+   case of multi-step conversion (like int->short->char - in that case
+   MULTI_STEP_CVT will be 1). In the function, it should be 1.
+   - INTERM_TYPES contains the intermediate type required to perform the
+   convert operation (short in the above example).   */
+bool
+supportable_indirect_convert_operation (code_helper code,
+					tree vectype_out,
+					tree vectype_in,
+					vec<std::pair<tree, tree_code> > *converts,
+					tree op0)
+{
+  bool found_mode = false;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
+  opt_scalar_mode mode_iter;
+  tree_code tc1, tc2, code1, code2;
+
+  tree cvt_type = NULL_TREE;
+  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
+
+  if (supportable_convert_operation ((tree_code) code,
+				     vectype_out,
+				     vectype_in,
+				     &tc1))
+    {
+      converts->safe_push (std::make_pair (vectype_out, tc1));
+      return true;
+    }
+
+  /* For conversions between float and integer types try whether
+     we can use intermediate signed integer types to support the
+     conversion.  */
+  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
+      && (code == FLOAT_EXPR
+	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
+    {
+      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
+      bool float_expr_p = code == FLOAT_EXPR;
+      unsigned short target_size;
+      scalar_mode intermediate_mode;
+      if (demotion)
+	{
+	  intermediate_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
+	    return false;
+	}
+      code1 = float_expr_p ? (tree_code) code : NOP_EXPR;
+      code2 = float_expr_p ? NOP_EXPR : (tree_code) code;
+      opt_scalar_mode mode_iter;
+      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
+	{
+	  intermediate_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
+	    break;
+
+	  cvt_type = build_nonstandard_integer_type
+	    (GET_MODE_BITSIZE (cvt_mode), 0);
+
+	  /* Check if the intermediate type can hold OP0's range.
+	     When converting from float to integer this is not necessary
+	     because values that do not fit the (smaller) target type are
+	     unspecified anyway.  */
+	  if (demotion && float_expr_p)
+	    {
+	      wide_int op_min_value, op_max_value;
+	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
+		 In the future, if it is supported, changes may need to be made
+		 to this part, such as checking the RANGE of each element
+		 in the vector.  */
+	      if (!SSA_NAME_RANGE_INFO (op0)
+		  || !vect_get_range_info (op0, &op_min_value, &op_max_value))
+		break;
+
+	      if (cvt_type == NULL_TREE
+		  || (wi::min_precision (op_max_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type))
+		  || (wi::min_precision (op_min_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type)))
+		continue;
+	    }
+
+	  /*
+	  if (vinfo != NULL && slp_node != NULL)
+	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
+	  else
+	    {
+	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
+			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
+	      cvt_type = build_nonstandard_integer_type
+		(GET_MODE_BITSIZE (cvt_mode), uns);
+	      cvt_type = build_vector_type (cvt_type, nelts);
+	    }
+	    */
+	  cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE (vectype_in),
+							  cvt_type,
+							  nelts);
+	  /* This should only happened for SLP as long as loop vectorizer
+	     only supports same-sized vector.  */
+	  if (cvt_type == NULL_TREE
+	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
+	      || !supportable_convert_operation ((tree_code) code1,
+						 vectype_out,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) code2,
+						 cvt_type,
+						 vectype_in, &tc2))
+	    continue;
+
+	  found_mode = true;
+	  break;
+	}
+
+      if (found_mode)
+	{
+	  converts->safe_push (std::make_pair (cvt_type, tc2));
+	  if (TYPE_MODE (cvt_type) != TYPE_MODE (vectype_out))
+	    converts->safe_push (std::make_pair (vectype_out, tc1));
+	  return true;
+	}
+    }
+  return false;
+}
+
 /* Generate and return a vector mask of MASK_TYPE such that
    mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
    Add the statements to SEQ.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 97ec9c341e7..53b3f24cb2e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2265,6 +2265,10 @@ extern bool supportable_widening_operation (vec_info*, code_helper,
 extern bool supportable_narrowing_operation (code_helper, tree, tree,
 					     code_helper *, int *,
 					     vec<tree> *);
+extern bool supportable_indirect_convert_operation (code_helper,
+						    tree, tree,
+						    vec<std::pair<tree, tree_code> > *,
+						    tree = NULL_TREE);
 
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 				  enum vect_cost_for_stmt, stmt_vec_info,
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3 v4] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-25  3:28                                   ` [PATCH 1/3 v4] " Hu, Lin1
@ 2024-06-25 13:30                                     ` Richard Biener
  2024-06-26 10:54                                       ` [PATCH 1/3 v5] " Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Richard Biener @ 2024-06-25 13:30 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu, ubizjak, Tamar.Christina

On Tue, 25 Jun 2024, Hu, Lin1 wrote:

> Hi,
> 
> This is the current version. 
> 
> I haven't made any major changes to the original code, I think it will have less impact on your code. And I think the current API is sufficient to support the mode selection you mentioned, if you have any concerns you can mention them. I can tweak it further.
> 
> BRs,
> Lin
> 
> gcc/ChangeLog:
> 
> 	PR target/107432
> 	* tree-vect-generic.cc
> 	(expand_vector_conversion): Support convert for int -> int,
> 	float -> float and int <-> float.
> 	* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
> 	indirect convert part.
> 	(supportable_indirect_convert_operation): New function.
> 	* tree-vectorizer.h (supportable_indirect_convert_operation):
> 	Define the new function.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/107432
> 	* gcc.target/i386/pr107432-1.c: New test.
> 	* gcc.target/i386/pr107432-2.c: Ditto.
> 	* gcc.target/i386/pr107432-3.c: Ditto.
> 	* gcc.target/i386/pr107432-4.c: Ditto.
> 	* gcc.target/i386/pr107432-5.c: Ditto.
> 	* gcc.target/i386/pr107432-6.c: Ditto.
> 	* gcc.target/i386/pr107432-7.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
>  gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++++++
>  gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-7.c | 150 ++++++++++++
>  gcc/tree-vect-generic.cc                   |  34 ++-
>  gcc/tree-vect-stmts.cc                     | 259 ++++++++++++++-------
>  gcc/tree-vectorizer.h                      |   4 +
>  10 files changed, 1013 insertions(+), 95 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> new file mode 100644
> index 00000000000..a4f37447eb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -0,0 +1,234 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
> +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
> +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
> +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2si);
> +}
> +
> +__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
> +}
> +
> +__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
> +}
> +
> +__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi);
> +}
> +
> +__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi);
> +}
> +
> +__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
> +}
> +
> +__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8di)a, __v8qi);
> +}
> +
> +__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2hi);
> +}
> +
> +__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi);
> +}
> +
> +__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
> +}
> +
> +__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
> +}
> +
> +__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2qi);
> +}
> +
> +__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi);
> +}
> +
> +__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
> +}
> +
> +__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector((__v2hi)a, __v2qi);
> +}
> +
> +__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi);
> +}
> +
> +__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
> +}
> +
> +__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
> +}
> +
> +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2su);
> +}
> +
> +__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
> +}
> +
> +__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
> +}
> +
> +__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2hu);
> +}
> +
> +__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4hu);
> +}
> +
> +__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
> +}
> +
> +__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2du)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4du)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
> +{
> +  return __builtin_convertvector((__v8du)a, __v8qu);
> +}
> +
> +__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2hu);
> +}
> +
> +__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4hu);
> +}
> +
> +__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
> +}
> +
> +__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
> +}
> +
> +__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
> +{
> +  return __builtin_convertvector((__v2su)a, __v2qu);
> +}
> +
> +__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4su)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8su)a, __v8qu);
> +}
> +
> +__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
> +}
> +
> +__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
> +{
> +  return __builtin_convertvector((__v2hu)a, __v2qu);
> +}
> +
> +__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hu)a, __v8qu);
> +}
> +
> +__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
> +{
> +  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
> +}
> +
> +__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
> +{
> +  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> new file mode 100644
> index 00000000000..02ffd811cb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
> @@ -0,0 +1,105 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v4di);
> +}
> +
> +__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v8di);
> +}
> +
> +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v4si);
> +}
> +
> +__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v8si);
> +}
> +
> +__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
> +{
> +  return (__m512i)__builtin_convertvector(a, __v16si);
> +}
> +
> +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
> +{
> +  return (__m128i)__builtin_convertvector(a, __v8hi);
> +}
> +
> +__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
> +{
> +  return (__m256i)__builtin_convertvector(a, __v16hi);
> +}
> +
> +__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector(a, __v32hi);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> new file mode 100644
> index 00000000000..30dc947b6dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
> @@ -0,0 +1,55 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2sf);
> +}
> +
> +__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2hf);
> +}
> +
> +__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4hf);
> +}
> +
> +__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8hf);
> +}
> +
> +__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector(a, __v16hf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> new file mode 100644
> index 00000000000..e537e7349e4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2df);
> +}
> +
> +__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4df);
> +}
> +
> +__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8df);
> +}
> +
> +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16sf);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> new file mode 100644
> index 00000000000..5a44ef9f3b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector(a, __v2si);
> +}
> +
> +__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> +
> +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4si);
> +}
> +
> +__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8si);
> +}
> +
> +__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector(a, __v16si);
> +}
> +
> +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector(a, __v2di);
> +}
> +
> +__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector(a, __v4di);
> +}
> +
> +__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector(a, __v8di);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> new file mode 100644
> index 00000000000..4a68a10b089
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -0,0 +1,139 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qi);
> +}
> +
> +__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qi);
> +}
> +
> +__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qi);
> +}
> +
> +__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
> +{
> +  return __builtin_convertvector((__v2df)a, __v2qu);
> +}
> +
> +__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
> +{
> +  return __builtin_convertvector((__v4df)a, __v4qu);
> +}
> +
> +__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
> +{
> +  return __builtin_convertvector((__v8df)a, __v8qu);
> +}
> +
> +__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qi);
> +}
> +
> +__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qi);
> +}
> +
> +__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qi);
> +}
> +
> +__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qi);
> +}
> +
> +__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
> +{
> +  return __builtin_convertvector((__v2sf)a, __v2qu);
> +}
> +
> +__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
> +{
> +  return __builtin_convertvector((__v4sf)a, __v4qu);
> +}
> +
> +__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
> +{
> +  return __builtin_convertvector((__v8sf)a, __v8qu);
> +}
> +
> +__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
> +{
> +  return __builtin_convertvector((__v16sf)a, __v16qu);
> +}
> +
> +__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qi);
> +}
> +
> +__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qi);
> +}
> +
> +__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qi);
> +}
> +
> +__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qi);
> +}
> +
> +__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
> +{
> +  return __builtin_convertvector((__v2hf)a, __v2qu);
> +}
> +
> +__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
> +{
> +  return __builtin_convertvector((__v8hf)a, __v8qu);
> +}
> +
> +__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
> +{
> +  return __builtin_convertvector((__v16hf)a, __v16qu);
> +}
> +
> +__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
> +{
> +  return __builtin_convertvector((__v32hf)a, __v32qu);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> new file mode 100644
> index 00000000000..1b33e9a9508
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
> @@ -0,0 +1,150 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2pd" 6 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 6 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtdq2ps" 8 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 8 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvtw2ph" 10 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
> +
> +#include <x86intrin.h>
> +
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
> +typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
> +typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
> +typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
> +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
> +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
> +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
> +
> +__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8df);
> +}
> +
> +__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2df);
> +}
> +
> +__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4df);
> +}
> +
> +__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8df);
> +}
> +
> +__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16sf);
> +}
> +
> +__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2sf);
> +}
> +
> +__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4sf);
> +}
> +
> +__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8sf);
> +}
> +
> +__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16sf);
> +}
> +
> +__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
> +{
> +  return __builtin_convertvector((__v2qi)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
> +{
> +  return __builtin_convertvector((__v4qi)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
> +{
> +  return __builtin_convertvector((__v8qi)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
> +{
> +  return __builtin_convertvector((__v16qi)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
> +{
> +  return __builtin_convertvector((__v32qi)a, __v32hf);
> +}
> +
> +__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
> +{
> +  return __builtin_convertvector((__v2qu)a, __v2hf);
> +}
> +
> +__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
> +{
> +  return __builtin_convertvector((__v4qu)a, __v4hf);
> +}
> +
> +__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
> +{
> +  return __builtin_convertvector((__v8qu)a, __v8hf);
> +}
> +
> +__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
> +{
> +  return __builtin_convertvector((__v16qu)a, __v16hf);
> +}
> +
> +__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
> +{
> +  return __builtin_convertvector((__v32qu)a, __v32hf);
> +}
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index ea0069f7a67..b5c87bb0e13 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "gimple-match.h"
>  #include "recog.h"		/* FIXME: for insn_data */
>  #include "optabs-libfuncs.h"
> +#include "cfgloop.h"
> +#include "tree-vectorizer.h"
>  
>  
>  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> @@ -1870,14 +1872,36 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
>    else if (ret_elt_bits > arg_elt_bits)
>      modifier = WIDEN;
>  
> -  if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
> +  /*
> +  if (supportable_convert_operation (code, ret_type, arg_type, &code1))
>      {
> -      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
> +      g = gimple_build_assign (lhs, code1, arg);
> +      gsi_replace (gsi, g, false);
> +      return;
> +    }
> +    */
> +
> +  vec<std::pair<tree, tree_code> > converts = vNULL;

This leaks, please simply use

    auto_vec<std::pair<...

> +  if (supportable_indirect_convert_operation (code,
> +					      ret_type, arg_type,
> +					      &converts,
> +					      arg))
> +    {
> +      if (TYPE_MODE (converts[0].first) == TYPE_MODE (ret_type))
> +	g = gimple_build_assign (lhs, converts[0].second, arg);
> +      else
>  	{
> -	  g = gimple_build_assign (lhs, code1, arg);
> -	  gsi_replace (gsi, g, false);
> -	  return;
> +	  new_rhs = make_ssa_name (converts[0].first);
> +	  g = gimple_build_assign (new_rhs, converts[0].second, arg);
> +	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +	  g = gimple_build_assign (lhs, converts[1].second, new_rhs);
>  	}
> +      gsi_replace (gsi, g, false);

I would like to see a loop over 'converts' here.

> +      return;
> +    }
> +
> +  if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
> +    {
>        /* Can't use get_compute_type here, as supportable_convert_operation
>  	 doesn't necessarily use an optab and needs two arguments.  */
>        tree vec_compute_type
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 05a169ecb2d..f4c829ff6af 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
>    tree scalar_dest;
>    tree op0, op1 = NULL_TREE;
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> -  tree_code tc1, tc2;
> +  tree_code tc1;
>    code_helper code, code1, code2;
>    code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
>    tree new_temp;
> @@ -5367,6 +5367,7 @@ vectorizable_conversion (vec_info *vinfo,
>    scalar_mode lhs_mode = SCALAR_TYPE_MODE (lhs_type);
>    scalar_mode rhs_mode = SCALAR_TYPE_MODE (rhs_type);
>    opt_scalar_mode rhs_mode_iter;
> +  vec<std::pair<tree, tree_code> > converts = vNULL;
>  
>    /* Supportable by target?  */
>    switch (modifier)
> @@ -5377,99 +5378,25 @@ vectorizable_conversion (vec_info *vinfo,
>  	  && !CONVERT_EXPR_CODE_P (code))
>  	return false;
>        gcc_assert (code.is_tree_code ());
> -      if (supportable_convert_operation ((tree_code) code, vectype_out,
> -					 vectype_in, &tc1))
> -      {
> -	code1 = tc1;
> -	break;
> -      }
> -
> -      /* For conversions between float and integer types try whether
> -	 we can use intermediate signed integer types to support the
> -	 conversion.  */
> -      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> -	  && (code == FLOAT_EXPR ||
> -	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> -	{
> -	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
> -	  bool float_expr_p = code == FLOAT_EXPR;
> -	  unsigned short target_size;
> -	  scalar_mode intermediate_mode;
> -	  if (demotion)
> -	    {
> -	      intermediate_mode = lhs_mode;
> -	      target_size = GET_MODE_SIZE (rhs_mode);
> -	    }
> +      if (supportable_indirect_convert_operation (code,
> +						  vectype_out,
> +						  vectype_in,
> +						  &converts,
> +						  op0))
> +	{
> +	  if (converts.length () == 1)
> +	    code1 = converts[0].second;
>  	  else
>  	    {
> -	      target_size = GET_MODE_SIZE (lhs_mode);
> -	      if (!int_mode_for_size
> -		  (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
> -		goto unsupported;
> -	    }
> -	  code1 = float_expr_p ? code : NOP_EXPR;
> -	  codecvt1 = float_expr_p ? NOP_EXPR : code;
> -	  opt_scalar_mode mode_iter;
> -	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> -	    {
> -	      intermediate_mode = mode_iter.require ();
> -
> -	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
> -		break;
> -
> -	      scalar_mode cvt_mode;
> -	      if (!int_mode_for_size
> -		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
> -		break;
> -
> -	      cvt_type = build_nonstandard_integer_type
> -		(GET_MODE_BITSIZE (cvt_mode), 0);
> -
> -	      /* Check if the intermediate type can hold OP0's range.
> -		 When converting from float to integer this is not necessary
> -		 because values that do not fit the (smaller) target type are
> -		 unspecified anyway.  */
> -	      if (demotion && float_expr_p)
> -		{
> -		  wide_int op_min_value, op_max_value;
> -		  if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
> -		    break;
> -
> -		  if (cvt_type == NULL_TREE
> -		      || (wi::min_precision (op_max_value, SIGNED)
> -			  > TYPE_PRECISION (cvt_type))
> -		      || (wi::min_precision (op_min_value, SIGNED)
> -			  > TYPE_PRECISION (cvt_type)))
> -		    continue;
> -		}
> -
> -	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> -	      /* This should only happened for SLP as long as loop vectorizer
> -		 only supports same-sized vector.  */
> -	      if (cvt_type == NULL_TREE
> -		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
> -		  || !supportable_convert_operation ((tree_code) code1,
> -						     vectype_out,
> -						     cvt_type, &tc1)
> -		  || !supportable_convert_operation ((tree_code) codecvt1,
> -						     cvt_type,
> -						     vectype_in, &tc2))
> -		continue;
> -
> -	      found_mode = true;
> -	      break;
> -	    }
> -
> -	  if (found_mode)
> -	    {
> -	      multi_step_cvt++;
> -	      interm_types.safe_push (cvt_type);
> -	      cvt_type = NULL_TREE;
> -	      code1 = tc1;
> -	      codecvt1 = tc2;
> -	      break;
> +	      gcc_assert (converts.length () == 2);

I'd rather see a && converts.length () <= 2 check after the
supportable_indirect_convert_operation call.

> +	      multi_step_cvt = converts.length () - 1;
> +	      codecvt1 = converts[0].second;
> +	      code1 = converts[1].second;
> +	      interm_types.safe_push (converts[0].first);

cvt_type is no longer assigned from NULL_TREE, not sure if that matters.

OK with those changes.

Thanks,
Richard.

>  	    }
> +	  break;
>  	}
> +
>        /* FALLTHRU */
>      unsupported:
>        if (dump_enabled_p ())
> @@ -14626,6 +14553,158 @@ supportable_narrowing_operation (code_helper code,
>    return false;
>  }
>  
> +/* Function supportable_indirect_convert_operation
> +
> +   Check whether an operation represented by the code CODE is two
> +   convert operations that are supported by the target platform in
> +   vector form (i.e., when operating on arguments of type VECTYPE_IN
> +   producing a result of type VECTYPE_OUT).
> +
> +   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
> +   This function checks if these operations are supported
> +   by the target platform directly (via vector tree-codes).
> +
> +   Output:
> +   - CODE1 is the code of a vector operation to be used when
> +   converting the operation in the first step, if available.
> +   - CODE2 is the code of a vector operation to be used when
> +   converting the operation in the second step, if available.
> +   - MULTI_STEP_CVT determines the number of required intermediate steps in
> +   case of multi-step conversion (like int->short->char - in that case
> +   MULTI_STEP_CVT will be 1). In the function, it should be 1.
> +   - INTERM_TYPES contains the intermediate type required to perform the
> +   convert operation (short in the above example).   */
> +bool
> +supportable_indirect_convert_operation (code_helper code,
> +					tree vectype_out,
> +					tree vectype_in,
> +					vec<std::pair<tree, tree_code> > *converts,
> +					tree op0)
> +{
> +  bool found_mode = false;
> +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
> +  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
> +  opt_scalar_mode mode_iter;
> +  tree_code tc1, tc2, code1, code2;
> +
> +  tree cvt_type = NULL_TREE;
> +  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
> +
> +  if (supportable_convert_operation ((tree_code) code,
> +				     vectype_out,
> +				     vectype_in,
> +				     &tc1))
> +    {
> +      converts->safe_push (std::make_pair (vectype_out, tc1));
> +      return true;
> +    }
> +
> +  /* For conversions between float and integer types try whether
> +     we can use intermediate signed integer types to support the
> +     conversion.  */
> +  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> +      && (code == FLOAT_EXPR
> +	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> +    {
> +      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
> +      bool float_expr_p = code == FLOAT_EXPR;
> +      unsigned short target_size;
> +      scalar_mode intermediate_mode;
> +      if (demotion)
> +	{
> +	  intermediate_mode = lhs_mode;
> +	  target_size = GET_MODE_SIZE (rhs_mode);
> +	}
> +      else
> +	{
> +	  target_size = GET_MODE_SIZE (lhs_mode);
> +	  if (!int_mode_for_size
> +	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
> +	    return false;
> +	}
> +      code1 = float_expr_p ? (tree_code) code : NOP_EXPR;
> +      code2 = float_expr_p ? NOP_EXPR : (tree_code) code;
> +      opt_scalar_mode mode_iter;
> +      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> +	{
> +	  intermediate_mode = mode_iter.require ();
> +
> +	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
> +	    break;
> +
> +	  scalar_mode cvt_mode;
> +	  if (!int_mode_for_size
> +	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
> +	    break;
> +
> +	  cvt_type = build_nonstandard_integer_type
> +	    (GET_MODE_BITSIZE (cvt_mode), 0);
> +
> +	  /* Check if the intermediate type can hold OP0's range.
> +	     When converting from float to integer this is not necessary
> +	     because values that do not fit the (smaller) target type are
> +	     unspecified anyway.  */
> +	  if (demotion && float_expr_p)
> +	    {
> +	      wide_int op_min_value, op_max_value;
> +	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
> +		 In the future, if it is supported, changes may need to be made
> +		 to this part, such as checking the RANGE of each element
> +		 in the vector.  */
> +	      if (!SSA_NAME_RANGE_INFO (op0)
> +		  || !vect_get_range_info (op0, &op_min_value, &op_max_value))
> +		break;
> +
> +	      if (cvt_type == NULL_TREE
> +		  || (wi::min_precision (op_max_value, SIGNED)
> +		      > TYPE_PRECISION (cvt_type))
> +		  || (wi::min_precision (op_min_value, SIGNED)
> +		      > TYPE_PRECISION (cvt_type)))
> +		continue;
> +	    }
> +
> +	  /*
> +	  if (vinfo != NULL && slp_node != NULL)
> +	    cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
> +	  else
> +	    {
> +	      bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
> +			 || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
> +	      cvt_type = build_nonstandard_integer_type
> +		(GET_MODE_BITSIZE (cvt_mode), uns);
> +	      cvt_type = build_vector_type (cvt_type, nelts);
> +	    }
> +	    */
> +	  cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE (vectype_in),
> +							  cvt_type,
> +							  nelts);
> +	  /* This should only happened for SLP as long as loop vectorizer
> +	     only supports same-sized vector.  */
> +	  if (cvt_type == NULL_TREE
> +	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
> +	      || !supportable_convert_operation ((tree_code) code1,
> +						 vectype_out,
> +						 cvt_type, &tc1)
> +	      || !supportable_convert_operation ((tree_code) code2,
> +						 cvt_type,
> +						 vectype_in, &tc2))
> +	    continue;
> +
> +	  found_mode = true;
> +	  break;
> +	}
> +
> +      if (found_mode)
> +	{
> +	  converts->safe_push (std::make_pair (cvt_type, tc2));
> +	  if (TYPE_MODE (cvt_type) != TYPE_MODE (vectype_out))
> +	    converts->safe_push (std::make_pair (vectype_out, tc1));
> +	  return true;
> +	}
> +    }
> +  return false;
> +}
> +
>  /* Generate and return a vector mask of MASK_TYPE such that
>     mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
>     Add the statements to SEQ.  */
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 97ec9c341e7..53b3f24cb2e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2265,6 +2265,10 @@ extern bool supportable_widening_operation (vec_info*, code_helper,
>  extern bool supportable_narrowing_operation (code_helper, tree, tree,
>  					     code_helper *, int *,
>  					     vec<tree> *);
> +extern bool supportable_indirect_convert_operation (code_helper,
> +						    tree, tree,
> +						    vec<std::pair<tree, tree_code> > *,
> +						    tree = NULL_TREE);
>  
>  extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
>  				  enum vect_cost_for_stmt, stmt_vec_info,
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/3 v5] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
  2024-06-25 13:30                                     ` Richard Biener
@ 2024-06-26 10:54                                       ` Hu, Lin1
  0 siblings, 0 replies; 33+ messages in thread
From: Hu, Lin1 @ 2024-06-26 10:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther, Tamar.Christina

Hi,

This is the lasted version, I modified some comments and retest the patch on
x86-64-linux-gnu. I'll wait another day to see what else Tamar has to say
about the API, if not I will upstream this patch tomorrow.

BRs,
Lin

gcc/ChangeLog:

	PR target/107432
	* tree-vect-generic.cc
	(expand_vector_conversion): Support convert for int -> int,
	float -> float and int <-> float.
	* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
	indirect convert part.
	(supportable_indirect_convert_operation): New function.
	* tree-vectorizer.h (supportable_indirect_convert_operation):
	Define the new function.

gcc/testsuite/ChangeLog:

	PR target/107432
	* gcc.target/i386/pr107432-1.c: New test.
	* gcc.target/i386/pr107432-2.c: Ditto.
	* gcc.target/i386/pr107432-3.c: Ditto.
	* gcc.target/i386/pr107432-4.c: Ditto.
	* gcc.target/i386/pr107432-5.c: Ditto.
	* gcc.target/i386/pr107432-6.c: Ditto.
	* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +++++
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +++++
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++++++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 150 +++++++++++++
 gcc/tree-vect-generic.cc                   |  29 ++-
 gcc/tree-vect-stmts.cc                     | 241 +++++++++++++--------
 gcc/tree-vectorizer.h                      |   4 +
 10 files changed, 990 insertions(+), 95 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128i	mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256i	mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128i	mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi	mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i	mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256i	mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__m128i	mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16si)a, __v16qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__m128i	mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi);
+}
+
+__m256i	mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi);
+}
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2su);
+}
+
+__m128i	mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4du)a, __v4su);
+}
+
+__m256i	mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8du)a, __v8su);
+}
+
+__v2hu	mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2hu);
+}
+
+__v4hu	mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4hu);
+}
+
+__m128i	mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8du)a, __v8hu);
+}
+
+__v2qu	mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2du)a, __v2qu);
+}
+
+__v4qu	mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4du)a, __v4qu);
+}
+
+__v8qu	mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8du)a, __v8qu);
+}
+
+__v2hu	mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2hu);
+}
+
+__v4hu	mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4hu);
+}
+
+__m128i	mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8su)a, __v8hu);
+}
+
+__m256i	mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16su)a, __v16hu);
+}
+
+__v2qu	mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+  return __builtin_convertvector((__v2su)a, __v2qu);
+}
+
+__v4qu	mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4su)a, __v4qu);
+}
+
+__v8qu	mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8su)a, __v8qu);
+}
+
+__m128i	mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v16su)a, __v16qu);
+}
+
+__v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+  return __builtin_convertvector((__v2hu)a, __v2qu);
+}
+
+__v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hu)a, __v8qu);
+}
+
+__m128i	mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu);
+}
+
+__m256i	mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__m256i	mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v4di);
+}
+
+__m512i	mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v8di);
+}
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v4si);
+}
+
+__m256i	mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v8si);
+}
+
+__m512i	mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+  return (__m512i)__builtin_convertvector(a, __v16si);
+}
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a)
+{
+  return (__m128i)__builtin_convertvector(a, __v8hi);
+}
+
+__m256i	mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+  return (__m256i)__builtin_convertvector(a, __v16hi);
+}
+
+__v32hi	mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector(a, __v32hi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2sf);
+}
+
+__v4sf	mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2hf);
+}
+
+__v4hf	mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4hf);
+}
+
+__v8hf	mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8hf);
+}
+
+__v16hf	mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector(a, __v16hf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2df);
+}
+
+__v4df	mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4df);
+}
+
+__v8df	mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8df);
+}
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4sf);
+}
+
+__v8sf	mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8sf);
+}
+
+__v16sf	mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16sf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector(a, __v2si);
+}
+
+__v4si	mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4si);
+}
+
+__v8si	mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8si);
+}
+
+__v16si	mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector(a, __v16si);
+}
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector(a, __v2di);
+}
+
+__v4di	mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector(a, __v4di);
+}
+
+__v8di	mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector(a, __v8di);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2qi	mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qi);
+}
+
+__v4qi	mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qi);
+}
+
+__v8qi	mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qi);
+}
+
+__v2qu	mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+  return __builtin_convertvector((__v2df)a, __v2qu);
+}
+
+__v4qu	mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+  return __builtin_convertvector((__v4df)a, __v4qu);
+}
+
+__v8qu	mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+  return __builtin_convertvector((__v8df)a, __v8qu);
+}
+
+__v2qi	mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qi);
+}
+
+__v4qi	mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qi);
+}
+
+__v8qi	mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qi);
+}
+
+__v16qi	mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qi);
+}
+
+__v2qu	mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+  return __builtin_convertvector((__v2sf)a, __v2qu);
+}
+
+__v4qu	mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+  return __builtin_convertvector((__v4sf)a, __v4qu);
+}
+
+__v8qu	mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+  return __builtin_convertvector((__v8sf)a, __v8qu);
+}
+
+__v16qu	mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+  return __builtin_convertvector((__v16sf)a, __v16qu);
+}
+
+__v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qi);
+}
+
+__v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qi);
+}
+
+__v16qi	mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qi);
+}
+
+__v32qi	mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qi);
+}
+
+__v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+  return __builtin_convertvector((__v2hf)a, __v2qu);
+}
+
+__v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+  return __builtin_convertvector((__v8hf)a, __v8qu);
+}
+
+__v16qu	mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+  return __builtin_convertvector((__v16hf)a, __v16qu);
+}
+
+__v32qu	mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+  return __builtin_convertvector((__v32hf)a, __v32qu);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..1b33e9a9508
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,150 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 6 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 10 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v2qu __attribute__ ((vector_size (2)));
+typedef unsigned char __v4qu __attribute__ ((vector_size (4)));
+typedef unsigned char __v8qu __attribute__ ((vector_size (8)));
+typedef unsigned char __v16qu __attribute__ ((vector_size (16)));
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4)));
+typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+
+__v2df	mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2df);
+}
+
+__v4df	mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4df);
+}
+
+__v8df	mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8df);
+}
+
+__v2df	mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2df);
+}
+
+__v4df	mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4df);
+}
+
+__v8df	mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8df);
+}
+
+__v2sf	mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16sf);
+}
+
+__v2sf	mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2sf);
+}
+
+__v4sf	mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4sf);
+}
+
+__v8sf	mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8sf);
+}
+
+__v16sf	mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16sf);
+}
+
+__v2hf	mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+  return __builtin_convertvector((__v2qi)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+  return __builtin_convertvector((__v4qi)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+  return __builtin_convertvector((__v8qi)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+  return __builtin_convertvector((__v16qi)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+  return __builtin_convertvector((__v32qi)a, __v32hf);
+}
+
+__v2hf	mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+  return __builtin_convertvector((__v2qu)a, __v2hf);
+}
+
+__v4hf	mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+  return __builtin_convertvector((__v4qu)a, __v4hf);
+}
+
+__v8hf	mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+  return __builtin_convertvector((__v8qu)a, __v8hf);
+}
+
+__v16hf	mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+  return __builtin_convertvector((__v16qu)a, __v16hf);
+}
+
+__v32hf	mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+  return __builtin_convertvector((__v32qu)a, __v32hf);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index ea0069f7a67..8336cbb8c73 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-match.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 
 
 /* Build a ternary operation and gimplify it.  Emit code before GSI.
@@ -1850,7 +1852,7 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   tree arg = gimple_call_arg (stmt, 0);
   tree ret_type = TREE_TYPE (lhs);
   tree arg_type = TREE_TYPE (arg);
-  tree new_rhs, compute_type = TREE_TYPE (arg_type);
+  tree new_rhs, new_lhs, compute_type = TREE_TYPE (arg_type);
   enum tree_code code = NOP_EXPR;
   enum tree_code code1 = ERROR_MARK;
   enum { NARROW, NONE, WIDEN } modifier = NONE;
@@ -1870,14 +1872,29 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   else if (ret_elt_bits > arg_elt_bits)
     modifier = WIDEN;
 
-  if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
+  auto_vec<std::pair<tree, tree_code> > converts;
+  if (supportable_indirect_convert_operation (code,
+					      ret_type, arg_type,
+					      &converts,
+					      arg))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+      new_rhs = arg;
+      for (unsigned int i = 0; i < converts.length () - 1; i++)
 	{
-	  g = gimple_build_assign (lhs, code1, arg);
-	  gsi_replace (gsi, g, false);
-	  return;
+	  new_lhs = make_ssa_name (converts[i].first);
+	  g = gimple_build_assign (new_lhs, converts[i].second, new_rhs);
+	  new_rhs = new_lhs;
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
 	}
+      g = gimple_build_assign (lhs,
+			       converts[converts.length() - 1].second,
+			       new_rhs);
+      gsi_replace (gsi, g, false);
+      return;
+    }
+
+  if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
+    {
       /* Can't use get_compute_type here, as supportable_convert_operation
 	 doesn't necessarily use an optab and needs two arguments.  */
       tree vec_compute_type
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 05a169ecb2d..cfa56801822 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
-  tree_code tc1, tc2;
+  tree_code tc1;
   code_helper code, code1, code2;
   code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
   tree new_temp;
@@ -5367,6 +5367,7 @@ vectorizable_conversion (vec_info *vinfo,
   scalar_mode lhs_mode = SCALAR_TYPE_MODE (lhs_type);
   scalar_mode rhs_mode = SCALAR_TYPE_MODE (rhs_type);
   opt_scalar_mode rhs_mode_iter;
+  vec<std::pair<tree, tree_code> > converts = vNULL;
 
   /* Supportable by target?  */
   switch (modifier)
@@ -5377,99 +5378,26 @@ vectorizable_conversion (vec_info *vinfo,
 	  && !CONVERT_EXPR_CODE_P (code))
 	return false;
       gcc_assert (code.is_tree_code ());
-      if (supportable_convert_operation ((tree_code) code, vectype_out,
-					 vectype_in, &tc1))
-      {
-	code1 = tc1;
-	break;
-      }
-
-      /* For conversions between float and integer types try whether
-	 we can use intermediate signed integer types to support the
-	 conversion.  */
-      if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
-	  && (code == FLOAT_EXPR ||
-	      (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
-	{
-	  bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
-	  bool float_expr_p = code == FLOAT_EXPR;
-	  unsigned short target_size;
-	  scalar_mode intermediate_mode;
-	  if (demotion)
-	    {
-	      intermediate_mode = lhs_mode;
-	      target_size = GET_MODE_SIZE (rhs_mode);
-	    }
+      if (supportable_indirect_convert_operation (code,
+						  vectype_out,
+						  vectype_in,
+						  &converts,
+						  op0))
+	{
+	  gcc_assert (converts.length () <= 2);
+	  if (converts.length () == 1)
+	    code1 = converts[0].second;
 	  else
 	    {
-	      target_size = GET_MODE_SIZE (lhs_mode);
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
-		goto unsupported;
-	    }
-	  code1 = float_expr_p ? code : NOP_EXPR;
-	  codecvt1 = float_expr_p ? NOP_EXPR : code;
-	  opt_scalar_mode mode_iter;
-	  FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
-	    {
-	      intermediate_mode = mode_iter.require ();
-
-	      if (GET_MODE_SIZE (intermediate_mode) > target_size)
-		break;
-
-	      scalar_mode cvt_mode;
-	      if (!int_mode_for_size
-		  (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
-		break;
-
-	      cvt_type = build_nonstandard_integer_type
-		(GET_MODE_BITSIZE (cvt_mode), 0);
-
-	      /* Check if the intermediate type can hold OP0's range.
-		 When converting from float to integer this is not necessary
-		 because values that do not fit the (smaller) target type are
-		 unspecified anyway.  */
-	      if (demotion && float_expr_p)
-		{
-		  wide_int op_min_value, op_max_value;
-		  if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
-		    break;
-
-		  if (cvt_type == NULL_TREE
-		      || (wi::min_precision (op_max_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type))
-		      || (wi::min_precision (op_min_value, SIGNED)
-			  > TYPE_PRECISION (cvt_type)))
-		    continue;
-		}
-
-	      cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
-	      /* This should only happened for SLP as long as loop vectorizer
-		 only supports same-sized vector.  */
-	      if (cvt_type == NULL_TREE
-		  || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
-		  || !supportable_convert_operation ((tree_code) code1,
-						     vectype_out,
-						     cvt_type, &tc1)
-		  || !supportable_convert_operation ((tree_code) codecvt1,
-						     cvt_type,
-						     vectype_in, &tc2))
-		continue;
-
-	      found_mode = true;
-	      break;
-	    }
-
-	  if (found_mode)
-	    {
-	      multi_step_cvt++;
-	      interm_types.safe_push (cvt_type);
 	      cvt_type = NULL_TREE;
-	      code1 = tc1;
-	      codecvt1 = tc2;
-	      break;
+	      multi_step_cvt = converts.length () - 1;
+	      codecvt1 = converts[0].second;
+	      code1 = converts[1].second;
+	      interm_types.safe_push (converts[0].first);
 	    }
+	  break;
 	}
+
       /* FALLTHRU */
     unsupported:
       if (dump_enabled_p ())
@@ -14626,6 +14554,141 @@ supportable_narrowing_operation (code_helper code,
   return false;
 }
 
+/* Function supportable_indirect_convert_operation
+
+   Check whether an operation represented by the code CODE is single or multi
+   operations that are supported by the target platform in
+   vector form (i.e., when operating on arguments of type VECTYPE_IN
+   producing a result of type VECTYPE_OUT).
+
+   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
+   This function checks if these operations are supported
+   by the target platform directly (via vector tree-codes).
+
+   Output:
+   - converts contains some pairs to perform the convert operation,
+   the pair's first is the intermediate type, and its second is the code of
+   a vector operation to be used when converting the operation from the
+   previous type to the intermediate type. */
+bool
+supportable_indirect_convert_operation (code_helper code,
+					tree vectype_out,
+					tree vectype_in,
+					vec<std::pair<tree, tree_code> > *converts,
+					tree op0)
+{
+  bool found_mode = false;
+  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
+  scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
+  opt_scalar_mode mode_iter;
+  tree_code tc1, tc2, code1, code2;
+
+  tree cvt_type = NULL_TREE;
+  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
+
+  if (supportable_convert_operation ((tree_code) code,
+				     vectype_out,
+				     vectype_in,
+				     &tc1))
+    {
+      converts->safe_push (std::make_pair (vectype_out, tc1));
+      return true;
+    }
+
+  /* For conversions between float and integer types try whether
+     we can use intermediate signed integer types to support the
+     conversion.  */
+  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
+      && (code == FLOAT_EXPR
+	  || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
+    {
+      bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
+      bool float_expr_p = code == FLOAT_EXPR;
+      unsigned short target_size;
+      scalar_mode intermediate_mode;
+      if (demotion)
+	{
+	  intermediate_mode = lhs_mode;
+	  target_size = GET_MODE_SIZE (rhs_mode);
+	}
+      else
+	{
+	  target_size = GET_MODE_SIZE (lhs_mode);
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
+	    return false;
+	}
+      code1 = float_expr_p ? (tree_code) code : NOP_EXPR;
+      code2 = float_expr_p ? NOP_EXPR : (tree_code) code;
+      opt_scalar_mode mode_iter;
+      FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
+	{
+	  intermediate_mode = mode_iter.require ();
+
+	  if (GET_MODE_SIZE (intermediate_mode) > target_size)
+	    break;
+
+	  scalar_mode cvt_mode;
+	  if (!int_mode_for_size
+	      (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
+	    break;
+
+	  cvt_type = build_nonstandard_integer_type
+	    (GET_MODE_BITSIZE (cvt_mode), 0);
+
+	  /* Check if the intermediate type can hold OP0's range.
+	     When converting from float to integer this is not necessary
+	     because values that do not fit the (smaller) target type are
+	     unspecified anyway.  */
+	  if (demotion && float_expr_p)
+	    {
+	      wide_int op_min_value, op_max_value;
+	      /* For vector form, it looks like op0 doesn't have RANGE_INFO.
+		 In the future, if it is supported, changes may need to be made
+		 to this part, such as checking the RANGE of each element
+		 in the vector.  */
+	      if (!SSA_NAME_RANGE_INFO (op0)
+		  || !vect_get_range_info (op0, &op_min_value, &op_max_value))
+		break;
+
+	      if (cvt_type == NULL_TREE
+		  || (wi::min_precision (op_max_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type))
+		  || (wi::min_precision (op_min_value, SIGNED)
+		      > TYPE_PRECISION (cvt_type)))
+		continue;
+	    }
+
+	  cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE (vectype_in),
+							  cvt_type,
+							  nelts);
+	  /* This should only happened for SLP as long as loop vectorizer
+	     only supports same-sized vector.  */
+	  if (cvt_type == NULL_TREE
+	      || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
+	      || !supportable_convert_operation ((tree_code) code1,
+						 vectype_out,
+						 cvt_type, &tc1)
+	      || !supportable_convert_operation ((tree_code) code2,
+						 cvt_type,
+						 vectype_in, &tc2))
+	    continue;
+
+	  found_mode = true;
+	  break;
+	}
+
+      if (found_mode)
+	{
+	  converts->safe_push (std::make_pair (cvt_type, tc2));
+	  if (TYPE_MODE (cvt_type) != TYPE_MODE (vectype_out))
+	    converts->safe_push (std::make_pair (vectype_out, tc1));
+	  return true;
+	}
+    }
+  return false;
+}
+
 /* Generate and return a vector mask of MASK_TYPE such that
    mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
    Add the statements to SEQ.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 97ec9c341e7..53b3f24cb2e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2265,6 +2265,10 @@ extern bool supportable_widening_operation (vec_info*, code_helper,
 extern bool supportable_narrowing_operation (code_helper, tree, tree,
 					     code_helper *, int *,
 					     vec<tree> *);
+extern bool supportable_indirect_convert_operation (code_helper,
+						    tree, tree,
+						    vec<std::pair<tree, tree_code> > *,
+						    tree = NULL_TREE);
 
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 				  enum vect_cost_for_stmt, stmt_vec_info,
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/3] vect: Support v4hi -> v4qi.
  2024-05-23  6:37       ` [PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and Hu, Lin1
  2024-05-23  6:37         ` [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float Hu, Lin1
@ 2024-05-23  6:37         ` Hu, Lin1
  2024-05-27  2:11           ` Hongtao Liu
  2024-05-23  6:37         ` [PATCH 3/3] vect: support direct conversion under x86-64-v3 Hu, Lin1
  2 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-23  6:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther

gcc/ChangeLog:

    PR target/107432
    * config/i386/mmx.md (truncv4hiv4qi2): New define_insn.

gcc/testsuite/ChangeLog:

    PR target/107432
    * gcc.target/i386/pr107432-6.c: Add test.
---
 gcc/config/i386/mmx.md                     | 10 ++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 12 +++++++++++-
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 ++++++++++++++++---
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 5f342497885..30f0d88af9f 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -4883,6 +4883,16 @@ (define_insn "truncv2hiv2qi2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
+(define_insn "truncv4hiv4qi2"
+  [(set (match_operand:V4QI 0 "register_operand" "=v")
+	(truncate:V4QI
+	  (match_operand:V4HI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_AVX512BW"
+  "vpmovwb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
 (define_mode_iterator V2QI_V2HI [V2QI V2HI])
 (define_insn "truncv2si<mode>2"
   [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
index a4f37447eb4..e0c7ffc8e5b 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -7,7 +7,7 @@
 /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 } } */
 
 #include <x86intrin.h>
 
@@ -113,6 +113,11 @@ __v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
   return __builtin_convertvector((__v2hi)a, __v2qi);
 }
 
+__v4qi	mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
+{
+  return __builtin_convertvector((__v4hi)a, __v4qi);
+}
+
 __v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hi)a, __v8qi);
@@ -218,6 +223,11 @@ __v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
   return __builtin_convertvector((__v2hu)a, __v2qu);
 }
 
+__v4qu	mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a)
+{
+  return __builtin_convertvector((__v4hu)a, __v4qu);
+}
+
 __v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hu)a, __v8qu);
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
index 4a68a10b089..7d3717d45bc 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-6.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -8,11 +8,14 @@
 /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
-/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
 
 #include <x86intrin.h>
 
@@ -103,6 +106,11 @@ __v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
   return __builtin_convertvector((__v2hf)a, __v2qi);
 }
 
+__v4qi	mm64_cvtph_epi8_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector((__v4hf)a, __v4qi);
+}
+
 __v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
 {
   return __builtin_convertvector((__v8hf)a, __v8qi);
@@ -123,6 +131,11 @@ __v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
   return __builtin_convertvector((__v2hf)a, __v2qu);
 }
 
+__v4qu	mm64_cvtph_epu8_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector((__v4hf)a, __v4qu);
+}
+
 __v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
 {
   return __builtin_convertvector((__v8hf)a, __v8qu);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/3] vect: Support v4hi -> v4qi.
  2024-05-23  6:37         ` [PATCH 2/3] vect: Support v4hi -> v4qi Hu, Lin1
@ 2024-05-27  2:11           ` Hongtao Liu
  2024-05-29  8:55             ` [PATCH 2/3 v2] " Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Hongtao Liu @ 2024-05-27  2:11 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu, ubizjak, rguenther

On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> gcc/ChangeLog:
>
>     PR target/107432
>     * config/i386/mmx.md (truncv4hiv4qi2): New define_insn.
>
> gcc/testsuite/ChangeLog:
>
>     PR target/107432
>     * gcc.target/i386/pr107432-6.c: Add test.
> ---
>  gcc/config/i386/mmx.md                     | 10 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 12 +++++++++++-
>  gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 ++++++++++++++++---
>  3 files changed, 37 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 5f342497885..30f0d88af9f 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -4883,6 +4883,16 @@ (define_insn "truncv2hiv2qi2"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "TI")])
>
> +(define_insn "truncv4hiv4qi2"
> +  [(set (match_operand:V4QI 0 "register_operand" "=v")
> +       (truncate:V4QI
> +         (match_operand:V4HI 1 "register_operand" "v")))]
> +  "TARGET_AVX512VL && TARGET_AVX512BW"
Please also add TARGET_MMX_WITH_SSE since v4hi is 64-bit vector.
Others LGTM.
> +  "vpmovwb\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "TI")])
> +
>  (define_mode_iterator V2QI_V2HI [V2QI V2HI])
>  (define_insn "truncv2si<mode>2"
>    [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> index a4f37447eb4..e0c7ffc8e5b 100644
> --- a/gcc/testsuite/gcc.target/i386/pr107432-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -7,7 +7,7 @@
>  /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 10 } } */
>
>  #include <x86intrin.h>
>
> @@ -113,6 +113,11 @@ __v2qi     mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
>    return __builtin_convertvector((__v2hi)a, __v2qi);
>  }
>
> +__v4qi mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
> +{
> +  return __builtin_convertvector((__v4hi)a, __v4qi);
> +}
> +
>  __v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
>  {
>    return __builtin_convertvector((__v8hi)a, __v8qi);
> @@ -218,6 +223,11 @@ __v2qu     mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
>    return __builtin_convertvector((__v2hu)a, __v2qu);
>  }
>
> +__v4qu mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a)
> +{
> +  return __builtin_convertvector((__v4hu)a, __v4qu);
> +}
> +
>  __v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
>  {
>    return __builtin_convertvector((__v8hu)a, __v8qu);
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> index 4a68a10b089..7d3717d45bc 100644
> --- a/gcc/testsuite/gcc.target/i386/pr107432-6.c
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -8,11 +8,14 @@
>  /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> -/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
>
>  #include <x86intrin.h>
>
> @@ -103,6 +106,11 @@ __v2qi     mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
>    return __builtin_convertvector((__v2hf)a, __v2qi);
>  }
>
> +__v4qi mm64_cvtph_epi8_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector((__v4hf)a, __v4qi);
> +}
> +
>  __v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
>  {
>    return __builtin_convertvector((__v8hf)a, __v8qi);
> @@ -123,6 +131,11 @@ __v2qu     mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
>    return __builtin_convertvector((__v2hf)a, __v2qu);
>  }
>
> +__v4qu mm64_cvtph_epu8_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector((__v4hf)a, __v4qu);
> +}
> +
>  __v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
>  {
>    return __builtin_convertvector((__v8hf)a, __v8qu);
> --
> 2.31.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/3 v2] vect: Support v4hi -> v4qi.
  2024-05-27  2:11           ` Hongtao Liu
@ 2024-05-29  8:55             ` Hu, Lin1
  2024-05-29  9:09               ` Hongtao Liu
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-29  8:55 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther

Exclude add TARGET_MMX_WITH_SSE, I merge two patterns.

BRs,
Lin

gcc/ChangeLog:

    PR target/107432
    * config/i386/mmx.md
    (VI2_32_64): New mode iterator.
    (mmxhalfmode): New mode atter.
    (mmxhalfmodelower): Ditto.
    (truncv2hiv2qi2): Extend mode v4hi and change name from
    truncv2hiv2qi to trunc<mode><mmxhalfmodelower>2.

gcc/testsuite/ChangeLog:

    PR target/107432
    * gcc.target/i386/pr107432-1.c: Modify test.
    * gcc.target/i386/pr107432-6.c: Add test.
---
 gcc/config/i386/mmx.md                     | 17 +++++++++++++----
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 13 ++++++++++++-
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 ++++++++++++++++---
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 5f342497885..27b080bfeb6 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -67,6 +67,9 @@ (define_mode_iterator V2F_32 [V2HF V2BF])
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
 
+;; 8-byte and 4-byte HImode vector modes
+(define_mode_iterator VI2_32_64 [(V4HI "TARGET_MMX_WITH_SSE") V2HI])
+
 ;; 4-byte and 2-byte integer vector modes
 (define_mode_iterator VI_16_32 [V4QI V2QI V2HI])
 
@@ -106,6 +109,12 @@ (define_mode_attr mmxinsnmode
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
+(define_mode_attr mmxhalfmode
+  [(V4HI "V4QI") (V2HI "V2QI")])
+
+(define_mode_attr mmxhalfmodelower
+  [(V4HI "v4qi") (V2HI "v2qi")])
+
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr mmxintvecmode
   [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
@@ -4873,10 +4882,10 @@ (define_expand "<insn>v2qiv2hi2"
   DONE;
 })
 
-(define_insn "truncv2hiv2qi2"
-  [(set (match_operand:V2QI 0 "register_operand" "=v")
-	(truncate:V2QI
-	  (match_operand:V2HI 1 "register_operand" "v")))]
+(define_insn "trunc<mode><mmxhalfmodelower>2"
+  [(set (match_operand:<mmxhalfmode> 0 "register_operand" "=v")
+	(truncate:<mmxhalfmode>
+	  (match_operand:VI2_32_64 1 "register_operand" "v")))]
   "TARGET_AVX512VL && TARGET_AVX512BW"
   "vpmovwb\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
index a4f37447eb4..afdf367afe2 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -7,7 +7,8 @@
 /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
 
 #include <x86intrin.h>
 
@@ -113,6 +114,11 @@ __v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
   return __builtin_convertvector((__v2hi)a, __v2qi);
 }
 
+__v4qi	mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
+{
+  return __builtin_convertvector((__v4hi)a, __v4qi);
+}
+
 __v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hi)a, __v8qi);
@@ -218,6 +224,11 @@ __v2qu	mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
   return __builtin_convertvector((__v2hu)a, __v2qu);
 }
 
+__v4qu	mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a)
+{
+  return __builtin_convertvector((__v4hu)a, __v4qu);
+}
+
 __v8qu	mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hu)a, __v8qu);
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
index 4a68a10b089..7d3717d45bc 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-6.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -8,11 +8,14 @@
 /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
-/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
 
 #include <x86intrin.h>
 
@@ -103,6 +106,11 @@ __v2qi	mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
   return __builtin_convertvector((__v2hf)a, __v2qi);
 }
 
+__v4qi	mm64_cvtph_epi8_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector((__v4hf)a, __v4qi);
+}
+
 __v8qi	mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
 {
   return __builtin_convertvector((__v8hf)a, __v8qi);
@@ -123,6 +131,11 @@ __v2qu	mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
   return __builtin_convertvector((__v2hf)a, __v2qu);
 }
 
+__v4qu	mm64_cvtph_epu8_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector((__v4hf)a, __v4qu);
+}
+
 __v8qu	mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
 {
   return __builtin_convertvector((__v8hf)a, __v8qu);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/3 v2] vect: Support v4hi -> v4qi.
  2024-05-29  8:55             ` [PATCH 2/3 v2] " Hu, Lin1
@ 2024-05-29  9:09               ` Hongtao Liu
  0 siblings, 0 replies; 33+ messages in thread
From: Hongtao Liu @ 2024-05-29  9:09 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu, ubizjak, rguenther

On Wed, May 29, 2024 at 4:56 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> Exclude add TARGET_MMX_WITH_SSE, I merge two patterns.
Ok.
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
>     PR target/107432
>     * config/i386/mmx.md
>     (VI2_32_64): New mode iterator.
>     (mmxhalfmode): New mode atter.
>     (mmxhalfmodelower): Ditto.
>     (truncv2hiv2qi2): Extend mode v4hi and change name from
>     truncv2hiv2qi to trunc<mode><mmxhalfmodelower>2.
>
> gcc/testsuite/ChangeLog:
>
>     PR target/107432
>     * gcc.target/i386/pr107432-1.c: Modify test.
>     * gcc.target/i386/pr107432-6.c: Add test.
> ---
>  gcc/config/i386/mmx.md                     | 17 +++++++++++++----
>  gcc/testsuite/gcc.target/i386/pr107432-1.c | 13 ++++++++++++-
>  gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 ++++++++++++++++---
>  3 files changed, 41 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 5f342497885..27b080bfeb6 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -67,6 +67,9 @@ (define_mode_iterator V2F_32 [V2HF V2BF])
>  ;; 4-byte integer vector modes
>  (define_mode_iterator VI_32 [V4QI V2HI])
>
> +;; 8-byte and 4-byte HImode vector modes
> +(define_mode_iterator VI2_32_64 [(V4HI "TARGET_MMX_WITH_SSE") V2HI])
> +
>  ;; 4-byte and 2-byte integer vector modes
>  (define_mode_iterator VI_16_32 [V4QI V2QI V2HI])
>
> @@ -106,6 +109,12 @@ (define_mode_attr mmxinsnmode
>  (define_mode_attr mmxdoublemode
>    [(V8QI "V8HI") (V4HI "V4SI")])
>
> +(define_mode_attr mmxhalfmode
> +  [(V4HI "V4QI") (V2HI "V2QI")])
> +
> +(define_mode_attr mmxhalfmodelower
> +  [(V4HI "v4qi") (V2HI "v2qi")])
> +
>  ;; Mapping of vector float modes to an integer mode of the same size
>  (define_mode_attr mmxintvecmode
>    [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
> @@ -4873,10 +4882,10 @@ (define_expand "<insn>v2qiv2hi2"
>    DONE;
>  })
>
> -(define_insn "truncv2hiv2qi2"
> -  [(set (match_operand:V2QI 0 "register_operand" "=v")
> -       (truncate:V2QI
> -         (match_operand:V2HI 1 "register_operand" "v")))]
> +(define_insn "trunc<mode><mmxhalfmodelower>2"
> +  [(set (match_operand:<mmxhalfmode> 0 "register_operand" "=v")
> +       (truncate:<mmxhalfmode>
> +         (match_operand:VI2_32_64 1 "register_operand" "v")))]
>    "TARGET_AVX512VL && TARGET_AVX512BW"
>    "vpmovwb\t{%1, %0|%0, %1}"
>    [(set_attr "type" "ssemov")
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> index a4f37447eb4..afdf367afe2 100644
> --- a/gcc/testsuite/gcc.target/i386/pr107432-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> @@ -7,7 +7,8 @@
>  /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
>
>  #include <x86intrin.h>
>
> @@ -113,6 +114,11 @@ __v2qi     mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
>    return __builtin_convertvector((__v2hi)a, __v2qi);
>  }
>
> +__v4qi mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
> +{
> +  return __builtin_convertvector((__v4hi)a, __v4qi);
> +}
> +
>  __v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
>  {
>    return __builtin_convertvector((__v8hi)a, __v8qi);
> @@ -218,6 +224,11 @@ __v2qu     mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
>    return __builtin_convertvector((__v2hu)a, __v2qu);
>  }
>
> +__v4qu mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a)
> +{
> +  return __builtin_convertvector((__v4hu)a, __v4qu);
> +}
> +
>  __v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
>  {
>    return __builtin_convertvector((__v8hu)a, __v8qu);
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> index 4a68a10b089..7d3717d45bc 100644
> --- a/gcc/testsuite/gcc.target/i386/pr107432-6.c
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
> @@ -8,11 +8,14 @@
>  /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
> -/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
>
>  #include <x86intrin.h>
>
> @@ -103,6 +106,11 @@ __v2qi     mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
>    return __builtin_convertvector((__v2hf)a, __v2qi);
>  }
>
> +__v4qi mm64_cvtph_epi8_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector((__v4hf)a, __v4qi);
> +}
> +
>  __v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
>  {
>    return __builtin_convertvector((__v8hf)a, __v8qi);
> @@ -123,6 +131,11 @@ __v2qu     mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
>    return __builtin_convertvector((__v2hf)a, __v2qu);
>  }
>
> +__v4qu mm64_cvtph_epu8_builtin_convertvector(__v4hf a)
> +{
> +  return __builtin_convertvector((__v4hf)a, __v4qu);
> +}
> +
>  __v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
>  {
>    return __builtin_convertvector((__v8hf)a, __v8qu);
> --
> 2.31.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 3/3] vect: support direct conversion under x86-64-v3.
  2024-05-23  6:37       ` [PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and Hu, Lin1
  2024-05-23  6:37         ` [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float Hu, Lin1
  2024-05-23  6:37         ` [PATCH 2/3] vect: Support v4hi -> v4qi Hu, Lin1
@ 2024-05-23  6:37         ` Hu, Lin1
  2024-05-23  6:42           ` Hongtao Liu
  2 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-23  6:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther

gcc/ChangeLog:

	PR 107432
	* config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
	New function for generate a series of suitable insn.
	* config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
	Define new function.
	* config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.

gcc/testsuite/ChangeLog:

	PR 107432
	* gcc.target/i386/pr107432-8.c: New test.
	* gcc.target/i386/pr107432-9.c: Ditto.
	* gcc.target/i386/pr92645-4.c: Modify test.
---
 gcc/config/i386/i386-expand.cc             |  47 +++++++-
 gcc/config/i386/i386-protos.h              |   3 +
 gcc/config/i386/sse.md                     |  87 +++++++++++----
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 6 files changed, 304 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..bca8b85c9d1 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
   emit_insn (gen_xorv4si3 (value, value, large));
 }
 
-static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
-						 machine_mode mode, rtx target,
-						 rtx var, int one_var);
-
 /* Convert an unsigned DImode value into a DFmode, using only SSE.
    Expects the 64-bit DImode to be supplied in a pair of integral
    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
@@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
    whose ONE_VAR element is VAR, and other elements are zero.  Return true
    if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
 				     rtx target, rtx var, int one_var)
 {
@@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
   return ret;
 }
 
+/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
+
+bool
+ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
+{
+  machine_mode out_mode = GET_MODE (output);
+  machine_mode in_mode = GET_MODE (input);
+  int len = GET_MODE_SIZE (in_mode);
+  gcc_assert (len == 16 || len == 32);
+  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
+  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
+  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
+
+  struct expand_vec_perm_d d;
+  d.target = gen_reg_rtx (cvt_mode);
+  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
+  d.op1 = d.op0;
+  d.vmode = cvt_mode;
+  d.nelt = len;
+  d.testing_p = false;
+  d.one_operand_p = true;
+
+  /* Init perm. Put the needed bits of input in order and
+     fill the rest of bits by default.  */
+  int tot = 0;
+  for (int i = 0; i < len; ++i)
+    {
+      d.perm[i] = i;
+      if ((i % in_innersize) < out_innersize)
+	d.perm[tot++] = i;
+    }
+
+  if (ix86_expand_vec_perm_const_1(&d))
+    {
+      emit_move_insn (output, gen_lowpart (out_mode, d.target));
+      return true;
+    }
+
+  return false;
+}
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..ac29fb34028 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
 extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
 				      bool, rtx_code_label *);
 extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
+extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
 extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
 extern bool ix86_memtag_can_tag_addresses (void);
 
@@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
 extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
 					       rtx);
+extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
+						 rtx, int);
 extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
 
 /* In i386-c.cc  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f57f36ae380..0b14b3dc1ac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14373,14 +14373,25 @@ (define_expand "avx512bw_<code>v32hiv32qi2_mask_store"
 
 (define_mode_iterator PMOV_DST_MODE_2
   [V4SI V8HI (V16QI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_DST_MODE_2_AVX2
+  [V4SI V8HI V16QI])
 (define_mode_attr pmov_suff_2
   [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
 
 (define_expand "trunc<ssedoublemodelower><mode>2"
-  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
-	(truncate:PMOV_DST_MODE_2
+  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand")
+	(truncate:PMOV_DST_MODE_2_AVX2
 	  (match_operand:<ssedoublemode> 1 "register_operand")))]
-  "TARGET_AVX512VL")
+  "TARGET_AVX2"
+{
+  if (!TARGET_AVX512VL
+      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
+    {
+      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
+      gcc_assert (ok);
+      DONE;
+    }
+})
 
 (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
   [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
@@ -14460,6 +14471,7 @@ (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
   "TARGET_AVX512VL")
 
 (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI])
 (define_mode_attr pmov_dst_3_lower
   [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")])
 (define_mode_attr pmov_dst_3
@@ -14472,16 +14484,26 @@ (define_mode_attr pmov_suff_3
 (define_expand "trunc<mode><pmov_dst_3_lower>2"
   [(set (match_operand:<pmov_dst_3> 0 "register_operand")
 	(truncate:<pmov_dst_3>
-	  (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
-  "TARGET_AVX512VL"
+	  (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V16QImode);
+  if (TARGET_AVX512VL
+      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
+    {
+       rtx op0 = gen_reg_rtx (V16QImode);
 
-  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
-	     (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
+       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
+	         (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
+
+       emit_move_insn (operands[0],
+		       lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
+    }
+  else
+    {
+      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
+      gcc_assert (ok);
+    }
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
   DONE;
 })
 
@@ -14853,15 +14875,24 @@ (define_expand "trunc<mode><pmov_dst_4_lower>2"
   [(set (match_operand:<pmov_dst_4> 0 "register_operand")
 	(truncate:<pmov_dst_4>
 	  (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
-  "TARGET_AVX512VL"
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V8HImode);
+  if (TARGET_AVX512VL)
+    {
+      rtx op0 = gen_reg_rtx (V8HImode);
 
-  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
-	     (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
+      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
+		(op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
+      emit_move_insn (operands[0],
+		      lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
+      DONE;
+    }
+  else
+    {
+      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
+      gcc_assert (ok);
+    }
   DONE;
 })
 
@@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2"
   [(set (match_operand:V2SI 0 "register_operand")
 	(truncate:V2SI
 	  (match_operand:V2DI 1 "register_operand")))]
-  "TARGET_AVX512VL"
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V4SImode);
+  if (TARGET_AVX512VL)
+    {
+      rtx op0 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_avx512vl_truncatev2div2si2
-	     (op0, operands[1], CONST0_RTX (V2SImode)));
+      emit_insn (gen_avx512vl_truncatev2div2si2
+		(op0, operands[1], CONST0_RTX (V2SImode)));
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (V2SImode, op0, V4SImode));
+      emit_move_insn (operands[0],
+		      lowpart_subreg (V2SImode, op0, V4SImode));
+    }
+  else
+    {
+      rtx tmp = lowpart_subreg (V4SImode,
+				force_reg (V2DImode, operands[1]), V2DImode);
+      rtx op0 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
+				      GEN_INT (6), GEN_INT (7)));
+      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+    }
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c
new file mode 100644
index 00000000000..f0d1ab028f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vshufps" 1 } } */
+/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
+/* { dg-final { scan-assembler-times "vpermd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermq" 5 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__v4si	mm256_cvtepi64_epi32_builtin_convertvector(__v4di a)
+{
+  return __builtin_convertvector((__v4di)a, __v4si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__v8hi	mm256_cvtepi32_epi16_builtin_convertvector(__v8si a)
+{
+  return __builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__v16qi	mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
+{
+  return __builtin_convertvector((__v16hi)a, __v16qi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c
new file mode 100644
index 00000000000..650d352b945
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
@@ -0,0 +1,121 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
+#include <x86intrin.h>
+
+#include "avx-check.h"
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef union
+{
+  __v2si x;
+  int a[2];
+} union64i_d;
+
+typedef union
+{
+  __v2hi x;
+  short a[2];
+} union32i_w;
+
+typedef union
+{
+  __v4hi x;
+  short a[4];
+} union64i_w;
+
+typedef union
+{
+  __v2qi x;
+  char a[2];
+} union16i_b;
+
+typedef union
+{
+  __v4qi x;
+  char a[4];
+} union32i_b;
+
+typedef union
+{
+  __v8qi x;
+  char a[8];
+} union64i_b;
+
+#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)	  \
+static int						  \
+__attribute__((noinline, unused))			  \
+check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)	  \
+{							  \
+  int i;						  \
+  int err = 0;						  \
+							  \
+  for (i = 0; i < ARRAY_SIZE (u.a); i++)		  \
+    if (u.a[i] != v[i])					  \
+      {							  \
+	err++;						  \
+	PRINTF ("%i: " FMT " != " FMT "\n",		  \
+		i, v[i], u.a[i]);			  \
+      }							  \
+  return err;						  \
+}
+
+CHECK_EXP_LESS128 (union64i_d, int, "%d");
+CHECK_EXP_LESS128 (union32i_w, short, "%d");
+CHECK_EXP_LESS128 (union64i_w, short, "%d");
+CHECK_EXP_LESS128 (union16i_b, char, "%d");
+CHECK_EXP_LESS128 (union32i_b, char, "%d");
+CHECK_EXP_LESS128 (union64i_b, char, "%d");
+
+#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \
+void do_test##INIT_TYPE##CVT_TYPE ()			  \
+{							  \
+  INPUT_TYPE s;						  \
+  OUTPUT_TYPE r, ref;					  \
+  for (int i = 0; i < ARRAY_SIZE (s.a); i++)		  \
+    {							  \
+      s.a[i] = (i + 23415) * (i + 341);			  \
+      ref.a[i] = (OUTPUT_INNER) s.a[i];			  \
+    }							  \
+  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
+							  \
+  if (check_##OUTPUT_TYPE (r, ref.a))			  \
+    abort ();						  \
+  return;						  \
+}
+
+SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
+SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
+SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
+SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
+SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
+SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
+SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
+SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
+SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
+SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
+SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
+SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
+
+void TEST (void)
+{
+  do_test__v2di__v2si ();
+  do_test__v2di__v2hi ();
+  do_test__v2di__v2qi ();
+  do_test__v4di__v4si ();
+  do_test__v4di__v4hi ();
+  do_test__v4di__v4qi ();
+  do_test__v4si__v4hi ();
+  do_test__v4si__v4qi ();
+  do_test__v8si__v8hi ();
+  do_test__v8si__v8qi ();
+  do_test__v8hi__v8qi ();
+  do_test__v16hi__v16qi ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
index 28a3f9a3527..3aa49a3b654 100644
--- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
@@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
    a uniform CTOR with a vector promotion to a CTOR on a promoted
    element.  */
 /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
  2024-05-23  6:37         ` [PATCH 3/3] vect: support direct conversion under x86-64-v3 Hu, Lin1
@ 2024-05-23  6:42           ` Hongtao Liu
  2024-05-23  7:17             ` Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Hongtao Liu @ 2024-05-23  6:42 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu, ubizjak, rguenther

On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> gcc/ChangeLog:
>
>         PR 107432
>         * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
>         New function for generate a series of suitable insn.
>         * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
>         Define new function.
>         * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
I have some concern for this patch since
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to
this patch.
> gcc/testsuite/ChangeLog:
>
>         PR 107432
>         * gcc.target/i386/pr107432-8.c: New test.
>         * gcc.target/i386/pr107432-9.c: Ditto.
>         * gcc.target/i386/pr92645-4.c: Modify test.
> ---
>  gcc/config/i386/i386-expand.cc             |  47 +++++++-
>  gcc/config/i386/i386-protos.h              |   3 +
>  gcc/config/i386/sse.md                     |  87 +++++++++++----
>  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
>  6 files changed, 304 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..bca8b85c9d1 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
>    emit_insn (gen_xorv4si3 (value, value, large));
>  }
>
> -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> -                                                machine_mode mode, rtx target,
> -                                                rtx var, int one_var);
> -
>  /* Convert an unsigned DImode value into a DFmode, using only SSE.
>     Expects the 64-bit DImode to be supplied in a pair of integral
>     registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
> @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
>     whose ONE_VAR element is VAR, and other elements are zero.  Return true
>     if successful.  */
>
> -static bool
> +bool
>  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
>                                      rtx target, rtx var, int one_var)
>  {
> @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
>    return ret;
>  }
>
> +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> +
> +bool
> +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
> +{
> +  machine_mode out_mode = GET_MODE (output);
> +  machine_mode in_mode = GET_MODE (input);
> +  int len = GET_MODE_SIZE (in_mode);
> +  gcc_assert (len == 16 || len == 32);
> +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> +
> +  struct expand_vec_perm_d d;
> +  d.target = gen_reg_rtx (cvt_mode);
> +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
> +  d.op1 = d.op0;
> +  d.vmode = cvt_mode;
> +  d.nelt = len;
> +  d.testing_p = false;
> +  d.one_operand_p = true;
> +
> +  /* Init perm. Put the needed bits of input in order and
> +     fill the rest of bits by default.  */
> +  int tot = 0;
> +  for (int i = 0; i < len; ++i)
> +    {
> +      d.perm[i] = i;
> +      if ((i % in_innersize) < out_innersize)
> +       d.perm[tot++] = i;
> +    }
> +
> +  if (ix86_expand_vec_perm_const_1(&d))
> +    {
> +      emit_move_insn (output, gen_lowpart (out_mode, d.target));
> +      return true;
> +    }
> +
> +  return false;
> +}
> +
>  #include "gt-i386-expand.h"
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index dbc861fb1ea..ac29fb34028 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
>  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
>                                       bool, rtx_code_label *);
>  extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
>  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
>  extern bool ix86_memtag_can_tag_addresses (void);
>
> @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_abs (rtx, rtx);
>  extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
>                                                rtx);
> +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
> +                                                rtx, int);
>  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
>
>  /* In i386-c.cc  */
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index f57f36ae380..0b14b3dc1ac 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -14373,14 +14373,25 @@ (define_expand "avx512bw_<code>v32hiv32qi2_mask_store"
>
>  (define_mode_iterator PMOV_DST_MODE_2
>    [V4SI V8HI (V16QI "TARGET_AVX512BW")])
> +(define_mode_iterator PMOV_DST_MODE_2_AVX2
> +  [V4SI V8HI V16QI])
>  (define_mode_attr pmov_suff_2
>    [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
>
>  (define_expand "trunc<ssedoublemodelower><mode>2"
> -  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
> -       (truncate:PMOV_DST_MODE_2
> +  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand")
> +       (truncate:PMOV_DST_MODE_2_AVX2
>           (match_operand:<ssedoublemode> 1 "register_operand")))]
> -  "TARGET_AVX512VL")
> +  "TARGET_AVX2"
> +{
> +  if (!TARGET_AVX512VL
> +      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
> +    {
> +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
> +      gcc_assert (ok);
> +      DONE;
> +    }
> +})
>
>  (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
>    [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
> @@ -14460,6 +14471,7 @@ (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
>    "TARGET_AVX512VL")
>
>  (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")])
> +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI])
>  (define_mode_attr pmov_dst_3_lower
>    [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")])
>  (define_mode_attr pmov_dst_3
> @@ -14472,16 +14484,26 @@ (define_mode_attr pmov_suff_3
>  (define_expand "trunc<mode><pmov_dst_3_lower>2"
>    [(set (match_operand:<pmov_dst_3> 0 "register_operand")
>         (truncate:<pmov_dst_3>
> -         (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +         (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V16QImode);
> +  if (TARGET_AVX512VL
> +      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
> +    {
> +       rtx op0 = gen_reg_rtx (V16QImode);
>
> -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> +       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> +                (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> +
> +       emit_move_insn (operands[0],
> +                      lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> +    }
> +  else
> +    {
> +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
> +      gcc_assert (ok);
> +    }
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
>    DONE;
>  })
>
> @@ -14853,15 +14875,24 @@ (define_expand "trunc<mode><pmov_dst_4_lower>2"
>    [(set (match_operand:<pmov_dst_4> 0 "register_operand")
>         (truncate:<pmov_dst_4>
>           (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V8HImode);
> +  if (TARGET_AVX512VL)
> +    {
> +      rtx op0 = gen_reg_rtx (V8HImode);
>
> -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
> +      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> +               (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> +      emit_move_insn (operands[0],
> +                     lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> +      DONE;
> +    }
> +  else
> +    {
> +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1]);
> +      gcc_assert (ok);
> +    }
>    DONE;
>  })
>
> @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2"
>    [(set (match_operand:V2SI 0 "register_operand")
>         (truncate:V2SI
>           (match_operand:V2DI 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V4SImode);
> +  if (TARGET_AVX512VL)
> +    {
> +      rtx op0 = gen_reg_rtx (V4SImode);
>
> -  emit_insn (gen_avx512vl_truncatev2div2si2
> -            (op0, operands[1], CONST0_RTX (V2SImode)));
> +      emit_insn (gen_avx512vl_truncatev2div2si2
> +               (op0, operands[1], CONST0_RTX (V2SImode)));
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (V2SImode, op0, V4SImode));
> +      emit_move_insn (operands[0],
> +                     lowpart_subreg (V2SImode, op0, V4SImode));
> +    }
> +  else
> +    {
> +      rtx tmp = lowpart_subreg (V4SImode,
> +                               force_reg (V2DImode, operands[1]), V2DImode);
> +      rtx op0 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
> +                                     GEN_INT (6), GEN_INT (7)));
> +      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
> +    }
>    DONE;
>  })
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> new file mode 100644
> index 00000000000..f0d1ab028f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -O2" } */
> +/* { dg-final { scan-assembler-times "vshufps" 1 } } */
> +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
> +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermq" 5 } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2si);
> +}
> +
> +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4si);
> +}
> +
> +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi);
> +}
> +
> +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi);
> +}
> +
> +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi);
> +}
> +
> +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8hi);
> +}
> +
> +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi);
> +}
> +
> +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi);
> +}
> +
> +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi);
> +}
> +
> +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi);
> +}
> +
> +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi);
> +}
> +
> +__v16qi        mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
> +{
> +  return __builtin_convertvector((__v16hi)a, __v16qi);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> new file mode 100644
> index 00000000000..650d352b945
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> @@ -0,0 +1,121 @@
> +/* { dg-do run } */
> +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
> +#include <x86intrin.h>
> +
> +#include "avx-check.h"
> +
> +#ifndef TEST
> +#define TEST avx_test
> +#endif
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +typedef union
> +{
> +  __v2si x;
> +  int a[2];
> +} union64i_d;
> +
> +typedef union
> +{
> +  __v2hi x;
> +  short a[2];
> +} union32i_w;
> +
> +typedef union
> +{
> +  __v4hi x;
> +  short a[4];
> +} union64i_w;
> +
> +typedef union
> +{
> +  __v2qi x;
> +  char a[2];
> +} union16i_b;
> +
> +typedef union
> +{
> +  __v4qi x;
> +  char a[4];
> +} union32i_b;
> +
> +typedef union
> +{
> +  __v8qi x;
> +  char a[8];
> +} union64i_b;
> +
> +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)   \
> +static int                                               \
> +__attribute__((noinline, unused))                        \
> +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)          \
> +{                                                        \
> +  int i;                                                 \
> +  int err = 0;                                           \
> +                                                         \
> +  for (i = 0; i < ARRAY_SIZE (u.a); i++)                 \
> +    if (u.a[i] != v[i])                                          \
> +      {                                                          \
> +       err++;                                            \
> +       PRINTF ("%i: " FMT " != " FMT "\n",               \
> +               i, v[i], u.a[i]);                         \
> +      }                                                          \
> +  return err;                                            \
> +}
> +
> +CHECK_EXP_LESS128 (union64i_d, int, "%d");
> +CHECK_EXP_LESS128 (union32i_w, short, "%d");
> +CHECK_EXP_LESS128 (union64i_w, short, "%d");
> +CHECK_EXP_LESS128 (union16i_b, char, "%d");
> +CHECK_EXP_LESS128 (union32i_b, char, "%d");
> +CHECK_EXP_LESS128 (union64i_b, char, "%d");
> +
> +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \
> +void do_test##INIT_TYPE##CVT_TYPE ()                     \
> +{                                                        \
> +  INPUT_TYPE s;                                                  \
> +  OUTPUT_TYPE r, ref;                                    \
> +  for (int i = 0; i < ARRAY_SIZE (s.a); i++)             \
> +    {                                                    \
> +      s.a[i] = (i + 23415) * (i + 341);                          \
> +      ref.a[i] = (OUTPUT_INNER) s.a[i];                          \
> +    }                                                    \
> +  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
> +                                                         \
> +  if (check_##OUTPUT_TYPE (r, ref.a))                    \
> +    abort ();                                            \
> +  return;                                                \
> +}
> +
> +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
> +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
> +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
> +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
> +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
> +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
> +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
> +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
> +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
> +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
> +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
> +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
> +
> +void TEST (void)
> +{
> +  do_test__v2di__v2si ();
> +  do_test__v2di__v2hi ();
> +  do_test__v2di__v2qi ();
> +  do_test__v4di__v4si ();
> +  do_test__v4di__v4hi ();
> +  do_test__v4di__v4qi ();
> +  do_test__v4si__v4hi ();
> +  do_test__v4si__v4qi ();
> +  do_test__v8si__v8hi ();
> +  do_test__v8si__v8qi ();
> +  do_test__v8hi__v8qi ();
> +  do_test__v16hi__v16qi ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> index 28a3f9a3527..3aa49a3b654 100644
> --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
>     a uniform CTOR with a vector promotion to a CTOR on a promoted
>     element.  */
>  /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
> -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
> -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
> --
> 2.31.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
  2024-05-23  6:42           ` Hongtao Liu
@ 2024-05-23  7:17             ` Hu, Lin1
  2024-05-23  8:05               ` Hongtao Liu
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-23  7:17 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: gcc-patches, Liu, Hongtao, ubizjak, rguenther

> -----Original Message-----
> From: Hongtao Liu <crazylht@gmail.com>
> Sent: Thursday, May 23, 2024 2:42 PM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> ubizjak@gmail.com; rguenther@suse.de
> Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> 
> On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
> >
> > gcc/ChangeLog:
> >
> >         PR 107432
> >         * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
> >         New function for generate a series of suitable insn.
> >         * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> >         Define new function.
> >         * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
> I have some concern for this patch since
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> patch.

OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq.

BRs,
Lin
 
> > gcc/testsuite/ChangeLog:
> >
> >         PR 107432
> >         * gcc.target/i386/pr107432-8.c: New test.
> >         * gcc.target/i386/pr107432-9.c: Ditto.
> >         * gcc.target/i386/pr92645-4.c: Modify test.
> > ---
> >  gcc/config/i386/i386-expand.cc             |  47 +++++++-
> >  gcc/config/i386/i386-protos.h              |   3 +
> >  gcc/config/i386/sse.md                     |  87 +++++++++++----
> >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
> > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> >    emit_insn (gen_xorv4si3 (value, value, large));  }
> >
> > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > -                                                machine_mode mode, rtx target,
> > -                                                rtx var, int one_var);
> > -
> >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> >     Expects the 64-bit DImode to be supplied in a pair of integral
> >     registers.  Requires SSE2; will use SSE3 if available.  For
> > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> >     whose ONE_VAR element is VAR, and other elements are zero.  Return true
> >     if successful.  */
> >
> > -static bool
> > +bool
> >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> >                                      rtx target, rtx var, int one_var)
> > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> >    return ret;
> >  }
> >
> > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > +
> > +bool
> > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > +  machine_mode out_mode = GET_MODE (output);
> > +  machine_mode in_mode = GET_MODE (input);
> > +  int len = GET_MODE_SIZE (in_mode);
> > +  gcc_assert (len == 16 || len == 32);
> > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > +
> > +  struct expand_vec_perm_d d;
> > +  d.target = gen_reg_rtx (cvt_mode);
> > +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input),
> > + in_mode);
> > +  d.op1 = d.op0;
> > +  d.vmode = cvt_mode;
> > +  d.nelt = len;
> > +  d.testing_p = false;
> > +  d.one_operand_p = true;
> > +
> > +  /* Init perm. Put the needed bits of input in order and
> > +     fill the rest of bits by default.  */  int tot = 0;  for (int i
> > + = 0; i < len; ++i)
> > +    {
> > +      d.perm[i] = i;
> > +      if ((i % in_innersize) < out_innersize)
> > +       d.perm[tot++] = i;
> > +    }
> > +
> > +  if (ix86_expand_vec_perm_const_1(&d))
> > +    {
> > +      emit_move_insn (output, gen_lowpart (out_mode, d.target));
> > +      return true;
> > +    }
> > +
> > +  return false;
> > +}
> > +
> >  #include "gt-i386-expand.h"
> > diff --git a/gcc/config/i386/i386-protos.h
> > b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644
> > --- a/gcc/config/i386/i386-protos.h
> > +++ b/gcc/config/i386/i386-protos.h
> > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx,
> > rtx, rtx, enum rtx_code,  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx,
> rtx, rtx, rtx,
> >                                       bool, rtx_code_label *);  extern
> > rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
> >  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);  extern bool
> > ix86_memtag_can_tag_addresses (void);
> >
> > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx,
> > rtx);  extern void ix86_expand_sse2_abs (rtx, rtx);  extern bool
> > ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
> >                                                rtx);
> > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
> rtx,
> > +                                                rtx, int);
> >  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
> >
> >  /* In i386-c.cc  */
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index
> > f57f36ae380..0b14b3dc1ac 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -14373,14 +14373,25 @@ (define_expand
> "avx512bw_<code>v32hiv32qi2_mask_store"
> >
> >  (define_mode_iterator PMOV_DST_MODE_2
> >    [V4SI V8HI (V16QI "TARGET_AVX512BW")])
> > +(define_mode_iterator PMOV_DST_MODE_2_AVX2
> > +  [V4SI V8HI V16QI])
> >  (define_mode_attr pmov_suff_2
> >    [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
> >
> >  (define_expand "trunc<ssedoublemodelower><mode>2"
> > -  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
> > -       (truncate:PMOV_DST_MODE_2
> > +  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0
> "nonimmediate_operand")
> > +       (truncate:PMOV_DST_MODE_2_AVX2
> >           (match_operand:<ssedoublemode> 1 "register_operand")))]
> > -  "TARGET_AVX512VL")
> > +  "TARGET_AVX2"
> > +{
> > +  if (!TARGET_AVX512VL
> > +      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
> > +    {
> > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> operands[1]);
> > +      gcc_assert (ok);
> > +      DONE;
> > +    }
> > +})
> >
> >  (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
> >    [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand"
> > "=v,m") @@ -14460,6 +14471,7 @@ (define_expand
> "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
> >    "TARGET_AVX512VL")
> >
> >  (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI
> > "TARGET_AVX512BW")])
> > +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI
> > +V8HI])
> >  (define_mode_attr pmov_dst_3_lower
> >    [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI
> > "v8qi")])  (define_mode_attr pmov_dst_3 @@ -14472,16 +14484,26 @@
> > (define_mode_attr pmov_suff_3  (define_expand
> > "trunc<mode><pmov_dst_3_lower>2"
> >    [(set (match_operand:<pmov_dst_3> 0 "register_operand")
> >         (truncate:<pmov_dst_3>
> > -         (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
> > -  "TARGET_AVX512VL"
> > +         (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
> > + "TARGET_AVX2"
> >  {
> > -  rtx op0 = gen_reg_rtx (V16QImode);
> > +  if (TARGET_AVX512VL
> > +      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
> > +    {
> > +       rtx op0 = gen_reg_rtx (V16QImode);
> >
> > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> > +       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > +                (op0, operands[1], CONST0_RTX
> > + (<pmov_dst_zeroed_3>mode)));
> > +
> > +       emit_move_insn (operands[0],
> > +                      lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> > +    }
> > +  else
> > +    {
> > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> operands[1]);
> > +      gcc_assert (ok);
> > +    }
> >
> > -  emit_move_insn (operands[0],
> > -                 lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> >    DONE;
> >  })
> >
> > @@ -14853,15 +14875,24 @@ (define_expand
> "trunc<mode><pmov_dst_4_lower>2"
> >    [(set (match_operand:<pmov_dst_4> 0 "register_operand")
> >         (truncate:<pmov_dst_4>
> >           (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
> > -  "TARGET_AVX512VL"
> > +  "TARGET_AVX2"
> >  {
> > -  rtx op0 = gen_reg_rtx (V8HImode);
> > +  if (TARGET_AVX512VL)
> > +    {
> > +      rtx op0 = gen_reg_rtx (V8HImode);
> >
> > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
> > +      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > +               (op0, operands[1], CONST0_RTX
> > + (<pmov_dst_zeroed_4>mode)));
> >
> > -  emit_move_insn (operands[0],
> > -                 lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > +      emit_move_insn (operands[0],
> > +                     lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > +      DONE;
> > +    }
> > +  else
> > +    {
> > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> operands[1]);
> > +      gcc_assert (ok);
> > +    }
> >    DONE;
> >  })
> >
> > @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2"
> >    [(set (match_operand:V2SI 0 "register_operand")
> >         (truncate:V2SI
> >           (match_operand:V2DI 1 "register_operand")))]
> > -  "TARGET_AVX512VL"
> > +  "TARGET_AVX2"
> >  {
> > -  rtx op0 = gen_reg_rtx (V4SImode);
> > +  if (TARGET_AVX512VL)
> > +    {
> > +      rtx op0 = gen_reg_rtx (V4SImode);
> >
> > -  emit_insn (gen_avx512vl_truncatev2div2si2
> > -            (op0, operands[1], CONST0_RTX (V2SImode)));
> > +      emit_insn (gen_avx512vl_truncatev2div2si2
> > +               (op0, operands[1], CONST0_RTX (V2SImode)));
> >
> > -  emit_move_insn (operands[0],
> > -                 lowpart_subreg (V2SImode, op0, V4SImode));
> > +      emit_move_insn (operands[0],
> > +                     lowpart_subreg (V2SImode, op0, V4SImode));
> > +    }
> > +  else
> > +    {
> > +      rtx tmp = lowpart_subreg (V4SImode,
> > +                               force_reg (V2DImode, operands[1]), V2DImode);
> > +      rtx op0 = gen_reg_rtx (V4SImode);
> > +      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
> > +                                     GEN_INT (6), GEN_INT (7)));
> > +      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0,
> V4SImode));
> > +    }
> >    DONE;
> >  })
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > new file mode 100644
> > index 00000000000..f0d1ab028f7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > @@ -0,0 +1,73 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v3 -O2" } */
> > +/* { dg-final { scan-assembler-times "vshufps" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
> > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpermq" 5 } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) {
> > +  return __builtin_convertvector((__v2di)a, __v2si); }
> > +
> > +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) {
> > +  return __builtin_convertvector((__v4di)a, __v4si); }
> > +
> > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2di)a, __v2hi); }
> > +
> > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) {
> > +  return __builtin_convertvector((__v4di)a, __v4hi); }
> > +
> > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v4si)a, __v4hi); }
> > +
> > +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) {
> > +  return __builtin_convertvector((__v8si)a, __v8hi); }
> > +
> > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v2di)a, __v2qi); }
> > +
> > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) {
> > +  return __builtin_convertvector((__v4di)a, __v4qi); }
> > +
> > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v4si)a, __v4qi); }
> > +
> > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) {
> > +  return __builtin_convertvector((__v8si)a, __v8qi); }
> > +
> > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) {
> > +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> > +
> > +__v16qi        mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
> > +{
> > +  return __builtin_convertvector((__v16hi)a, __v16qi); }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > new file mode 100644
> > index 00000000000..650d352b945
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > @@ -0,0 +1,121 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
> > +#include <x86intrin.h>
> > +
> > +#include "avx-check.h"
> > +
> > +#ifndef TEST
> > +#define TEST avx_test
> > +#endif
> > +
> > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > +__attribute__ ((__vector_size__ (8)));
> > +
> > +typedef union
> > +{
> > +  __v2si x;
> > +  int a[2];
> > +} union64i_d;
> > +
> > +typedef union
> > +{
> > +  __v2hi x;
> > +  short a[2];
> > +} union32i_w;
> > +
> > +typedef union
> > +{
> > +  __v4hi x;
> > +  short a[4];
> > +} union64i_w;
> > +
> > +typedef union
> > +{
> > +  __v2qi x;
> > +  char a[2];
> > +} union16i_b;
> > +
> > +typedef union
> > +{
> > +  __v4qi x;
> > +  char a[4];
> > +} union32i_b;
> > +
> > +typedef union
> > +{
> > +  __v8qi x;
> > +  char a[8];
> > +} union64i_b;
> > +
> > +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)   \
> > +static int                                               \
> > +__attribute__((noinline, unused))                        \
> > +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)          \
> > +{                                                        \
> > +  int i;                                                 \
> > +  int err = 0;                                           \
> > +                                                         \
> > +  for (i = 0; i < ARRAY_SIZE (u.a); i++)                 \
> > +    if (u.a[i] != v[i])                                          \
> > +      {                                                          \
> > +       err++;                                            \
> > +       PRINTF ("%i: " FMT " != " FMT "\n",               \
> > +               i, v[i], u.a[i]);                         \
> > +      }                                                          \
> > +  return err;                                            \
> > +}
> > +
> > +CHECK_EXP_LESS128 (union64i_d, int, "%d");
> > +CHECK_EXP_LESS128 (union32i_w, short, "%d");
> > +CHECK_EXP_LESS128 (union64i_w, short, "%d");
> > +CHECK_EXP_LESS128 (union16i_b, char, "%d");
> > +CHECK_EXP_LESS128 (union32i_b, char, "%d");
> > +CHECK_EXP_LESS128 (union64i_b, char, "%d");
> > +
> > +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE,
> CVT_TYPE) \
> > +void do_test##INIT_TYPE##CVT_TYPE ()                     \
> > +{                                                        \
> > +  INPUT_TYPE s;                                                  \
> > +  OUTPUT_TYPE r, ref;                                    \
> > +  for (int i = 0; i < ARRAY_SIZE (s.a); i++)             \
> > +    {                                                    \
> > +      s.a[i] = (i + 23415) * (i + 341);                          \
> > +      ref.a[i] = (OUTPUT_INNER) s.a[i];                          \
> > +    }                                                    \
> > +  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
> > +                                                         \
> > +  if (check_##OUTPUT_TYPE (r, ref.a))                    \
> > +    abort ();                                            \
> > +  return;                                                \
> > +}
> > +
> > +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
> > +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
> > +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
> > +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
> > +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
> > +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
> > +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
> > +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
> > +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
> > +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
> > +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
> > +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
> > +
> > +void TEST (void)
> > +{
> > +  do_test__v2di__v2si ();
> > +  do_test__v2di__v2hi ();
> > +  do_test__v2di__v2qi ();
> > +  do_test__v4di__v4si ();
> > +  do_test__v4di__v4hi ();
> > +  do_test__v4di__v4qi ();
> > +  do_test__v4si__v4hi ();
> > +  do_test__v4si__v4qi ();
> > +  do_test__v8si__v8hi ();
> > +  do_test__v8si__v8qi ();
> > +  do_test__v8hi__v8qi ();
> > +  do_test__v16hi__v16qi ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > index 28a3f9a3527..3aa49a3b654 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
> >     a uniform CTOR with a vector promotion to a CTOR on a promoted
> >     element.  */
> >  /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short
> > unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
> > -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" }
> > } */
> > -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" }
> > } */
> > --
> > 2.31.1
> >
> 
> 
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
  2024-05-23  7:17             ` Hu, Lin1
@ 2024-05-23  8:05               ` Hongtao Liu
  2024-05-29  8:59                 ` [PATCH 3/3 v2] " Hu, Lin1
  0 siblings, 1 reply; 33+ messages in thread
From: Hongtao Liu @ 2024-05-23  8:05 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, Liu, Hongtao, ubizjak, rguenther

On Thu, May 23, 2024 at 3:17 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> > -----Original Message-----
> > From: Hongtao Liu <crazylht@gmail.com>
> > Sent: Thursday, May 23, 2024 2:42 PM
> > To: Hu, Lin1 <lin1.hu@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>;
> > ubizjak@gmail.com; rguenther@suse.de
> > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> >
> > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR 107432
> > >         * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
> > >         New function for generate a series of suitable insn.
> > >         * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> > >         Define new function.
> > >         * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
> > I have some concern for this patch since
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> > patch.
>
> OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
> Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq.
After a second thought, we may go ahead with the patch, for PR115069,
there's an alternative to avoid cross-lane truncation.
But for this one, there's no alternative. Although cross-lane
permutation is not very efficient, it should still be better than
original code.
>
> BRs,
> Lin
>
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR 107432
> > >         * gcc.target/i386/pr107432-8.c: New test.
> > >         * gcc.target/i386/pr107432-9.c: Ditto.
> > >         * gcc.target/i386/pr92645-4.c: Modify test.
> > > ---
> > >  gcc/config/i386/i386-expand.cc             |  47 +++++++-
> > >  gcc/config/i386/i386-protos.h              |   3 +
> > >  gcc/config/i386/sse.md                     |  87 +++++++++++----
> > >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> > >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc
> > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> > >    emit_insn (gen_xorv4si3 (value, value, large));  }
> > >
> > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > > -                                                machine_mode mode, rtx target,
> > > -                                                rtx var, int one_var);
> > > -
> > >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> > >     Expects the 64-bit DImode to be supplied in a pair of integral
> > >     registers.  Requires SSE2; will use SSE3 if available.  For
> > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> > mmx_ok, machine_mode mode,
> > >     whose ONE_VAR element is VAR, and other elements are zero.  Return true
> > >     if successful.  */
> > >
> > > -static bool
> > > +bool
> > >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> > >                                      rtx target, rtx var, int one_var)
> > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> > >    return ret;
> > >  }
> > >
> > > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > > +
> > > +bool
> > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > > +  machine_mode out_mode = GET_MODE (output);
> > > +  machine_mode in_mode = GET_MODE (input);
> > > +  int len = GET_MODE_SIZE (in_mode);
> > > +  gcc_assert (len == 16 || len == 32);
> > > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > > +
> > > +  struct expand_vec_perm_d d;
> > > +  d.target = gen_reg_rtx (cvt_mode);
> > > +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input),
> > > + in_mode);
> > > +  d.op1 = d.op0;
> > > +  d.vmode = cvt_mode;
> > > +  d.nelt = len;
> > > +  d.testing_p = false;
> > > +  d.one_operand_p = true;
> > > +
> > > +  /* Init perm. Put the needed bits of input in order and
> > > +     fill the rest of bits by default.  */  int tot = 0;  for (int i
> > > + = 0; i < len; ++i)
> > > +    {
> > > +      d.perm[i] = i;
> > > +      if ((i % in_innersize) < out_innersize)
> > > +       d.perm[tot++] = i;
> > > +    }
> > > +
> > > +  if (ix86_expand_vec_perm_const_1(&d))
> > > +    {
> > > +      emit_move_insn (output, gen_lowpart (out_mode, d.target));
> > > +      return true;
> > > +    }
> > > +
> > > +  return false;
> > > +}
> > > +
> > >  #include "gt-i386-expand.h"
> > > diff --git a/gcc/config/i386/i386-protos.h
> > > b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644
> > > --- a/gcc/config/i386/i386-protos.h
> > > +++ b/gcc/config/i386/i386-protos.h
> > > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx,
> > > rtx, rtx, enum rtx_code,  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx,
> > rtx, rtx, rtx,
> > >                                       bool, rtx_code_label *);  extern
> > > rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> > > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
> > >  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);  extern bool
> > > ix86_memtag_can_tag_addresses (void);
> > >
> > > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx,
> > > rtx);  extern void ix86_expand_sse2_abs (rtx, rtx);  extern bool
> > > ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
> > >                                                rtx);
> > > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
> > rtx,
> > > +                                                rtx, int);
> > >  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
> > >
> > >  /* In i386-c.cc  */
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index
> > > f57f36ae380..0b14b3dc1ac 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -14373,14 +14373,25 @@ (define_expand
> > "avx512bw_<code>v32hiv32qi2_mask_store"
> > >
> > >  (define_mode_iterator PMOV_DST_MODE_2
> > >    [V4SI V8HI (V16QI "TARGET_AVX512BW")])
> > > +(define_mode_iterator PMOV_DST_MODE_2_AVX2
> > > +  [V4SI V8HI V16QI])
> > >  (define_mode_attr pmov_suff_2
> > >    [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
> > >
> > >  (define_expand "trunc<ssedoublemodelower><mode>2"
> > > -  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
> > > -       (truncate:PMOV_DST_MODE_2
> > > +  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0
> > "nonimmediate_operand")
> > > +       (truncate:PMOV_DST_MODE_2_AVX2
> > >           (match_operand:<ssedoublemode> 1 "register_operand")))]
> > > -  "TARGET_AVX512VL")
> > > +  "TARGET_AVX2"
> > > +{
> > > +  if (!TARGET_AVX512VL
> > > +      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
> > > +    {
> > > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> > operands[1]);
> > > +      gcc_assert (ok);
> > > +      DONE;
> > > +    }
> > > +})
> > >
> > >  (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
> > >    [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand"
> > > "=v,m") @@ -14460,6 +14471,7 @@ (define_expand
> > "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
> > >    "TARGET_AVX512VL")
> > >
> > >  (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI
> > > "TARGET_AVX512BW")])
> > > +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI
> > > +V8HI])
> > >  (define_mode_attr pmov_dst_3_lower
> > >    [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI
> > > "v8qi")])  (define_mode_attr pmov_dst_3 @@ -14472,16 +14484,26 @@
> > > (define_mode_attr pmov_suff_3  (define_expand
> > > "trunc<mode><pmov_dst_3_lower>2"
> > >    [(set (match_operand:<pmov_dst_3> 0 "register_operand")
> > >         (truncate:<pmov_dst_3>
> > > -         (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
> > > -  "TARGET_AVX512VL"
> > > +         (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
> > > + "TARGET_AVX2"
> > >  {
> > > -  rtx op0 = gen_reg_rtx (V16QImode);
> > > +  if (TARGET_AVX512VL
> > > +      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
> > > +    {
> > > +       rtx op0 = gen_reg_rtx (V16QImode);
> > >
> > > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> > > +       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> > > +                (op0, operands[1], CONST0_RTX
> > > + (<pmov_dst_zeroed_3>mode)));
> > > +
> > > +       emit_move_insn (operands[0],
> > > +                      lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> > > +    }
> > > +  else
> > > +    {
> > > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> > operands[1]);
> > > +      gcc_assert (ok);
> > > +    }
> > >
> > > -  emit_move_insn (operands[0],
> > > -                 lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> > >    DONE;
> > >  })
> > >
> > > @@ -14853,15 +14875,24 @@ (define_expand
> > "trunc<mode><pmov_dst_4_lower>2"
> > >    [(set (match_operand:<pmov_dst_4> 0 "register_operand")
> > >         (truncate:<pmov_dst_4>
> > >           (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
> > > -  "TARGET_AVX512VL"
> > > +  "TARGET_AVX2"
> > >  {
> > > -  rtx op0 = gen_reg_rtx (V8HImode);
> > > +  if (TARGET_AVX512VL)
> > > +    {
> > > +      rtx op0 = gen_reg_rtx (V8HImode);
> > >
> > > -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > > -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
> > > +      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> > > +               (op0, operands[1], CONST0_RTX
> > > + (<pmov_dst_zeroed_4>mode)));
> > >
> > > -  emit_move_insn (operands[0],
> > > -                 lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > > +      emit_move_insn (operands[0],
> > > +                     lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> > > +      DONE;
> > > +    }
> > > +  else
> > > +    {
> > > +      bool ok = ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> > operands[1]);
> > > +      gcc_assert (ok);
> > > +    }
> > >    DONE;
> > >  })
> > >
> > > @@ -15102,15 +15133,27 @@ (define_expand "truncv2div2si2"
> > >    [(set (match_operand:V2SI 0 "register_operand")
> > >         (truncate:V2SI
> > >           (match_operand:V2DI 1 "register_operand")))]
> > > -  "TARGET_AVX512VL"
> > > +  "TARGET_AVX2"
> > >  {
> > > -  rtx op0 = gen_reg_rtx (V4SImode);
> > > +  if (TARGET_AVX512VL)
> > > +    {
> > > +      rtx op0 = gen_reg_rtx (V4SImode);
> > >
> > > -  emit_insn (gen_avx512vl_truncatev2div2si2
> > > -            (op0, operands[1], CONST0_RTX (V2SImode)));
> > > +      emit_insn (gen_avx512vl_truncatev2div2si2
> > > +               (op0, operands[1], CONST0_RTX (V2SImode)));
> > >
> > > -  emit_move_insn (operands[0],
> > > -                 lowpart_subreg (V2SImode, op0, V4SImode));
> > > +      emit_move_insn (operands[0],
> > > +                     lowpart_subreg (V2SImode, op0, V4SImode));
> > > +    }
> > > +  else
> > > +    {
> > > +      rtx tmp = lowpart_subreg (V4SImode,
> > > +                               force_reg (V2DImode, operands[1]), V2DImode);
> > > +      rtx op0 = gen_reg_rtx (V4SImode);
> > > +      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
> > > +                                     GEN_INT (6), GEN_INT (7)));
> > > +      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0,
> > V4SImode));
> > > +    }
> > >    DONE;
> > >  })
> > >
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > > b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > > new file mode 100644
> > > index 00000000000..f0d1ab028f7
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> > > @@ -0,0 +1,73 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-march=x86-64-v3 -O2" } */
> > > +/* { dg-final { scan-assembler-times "vshufps" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpshufb" 15 } } */
> > > +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> > > +/* { dg-final { scan-assembler-times "vpermq" 5 } } */
> > > +
> > > +#include <x86intrin.h>
> > > +
> > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > > +__attribute__ ((__vector_size__ (8)));
> > > +
> > > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a) {
> > > +  return __builtin_convertvector((__v2di)a, __v2si); }
> > > +
> > > +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a) {
> > > +  return __builtin_convertvector((__v4di)a, __v4si); }
> > > +
> > > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v2di)a, __v2hi); }
> > > +
> > > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) {
> > > +  return __builtin_convertvector((__v4di)a, __v4hi); }
> > > +
> > > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v4si)a, __v4hi); }
> > > +
> > > +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a) {
> > > +  return __builtin_convertvector((__v8si)a, __v8hi); }
> > > +
> > > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v2di)a, __v2qi); }
> > > +
> > > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) {
> > > +  return __builtin_convertvector((__v4di)a, __v4qi); }
> > > +
> > > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v4si)a, __v4qi); }
> > > +
> > > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) {
> > > +  return __builtin_convertvector((__v8si)a, __v8qi); }
> > > +
> > > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) {
> > > +  return __builtin_convertvector((__v8hi)a, __v8qi); }
> > > +
> > > +__v16qi        mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
> > > +{
> > > +  return __builtin_convertvector((__v16hi)a, __v16qi); }
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > > b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > > new file mode 100644
> > > index 00000000000..650d352b945
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> > > @@ -0,0 +1,121 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
> > > +#include <x86intrin.h>
> > > +
> > > +#include "avx-check.h"
> > > +
> > > +#ifndef TEST
> > > +#define TEST avx_test
> > > +#endif
> > > +
> > > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
> > > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
> > > +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
> > > +__attribute__ ((__vector_size__ (8)));
> > > +
> > > +typedef union
> > > +{
> > > +  __v2si x;
> > > +  int a[2];
> > > +} union64i_d;
> > > +
> > > +typedef union
> > > +{
> > > +  __v2hi x;
> > > +  short a[2];
> > > +} union32i_w;
> > > +
> > > +typedef union
> > > +{
> > > +  __v4hi x;
> > > +  short a[4];
> > > +} union64i_w;
> > > +
> > > +typedef union
> > > +{
> > > +  __v2qi x;
> > > +  char a[2];
> > > +} union16i_b;
> > > +
> > > +typedef union
> > > +{
> > > +  __v4qi x;
> > > +  char a[4];
> > > +} union32i_b;
> > > +
> > > +typedef union
> > > +{
> > > +  __v8qi x;
> > > +  char a[8];
> > > +} union64i_b;
> > > +
> > > +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)   \
> > > +static int                                               \
> > > +__attribute__((noinline, unused))                        \
> > > +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)          \
> > > +{                                                        \
> > > +  int i;                                                 \
> > > +  int err = 0;                                           \
> > > +                                                         \
> > > +  for (i = 0; i < ARRAY_SIZE (u.a); i++)                 \
> > > +    if (u.a[i] != v[i])                                          \
> > > +      {                                                          \
> > > +       err++;                                            \
> > > +       PRINTF ("%i: " FMT " != " FMT "\n",               \
> > > +               i, v[i], u.a[i]);                         \
> > > +      }                                                          \
> > > +  return err;                                            \
> > > +}
> > > +
> > > +CHECK_EXP_LESS128 (union64i_d, int, "%d");
> > > +CHECK_EXP_LESS128 (union32i_w, short, "%d");
> > > +CHECK_EXP_LESS128 (union64i_w, short, "%d");
> > > +CHECK_EXP_LESS128 (union16i_b, char, "%d");
> > > +CHECK_EXP_LESS128 (union32i_b, char, "%d");
> > > +CHECK_EXP_LESS128 (union64i_b, char, "%d");
> > > +
> > > +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE,
> > CVT_TYPE) \
> > > +void do_test##INIT_TYPE##CVT_TYPE ()                     \
> > > +{                                                        \
> > > +  INPUT_TYPE s;                                                  \
> > > +  OUTPUT_TYPE r, ref;                                    \
> > > +  for (int i = 0; i < ARRAY_SIZE (s.a); i++)             \
> > > +    {                                                    \
> > > +      s.a[i] = (i + 23415) * (i + 341);                          \
> > > +      ref.a[i] = (OUTPUT_INNER) s.a[i];                          \
> > > +    }                                                    \
> > > +  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
> > > +                                                         \
> > > +  if (check_##OUTPUT_TYPE (r, ref.a))                    \
> > > +    abort ();                                            \
> > > +  return;                                                \
> > > +}
> > > +
> > > +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
> > > +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
> > > +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
> > > +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
> > > +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
> > > +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
> > > +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
> > > +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
> > > +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
> > > +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
> > > +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
> > > +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
> > > +
> > > +void TEST (void)
> > > +{
> > > +  do_test__v2di__v2si ();
> > > +  do_test__v2di__v2hi ();
> > > +  do_test__v2di__v2qi ();
> > > +  do_test__v4di__v4si ();
> > > +  do_test__v4di__v4hi ();
> > > +  do_test__v4di__v4qi ();
> > > +  do_test__v4si__v4hi ();
> > > +  do_test__v4si__v4qi ();
> > > +  do_test__v8si__v8hi ();
> > > +  do_test__v8si__v8qi ();
> > > +  do_test__v8hi__v8qi ();
> > > +  do_test__v16hi__v16qi ();
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > index 28a3f9a3527..3aa49a3b654 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> > > @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
> > >     a uniform CTOR with a vector promotion to a CTOR on a promoted
> > >     element.  */
> > >  /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short
> > > unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
> > > -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" }
> > > } */
> > > -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" }
> > > } */
> > > --
> > > 2.31.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 3/3 v2] vect: support direct conversion under x86-64-v3.
  2024-05-23  8:05               ` Hongtao Liu
@ 2024-05-29  8:59                 ` Hu, Lin1
  2024-05-30  6:51                   ` Hongtao Liu
  0 siblings, 1 reply; 33+ messages in thread
From: Hu, Lin1 @ 2024-05-29  8:59 UTC (permalink / raw)
  To: gcc-patches; +Cc: hongtao.liu, ubizjak, rguenther

According to hongtao's suggestion, I support some trunc in mmx.md under
x86-64-v3, and optimize ix86_expand_trunc_with_avx2_noavx512f.

BRs,
Lin

gcc/ChangeLog:

	PR 107432
	* config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
	New function for generate a series of suitable insn.
	* config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
	Define new function.
	* config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
	(ssebytemode) Add V8HI.
	(PMOV_DST_MODE_2_AVX2): New mode iterator.
	(PMOV_SRC_MODE_3_AVX2): Ditto.
	* config/i386/mmx.md
	(trunc<mode><mmxhalfmodelower>2): Ditto.
	(avx512vl_trunc<mode><mmxhalfmodelower>2): Ditto.
	(truncv2si<mode>2): Ditto.
	(avx512vl_truncv2si<mode>2): Ditto.
	(mmxbytemode): New mode attr.

gcc/testsuite/ChangeLog:

	PR 107432
	* gcc.target/i386/pr107432-8.c: New test.
	* gcc.target/i386/pr107432-9.c: Ditto.
	* gcc.target/i386/pr92645-4.c: Modify test.
---
 gcc/config/i386/i386-expand.cc             |  44 ++++++-
 gcc/config/i386/i386-protos.h              |   3 +
 gcc/config/i386/mmx.md                     |  35 +++++-
 gcc/config/i386/sse.md                     |  88 ++++++++++----
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  94 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 129 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 7 files changed, 363 insertions(+), 32 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..90705803d29 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
   emit_insn (gen_xorv4si3 (value, value, large));
 }
 
-static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
-						 machine_mode mode, rtx target,
-						 rtx var, int one_var);
-
 /* Convert an unsigned DImode value into a DFmode, using only SSE.
    Expects the 64-bit DImode to be supplied in a pair of integral
    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
@@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
    whose ONE_VAR element is VAR, and other elements are zero.  Return true
    if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
 				     rtx target, rtx var, int one_var)
 {
@@ -25551,4 +25547,42 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
   return ret;
 }
 
+/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
+
+void
+ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
+{
+  machine_mode out_mode = GET_MODE (output);
+  machine_mode in_mode = GET_MODE (input);
+  int len = GET_MODE_SIZE (in_mode);
+  gcc_assert (len == GET_MODE_SIZE (cvt_mode)
+	      && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
+	      && (REG_P (input) || SUBREG_P (input)));
+  scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
+  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
+  int out_innersize = GET_MODE_SIZE (inner_out_mode);
+
+  struct expand_vec_perm_d d;
+  d.target = gen_reg_rtx (cvt_mode);
+  d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
+  d.op1 = d.op0;
+  d.vmode = cvt_mode;
+  d.nelt = GET_MODE_NUNITS (cvt_mode);
+  d.testing_p = false;
+  d.one_operand_p = true;
+
+  /* Init perm. Put the needed bits of input in order and
+     fill the rest of bits by default.  */
+  for (int i = 0; i < d.nelt; ++i)
+    {
+      d.perm[i] = i;
+      if (i < GET_MODE_NUNITS (out_mode))
+	d.perm[i] = i * (in_innersize / out_innersize);
+    }
+
+  bool ok = ix86_expand_vec_perm_const_1(&d);
+  gcc_assert (ok);
+  emit_move_insn (output, gen_lowpart (out_mode, d.target));
+}
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..aa826f4864f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
 extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
 				      bool, rtx_code_label *);
 extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
+extern void ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx, machine_mode);
 extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
 extern bool ix86_memtag_can_tag_addresses (void);
 
@@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
 extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
 					       rtx);
+extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
+						 rtx, int);
 extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
 
 /* In i386-c.cc  */
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 27b080bfeb6..6f51658f27e 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -112,6 +112,9 @@ (define_mode_attr mmxdoublemode
 (define_mode_attr mmxhalfmode
   [(V4HI "V4QI") (V2HI "V2QI")])
 
+(define_mode_attr mmxbytemode
+  [(V4HI "V8QI") (V2HI "V4QI")])
+
 (define_mode_attr mmxhalfmodelower
   [(V4HI "v4qi") (V2HI "v2qi")])
 
@@ -4882,7 +4885,20 @@ (define_expand "<insn>v2qiv2hi2"
   DONE;
 })
 
-(define_insn "trunc<mode><mmxhalfmodelower>2"
+(define_expand "trunc<mode><mmxhalfmodelower>2"
+  [(set (match_operand:<mmxhalfmode> 0 "register_operand")
+	(truncate:<mmxhalfmode>
+	  (match_operand:VI2_32_64 1 "register_operand")))]
+  "TARGET_AVX2"
+{
+  if (TARGET_AVX512VL && TARGET_AVX512BW)
+    emit_insn (gen_avx512vl_trunc<mode><mmxhalfmodelower>2 (operands[0], operands[1]));
+  else
+    ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1], <mmxbytemode>mode);
+  DONE;
+})
+
+(define_insn "avx512vl_trunc<mode><mmxhalfmodelower>2"
   [(set (match_operand:<mmxhalfmode> 0 "register_operand" "=v")
 	(truncate:<mmxhalfmode>
 	  (match_operand:VI2_32_64 1 "register_operand" "v")))]
@@ -4893,7 +4909,22 @@ (define_insn "trunc<mode><mmxhalfmodelower>2"
    (set_attr "mode" "TI")])
 
 (define_mode_iterator V2QI_V2HI [V2QI V2HI])
-(define_insn "truncv2si<mode>2"
+(define_mode_attr v2qi_quad_v2hi_double
+  [(V2QI "V8QI") (V2HI "V4HI")])
+(define_expand "truncv2si<mode>2"
+  [(set (match_operand:V2QI_V2HI 0 "register_operand")
+	(truncate:V2QI_V2HI
+	  (match_operand:V2SI 1 "register_operand")))]
+  "TARGET_AVX2 && TARGET_MMX_WITH_SSE"
+{
+  if (TARGET_AVX512VL)
+    emit_insn (gen_avx512vl_truncv2si<mode>2 (operands[0], operands[1]));
+  else
+    ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1], <v2qi_quad_v2hi_double>mode);
+  DONE;
+})
+
+(define_insn "avx512vl_truncv2si<mode>2"
   [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
 	(truncate:V2QI_V2HI
 	  (match_operand:V2SI 1 "register_operand" "v")))]
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f57f36ae380..4c54fc9d4ef 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -835,7 +835,8 @@ (define_mode_attr ssedoublemode
 
 (define_mode_attr ssebytemode
   [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
-   (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")])
+   (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")
+   (V8HI "V16QI")])
 
 (define_mode_attr sseintconvert
   [(V32HI "w") (V16HI "w") (V8HI "w")
@@ -14373,14 +14374,26 @@ (define_expand "avx512bw_<code>v32hiv32qi2_mask_store"
 
 (define_mode_iterator PMOV_DST_MODE_2
   [V4SI V8HI (V16QI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_DST_MODE_2_AVX2
+  [V4SI V8HI V16QI])
 (define_mode_attr pmov_suff_2
   [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
 
 (define_expand "trunc<ssedoublemodelower><mode>2"
-  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
-	(truncate:PMOV_DST_MODE_2
+  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand")
+	(truncate:PMOV_DST_MODE_2_AVX2
 	  (match_operand:<ssedoublemode> 1 "register_operand")))]
-  "TARGET_AVX512VL")
+  "TARGET_AVX2"
+{
+  if (!TARGET_AVX512VL
+      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
+    {
+      ix86_expand_trunc_with_avx2_noavx512f (operands[0],
+					     operands[1],
+					     <ssedoublevecmode>mode);
+      DONE;
+    }
+})
 
 (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
   [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
@@ -14460,6 +14473,7 @@ (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
   "TARGET_AVX512VL")
 
 (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI])
 (define_mode_attr pmov_dst_3_lower
   [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")])
 (define_mode_attr pmov_dst_3
@@ -14472,16 +14486,26 @@ (define_mode_attr pmov_suff_3
 (define_expand "trunc<mode><pmov_dst_3_lower>2"
   [(set (match_operand:<pmov_dst_3> 0 "register_operand")
 	(truncate:<pmov_dst_3>
-	  (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
-  "TARGET_AVX512VL"
+	  (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V16QImode);
+  if (TARGET_AVX512VL
+      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
+    {
+       rtx op0 = gen_reg_rtx (V16QImode);
 
-  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
-	     (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
+       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
+	         (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
+       emit_move_insn (operands[0],
+		       lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
+    }
+  else
+    {
+      ix86_expand_trunc_with_avx2_noavx512f (operands[0],
+					     operands[1],
+					     <ssebytemode>mode);
+    }
   DONE;
 })
 
@@ -14853,15 +14877,21 @@ (define_expand "trunc<mode><pmov_dst_4_lower>2"
   [(set (match_operand:<pmov_dst_4> 0 "register_operand")
 	(truncate:<pmov_dst_4>
 	  (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
-  "TARGET_AVX512VL"
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V8HImode);
+  if (TARGET_AVX512VL)
+    {
+      rtx op0 = gen_reg_rtx (V8HImode);
 
-  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
-	     (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
+      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
+		(op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
+      emit_move_insn (operands[0],
+		      lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
+      DONE;
+    }
+  else
+    ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1], <ssewvecmode>mode);
   DONE;
 })
 
@@ -15102,15 +15132,27 @@ (define_expand "truncv2div2si2"
   [(set (match_operand:V2SI 0 "register_operand")
 	(truncate:V2SI
 	  (match_operand:V2DI 1 "register_operand")))]
-  "TARGET_AVX512VL"
+  "TARGET_AVX2"
 {
-  rtx op0 = gen_reg_rtx (V4SImode);
+  if (TARGET_AVX512VL)
+    {
+      rtx op0 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_avx512vl_truncatev2div2si2
-	     (op0, operands[1], CONST0_RTX (V2SImode)));
+      emit_insn (gen_avx512vl_truncatev2div2si2
+		(op0, operands[1], CONST0_RTX (V2SImode)));
 
-  emit_move_insn (operands[0],
-		  lowpart_subreg (V2SImode, op0, V4SImode));
+      emit_move_insn (operands[0],
+		      lowpart_subreg (V2SImode, op0, V4SImode));
+    }
+  else
+    {
+      rtx tmp = lowpart_subreg (V4SImode,
+				force_reg (V2DImode, operands[1]), V2DImode);
+      rtx op0 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
+				      GEN_INT (6), GEN_INT (7)));
+      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+    }
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c
new file mode 100644
index 00000000000..5e1a442159f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vshufps" 1 } } */
+/* { dg-final { scan-assembler-times "vpshufb" 18 } } */
+/* { dg-final { scan-assembler-times "vpermd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermq" 5 } } */
+/* { dg-final { scan-assembler-times "vpshuflw" 1 { target { ! ia32 } } } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__v4si	mm256_cvtepi64_epi32_builtin_convertvector(__v4di a)
+{
+  return __builtin_convertvector((__v4di)a, __v4si);
+}
+
+__v2hi	mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi	mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__v2hi	mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi	mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__v8hi	mm256_cvtepi32_epi16_builtin_convertvector(__v8si a)
+{
+  return __builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__v2qi	mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi	mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v2qi	mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2qi);
+}
+
+__v4qi	mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4qi);
+}
+
+__v8qi	mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v8si)a, __v8qi);
+}
+
+__v2qi	mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+  return __builtin_convertvector((__v2hi)a, __v2qi);
+}
+
+__v4qi	mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
+{
+  return __builtin_convertvector((__v4hi)a, __v4qi);
+}
+
+__v8qi	mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v8hi)a, __v8qi);
+}
+
+__v16qi	mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
+{
+  return __builtin_convertvector((__v16hi)a, __v16qi);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c
new file mode 100644
index 00000000000..90426c030c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
@@ -0,0 +1,129 @@
+/* { dg-do run } */
+/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
+#include <x86intrin.h>
+
+#include "avx-check.h"
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef union
+{
+  __v2si x;
+  int a[2];
+} union64i_d;
+
+typedef union
+{
+  __v2hi x;
+  short a[2];
+} union32i_w;
+
+typedef union
+{
+  __v4hi x;
+  short a[4];
+} union64i_w;
+
+typedef union
+{
+  __v2qi x;
+  char a[2];
+} union16i_b;
+
+typedef union
+{
+  __v4qi x;
+  char a[4];
+} union32i_b;
+
+typedef union
+{
+  __v8qi x;
+  char a[8];
+} union64i_b;
+
+#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)	  \
+static int						  \
+__attribute__((noinline, unused))			  \
+check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)	  \
+{							  \
+  int i;						  \
+  int err = 0;						  \
+							  \
+  for (i = 0; i < ARRAY_SIZE (u.a); i++)		  \
+    if (u.a[i] != v[i])					  \
+      {							  \
+	err++;						  \
+	PRINTF ("%i: " FMT " != " FMT "\n",		  \
+		i, v[i], u.a[i]);			  \
+      }							  \
+  return err;						  \
+}
+
+CHECK_EXP_LESS128 (union64i_d, int, "%d");
+CHECK_EXP_LESS128 (union32i_w, short, "%d");
+CHECK_EXP_LESS128 (union64i_w, short, "%d");
+CHECK_EXP_LESS128 (union16i_b, char, "%d");
+CHECK_EXP_LESS128 (union32i_b, char, "%d");
+CHECK_EXP_LESS128 (union64i_b, char, "%d");
+
+#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \
+void do_test##INIT_TYPE##CVT_TYPE ()			  \
+{							  \
+  INPUT_TYPE s;						  \
+  OUTPUT_TYPE r, ref;					  \
+  for (int i = 0; i < ARRAY_SIZE (s.a); i++)		  \
+    {							  \
+      s.a[i] = (i + 23415) * (i + 341);			  \
+      ref.a[i] = (OUTPUT_INNER) s.a[i];			  \
+    }							  \
+  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
+							  \
+  if (check_##OUTPUT_TYPE (r, ref.a))			  \
+    abort ();						  \
+  return;						  \
+}
+
+SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
+SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
+SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
+SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
+SUBTEST(union64i_d, union32i_w, short, __v2si, __v2hi);
+SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
+SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
+SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
+SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
+SUBTEST(union64i_d, union16i_b, char, __v2si, __v2qi);
+SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
+SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
+SUBTEST(union32i_w, union16i_b, char, __v2hi, __v2qi);
+SUBTEST(union64i_w, union32i_b, char, __v4hi, __v4qi);
+SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
+SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
+
+void TEST (void)
+{
+  do_test__v2di__v2si ();
+  do_test__v2di__v2hi ();
+  do_test__v2di__v2qi ();
+  do_test__v4di__v4si ();
+  do_test__v4di__v4hi ();
+  do_test__v4di__v4qi ();
+  do_test__v2si__v2hi ();
+  do_test__v2si__v2qi ();
+  do_test__v4si__v4hi ();
+  do_test__v4si__v4qi ();
+  do_test__v8si__v8hi ();
+  do_test__v8si__v8qi ();
+  do_test__v2hi__v2qi ();
+  do_test__v4hi__v4qi ();
+  do_test__v8hi__v8qi ();
+  do_test__v16hi__v16qi ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
index 28a3f9a3527..3aa49a3b654 100644
--- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
@@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
    a uniform CTOR with a vector promotion to a CTOR on a promoted
    element.  */
 /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3 v2] vect: support direct conversion under x86-64-v3.
  2024-05-29  8:59                 ` [PATCH 3/3 v2] " Hu, Lin1
@ 2024-05-30  6:51                   ` Hongtao Liu
  0 siblings, 0 replies; 33+ messages in thread
From: Hongtao Liu @ 2024-05-30  6:51 UTC (permalink / raw)
  To: Hu, Lin1; +Cc: gcc-patches, hongtao.liu, ubizjak, rguenther

On Wed, May 29, 2024 at 5:00 PM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> According to hongtao's suggestion, I support some trunc in mmx.md under
> x86-64-v3, and optimize ix86_expand_trunc_with_avx2_noavx512f.
Ok.
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
>         PR 107432
>         * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
>         New function for generate a series of suitable insn.
>         * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
>         Define new function.
>         * config/i386/sse.md: Extend trunc<mode><mode>2 for x86-64-v3.
>         (ssebytemode) Add V8HI.
>         (PMOV_DST_MODE_2_AVX2): New mode iterator.
>         (PMOV_SRC_MODE_3_AVX2): Ditto.
>         * config/i386/mmx.md
>         (trunc<mode><mmxhalfmodelower>2): Ditto.
>         (avx512vl_trunc<mode><mmxhalfmodelower>2): Ditto.
>         (truncv2si<mode>2): Ditto.
>         (avx512vl_truncv2si<mode>2): Ditto.
>         (mmxbytemode): New mode attr.
>
> gcc/testsuite/ChangeLog:
>
>         PR 107432
>         * gcc.target/i386/pr107432-8.c: New test.
>         * gcc.target/i386/pr107432-9.c: Ditto.
>         * gcc.target/i386/pr92645-4.c: Modify test.
> ---
>  gcc/config/i386/i386-expand.cc             |  44 ++++++-
>  gcc/config/i386/i386-protos.h              |   3 +
>  gcc/config/i386/mmx.md                     |  35 +++++-
>  gcc/config/i386/sse.md                     |  88 ++++++++++----
>  gcc/testsuite/gcc.target/i386/pr107432-8.c |  94 +++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107432-9.c | 129 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
>  7 files changed, 363 insertions(+), 32 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..90705803d29 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
>    emit_insn (gen_xorv4si3 (value, value, large));
>  }
>
> -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> -                                                machine_mode mode, rtx target,
> -                                                rtx var, int one_var);
> -
>  /* Convert an unsigned DImode value into a DFmode, using only SSE.
>     Expects the 64-bit DImode to be supplied in a pair of integral
>     registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
> @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
>     whose ONE_VAR element is VAR, and other elements are zero.  Return true
>     if successful.  */
>
> -static bool
> +bool
>  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
>                                      rtx target, rtx var, int one_var)
>  {
> @@ -25551,4 +25547,42 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
>    return ret;
>  }
>
> +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> +
> +void
> +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
> +{
> +  machine_mode out_mode = GET_MODE (output);
> +  machine_mode in_mode = GET_MODE (input);
> +  int len = GET_MODE_SIZE (in_mode);
> +  gcc_assert (len == GET_MODE_SIZE (cvt_mode)
> +             && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
> +             && (REG_P (input) || SUBREG_P (input)));
> +  scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
> +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> +  int out_innersize = GET_MODE_SIZE (inner_out_mode);
> +
> +  struct expand_vec_perm_d d;
> +  d.target = gen_reg_rtx (cvt_mode);
> +  d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
> +  d.op1 = d.op0;
> +  d.vmode = cvt_mode;
> +  d.nelt = GET_MODE_NUNITS (cvt_mode);
> +  d.testing_p = false;
> +  d.one_operand_p = true;
> +
> +  /* Init perm. Put the needed bits of input in order and
> +     fill the rest of bits by default.  */
> +  for (int i = 0; i < d.nelt; ++i)
> +    {
> +      d.perm[i] = i;
> +      if (i < GET_MODE_NUNITS (out_mode))
> +       d.perm[i] = i * (in_innersize / out_innersize);
> +    }
> +
> +  bool ok = ix86_expand_vec_perm_const_1(&d);
> +  gcc_assert (ok);
> +  emit_move_insn (output, gen_lowpart (out_mode, d.target));
> +}
> +
>  #include "gt-i386-expand.h"
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index dbc861fb1ea..aa826f4864f 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
>  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
>                                       bool, rtx_code_label *);
>  extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> +extern void ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx, machine_mode);
>  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
>  extern bool ix86_memtag_can_tag_addresses (void);
>
> @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_abs (rtx, rtx);
>  extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
>                                                rtx);
> +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
> +                                                rtx, int);
>  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
>
>  /* In i386-c.cc  */
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 27b080bfeb6..6f51658f27e 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -112,6 +112,9 @@ (define_mode_attr mmxdoublemode
>  (define_mode_attr mmxhalfmode
>    [(V4HI "V4QI") (V2HI "V2QI")])
>
> +(define_mode_attr mmxbytemode
> +  [(V4HI "V8QI") (V2HI "V4QI")])
> +
>  (define_mode_attr mmxhalfmodelower
>    [(V4HI "v4qi") (V2HI "v2qi")])
>
> @@ -4882,7 +4885,20 @@ (define_expand "<insn>v2qiv2hi2"
>    DONE;
>  })
>
> -(define_insn "trunc<mode><mmxhalfmodelower>2"
> +(define_expand "trunc<mode><mmxhalfmodelower>2"
> +  [(set (match_operand:<mmxhalfmode> 0 "register_operand")
> +       (truncate:<mmxhalfmode>
> +         (match_operand:VI2_32_64 1 "register_operand")))]
> +  "TARGET_AVX2"
> +{
> +  if (TARGET_AVX512VL && TARGET_AVX512BW)
> +    emit_insn (gen_avx512vl_trunc<mode><mmxhalfmodelower>2 (operands[0], operands[1]));
> +  else
> +    ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1], <mmxbytemode>mode);
> +  DONE;
> +})
> +
> +(define_insn "avx512vl_trunc<mode><mmxhalfmodelower>2"
>    [(set (match_operand:<mmxhalfmode> 0 "register_operand" "=v")
>         (truncate:<mmxhalfmode>
>           (match_operand:VI2_32_64 1 "register_operand" "v")))]
> @@ -4893,7 +4909,22 @@ (define_insn "trunc<mode><mmxhalfmodelower>2"
>     (set_attr "mode" "TI")])
>
>  (define_mode_iterator V2QI_V2HI [V2QI V2HI])
> -(define_insn "truncv2si<mode>2"
> +(define_mode_attr v2qi_quad_v2hi_double
> +  [(V2QI "V8QI") (V2HI "V4HI")])
> +(define_expand "truncv2si<mode>2"
> +  [(set (match_operand:V2QI_V2HI 0 "register_operand")
> +       (truncate:V2QI_V2HI
> +         (match_operand:V2SI 1 "register_operand")))]
> +  "TARGET_AVX2 && TARGET_MMX_WITH_SSE"
> +{
> +  if (TARGET_AVX512VL)
> +    emit_insn (gen_avx512vl_truncv2si<mode>2 (operands[0], operands[1]));
> +  else
> +    ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1], <v2qi_quad_v2hi_double>mode);
> +  DONE;
> +})
> +
> +(define_insn "avx512vl_truncv2si<mode>2"
>    [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
>         (truncate:V2QI_V2HI
>           (match_operand:V2SI 1 "register_operand" "v")))]
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index f57f36ae380..4c54fc9d4ef 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -835,7 +835,8 @@ (define_mode_attr ssedoublemode
>
>  (define_mode_attr ssebytemode
>    [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
> -   (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")])
> +   (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")
> +   (V8HI "V16QI")])
>
>  (define_mode_attr sseintconvert
>    [(V32HI "w") (V16HI "w") (V8HI "w")
> @@ -14373,14 +14374,26 @@ (define_expand "avx512bw_<code>v32hiv32qi2_mask_store"
>
>  (define_mode_iterator PMOV_DST_MODE_2
>    [V4SI V8HI (V16QI "TARGET_AVX512BW")])
> +(define_mode_iterator PMOV_DST_MODE_2_AVX2
> +  [V4SI V8HI V16QI])
>  (define_mode_attr pmov_suff_2
>    [(V16QI "wb") (V8HI "dw") (V4SI "qd")])
>
>  (define_expand "trunc<ssedoublemodelower><mode>2"
> -  [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand")
> -       (truncate:PMOV_DST_MODE_2
> +  [(set (match_operand:PMOV_DST_MODE_2_AVX2 0 "nonimmediate_operand")
> +       (truncate:PMOV_DST_MODE_2_AVX2
>           (match_operand:<ssedoublemode> 1 "register_operand")))]
> -  "TARGET_AVX512VL")
> +  "TARGET_AVX2"
> +{
> +  if (!TARGET_AVX512VL
> +      || (<MODE>mode == V16QImode && !TARGET_AVX512BW))
> +    {
> +      ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> +                                            operands[1],
> +                                            <ssedoublevecmode>mode);
> +      DONE;
> +    }
> +})
>
>  (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
>    [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
> @@ -14460,6 +14473,7 @@ (define_expand "<avx512>_<code><ssedoublemodelower><mode>2_mask_store"
>    "TARGET_AVX512VL")
>
>  (define_mode_iterator PMOV_SRC_MODE_3 [V4DI V2DI V8SI V4SI (V8HI "TARGET_AVX512BW")])
> +(define_mode_iterator PMOV_SRC_MODE_3_AVX2 [V4DI V2DI V8SI V4SI V8HI])
>  (define_mode_attr pmov_dst_3_lower
>    [(V4DI "v4qi") (V2DI "v2qi") (V8SI "v8qi") (V4SI "v4qi") (V8HI "v8qi")])
>  (define_mode_attr pmov_dst_3
> @@ -14472,16 +14486,26 @@ (define_mode_attr pmov_suff_3
>  (define_expand "trunc<mode><pmov_dst_3_lower>2"
>    [(set (match_operand:<pmov_dst_3> 0 "register_operand")
>         (truncate:<pmov_dst_3>
> -         (match_operand:PMOV_SRC_MODE_3 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +         (match_operand:PMOV_SRC_MODE_3_AVX2 1 "register_operand")))]
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V16QImode);
> +  if (TARGET_AVX512VL
> +      && (<MODE>mode != V8HImode || TARGET_AVX512BW))
> +    {
> +       rtx op0 = gen_reg_rtx (V16QImode);
>
> -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
> +       emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>qi2
> +                (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_3>mode)));
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> +       emit_move_insn (operands[0],
> +                      lowpart_subreg (<pmov_dst_3>mode, op0, V16QImode));
> +    }
> +  else
> +    {
> +      ix86_expand_trunc_with_avx2_noavx512f (operands[0],
> +                                            operands[1],
> +                                            <ssebytemode>mode);
> +    }
>    DONE;
>  })
>
> @@ -14853,15 +14877,21 @@ (define_expand "trunc<mode><pmov_dst_4_lower>2"
>    [(set (match_operand:<pmov_dst_4> 0 "register_operand")
>         (truncate:<pmov_dst_4>
>           (match_operand:PMOV_SRC_MODE_4 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V8HImode);
> +  if (TARGET_AVX512VL)
> +    {
> +      rtx op0 = gen_reg_rtx (V8HImode);
>
> -  emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> -            (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
> +      emit_insn (gen_avx512vl_truncate<mode>v<ssescalarnum>hi2
> +               (op0, operands[1], CONST0_RTX (<pmov_dst_zeroed_4>mode)));
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> +      emit_move_insn (operands[0],
> +                     lowpart_subreg (<pmov_dst_4>mode, op0, V8HImode));
> +      DONE;
> +    }
> +  else
> +    ix86_expand_trunc_with_avx2_noavx512f (operands[0], operands[1], <ssewvecmode>mode);
>    DONE;
>  })
>
> @@ -15102,15 +15132,27 @@ (define_expand "truncv2div2si2"
>    [(set (match_operand:V2SI 0 "register_operand")
>         (truncate:V2SI
>           (match_operand:V2DI 1 "register_operand")))]
> -  "TARGET_AVX512VL"
> +  "TARGET_AVX2"
>  {
> -  rtx op0 = gen_reg_rtx (V4SImode);
> +  if (TARGET_AVX512VL)
> +    {
> +      rtx op0 = gen_reg_rtx (V4SImode);
>
> -  emit_insn (gen_avx512vl_truncatev2div2si2
> -            (op0, operands[1], CONST0_RTX (V2SImode)));
> +      emit_insn (gen_avx512vl_truncatev2div2si2
> +               (op0, operands[1], CONST0_RTX (V2SImode)));
>
> -  emit_move_insn (operands[0],
> -                 lowpart_subreg (V2SImode, op0, V4SImode));
> +      emit_move_insn (operands[0],
> +                     lowpart_subreg (V2SImode, op0, V4SImode));
> +    }
> +  else
> +    {
> +      rtx tmp = lowpart_subreg (V4SImode,
> +                               force_reg (V2DImode, operands[1]), V2DImode);
> +      rtx op0 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_sse_shufps_v4si (op0, tmp, tmp, const0_rtx, GEN_INT (2),
> +                                     GEN_INT (6), GEN_INT (7)));
> +      emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
> +    }
>    DONE;
>  })
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-8.c b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> new file mode 100644
> index 00000000000..5e1a442159f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-8.c
> @@ -0,0 +1,94 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -O2" } */
> +/* { dg-final { scan-assembler-times "vshufps" 1 } } */
> +/* { dg-final { scan-assembler-times "vpshufb" 18 } } */
> +/* { dg-final { scan-assembler-times "vpermd" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermq" 5 } } */
> +/* { dg-final { scan-assembler-times "vpshuflw" 1 { target { ! ia32 } } } } */
> +
> +#include <x86intrin.h>
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +__v2si mm_cvtepi64_epi32_builtin_convertvector(__v2di a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2si);
> +}
> +
> +__v4si mm256_cvtepi64_epi32_builtin_convertvector(__v4di a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4si);
> +}
> +
> +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2hi);
> +}
> +
> +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4hi);
> +}
> +
> +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2hi);
> +}
> +
> +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4hi);
> +}
> +
> +__v8hi mm256_cvtepi32_epi16_builtin_convertvector(__v8si a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8hi);
> +}
> +
> +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v2di)a, __v2qi);
> +}
> +
> +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v4di)a, __v4qi);
> +}
> +
> +__v2qi mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
> +{
> +  return __builtin_convertvector((__v2si)a, __v2qi);
> +}
> +
> +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v4si)a, __v4qi);
> +}
> +
> +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
> +{
> +  return __builtin_convertvector((__v8si)a, __v8qi);
> +}
> +
> +__v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
> +{
> +  return __builtin_convertvector((__v2hi)a, __v2qi);
> +}
> +
> +__v4qi mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
> +{
> +  return __builtin_convertvector((__v4hi)a, __v4qi);
> +}
> +
> +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
> +{
> +  return __builtin_convertvector((__v8hi)a, __v8qi);
> +}
> +
> +__v16qi        mm256_cvtepi16_epi8_builtin_convertvector(__v16hi a)
> +{
> +  return __builtin_convertvector((__v16hi)a, __v16qi);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr107432-9.c b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> new file mode 100644
> index 00000000000..90426c030c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107432-9.c
> @@ -0,0 +1,129 @@
> +/* { dg-do run } */
> +/* { dg-options "-march=x86-64-v3 -O2 -flax-vector-conversions" } */
> +#include <x86intrin.h>
> +
> +#include "avx-check.h"
> +
> +#ifndef TEST
> +#define TEST avx_test
> +#endif
> +
> +typedef short __v2hi __attribute__ ((__vector_size__ (4)));
> +typedef char __v2qi __attribute__ ((__vector_size__ (2)));
> +typedef char __v4qi __attribute__ ((__vector_size__ (4)));
> +typedef char __v8qi __attribute__ ((__vector_size__ (8)));
> +
> +typedef union
> +{
> +  __v2si x;
> +  int a[2];
> +} union64i_d;
> +
> +typedef union
> +{
> +  __v2hi x;
> +  short a[2];
> +} union32i_w;
> +
> +typedef union
> +{
> +  __v4hi x;
> +  short a[4];
> +} union64i_w;
> +
> +typedef union
> +{
> +  __v2qi x;
> +  char a[2];
> +} union16i_b;
> +
> +typedef union
> +{
> +  __v4qi x;
> +  char a[4];
> +} union32i_b;
> +
> +typedef union
> +{
> +  __v8qi x;
> +  char a[8];
> +} union64i_b;
> +
> +#define CHECK_EXP_LESS128(UNION_TYPE, VALUE_TYPE, FMT)   \
> +static int                                               \
> +__attribute__((noinline, unused))                        \
> +check_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE * v)          \
> +{                                                        \
> +  int i;                                                 \
> +  int err = 0;                                           \
> +                                                         \
> +  for (i = 0; i < ARRAY_SIZE (u.a); i++)                 \
> +    if (u.a[i] != v[i])                                          \
> +      {                                                          \
> +       err++;                                            \
> +       PRINTF ("%i: " FMT " != " FMT "\n",               \
> +               i, v[i], u.a[i]);                         \
> +      }                                                          \
> +  return err;                                            \
> +}
> +
> +CHECK_EXP_LESS128 (union64i_d, int, "%d");
> +CHECK_EXP_LESS128 (union32i_w, short, "%d");
> +CHECK_EXP_LESS128 (union64i_w, short, "%d");
> +CHECK_EXP_LESS128 (union16i_b, char, "%d");
> +CHECK_EXP_LESS128 (union32i_b, char, "%d");
> +CHECK_EXP_LESS128 (union64i_b, char, "%d");
> +
> +#define SUBTEST(INPUT_TYPE, OUTPUT_TYPE, OUTPUT_INNER, INIT_TYPE, CVT_TYPE) \
> +void do_test##INIT_TYPE##CVT_TYPE ()                     \
> +{                                                        \
> +  INPUT_TYPE s;                                                  \
> +  OUTPUT_TYPE r, ref;                                    \
> +  for (int i = 0; i < ARRAY_SIZE (s.a); i++)             \
> +    {                                                    \
> +      s.a[i] = (i + 23415) * (i + 341);                          \
> +      ref.a[i] = (OUTPUT_INNER) s.a[i];                          \
> +    }                                                    \
> +  r.x = __builtin_convertvector((INIT_TYPE)s.x, CVT_TYPE); \
> +                                                         \
> +  if (check_##OUTPUT_TYPE (r, ref.a))                    \
> +    abort ();                                            \
> +  return;                                                \
> +}
> +
> +SUBTEST(union128i_q, union64i_d, int, __v2di, __v2si);
> +SUBTEST(union256i_q, union128i_d, int, __v4di, __v4si);
> +SUBTEST(union128i_q, union32i_w, short, __v2di, __v2hi);
> +SUBTEST(union256i_q, union64i_w, short, __v4di, __v4hi);
> +SUBTEST(union64i_d, union32i_w, short, __v2si, __v2hi);
> +SUBTEST(union128i_d, union64i_w, short, __v4si, __v4hi);
> +SUBTEST(union256i_d, union128i_w, short, __v8si, __v8hi);
> +SUBTEST(union128i_q, union16i_b, char, __v2di, __v2qi);
> +SUBTEST(union256i_q, union32i_b, char, __v4di,__v4qi);
> +SUBTEST(union64i_d, union16i_b, char, __v2si, __v2qi);
> +SUBTEST(union128i_d, union32i_b, char, __v4si, __v4qi);
> +SUBTEST(union256i_d, union64i_b, char, __v8si, __v8qi);
> +SUBTEST(union32i_w, union16i_b, char, __v2hi, __v2qi);
> +SUBTEST(union64i_w, union32i_b, char, __v4hi, __v4qi);
> +SUBTEST(union128i_w, union64i_b, char, __v8hi, __v8qi);
> +SUBTEST(union256i_w, union128i_b, char, __v16hi, __v16qi);
> +
> +void TEST (void)
> +{
> +  do_test__v2di__v2si ();
> +  do_test__v2di__v2hi ();
> +  do_test__v2di__v2qi ();
> +  do_test__v4di__v4si ();
> +  do_test__v4di__v4hi ();
> +  do_test__v4di__v4qi ();
> +  do_test__v2si__v2hi ();
> +  do_test__v2si__v2qi ();
> +  do_test__v4si__v4hi ();
> +  do_test__v4si__v4qi ();
> +  do_test__v8si__v8hi ();
> +  do_test__v8si__v8qi ();
> +  do_test__v2hi__v2qi ();
> +  do_test__v4hi__v4qi ();
> +  do_test__v8hi__v8qi ();
> +  do_test__v16hi__v16qi ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> index 28a3f9a3527..3aa49a3b654 100644
> --- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
> +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
> @@ -52,5 +52,3 @@ void f(char *dst, char *src, unsigned long n, unsigned c)
>     a uniform CTOR with a vector promotion to a CTOR on a promoted
>     element.  */
>  /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
> -/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
> -/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
> --
> 2.31.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2024-06-26 10:54 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-08  1:38 [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float Hu, Lin1
2024-05-14  2:25 ` Hu, Lin1
2024-05-14 12:23   ` Richard Biener
2024-05-15  2:30     ` Hu, Lin1
2024-05-23  6:37       ` [PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and Hu, Lin1
2024-05-23  6:37         ` [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float Hu, Lin1
2024-05-29  9:40           ` Richard Biener
2024-05-31  8:54             ` Hu, Lin1
2024-05-31 12:41               ` Richard Biener
2024-06-03  8:23                 ` Hu, Lin1
2024-06-03  9:02                   ` Richard Biener
2024-06-03  9:26                     ` Hu, Lin1
2024-06-03  9:30                       ` Richard Biener
2024-06-11  6:49                         ` [PATCH 1/3 v3] " Hu, Lin1
2024-06-17  1:48                           ` Hu, Lin1
2024-06-18 11:44                           ` Richard Biener
2024-06-20 11:26                             ` Hu, Lin1
2024-06-24 12:33                               ` Richard Biener
2024-06-24 14:12                                 ` Tamar Christina
2024-06-25  2:00                                   ` Hu, Lin1
2024-06-25  3:28                                   ` [PATCH 1/3 v4] " Hu, Lin1
2024-06-25 13:30                                     ` Richard Biener
2024-06-26 10:54                                       ` [PATCH 1/3 v5] " Hu, Lin1
2024-05-23  6:37         ` [PATCH 2/3] vect: Support v4hi -> v4qi Hu, Lin1
2024-05-27  2:11           ` Hongtao Liu
2024-05-29  8:55             ` [PATCH 2/3 v2] " Hu, Lin1
2024-05-29  9:09               ` Hongtao Liu
2024-05-23  6:37         ` [PATCH 3/3] vect: support direct conversion under x86-64-v3 Hu, Lin1
2024-05-23  6:42           ` Hongtao Liu
2024-05-23  7:17             ` Hu, Lin1
2024-05-23  8:05               ` Hongtao Liu
2024-05-29  8:59                 ` [PATCH 3/3 v2] " Hu, Lin1
2024-05-30  6:51                   ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).