public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [i386] Optimize movzwl + vmovd/vmovq to vmovw.
@ 2022-05-09  2:03 liuhongt
  2022-05-09  8:28 ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: liuhongt @ 2022-05-09  2:03 UTC (permalink / raw)
  To: gcc-patches

Similarly optimize movl + vmovq to vmovd.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/104915
	* config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
	pre_reload define_insn_and_split.
	(*vec_setv2di_0_zero_extendhi_1): Ditto.
	(*vec_set<mode>_0_zero_extendsi): Ditto.
	(*vec_setv2di_0_zero_extendsi_1): Ditto.
	(ssewvecmode): New mode attr.
	(ssewvecmodelower): Ditto.
	(ssepackmodelower): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr104915-vmovd.c: New test.
	* gcc.target/i386/pr104915-vmovw.c: New test.
---
 gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
 .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
 .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7b791def542..2ad8a2b46b8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
    (V32HI "V32HI") (V64QI "V64QI")
    (V32QI "V32QI") (V16QI "V16QI")])
 
+;; Mapping of vector modes to an V*HImode of the same size
+(define_mode_attr ssewvecmode
+  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
+   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
+
+(define_mode_attr ssewvecmodelower
+  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
+   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
+
 (define_mode_attr sseintvecmode2
   [(V8DF "XI") (V4DF "OI") (V2DF "TI")
    (V8SF "OI") (V4SF "TI")
@@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
    (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
    (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
 
+(define_mode_attr ssepackmodelower
+  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
+   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
+   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
+
 ;; Mapping of the max integer size for xop rotate immediate constraint
 (define_mode_attr sserotatemax
   [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
@@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
+  [(set (match_operand:VI48_AVX512F 0 "register_operand")
+	(vec_merge:VI48_AVX512F
+	 (vec_duplicate:VI48_AVX512F
+	  (zero_extend:<ssescalarmode>
+	    (match_operand:HI 1 "nonimmediate_operand")))
+	 (match_operand:VI48_AVX512F 2 "const0_operand")
+	 (const_int 1)))]
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
+  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
+					      CONST0_RTX (<ssewvecmode>mode),
+					      operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
+  DONE;
+})
+
+(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (zero_extend:DI
+	    (match_operand:HI 1 "nonimmediate_operand"))
+	  (const_int 0)))]
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (V8HImode);
+  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (V2DImode, dest, V8HImode));
+  DONE;
+})
+
 (define_insn "avx512fp16_movsh"
   [(set (match_operand:V8HF 0 "register_operand" "=v")
 	(vec_merge:V8HF
@@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
 	   ]
 	   (symbol_ref "true")))])
 
+(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
+  [(set (match_operand:VI8 0 "register_operand")
+	(vec_merge:VI8
+	 (vec_duplicate:VI8
+	  (zero_extend:DI
+	    (match_operand:SI 1 "nonimmediate_operand")))
+	 (match_operand:VI8 2 "const0_operand")
+	 (const_int 1)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (<ssepackmode>mode);
+  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
+					      CONST0_RTX (<ssepackmode>mode),
+					      operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
+  DONE;
+})
+
+(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (zero_extend:DI
+	    (match_operand:SI 1 "nonimmediate_operand"))
+	  (const_int 0)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (V4SImode);
+  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (V2DImode, dest, V4SImode));
+  DONE;
+})
+
 (define_insn "sse4_1_insertps"
   [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
 	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
new file mode 100644
index 00000000000..913ff8806f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
+/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
+
+#include<immintrin.h>
+
+__m128i
+foo1 (int* p)
+{
+  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
+}
+
+__m256i
+foo3 (int* p)
+{
+  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
+}
+
+__m512i
+foo5 (int* p)
+{
+  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
+			   (unsigned int) ((*(__m32_u *)p)[0]));
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
new file mode 100644
index 00000000000..ac47865d17a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx512fp16 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
+/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
+
+#include<immintrin.h>
+__m128i
+foo (short* p)
+{
+  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m128i
+foo1 (short* p)
+{
+  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m256i
+foo2 (short* p)
+{
+  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
+			   (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m256i
+foo3 (short* p)
+{
+  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m512i
+foo4 (short* p)
+{
+  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
+			   0, 0, 0, 0, 0, 0, 0,
+			   (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m512i
+foo5 (short* p)
+{
+  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
+			   (unsigned short) ((*(__m16_u *)p)[0]));
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] [i386] Optimize movzwl + vmovd/vmovq to vmovw.
  2022-05-09  2:03 [PATCH] [i386] Optimize movzwl + vmovd/vmovq to vmovw liuhongt
@ 2022-05-09  8:28 ` Uros Bizjak
  2022-05-11  3:39   ` Hongtao Liu
  0 siblings, 1 reply; 4+ messages in thread
From: Uros Bizjak @ 2022-05-09  8:28 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches, Hongtao Liu, H. J. Lu

On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Similarly optimize movl + vmovq to vmovd.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/104915
>         * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
>         pre_reload define_insn_and_split.
>         (*vec_setv2di_0_zero_extendhi_1): Ditto.
>         (*vec_set<mode>_0_zero_extendsi): Ditto.
>         (*vec_setv2di_0_zero_extendsi_1): Ditto.
>         (ssewvecmode): New mode attr.
>         (ssewvecmodelower): Ditto.
>         (ssepackmodelower): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr104915-vmovd.c: New test.
>         * gcc.target/i386/pr104915-vmovw.c: New test.

I wonder if these define_insn_and_splits can instead be implemented
via combine splitter (which has the unfortunate limitation that the
output sequence has to be exactly two instructions, which is true in
your case). Combine splitter is preferred, since it splits immediately
and the resulting insns can be combined further during the combine
pass.

Uros.

> ---
>  gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
>  .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
>  .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
>  3 files changed, 164 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 7b791def542..2ad8a2b46b8 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
>     (V32HI "V32HI") (V64QI "V64QI")
>     (V32QI "V32QI") (V16QI "V16QI")])
>
> +;; Mapping of vector modes to an V*HImode of the same size
> +(define_mode_attr ssewvecmode
> +  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
> +   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
> +
> +(define_mode_attr ssewvecmodelower
> +  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
> +   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
> +
>  (define_mode_attr sseintvecmode2
>    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
>     (V8SF "OI") (V4SF "TI")
> @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
>     (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
>     (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
>
> +(define_mode_attr ssepackmodelower
> +  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
> +   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
> +   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
> +
>  ;; Mapping of the max integer size for xop rotate immediate constraint
>  (define_mode_attr sserotatemax
>    [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
> @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "HF")])
>
> +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
> +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> +       (vec_merge:VI48_AVX512F
> +        (vec_duplicate:VI48_AVX512F
> +         (zero_extend:<ssescalarmode>
> +           (match_operand:HI 1 "nonimmediate_operand")))
> +        (match_operand:VI48_AVX512F 2 "const0_operand")
> +        (const_int 1)))]
> +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
> +  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
> +                                             CONST0_RTX (<ssewvecmode>mode),
> +                                             operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
> +  DONE;
> +})
> +
> +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
> +  [(set (match_operand:V2DI 0 "register_operand")
> +       (vec_concat:V2DI
> +         (zero_extend:DI
> +           (match_operand:HI 1 "nonimmediate_operand"))
> +         (const_int 0)))]
> +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (V8HImode);
> +  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (V2DImode, dest, V8HImode));
> +  DONE;
> +})
> +
>  (define_insn "avx512fp16_movsh"
>    [(set (match_operand:V8HF 0 "register_operand" "=v")
>         (vec_merge:V8HF
> @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
>            ]
>            (symbol_ref "true")))])
>
> +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
> +  [(set (match_operand:VI8 0 "register_operand")
> +       (vec_merge:VI8
> +        (vec_duplicate:VI8
> +         (zero_extend:DI
> +           (match_operand:SI 1 "nonimmediate_operand")))
> +        (match_operand:VI8 2 "const0_operand")
> +        (const_int 1)))]
> +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (<ssepackmode>mode);
> +  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
> +                                             CONST0_RTX (<ssepackmode>mode),
> +                                             operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
> +  DONE;
> +})
> +
> +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
> +  [(set (match_operand:V2DI 0 "register_operand")
> +       (vec_concat:V2DI
> +         (zero_extend:DI
> +           (match_operand:SI 1 "nonimmediate_operand"))
> +         (const_int 0)))]
> +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (V4SImode);
> +  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (V2DImode, dest, V4SImode));
> +  DONE;
> +})
> +
>  (define_insn "sse4_1_insertps"
>    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
>         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
> diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> new file mode 100644
> index 00000000000..913ff8806f1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
> +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
> +
> +#include<immintrin.h>
> +
> +__m128i
> +foo1 (int* p)
> +{
> +  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
> +}
> +
> +__m256i
> +foo3 (int* p)
> +{
> +  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
> +}
> +
> +__m512i
> +foo5 (int* p)
> +{
> +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned int) ((*(__m32_u *)p)[0]));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> new file mode 100644
> index 00000000000..ac47865d17a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx512fp16 -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
> +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
> +
> +#include<immintrin.h>
> +__m128i
> +foo (short* p)
> +{
> +  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m128i
> +foo1 (short* p)
> +{
> +  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m256i
> +foo2 (short* p)
> +{
> +  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m256i
> +foo3 (short* p)
> +{
> +  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m512i
> +foo4 (short* p)
> +{
> +  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
> +                          0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m512i
> +foo5 (short* p)
> +{
> +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] [i386] Optimize movzwl + vmovd/vmovq to vmovw.
  2022-05-09  8:28 ` Uros Bizjak
@ 2022-05-11  3:39   ` Hongtao Liu
  2022-05-11  6:37     ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Hongtao Liu @ 2022-05-11  3:39 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: liuhongt, gcc-patches, H. J. Lu

On Mon, May 9, 2022 at 4:28 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Similarly optimize movl + vmovq to vmovd.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         PR target/104915
> >         * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
> >         pre_reload define_insn_and_split.
> >         (*vec_setv2di_0_zero_extendhi_1): Ditto.
> >         (*vec_set<mode>_0_zero_extendsi): Ditto.
> >         (*vec_setv2di_0_zero_extendsi_1): Ditto.
> >         (ssewvecmode): New mode attr.
> >         (ssewvecmodelower): Ditto.
> >         (ssepackmodelower): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr104915-vmovd.c: New test.
> >         * gcc.target/i386/pr104915-vmovw.c: New test.
>
> I wonder if these define_insn_and_splits can instead be implemented
> via combine splitter (which has the unfortunate limitation that the
> output sequence has to be exactly two instructions, which is true in
> your case). Combine splitter is preferred, since it splits immediately
> and the resulting insns can be combined further during the combine
> pass.

try_combine requires at least 3 insns to go into combine_split_insns,
here we just have 2 insns and failed.

-----cut from combine.cc--------
3545  /* If we were combining three insns and the result is a simple SET
 3546     with no ASM_OPERANDS that wasn't recognized, try to split it into two
 3547     insns.  There are two ways to do this.  It can be split using a
 3548     machine-specific method (like when you have an addition of a large
 3549     constant) or by combine in the function find_split_point.  */
 3550
 3551=>if (i1 && insn_code_number < 0 && GET_CODE (newpat) == SET
 3552      && asm_noperands (newpat) < 0)
-------cut end-------------

>
> Uros.
>
> > ---
> >  gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
> >  .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
> >  .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
> >  3 files changed, 164 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 7b791def542..2ad8a2b46b8 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
> >     (V32HI "V32HI") (V64QI "V64QI")
> >     (V32QI "V32QI") (V16QI "V16QI")])
> >
> > +;; Mapping of vector modes to an V*HImode of the same size
> > +(define_mode_attr ssewvecmode
> > +  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
> > +   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
> > +
> > +(define_mode_attr ssewvecmodelower
> > +  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
> > +   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
> > +
> >  (define_mode_attr sseintvecmode2
> >    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
> >     (V8SF "OI") (V4SF "TI")
> > @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
> >     (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
> >     (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
> >
> > +(define_mode_attr ssepackmodelower
> > +  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
> > +   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
> > +   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
> > +
> >  ;; Mapping of the max integer size for xop rotate immediate constraint
> >  (define_mode_attr sserotatemax
> >    [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
> > @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
> >     (set_attr "prefix" "evex")
> >     (set_attr "mode" "HF")])
> >
> > +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
> > +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> > +       (vec_merge:VI48_AVX512F
> > +        (vec_duplicate:VI48_AVX512F
> > +         (zero_extend:<ssescalarmode>
> > +           (match_operand:HI 1 "nonimmediate_operand")))
> > +        (match_operand:VI48_AVX512F 2 "const0_operand")
> > +        (const_int 1)))]
> > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
> > +  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
> > +                                             CONST0_RTX (<ssewvecmode>mode),
> > +                                             operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
> > +  DONE;
> > +})
> > +
> > +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
> > +  [(set (match_operand:V2DI 0 "register_operand")
> > +       (vec_concat:V2DI
> > +         (zero_extend:DI
> > +           (match_operand:HI 1 "nonimmediate_operand"))
> > +         (const_int 0)))]
> > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (V8HImode);
> > +  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (V2DImode, dest, V8HImode));
> > +  DONE;
> > +})
> > +
> >  (define_insn "avx512fp16_movsh"
> >    [(set (match_operand:V8HF 0 "register_operand" "=v")
> >         (vec_merge:V8HF
> > @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
> >            ]
> >            (symbol_ref "true")))])
> >
> > +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
> > +  [(set (match_operand:VI8 0 "register_operand")
> > +       (vec_merge:VI8
> > +        (vec_duplicate:VI8
> > +         (zero_extend:DI
> > +           (match_operand:SI 1 "nonimmediate_operand")))
> > +        (match_operand:VI8 2 "const0_operand")
> > +        (const_int 1)))]
> > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (<ssepackmode>mode);
> > +  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
> > +                                             CONST0_RTX (<ssepackmode>mode),
> > +                                             operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
> > +  DONE;
> > +})
> > +
> > +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
> > +  [(set (match_operand:V2DI 0 "register_operand")
> > +       (vec_concat:V2DI
> > +         (zero_extend:DI
> > +           (match_operand:SI 1 "nonimmediate_operand"))
> > +         (const_int 0)))]
> > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (V4SImode);
> > +  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (V2DImode, dest, V4SImode));
> > +  DONE;
> > +})
> > +
> >  (define_insn "sse4_1_insertps"
> >    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
> >         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > new file mode 100644
> > index 00000000000..913ff8806f1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-mavx512f -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
> > +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
> > +
> > +#include<immintrin.h>
> > +
> > +__m128i
> > +foo1 (int* p)
> > +{
> > +  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
> > +}
> > +
> > +__m256i
> > +foo3 (int* p)
> > +{
> > +  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
> > +}
> > +
> > +__m512i
> > +foo5 (int* p)
> > +{
> > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned int) ((*(__m32_u *)p)[0]));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > new file mode 100644
> > index 00000000000..ac47865d17a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > @@ -0,0 +1,45 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-mavx512fp16 -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
> > +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
> > +
> > +#include<immintrin.h>
> > +__m128i
> > +foo (short* p)
> > +{
> > +  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m128i
> > +foo1 (short* p)
> > +{
> > +  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m256i
> > +foo2 (short* p)
> > +{
> > +  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m256i
> > +foo3 (short* p)
> > +{
> > +  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m512i
> > +foo4 (short* p)
> > +{
> > +  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
> > +                          0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m512i
> > +foo5 (short* p)
> > +{
> > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > --
> > 2.18.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] [i386] Optimize movzwl + vmovd/vmovq to vmovw.
  2022-05-11  3:39   ` Hongtao Liu
@ 2022-05-11  6:37     ` Uros Bizjak
  0 siblings, 0 replies; 4+ messages in thread
From: Uros Bizjak @ 2022-05-11  6:37 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches, H. J. Lu

On Wed, May 11, 2022 at 5:39 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 4:28 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > Similarly optimize movl + vmovq to vmovd.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/104915
> > >         * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
> > >         pre_reload define_insn_and_split.
> > >         (*vec_setv2di_0_zero_extendhi_1): Ditto.
> > >         (*vec_set<mode>_0_zero_extendsi): Ditto.
> > >         (*vec_setv2di_0_zero_extendsi_1): Ditto.
> > >         (ssewvecmode): New mode attr.
> > >         (ssewvecmodelower): Ditto.
> > >         (ssepackmodelower): Ditto.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr104915-vmovd.c: New test.
> > >         * gcc.target/i386/pr104915-vmovw.c: New test.

OK.

Thanks,
Uros.

> >
> > I wonder if these define_insn_and_splits can instead be implemented
> > via combine splitter (which has the unfortunate limitation that the
> > output sequence has to be exactly two instructions, which is true in
> > your case). Combine splitter is preferred, since it splits immediately
> > and the resulting insns can be combined further during the combine
> > pass.
>
> try_combine requires at least 3 insns to go into combine_split_insns,
> here we just have 2 insns and failed.
>
> -----cut from combine.cc--------
> 3545  /* If we were combining three insns and the result is a simple SET
>  3546     with no ASM_OPERANDS that wasn't recognized, try to split it into two
>  3547     insns.  There are two ways to do this.  It can be split using a
>  3548     machine-specific method (like when you have an addition of a large
>  3549     constant) or by combine in the function find_split_point.  */
>  3550
>  3551=>if (i1 && insn_code_number < 0 && GET_CODE (newpat) == SET
>  3552      && asm_noperands (newpat) < 0)
> -------cut end-------------
>
> >
> > Uros.
> >
> > > ---
> > >  gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
> > >  .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
> > >  .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
> > >  3 files changed, 164 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > >
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 7b791def542..2ad8a2b46b8 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
> > >     (V32HI "V32HI") (V64QI "V64QI")
> > >     (V32QI "V32QI") (V16QI "V16QI")])
> > >
> > > +;; Mapping of vector modes to an V*HImode of the same size
> > > +(define_mode_attr ssewvecmode
> > > +  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
> > > +   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
> > > +
> > > +(define_mode_attr ssewvecmodelower
> > > +  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
> > > +   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
> > > +
> > >  (define_mode_attr sseintvecmode2
> > >    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
> > >     (V8SF "OI") (V4SF "TI")
> > > @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
> > >     (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
> > >     (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
> > >
> > > +(define_mode_attr ssepackmodelower
> > > +  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
> > > +   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
> > > +   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
> > > +
> > >  ;; Mapping of the max integer size for xop rotate immediate constraint
> > >  (define_mode_attr sserotatemax
> > >    [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
> > > @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
> > >     (set_attr "prefix" "evex")
> > >     (set_attr "mode" "HF")])
> > >
> > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
> > > +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> > > +       (vec_merge:VI48_AVX512F
> > > +        (vec_duplicate:VI48_AVX512F
> > > +         (zero_extend:<ssescalarmode>
> > > +           (match_operand:HI 1 "nonimmediate_operand")))
> > > +        (match_operand:VI48_AVX512F 2 "const0_operand")
> > > +        (const_int 1)))]
> > > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
> > > +  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
> > > +                                             CONST0_RTX (<ssewvecmode>mode),
> > > +                                             operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
> > > +  DONE;
> > > +})
> > > +
> > > +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
> > > +  [(set (match_operand:V2DI 0 "register_operand")
> > > +       (vec_concat:V2DI
> > > +         (zero_extend:DI
> > > +           (match_operand:HI 1 "nonimmediate_operand"))
> > > +         (const_int 0)))]
> > > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (V8HImode);
> > > +  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (V2DImode, dest, V8HImode));
> > > +  DONE;
> > > +})
> > > +
> > >  (define_insn "avx512fp16_movsh"
> > >    [(set (match_operand:V8HF 0 "register_operand" "=v")
> > >         (vec_merge:V8HF
> > > @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
> > >            ]
> > >            (symbol_ref "true")))])
> > >
> > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
> > > +  [(set (match_operand:VI8 0 "register_operand")
> > > +       (vec_merge:VI8
> > > +        (vec_duplicate:VI8
> > > +         (zero_extend:DI
> > > +           (match_operand:SI 1 "nonimmediate_operand")))
> > > +        (match_operand:VI8 2 "const0_operand")
> > > +        (const_int 1)))]
> > > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (<ssepackmode>mode);
> > > +  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
> > > +                                             CONST0_RTX (<ssepackmode>mode),
> > > +                                             operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
> > > +  DONE;
> > > +})
> > > +
> > > +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
> > > +  [(set (match_operand:V2DI 0 "register_operand")
> > > +       (vec_concat:V2DI
> > > +         (zero_extend:DI
> > > +           (match_operand:SI 1 "nonimmediate_operand"))
> > > +         (const_int 0)))]
> > > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (V4SImode);
> > > +  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (V2DImode, dest, V4SImode));
> > > +  DONE;
> > > +})
> > > +
> > >  (define_insn "sse4_1_insertps"
> > >    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
> > >         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > > new file mode 100644
> > > index 00000000000..913ff8806f1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-mavx512f -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
> > > +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
> > > +
> > > +#include<immintrin.h>
> > > +
> > > +__m128i
> > > +foo1 (int* p)
> > > +{
> > > +  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo3 (int* p)
> > > +{
> > > +  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo5 (int* p)
> > > +{
> > > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > > new file mode 100644
> > > index 00000000000..ac47865d17a
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > > @@ -0,0 +1,45 @@
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-mavx512fp16 -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
> > > +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
> > > +
> > > +#include<immintrin.h>
> > > +__m128i
> > > +foo (short* p)
> > > +{
> > > +  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m128i
> > > +foo1 (short* p)
> > > +{
> > > +  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo2 (short* p)
> > > +{
> > > +  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo3 (short* p)
> > > +{
> > > +  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo4 (short* p)
> > > +{
> > > +  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
> > > +                          0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo5 (short* p)
> > > +{
> > > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-05-11  6:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-09  2:03 [PATCH] [i386] Optimize movzwl + vmovd/vmovq to vmovw liuhongt
2022-05-09  8:28 ` Uros Bizjak
2022-05-11  3:39   ` Hongtao Liu
2022-05-11  6:37     ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).