public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [i386] Support avx512 vector shift with vector [PR98434]
@ 2021-05-24  9:49 Hongtao Liu
  2021-06-23  7:23 ` Hongtao Liu
  0 siblings, 1 reply; 7+ messages in thread
From: Hongtao Liu @ 2021-05-24  9:49 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1947 bytes --]

Hi:
  This patch is about to add expanders for vashl<VI12_AVX512BW>,
vlshr<VI12_AVX512BW>,
vashr<VI1_AVX512BW> and vashr<v32hi,v16hi,v4di,v8di>.

Besides there's some assumption in expand_mult_const that mul and
add must be available at the same time, but for i386, addv8qi is
restricted under TARGET_64BIT, but mulv8qi not, that could cause ICE.
So restrict mulv8qi and shiftv8qi under TARGET_64BIT.
  Bootstrap and regtested on x86_64-linux-gnu{-m32,} and
x86_64-linux-gnu{-m32\ -march=cascadelake,-march=cascadelake}

gcc/ChangeLog:

        PR target/98434
        * config/i386/i386-expand.c (ix86_expand_vec_interleave):
        Adjust comments for ix86_expand_vecop_qihi2.
        (ix86_expand_vecmul_qihi): Renamed to ..
        (ix86_expand_vecop_qihi2): Adjust function prototype to
        support shift operation, add static to definition.
        (ix86_expand_vec_shift_qihi_constant): Add static to definition.
        (ix86_expand_vecop_qihi): Call ix86_expand_vecop_qihi2 and
        ix86_expand_vec_shift_qihi_constant.
        * config/i386/i386-protos.h (ix86_expand_vecmul_qihi): Deleted.
        (ix86_expand_vec_shift_qihi_constant): Deleted.
        * config/i386/sse.md (mulv8qi3): Call ix86_expand_vecop_qihi
        directly, add condition TARGET_64BIT.
        (mul<mode>3): Ditto.
        (<insn><mode>3): Ditto.
        (vlshr<mode>3): Extend to support avx512 vlshr.
        (v<insn><mode>3): New expander for
        vashr/vlshr/vashl.
        (v<insn>v8qi3): Ditto.
        (vashrv8hi3<mask_name>): Renamed to ..
        (vashr<mode>3): And extend to support V16QImode for avx512.
        (vashrv16qi3): Deleted.
        (vashrv2di3<mask_name>): Extend expander to support avx512
        instruction.

gcc/testsuite/ChangeLog:

        PR target/98434
        * gcc.target/i386/pr98434-1.c: New test.
        * gcc.target/i386/pr98434-2.c: New test.
        * gcc.target/i386/avx512vl-pr95488-1.c: Adjust testcase.

[-- Attachment #2: 0001-i386-Add-missing-expander-for-vector-vector-shift-PR.patch --]
[-- Type: text/x-patch, Size: 22113 bytes --]

From 4c423a49dd1af5b0808a89c356f94454dde8109f Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 26 Jan 2021 16:29:32 +0800
Subject: [PATCH] i386: Add missing expander for vector/vector shift [PR98434]

Add expanders for vashl<VI12_AVX512BW>, vlshr<VI12_AVX512BW>,
vashr<VI1_AVX512BW> and vashr<v32hi,v16hi,v4di,v8di>.

Besides there's some assumption in expand_mult_const that mul and
add must be available at the same time, but for i386, addv8qi is
restricted under TARGET_64BIT, but mulv8qi not, that could cause ICE.
So restrict mulv8qi and shiftv8qi under TARGET_64BIT.

gcc/ChangeLog:

	PR target/98434
	* config/i386/i386-expand.c (ix86_expand_vec_interleave):
	Adjust comments for ix86_expand_vecop_qihi2.
	(ix86_expand_vecmul_qihi): Renamed to ..
	(ix86_expand_vecop_qihi2): Adjust function prototype to
	support shift operation, add static to definition.
	(ix86_expand_vec_shift_qihi_constant): Add static to definition.
	(ix86_expand_vecop_qihi): Call ix86_expand_vecop_qihi2 and
	ix86_expand_vec_shift_qihi_constant.
	* config/i386/i386-protos.h (ix86_expand_vecmul_qihi): Deleted.
	(ix86_expand_vec_shift_qihi_constant): Deleted.
	* config/i386/sse.md (mulv8qi3): Call ix86_expand_vecop_qihi
	directly, add condition TARGET_64BIT.
	(mul<mode>3): Ditto.
	(<insn><mode>3): Ditto.
	(vlshr<mode>3): Extend to support avx512 vlshr.
	(v<insn><mode>3): New expander for
	vashr/vlshr/vashl.
	(v<insn>v8qi3): Ditto.
	(vashrv8hi3<mask_name>): Renamed to ..
	(vashr<mode>3): And extend to support V16QImode for avx512.
	(vashrv16qi3): Deleted.
	(vashrv2di3<mask_name>): Extend expander to support avx512
	instruction.

gcc/testsuite/ChangeLog:

	PR target/98434
	* gcc.target/i386/pr98434-1.c: New test.
	* gcc.target/i386/pr98434-2.c: New test.
	* gcc.target/i386/avx512vl-pr95488-1.c: Adjust testcase.
---
 gcc/config/i386/i386-expand.c                 |  73 +++++++---
 gcc/config/i386/i386-protos.h                 |   3 -
 gcc/config/i386/sse.md                        | 111 ++++++++++-----
 .../gcc.target/i386/avx512vl-pr95488-1.c      |   6 +-
 gcc/testsuite/gcc.target/i386/pr98434-1.c     |  64 +++++++++
 gcc/testsuite/gcc.target/i386/pr98434-2.c     | 131 ++++++++++++++++++
 6 files changed, 331 insertions(+), 57 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98434-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98434-2.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 9f3d41955a2..fba6c666769 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -20377,8 +20377,9 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   gcc_assert (ok);
 }
 
-/* Optimize vector MUL generation for V8QI, V16QI and V32QI
-   under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+/* This function is similar as ix86_expand_vecop_qihi,
+   but optimized under AVX512BW by using vpmovwb.
+   For example, optimize vector MUL generation like
 
    vpmovzxbw ymm2, xmm0
    vpmovzxbw ymm3, xmm1
@@ -20388,13 +20389,14 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
    it would take less instructions than ix86_expand_vecop_qihi.
    Return true if success.  */
 
-bool
-ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
 {
   machine_mode himode, qimode = GET_MODE (dest);
   rtx hop1, hop2, hdest;
   rtx (*gen_extend)(rtx, rtx);
   rtx (*gen_truncate)(rtx, rtx);
+  bool uns_p = (code == ASHIFTRT) ? false : true;
 
   /* There's no V64HImode multiplication instruction.  */
   if (qimode == E_V64QImode)
@@ -20415,17 +20417,17 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
     {
     case E_V8QImode:
       himode = V8HImode;
-      gen_extend = gen_zero_extendv8qiv8hi2;
+      gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
       gen_truncate = gen_truncv8hiv8qi2;
       break;
     case E_V16QImode:
       himode = V16HImode;
-      gen_extend = gen_zero_extendv16qiv16hi2;
+      gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
       gen_truncate = gen_truncv16hiv16qi2;
       break;
     case E_V32QImode:
       himode = V32HImode;
-      gen_extend = gen_zero_extendv32qiv32hi2;
+      gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
       gen_truncate = gen_truncv32hiv32qi2;
       break;
     default:
@@ -20437,7 +20439,7 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
   hdest = gen_reg_rtx (himode);
   emit_insn (gen_extend (hop1, op1));
   emit_insn (gen_extend (hop2, op2));
-  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
 						      hop1, hop2)));
   emit_insn (gen_truncate (dest, hdest));
   return true;
@@ -20445,8 +20447,9 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
 
 /* Expand a vector operation shift by constant for a V*QImode in terms of the
    same operation on V*HImode. Return true if success. */
-bool
-ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+static bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
+				     rtx dest, rtx op1, rtx op2)
 {
   machine_mode qimode, himode;
   HOST_WIDE_INT and_constant, xor_constant;
@@ -20558,6 +20561,16 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   bool uns_p = false;
   int i;
 
+  if (CONST_INT_P (op2)
+      && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+      && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
+    return;
+
+  if (TARGET_AVX512BW
+      && VECTOR_MODE_P (GET_MODE (op2))
+      && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+    return;
+
   switch (qimode)
     {
     case E_V16QImode:
@@ -20579,7 +20592,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       gcc_unreachable ();
     }
 
-  op2_l = op2_h = op2;
   switch (code)
     {
     case MULT:
@@ -20608,17 +20620,46 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       op1_h = gen_reg_rtx (himode);
       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+      /* vashr/vlshr/vashl  */
+      if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+	{
+	  rtx tmp = force_reg (qimode, op2);
+	  op2_l = gen_reg_rtx (himode);
+	  op2_h = gen_reg_rtx (himode);
+	  ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
+	  ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
+	}
+      else
+	op2_l = op2_h = op2;
+
       full_interleave = true;
       break;
     default:
       gcc_unreachable ();
     }
 
-  /* Perform the operation.  */
-  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
-			       1, OPTAB_DIRECT);
-  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
-			       1, OPTAB_DIRECT);
+  /* Perform vashr/vlshr/vashl.  */
+  if (code != MULT
+      && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+    {
+      res_l = gen_reg_rtx (himode);
+      res_h = gen_reg_rtx (himode);
+      emit_insn (gen_rtx_SET (res_l,
+			      simplify_gen_binary (code, himode,
+						   op1_l, op2_l)));
+      emit_insn (gen_rtx_SET (res_h,
+			      simplify_gen_binary (code, himode,
+						   op1_h, op2_h)));
+    }
+  /* Performance mult/ashr/lshr/ashl.  */
+  else
+    {
+      res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+				   1, OPTAB_DIRECT);
+      res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+				   1, OPTAB_DIRECT);
+    }
+
   gcc_assert (res_l && res_h);
 
   /* Merge the data back into the right place.  */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 7782cf1163f..81c5b998af2 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -207,10 +207,7 @@ extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_round_sse4 (rtx, rtx);
 
-extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
 extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
-extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx);
-
 extern rtx ix86_split_stack_guard (void);
 
 extern void ix86_move_vector_high_sse_to_mmx (rtx);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a4503ddcb73..851a2c11c17 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -400,6 +400,10 @@ (define_mode_iterator VI1_AVX512
 (define_mode_iterator VI1_AVX512F
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI])
 
+(define_mode_iterator VI12_256_512
+  [V64QI (V32QI "TARGET_AVX512VL")
+   V32HI (V16HI "TARGET_AVX512VL")])
+
 (define_mode_iterator VI2_AVX2
   [(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI])
 
@@ -11775,9 +11779,9 @@ (define_expand "mulv8qi3"
   [(set (match_operand:V8QI 0 "register_operand")
 	(mult:V8QI (match_operand:V8QI 1 "register_operand")
 		   (match_operand:V8QI 2 "register_operand")))]
-  "TARGET_AVX512VL && TARGET_AVX512BW"
+  "TARGET_AVX512VL && TARGET_AVX512BW && TARGET_64BIT"
 {
-  gcc_assert (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]));
+  ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
   DONE;
 })
 
@@ -11787,8 +11791,6 @@ (define_expand "mul<mode>3"
 		       (match_operand:VI1_AVX512 2 "register_operand")))]
   "TARGET_SSE2"
 {
-  if (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]))
-    DONE;
   ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
   DONE;
 })
@@ -20216,12 +20218,20 @@ (define_expand "vlshr<mode>3"
 	(lshiftrt:VI12_128
 	  (match_operand:VI12_128 1 "register_operand")
 	  (match_operand:VI12_128 2 "nonimmediate_operand")))]
-  "TARGET_XOP"
+  "TARGET_XOP || (TARGET_AVX512BW && TARGET_AVX512VL)"
 {
-  rtx neg = gen_reg_rtx (<MODE>mode);
-  emit_insn (gen_neg<mode>2 (neg, operands[2]));
-  emit_insn (gen_xop_shl<mode>3 (operands[0], operands[1], neg));
-  DONE;
+  if (TARGET_XOP)
+    {
+      rtx neg = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_neg<mode>2 (neg, operands[2]));
+      emit_insn (gen_xop_shl<mode>3 (operands[0], operands[1], neg));
+      DONE;
+    }
+    else if (<MODE>mode == V16QImode)
+    {
+      ix86_expand_vecop_qihi (LSHIFTRT, operands[0], operands[1], operands[2]);
+      DONE;
+    }
 })
 
 (define_expand "vlshr<mode>3"
@@ -20240,6 +20250,31 @@ (define_expand "vlshr<mode>3"
     }
 })
 
+(define_expand "v<insn><mode>3"
+  [(set (match_operand:VI12_256_512 0 "register_operand")
+	(any_shift:VI12_256_512
+	  (match_operand:VI12_256_512 1 "register_operand")
+	  (match_operand:VI12_256_512 2 "nonimmediate_operand")))]
+  "TARGET_AVX512BW"
+{
+  if (<MODE>mode == V32QImode || <MODE>mode == V64QImode)
+    {
+      ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
+      DONE;
+    }
+})
+
+(define_expand "v<insn>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(any_shift:V8QI
+	  (match_operand:V8QI 1 "register_operand")
+	  (match_operand:V8QI 2 "nonimmediate_operand")))]
+  "TARGET_AVX512BW && TARGET_AVX512VL && TARGET_64BIT"
+{
+  ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
+  DONE;
+})
+
 (define_expand "vlshr<mode>3"
   [(set (match_operand:VI48_512 0 "register_operand")
 	(lshiftrt:VI48_512
@@ -20254,33 +20289,32 @@ (define_expand "vlshr<mode>3"
 	  (match_operand:VI48_256 2 "nonimmediate_operand")))]
   "TARGET_AVX2")
 
-(define_expand "vashrv8hi3<mask_name>"
-  [(set (match_operand:V8HI 0 "register_operand")
-	(ashiftrt:V8HI
-	  (match_operand:V8HI 1 "register_operand")
-	  (match_operand:V8HI 2 "nonimmediate_operand")))]
+(define_expand "vashr<mode>3"
+  [(set (match_operand:VI8_256_512 0 "register_operand")
+	(ashiftrt:VI8_256_512
+	  (match_operand:VI8_256_512 1 "register_operand")
+	  (match_operand:VI8_256_512 2 "nonimmediate_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "vashr<mode>3"
+  [(set (match_operand:VI12_128 0 "register_operand")
+	(ashiftrt:VI12_128
+	  (match_operand:VI12_128 1 "register_operand")
+	  (match_operand:VI12_128 2 "nonimmediate_operand")))]
   "TARGET_XOP || (TARGET_AVX512BW && TARGET_AVX512VL)"
 {
   if (TARGET_XOP)
     {
-      rtx neg = gen_reg_rtx (V8HImode);
-      emit_insn (gen_negv8hi2 (neg, operands[2]));
-      emit_insn (gen_xop_shav8hi3 (operands[0], operands[1], neg));
+      rtx neg = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_neg<mode>2 (neg, operands[2]));
+      emit_insn (gen_xop_sha<mode>3 (operands[0], operands[1], neg));
+      DONE;
+    }
+  else if(<MODE>mode == V16QImode)
+    {
+      ix86_expand_vecop_qihi (ASHIFTRT, operands[0],operands[1], operands[2]);
       DONE;
     }
-})
-
-(define_expand "vashrv16qi3"
-  [(set (match_operand:V16QI 0 "register_operand")
-	(ashiftrt:V16QI
-	  (match_operand:V16QI 1 "register_operand")
-	  (match_operand:V16QI 2 "nonimmediate_operand")))]
-  "TARGET_XOP"
-{
-   rtx neg = gen_reg_rtx (V16QImode);
-   emit_insn (gen_negv16qi2 (neg, operands[2]));
-   emit_insn (gen_xop_shav16qi3 (operands[0], operands[1], neg));
-   DONE;
 })
 
 (define_expand "vashrv2di3<mask_name>"
@@ -20331,10 +20365,18 @@ (define_expand "vashl<mode>3"
 	(ashift:VI12_128
 	  (match_operand:VI12_128 1 "register_operand")
 	  (match_operand:VI12_128 2 "nonimmediate_operand")))]
-  "TARGET_XOP"
+  "TARGET_XOP || (TARGET_AVX512BW && TARGET_AVX512VL)"
 {
-  emit_insn (gen_xop_sha<mode>3 (operands[0], operands[1], operands[2]));
-  DONE;
+  if (TARGET_XOP)
+  {
+    emit_insn (gen_xop_sha<mode>3 (operands[0], operands[1], operands[2]));
+    DONE;
+  }
+  else if (<MODE>mode == V16QImode)
+  {
+    ix86_expand_vecop_qihi (ASHIFT, operands[0], operands[1], operands[2]);
+    DONE;
+  }
 })
 
 (define_expand "vashl<mode>3"
@@ -20438,8 +20480,7 @@ (define_expand "<insn><mode>3"
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
-  else if (!ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0],
-						operands[1], operands[2]))
+  else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
index b3674fbd04f..dc684a167c8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
@@ -1,10 +1,10 @@
 /* PR target/pr95488  */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512bw -mavx512vl" }  */
-/* { dg-final { scan-assembler-times "vpmovzxbw" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 8 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
-/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 4 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 4 { target { ! ia32 } } } } */
 
 typedef char v16qi __attribute__ ((vector_size (16)));
 typedef char v8qi __attribute__ ((vector_size (8)));
diff --git a/gcc/testsuite/gcc.target/i386/pr98434-1.c b/gcc/testsuite/gcc.target/i386/pr98434-1.c
new file mode 100644
index 00000000000..773d3b8ad60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98434-1.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -O2 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {vpsravw[\t ]*%xmm} 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times {vpsrlvw[\t ]*%ymm} 2 } } */
+/* { dg-final { scan-assembler-times {vpsllvw[\t ]*%zmm} 2 } } */
+/* { dg-final { scan-assembler-times {vpsllvq[\t ]*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {vpsravq[\t ]*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {vpsrlvq[\t ]*%zmm} 1 } } */
+
+int n;
+
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef long long v2di __attribute__((vector_size (16)));
+typedef long long v4di __attribute__((vector_size (32)));
+typedef long long v8di __attribute__((vector_size (64)));
+typedef unsigned char v8uqi __attribute__((vector_size (8)));
+typedef unsigned char v16uqi __attribute__((vector_size (16)));
+typedef unsigned char v32uqi __attribute__((vector_size (32)));
+typedef unsigned short v8uhi __attribute__((vector_size (16)));
+typedef unsigned short v16uhi __attribute__((vector_size (32)));
+typedef unsigned short v32uhi __attribute__((vector_size (64)));
+typedef unsigned long long v2udi __attribute__((vector_size (16)));
+typedef unsigned long long v4udi __attribute__((vector_size (32)));
+typedef unsigned long long v8udi __attribute__((vector_size (64)));
+
+#define FOO(TYPE, OP, NAME)		\
+  __attribute__((noipa)) TYPE		\
+  foo_##TYPE##_##NAME (TYPE a, TYPE b)	\
+  {					\
+    return a OP b;			\
+  }					\
+
+FOO (v8qi, <<, vashl);
+FOO (v8qi, >>, vashr);
+FOO (v8uqi, >>, vlshr);
+FOO (v16qi, <<, vashl);
+FOO (v16qi, >>, vashr);
+FOO (v16uqi, >>, vlshr);
+FOO (v32qi, <<, vashl);
+FOO (v32qi, >>, vashr);
+FOO (v32uqi, >>, vlshr);
+FOO (v8hi, <<, vashl);
+FOO (v8hi, >>, vashr);
+FOO (v8uhi, >>, vlshr);
+FOO (v16hi, <<, vashl);
+FOO (v16hi, >>, vashr);
+FOO (v16uhi, >>, vlshr);
+FOO (v32hi, <<, vashl);
+FOO (v32hi, >>, vashr);
+FOO (v32uhi, >>, vlshr);
+FOO (v2di, <<, vashl);
+FOO (v2di, >>, vashr);
+FOO (v2udi, >>, vlshr);
+FOO (v4di, <<, vashl);
+FOO (v4di, >>, vashr);
+FOO (v4udi, >>, vlshr);
+FOO (v8di, <<, vashl);
+FOO (v8di, >>, vashr);
+FOO (v8udi, >>, vlshr);
diff --git a/gcc/testsuite/gcc.target/i386/pr98434-2.c b/gcc/testsuite/gcc.target/i386/pr98434-2.c
new file mode 100644
index 00000000000..f3ad5cd2759
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98434-2.c
@@ -0,0 +1,131 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mprefer-vector-width=512 -mavx512vl -mavx512bw" } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "pr98434-1.c"
+void test (void);
+#define DO_TEST test
+#define AVX512VL
+#define AVX512BW
+#include "avx512-check.h"
+
+
+typedef char int8;
+typedef unsigned char uint8;
+typedef short int16;
+typedef unsigned short uint16;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+#define F_EMULATE(TYPE, SIZE, OP, NAME)		\
+  __attribute__((noipa, optimize("-fno-tree-vectorize"))) void	\
+  emulate_##SIZE##_##TYPE##_##NAME (TYPE *a,	\
+				    TYPE *b,	\
+				    TYPE *c)	\
+  {						\
+    int i;					\
+    for (i = 0; i < SIZE; i++)			\
+      {						\
+	a[i] = b[i] OP c[i];			\
+      }						\
+  }
+
+F_EMULATE (int8,  8, <<, vashl);
+F_EMULATE (int8,  8,  >>, vashr);
+F_EMULATE (uint8,  8, >>, vlshr);
+F_EMULATE (int8,  16, <<, vashl);
+F_EMULATE (int8,  16, >>, vashr);
+F_EMULATE (uint8,  16, >>, vlshr);
+F_EMULATE (int8,  32, <<, vashl);
+F_EMULATE (int8,  32, >>, vashr);
+F_EMULATE (uint8,  32, >>, vlshr);
+F_EMULATE (int16,  8, <<, vashl);
+F_EMULATE (int16,  8, >>, vashr);
+F_EMULATE (uint16, 8, >>, vlshr);
+F_EMULATE (int16,  16, <<, vashl);
+F_EMULATE (int16,  16, >>, vashr);
+F_EMULATE (uint16, 16, >>, vlshr);
+F_EMULATE (int16,  32, <<, vashl);
+F_EMULATE (int16,  32, >>, vashr);
+F_EMULATE (uint16, 32, >>, vlshr);
+F_EMULATE (int64,  2, <<, vashl);
+F_EMULATE (int64,  2, >>, vashr);
+F_EMULATE (uint64,  2, >>, vlshr);
+F_EMULATE (int64,  4, <<, vashl);
+F_EMULATE (int64,  4, >>, vashr);
+F_EMULATE (uint64,  4, >>, vlshr);
+F_EMULATE (int64,  8, <<, vashl);
+F_EMULATE (int64,  8, >>, vashr);
+F_EMULATE (uint64,  8, >>, vlshr);
+
+#define VSHIFT(VTYPE, NAME, src1, src2)	\
+  foo_##VTYPE##_##NAME (src1, src2)
+
+#define EMULATE(SIZE, TYPE, NAME, dst, src1, src2)	\
+  emulate_##SIZE##_##TYPE##_##NAME (dst, src1, src2)
+
+#define F_TEST_SHIFT(VTYPE, VTYPEU, TYPE, TYPEU, SIZE)    \
+  __attribute__((noipa, optimize("-fno-tree-vectorize"))) void \
+  test_##VTYPE ()\
+  {\
+    TYPE src1[SIZE], src2[SIZE], ref[SIZE];		\
+    TYPEU usrc1[SIZE], usrc2[SIZE], uref[SIZE];			\
+    VTYPE dst;	     \
+    VTYPEU udst;     \
+    int i;\
+    for (i = 0; i < SIZE; i++)\
+    {\
+      dst[i] = ref[i] = -i; \
+      src1[i] = -(i + SIZE);			\
+      src2[i] = i % 8;			\
+      udst[i] = uref[i] = i;			\
+      usrc1[i] = (i + SIZE);			\
+      usrc2[i] = (i % 8);			\
+    }\
+    EMULATE(SIZE, TYPE, vashl, ref, src1, src2);	\
+    dst = VSHIFT(VTYPE, vashl, *((VTYPE* )&src1[0]), *((VTYPE*) &src2[0])); \
+    for (i = 0; i < SIZE; i++)\
+    {\
+      if(dst[i] != ref[i]) __builtin_abort();\
+    }\
+    EMULATE(SIZE, TYPE, vashr, ref, src1, src2);	\
+    dst = VSHIFT(VTYPE, vashr, *((VTYPE* )&src1[0]), *((VTYPE*) &src2[0])); \
+    for (i = 0; i < SIZE; i++)\
+    {\
+      if(dst[i] != ref[i]) __builtin_abort();\
+    }\
+    EMULATE(SIZE, TYPEU, vlshr, uref, usrc1, usrc2);	\
+    udst = VSHIFT(VTYPEU, vlshr, *((VTYPEU* )&usrc1[0]), *((VTYPEU*) &usrc2[0])); \
+    for (i = 0; i < SIZE; i++)\
+    {\
+      if(udst[i] != uref[i]) __builtin_abort();\
+    }\
+  }
+
+F_TEST_SHIFT (v8qi, v8uqi, int8, uint8, 8);
+F_TEST_SHIFT (v16qi, v16uqi, int8, uint8, 16);
+F_TEST_SHIFT (v32qi, v32uqi, int8, uint8, 32);
+F_TEST_SHIFT (v8hi, v8uhi, int16, uint16, 8);
+F_TEST_SHIFT (v16hi, v16uhi, int16, uint16, 16);
+F_TEST_SHIFT (v32hi, v32uhi, int16, uint16, 32);
+F_TEST_SHIFT (v2di, v2udi, int64, uint64, 2);
+F_TEST_SHIFT (v4di, v4udi, int64, uint64, 4);
+F_TEST_SHIFT (v8di, v8udi, int64, uint64, 8);
+
+
+void
+test (void)
+{
+  test_v8qi ();
+  test_v16qi ();
+  test_v32qi ();
+  test_v8hi ();
+  test_v16hi ();
+  test_v32hi ();
+  test_v2di ();
+  test_v4di ();
+  test_v8di ();
+}
+
+
-- 
2.18.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-06-24  5:00 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-24  9:49 [PATCH] [i386] Support avx512 vector shift with vector [PR98434] Hongtao Liu
2021-06-23  7:23 ` Hongtao Liu
2021-06-23  7:25   ` Hongtao Liu
2021-06-23  7:53   ` Richard Biener
2021-06-23  8:01     ` Jakub Jelinek
2021-06-23  9:07       ` Richard Biener
2021-06-24  5:05         ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).