public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod*
@ 2011-10-14 14:57 Jakub Jelinek
  2011-10-14 16:32 ` Richard Henderson
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Jelinek @ 2011-10-14 14:57 UTC (permalink / raw)
  To: Richard Henderson, Uros Bizjak; +Cc: gcc-patches

Hi!

This patch improves generated code for SSE4.1 and even more
for AVX2 on the attached testcases.  SSE4.1 has pmuldq
(where SSE2 only had pmuludq), so can handle signed widening shifts fine
too.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2011-10-14  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (vec_widen_smult_hi_v8hi,
	vec_widen_smult_lo_v8hi, vec_widen_umult_hi_v8hi,
	vec_widen_umult_lo_v8hi): Macroize using VI2_AVX2
	mode iterator and any_extend code iterator.
	(vec_widen_<s>mult_hi_v8si, vec_widen_<s>mult_lo_v8si): New
	expanders.
	(vec_widen_smult_hi_v4si, vec_widen_smult_lo_v4si): Enable
	also for TARGET_SSE4_1 using pmuldq insn.
	(sdot_prodv8hi): Macroize using VI2_AVX2 iterator.
	(sse2_sse4_1): New code attr.
	(udot_prodv4si): Macroize using any_extend code iterator.
	(<s>dot_prodv8si): New expander.

	* gcc.target/i386/sse2-mul-1.c: New test.
	* gcc.target/i386/sse4_1-mul-1.c: New test.
	* gcc.target/i386/avx-mul-1.c: New test.
	* gcc.target/i386/xop-mul-1.c: New test.
	* gcc.target/i386/avx2-mul-1.c: New test.

--- gcc/config/i386/sse.md.jj	2011-10-14 08:38:47.000000000 +0200
+++ gcc/config/i386/sse.md	2011-10-14 13:05:58.000000000 +0200
@@ -5507,83 +5507,100 @@ (define_insn_and_split "mul<mode>3"
   DONE;
 })
 
-(define_expand "vec_widen_smult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_hi_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_smult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_lo_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_umult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_hi_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
-
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  rtx t1, t2, t3, t4, rperm[8], vperm;
+  int i;
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V8SImode);
+  t2 = gen_reg_rtx (V8SImode);
+  t3 = gen_reg_rtx (V8SImode);
+  t4 = gen_reg_rtx (V8SImode);
+  /* This would be 2 insns shorter if
+     rperm[i] = GEN_INT (((~i & 1) << 2) + i / 2);
+     has been used instead (both vpslrq insns wouldn't be needed),
+     but vec_widen_*mult_hi_* is usually used together with
+     vec_widen_*mult_lo_* and by writing it this way the load
+     of the constant and the two vpermd instructions (cross-lane)
+     can be CSEd together.  */
+  for (i = 0; i < 8; ++i)
+    rperm[i] = GEN_INT (((i & 1) << 2) + i / 2);
+  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+  vperm = force_reg (V8SImode, vperm);
+  emit_insn (gen_avx2_permvarv8si (t1, vperm, operands[1]));
+  emit_insn (gen_avx2_permvarv8si (t2, vperm, operands[2]));
+  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t3),
+			    gen_lowpart (V4DImode, t1), GEN_INT (32)));
+  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t4),
+			    gen_lowpart (V4DImode, t2), GEN_INT (32)));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
   DONE;
 })
 
-(define_expand "vec_widen_umult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_lo_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
+  rtx t1, t2, rperm[8], vperm;
+  int i;
 
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
-
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V8SImode);
+  t2 = gen_reg_rtx (V8SImode);
+  for (i = 0; i < 8; ++i)
+    rperm[i] = GEN_INT (((i & 1) << 2) + i / 2);
+  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+  vperm = force_reg (V8SImode, vperm);
+  emit_insn (gen_avx2_permvarv8si (t1, vperm, operands[1]));
+  emit_insn (gen_avx2_permvarv8si (t2, vperm, operands[2]));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5591,24 +5608,28 @@ (define_expand "vec_widen_smult_hi_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5616,24 +5637,28 @@ (define_expand "vec_widen_smult_lo_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5675,30 +5700,35 @@ (define_expand "vec_widen_umult_lo_v4si"
   DONE;
 })
 
-(define_expand "sdot_prodv8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")
-   (match_operand:V4SI 3 "register_operand" "")]
+(define_expand "sdot_prod<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (match_operand:VI2_AVX2 1 "register_operand" "")
+   (match_operand:VI2_AVX2 2 "register_operand" "")
+   (match_operand:<sseunpackmode> 3 "register_operand" "")]
   "TARGET_SSE2"
 {
-  rtx t = gen_reg_rtx (V4SImode);
-  emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2]));
-  emit_insn (gen_addv4si3 (operands[0], operands[3], t));
+  rtx t = gen_reg_rtx (<sseunpackmode>mode);
+  emit_insn (gen_<sse2_avx2>_pmaddwd (t, operands[1], operands[2]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_PLUS (<sseunpackmode>mode,
+					operands[3], t)));
   DONE;
 })
 
-(define_expand "udot_prodv4si"
+(define_code_attr sse2_sse4_1
+   [(zero_extend "sse2") (sign_extend "sse4_1")])
+
+(define_expand "<s>dot_prodv4si"
   [(match_operand:V2DI 0 "register_operand" "")
-   (match_operand:V4SI 1 "register_operand" "")
+   (any_extend:V2DI (match_operand:V4SI 1 "register_operand" ""))
    (match_operand:V4SI 2 "register_operand" "")
    (match_operand:V2DI 3 "register_operand" "")]
-  "TARGET_SSE2"
+  "<CODE> == ZERO_EXTEND ? TARGET_SSE2 : TARGET_SSE4_1"
 {
   rtx t1, t2, t3, t4;
 
   t1 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t1, operands[1], operands[2]));
   emit_insn (gen_addv2di3 (t1, t1, operands[3]));
 
   t2 = gen_reg_rtx (V4SImode);
@@ -5711,12 +5741,41 @@ (define_expand "udot_prodv4si"
 				 GEN_INT (32)));
 
   t4 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t4, t2, t3));
 
   emit_insn (gen_addv2di3 (operands[0], t1, t4));
   DONE;
 })
 
+(define_expand "<s>dot_prodv8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "register_operand" ""))
+   (match_operand:V8SI 2 "register_operand" "")
+   (match_operand:V4DI 3 "register_operand" "")]
+  "TARGET_AVX2"
+{
+  rtx t1, t2, t3, t4;
+
+  t1 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_addv4di3 (t1, t1, operands[3]));
+
+  t2 = gen_reg_rtx (V8SImode);
+  t3 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t2),
+				 gen_lowpart (V2TImode, operands[1]),
+				 GEN_INT (32)));
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t3),
+				 gen_lowpart (V2TImode, operands[2]),
+				 GEN_INT (32)));
+
+  t4 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t4, t2, t3));
+
+  emit_insn (gen_addv4di3 (operands[0], t1, t4));
+  DONE;
+})
+
 (define_insn "ashr<mode>3"
   [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x")
 	(ashiftrt:VI24_AVX2
--- gcc/testsuite/gcc.target/i386/sse2-mul-1.c.jj	2011-10-14 10:39:57.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-mul-1.c	2011-10-14 13:32:53.000000000 +0200
@@ -0,0 +1,209 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O3 -msse2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+#include <stdlib.h>
+
+#define N 512
+static short a1[N], a2[N], a3[N];
+static unsigned short b1[N], b2[N], b3[N];
+static int c1[N], c2[N], c3[N];
+static unsigned int d1[N], d2[N], d3[N];
+static long long e1[N], e2[N], e3[N];
+static unsigned long long g1[N], g2[N], g3[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    a1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    b1[i] = b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = c2[i] * c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = d2[i] * d3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = e2[i] * e3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = g2[i] * g3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = (unsigned int) b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = (long long) c2[i] * (long long) c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = (unsigned long long) d2[i] * (unsigned long long) d3[i];
+}
+
+__attribute__((noinline, noclone)) int
+f11 (void)
+{
+  int i, r = 0;
+  for (i = 0; i < N; ++i)
+    r += a2[i] * a3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned int
+f12 (void)
+{
+  int i;
+  unsigned r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned int) b2[i] * b3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) long long
+f13 (void)
+{
+  int i;
+  long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (long long) c2[i] * (long long) c3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+f14 (void)
+{
+  int i;
+  unsigned long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+  return r;
+}
+
+static void
+TEST (void)
+{
+  int i;
+  int s1 = 0;
+  unsigned int s2 = 0;
+  long long s3 = 0;
+  unsigned long long s4 = 0;
+  for (i = 0; i < N; ++i)
+    {
+      asm volatile ("" : : "r" (&s1) : "memory");
+      asm volatile ("" : : "r" (&s2) : "memory");
+      asm volatile ("" : : "r" (&s3) : "memory");
+      asm volatile ("" : : "r" (&s4) : "memory");
+      b2[i] = (int) random ();
+      b3[i] = (int) random ();
+      a2[i] = b2[i];
+      a3[i] = b3[i];
+      d2[i] = (((int) random ()) << 16) | b2[i];
+      d3[i] = (((int) random ()) << 16) | b3[i];
+      c2[i] = d2[i];
+      c3[i] = d3[i];
+      s1 += a2[i] * a3[i];
+      s2 += (unsigned int) b2[i] * b3[i];
+      s3 += (long long) c2[i] * (long long) c3[i];
+      s4 += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+    }
+  f1 ();
+  f2 ();
+  f3 ();
+  f4 ();
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (a1[i] != (short) (a2[i] * a3[i]))
+	abort ();
+      if (b1[i] != (unsigned short) (b2[i] * b3[i]))
+	abort ();
+      if (c1[i] != c2[i] * c3[i])
+	abort ();
+      if (d1[i] != d2[i] * d3[i])
+	abort ();
+      if (e1[i] != e2[i] * e3[i])
+	abort ();
+      if (g1[i] != g2[i] * g3[i])
+	abort ();
+    }
+  f7 ();
+  f8 ();
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (c1[i] != a2[i] * a3[i])
+	abort ();
+      if (d1[i] != b2[i] * b3[i])
+	abort ();
+      if (e1[i] != (long long) c2[i] * (long long) c3[i])
+	abort ();
+      if (g1[i] != (unsigned long long) d2[i] * (unsigned long long) d3[i])
+	abort ();
+    }
+  if (f11 () != s1 || f12 () != s2 || f13 () != s3 || f14 () != s4)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c.jj	2011-10-14 10:40:46.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c	2011-10-14 10:41:27.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O3 -msse4.1" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx-mul-1.c.jj	2011-10-14 10:42:35.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-mul-1.c	2011-10-14 10:42:56.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/xop-mul-1.c.jj	2011-10-14 10:44:11.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/xop-mul-1.c	2011-10-14 10:44:25.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O3 -mxop" } */
+
+#ifndef CHECK_H
+#define CHECK_H "xop-check.h"
+#endif
+
+#ifndef TEST
+#define TEST xop_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx2-mul-1.c.jj	2011-10-14 10:43:28.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-mul-1.c	2011-10-14 10:43:45.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+#include "sse2-mul-1.c"

	Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod*
  2011-10-14 14:57 [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* Jakub Jelinek
@ 2011-10-14 16:32 ` Richard Henderson
  2011-10-14 19:08   ` [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* (take 2) Jakub Jelinek
  0 siblings, 1 reply; 4+ messages in thread
From: Richard Henderson @ 2011-10-14 16:32 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On 10/14/2011 07:18 AM, Jakub Jelinek wrote:
> +  /* This would be 2 insns shorter if
> +     rperm[i] = GEN_INT (((~i & 1) << 2) + i / 2);
> +     has been used instead (both vpslrq insns wouldn't be needed),
> +     but vec_widen_*mult_hi_* is usually used together with
> +     vec_widen_*mult_lo_* and by writing it this way the load
> +     of the constant and the two vpermd instructions (cross-lane)
> +     can be CSEd together.  */
> +  for (i = 0; i < 8; ++i)
> +    rperm[i] = GEN_INT (((i & 1) << 2) + i / 2);
> +  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
> +  vperm = force_reg (V8SImode, vperm);
> +  emit_insn (gen_avx2_permvarv8si (t1, vperm, operands[1]));
> +  emit_insn (gen_avx2_permvarv8si (t2, vperm, operands[2]));
> +  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t3),
> +			    gen_lowpart (V4DImode, t1), GEN_INT (32)));
> +  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t4),
> +			    gen_lowpart (V4DImode, t2), GEN_INT (32)));
> +  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));

So what you're doing here is the low-part permutation:

	0 4 1 5 2 6 3 7

followed by a shift to get

	4 . 5 . 6 . 7 .

But you need to load a 256-bit constant from memory to get it.

I wonder if it wouldn't be better to use VPERMQ to handle the lane change:

	0   2   1   3
	0 1 4 5 2 3 6 7

shared between the hi/lo, and a VPSHUFD to handle the in-lane ordering:

	0 0 1 1 2 2 3 3
	4 4 5 5 6 6 7 7

In the end we get 2+(2+2)=6 insns as setup prior to the VPMULDQs, as compared
to your 1+2+(0+2)=5 insns, but no need to wait for the constant load.  Of 
course, if the constant load gets hoisted out of the loop, yours will likely
win on throughput.

Thoughts, Uros and those looking in from Intel?

Otherwise it looks ok.


r~

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* (take 2)
  2011-10-14 16:32 ` Richard Henderson
@ 2011-10-14 19:08   ` Jakub Jelinek
  2011-10-14 19:42     ` Richard Henderson
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Jelinek @ 2011-10-14 19:08 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Uros Bizjak, gcc-patches

On Fri, Oct 14, 2011 at 09:21:15AM -0700, Richard Henderson wrote:
> So what you're doing here is the low-part permutation:
> 
> 	0 4 1 5 2 6 3 7
> 
> followed by a shift to get
> 
> 	4 . 5 . 6 . 7 .
> 
> But you need to load a 256-bit constant from memory to get it.

Right.

> I wonder if it wouldn't be better to use VPERMQ to handle the lane change:
> 
> 	0   2   1   3
> 	0 1 4 5 2 3 6 7
> 
> shared between the hi/lo, and a VPSHUFD to handle the in-lane ordering:
> 
> 	0 0 1 1 2 2 3 3
> 	4 4 5 5 6 6 7 7
> 
> In the end we get 2+(2+2)=6 insns as setup prior to the VPMULDQs, as compared
> to your 1+2+(0+2)=5 insns, but no need to wait for the constant load.  Of 
> course, if the constant load gets hoisted out of the loop, yours will likely
> win on throughput.

But it increases register pressure, so it is a win for simpler loops,
and might be worse otherwise.

So here is the alternative patch that does it with VPERMQ+VPSHUFD.
Tested using the avx2-mul-1.c testcase with sde.

2011-10-14  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (vec_widen_smult_hi_v8hi,
	vec_widen_smult_lo_v8hi, vec_widen_umult_hi_v8hi,
	vec_widen_umult_lo_v8hi): Macroize using VI2_AVX2
	mode iterator and any_extend code iterator.
	(vec_widen_<s>mult_hi_v8si, vec_widen_<s>mult_lo_v8si): New
	expanders.
	(vec_widen_smult_hi_v4si, vec_widen_smult_lo_v4si): Enable
	also for TARGET_SSE4_1 using pmuldq insn.
	(sdot_prodv8hi): Macroize using VI2_AVX2 iterator.
	(sse2_sse4_1): New code attr.
	(udot_prodv4si): Macroize using any_extend code iterator.
	(<s>dot_prodv8si): New expander.

	* gcc.target/i386/sse2-mul-1.c: New test.
	* gcc.target/i386/sse4_1-mul-1.c: New test.
	* gcc.target/i386/avx-mul-1.c: New test.
	* gcc.target/i386/xop-mul-1.c: New test.
	* gcc.target/i386/avx2-mul-1.c: New test.

--- gcc/config/i386/sse.md.jj	2011-10-14 08:38:47.000000000 +0200
+++ gcc/config/i386/sse.md	2011-10-14 13:05:58.000000000 +0200
@@ -5507,83 +5507,97 @@ (define_insn_and_split "mul<mode>3"
   DONE;
 })
 
-(define_expand "vec_widen_smult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_hi_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_smult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_lo_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_umult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_hi_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
-
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  rtx t1, t2, t3, t4;
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V4DImode);
+  t2 = gen_reg_rtx (V4DImode);
+  t3 = gen_reg_rtx (V8SImode);
+  t4 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
+				GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
+  emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
+				GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
   DONE;
 })
 
-(define_expand "vec_widen_umult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_lo_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
+  rtx t1, t2, t3, t4;
 
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
-
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V4DImode);
+  t2 = gen_reg_rtx (V4DImode);
+  t3 = gen_reg_rtx (V8SImode);
+  t4 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_permv4di_1 (t2,  gen_lowpart (V4DImode, operands[2]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
+				GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
+  emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
+				GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
   DONE;
 })
 
@@ -5591,24 +5605,28 @@ (define_expand "vec_widen_smult_hi_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5616,24 +5634,28 @@ (define_expand "vec_widen_smult_lo_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5675,30 +5697,35 @@ (define_expand "vec_widen_umult_lo_v4si"
   DONE;
 })
 
-(define_expand "sdot_prodv8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")
-   (match_operand:V4SI 3 "register_operand" "")]
+(define_expand "sdot_prod<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (match_operand:VI2_AVX2 1 "register_operand" "")
+   (match_operand:VI2_AVX2 2 "register_operand" "")
+   (match_operand:<sseunpackmode> 3 "register_operand" "")]
   "TARGET_SSE2"
 {
-  rtx t = gen_reg_rtx (V4SImode);
-  emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2]));
-  emit_insn (gen_addv4si3 (operands[0], operands[3], t));
+  rtx t = gen_reg_rtx (<sseunpackmode>mode);
+  emit_insn (gen_<sse2_avx2>_pmaddwd (t, operands[1], operands[2]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_PLUS (<sseunpackmode>mode,
+					operands[3], t)));
   DONE;
 })
 
-(define_expand "udot_prodv4si"
+(define_code_attr sse2_sse4_1
+   [(zero_extend "sse2") (sign_extend "sse4_1")])
+
+(define_expand "<s>dot_prodv4si"
   [(match_operand:V2DI 0 "register_operand" "")
-   (match_operand:V4SI 1 "register_operand" "")
+   (any_extend:V2DI (match_operand:V4SI 1 "register_operand" ""))
    (match_operand:V4SI 2 "register_operand" "")
    (match_operand:V2DI 3 "register_operand" "")]
-  "TARGET_SSE2"
+  "<CODE> == ZERO_EXTEND ? TARGET_SSE2 : TARGET_SSE4_1"
 {
   rtx t1, t2, t3, t4;
 
   t1 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t1, operands[1], operands[2]));
   emit_insn (gen_addv2di3 (t1, t1, operands[3]));
 
   t2 = gen_reg_rtx (V4SImode);
@@ -5711,12 +5738,41 @@ (define_expand "udot_prodv4si"
 				 GEN_INT (32)));
 
   t4 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t4, t2, t3));
 
   emit_insn (gen_addv2di3 (operands[0], t1, t4));
   DONE;
 })
 
+(define_expand "<s>dot_prodv8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "register_operand" ""))
+   (match_operand:V8SI 2 "register_operand" "")
+   (match_operand:V4DI 3 "register_operand" "")]
+  "TARGET_AVX2"
+{
+  rtx t1, t2, t3, t4;
+
+  t1 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_addv4di3 (t1, t1, operands[3]));
+
+  t2 = gen_reg_rtx (V8SImode);
+  t3 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t2),
+				 gen_lowpart (V2TImode, operands[1]),
+				 GEN_INT (32)));
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t3),
+				 gen_lowpart (V2TImode, operands[2]),
+				 GEN_INT (32)));
+
+  t4 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t4, t2, t3));
+
+  emit_insn (gen_addv4di3 (operands[0], t1, t4));
+  DONE;
+})
+
 (define_insn "ashr<mode>3"
   [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x")
 	(ashiftrt:VI24_AVX2
--- gcc/testsuite/gcc.target/i386/sse2-mul-1.c.jj	2011-10-14 10:39:57.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-mul-1.c	2011-10-14 13:32:53.000000000 +0200
@@ -0,0 +1,209 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O3 -msse2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+#include <stdlib.h>
+
+#define N 512
+static short a1[N], a2[N], a3[N];
+static unsigned short b1[N], b2[N], b3[N];
+static int c1[N], c2[N], c3[N];
+static unsigned int d1[N], d2[N], d3[N];
+static long long e1[N], e2[N], e3[N];
+static unsigned long long g1[N], g2[N], g3[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    a1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    b1[i] = b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = c2[i] * c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = d2[i] * d3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = e2[i] * e3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = g2[i] * g3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = (unsigned int) b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = (long long) c2[i] * (long long) c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = (unsigned long long) d2[i] * (unsigned long long) d3[i];
+}
+
+__attribute__((noinline, noclone)) int
+f11 (void)
+{
+  int i, r = 0;
+  for (i = 0; i < N; ++i)
+    r += a2[i] * a3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned int
+f12 (void)
+{
+  int i;
+  unsigned r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned int) b2[i] * b3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) long long
+f13 (void)
+{
+  int i;
+  long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (long long) c2[i] * (long long) c3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+f14 (void)
+{
+  int i;
+  unsigned long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+  return r;
+}
+
+static void
+TEST (void)
+{
+  int i;
+  int s1 = 0;
+  unsigned int s2 = 0;
+  long long s3 = 0;
+  unsigned long long s4 = 0;
+  for (i = 0; i < N; ++i)
+    {
+      asm volatile ("" : : "r" (&s1) : "memory");
+      asm volatile ("" : : "r" (&s2) : "memory");
+      asm volatile ("" : : "r" (&s3) : "memory");
+      asm volatile ("" : : "r" (&s4) : "memory");
+      b2[i] = (int) random ();
+      b3[i] = (int) random ();
+      a2[i] = b2[i];
+      a3[i] = b3[i];
+      d2[i] = (((int) random ()) << 16) | b2[i];
+      d3[i] = (((int) random ()) << 16) | b3[i];
+      c2[i] = d2[i];
+      c3[i] = d3[i];
+      s1 += a2[i] * a3[i];
+      s2 += (unsigned int) b2[i] * b3[i];
+      s3 += (long long) c2[i] * (long long) c3[i];
+      s4 += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+    }
+  f1 ();
+  f2 ();
+  f3 ();
+  f4 ();
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (a1[i] != (short) (a2[i] * a3[i]))
+	abort ();
+      if (b1[i] != (unsigned short) (b2[i] * b3[i]))
+	abort ();
+      if (c1[i] != c2[i] * c3[i])
+	abort ();
+      if (d1[i] != d2[i] * d3[i])
+	abort ();
+      if (e1[i] != e2[i] * e3[i])
+	abort ();
+      if (g1[i] != g2[i] * g3[i])
+	abort ();
+    }
+  f7 ();
+  f8 ();
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (c1[i] != a2[i] * a3[i])
+	abort ();
+      if (d1[i] != b2[i] * b3[i])
+	abort ();
+      if (e1[i] != (long long) c2[i] * (long long) c3[i])
+	abort ();
+      if (g1[i] != (unsigned long long) d2[i] * (unsigned long long) d3[i])
+	abort ();
+    }
+  if (f11 () != s1 || f12 () != s2 || f13 () != s3 || f14 () != s4)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c.jj	2011-10-14 10:40:46.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c	2011-10-14 10:41:27.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O3 -msse4.1" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx-mul-1.c.jj	2011-10-14 10:42:35.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-mul-1.c	2011-10-14 10:42:56.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/xop-mul-1.c.jj	2011-10-14 10:44:11.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/xop-mul-1.c	2011-10-14 10:44:25.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O3 -mxop" } */
+
+#ifndef CHECK_H
+#define CHECK_H "xop-check.h"
+#endif
+
+#ifndef TEST
+#define TEST xop_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx2-mul-1.c.jj	2011-10-14 10:43:28.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-mul-1.c	2011-10-14 10:43:45.000000000 +0200
@@ -0,0 +1,13 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+#include "sse2-mul-1.c"


	Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* (take 2)
  2011-10-14 19:08   ` [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* (take 2) Jakub Jelinek
@ 2011-10-14 19:42     ` Richard Henderson
  0 siblings, 0 replies; 4+ messages in thread
From: Richard Henderson @ 2011-10-14 19:42 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On 10/14/2011 11:34 AM, Jakub Jelinek wrote:
> 2011-10-14  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* config/i386/sse.md (vec_widen_smult_hi_v8hi,
> 	vec_widen_smult_lo_v8hi, vec_widen_umult_hi_v8hi,
> 	vec_widen_umult_lo_v8hi): Macroize using VI2_AVX2
> 	mode iterator and any_extend code iterator.
> 	(vec_widen_<s>mult_hi_v8si, vec_widen_<s>mult_lo_v8si): New
> 	expanders.
> 	(vec_widen_smult_hi_v4si, vec_widen_smult_lo_v4si): Enable
> 	also for TARGET_SSE4_1 using pmuldq insn.
> 	(sdot_prodv8hi): Macroize using VI2_AVX2 iterator.
> 	(sse2_sse4_1): New code attr.
> 	(udot_prodv4si): Macroize using any_extend code iterator.
> 	(<s>dot_prodv8si): New expander.
> 
> 	* gcc.target/i386/sse2-mul-1.c: New test.
> 	* gcc.target/i386/sse4_1-mul-1.c: New test.
> 	* gcc.target/i386/avx-mul-1.c: New test.
> 	* gcc.target/i386/xop-mul-1.c: New test.
> 	* gcc.target/i386/avx2-mul-1.c: New test.

Ok.


r~

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-10-14 19:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-10-14 14:57 [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* Jakub Jelinek
2011-10-14 16:32 ` Richard Henderson
2011-10-14 19:08   ` [PATCH] AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* (take 2) Jakub Jelinek
2011-10-14 19:42     ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).