public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Improve AVX512 sse movcc (PR target/88547)
@ 2018-12-19 23:20 Jakub Jelinek
  2018-12-20  7:42 ` Uros Bizjak
  0 siblings, 1 reply; 3+ messages in thread
From: Jakub Jelinek @ 2018-12-19 23:20 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 12485 bytes --]

Hi!

If one vcond argument is all ones (non-bool) vector and another one is all
zeros, we can use for AVX512{DQ,BW} (sometimes + VL) the vpmovm2? insns.
While if op_true is all ones and op_false, we emit large code that the
combiner often optimizes to that vpmovm2?, if the arguments are swapped,
we emit vpxor + vpternlog + and masked move (blend), while we could just
invert the mask with knot* and use vpmovm2?.

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk?  The patch is large, but it is mostly reindentation, in the
attachment there is diff -ubpd variant of the i386.c changes to make it more
readable.

2018-12-19  Jakub Jelinek  <jakub@redhat.com>

	PR target/88547
	* config/i386/i386.c (ix86_expand_sse_movcc): For maskcmp, try to
	emit vpmovm2? instruction perhaps after knot?.  Reorganize code
	so that it doesn't have to test !maskcmp in almost every conditional.

	* gcc.target/i386/pr88547-1.c: New test.

--- gcc/config/i386/i386.c.jj	2018-12-18 19:40:27.698946295 +0100
+++ gcc/config/i386/i386.c	2018-12-19 17:14:24.948218640 +0100
@@ -23593,33 +23593,117 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
       cmp = gen_rtx_SUBREG (mode, cmp, 0);
     }
 
-  if (vector_all_ones_operand (op_true, mode)
-      && rtx_equal_p (op_false, CONST0_RTX (mode))
-      && !maskcmp)
+  if (maskcmp)
+    {
+      rtx (*gen) (rtx, rtx) = NULL;
+      if ((op_true == CONST0_RTX (mode)
+	   && vector_all_ones_operand (op_false, mode))
+	  || (op_false == CONST0_RTX (mode)
+	      && vector_all_ones_operand (op_true, mode)))
+	switch (mode)
+	  {
+	  case E_V64QImode:
+	    if (TARGET_AVX512BW)
+	      gen = gen_avx512bw_cvtmask2bv64qi;
+	    break;
+	  case E_V32QImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2bv32qi;
+	    break;
+	  case E_V16QImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2bv16qi;
+	    break;
+	  case E_V32HImode:
+	    if (TARGET_AVX512BW)
+	      gen = gen_avx512bw_cvtmask2wv32hi;
+	    break;
+	  case E_V16HImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2wv16hi;
+	    break;
+	  case E_V8HImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2wv8hi;
+	    break;
+	  case E_V16SImode:
+	    if (TARGET_AVX512DQ)
+	      gen = gen_avx512f_cvtmask2dv16si;
+	    break;
+	  case E_V8SImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2dv8si;
+	    break;
+	  case E_V4SImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2dv4si;
+	    break;
+	  case E_V8DImode:
+	    if (TARGET_AVX512DQ)
+	      gen = gen_avx512f_cvtmask2qv8di;
+	    break;
+	  case E_V4DImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2qv4di;
+	    break;
+	  case E_V2DImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2qv2di;
+	    break;
+	  default:
+	    break;
+	  }
+      if (gen && SCALAR_INT_MODE_P (cmpmode))
+	{
+	  cmp = force_reg (cmpmode, cmp);
+	  if (op_true == CONST0_RTX (mode))
+	    {
+	      rtx (*gen_not) (rtx, rtx);
+	      switch (cmpmode)
+		{
+		case E_QImode: gen_not = gen_knotqi; break;
+		case E_HImode: gen_not = gen_knothi; break;
+		case E_SImode: gen_not = gen_knotsi; break;
+		case E_DImode: gen_not = gen_knotdi; break;
+		default: gcc_unreachable ();
+		}
+	      rtx n = gen_reg_rtx (cmpmode);
+	      emit_insn (gen_not (n, cmp));
+	      cmp = n;
+	    }
+	  emit_insn (gen (dest, cmp));
+	  return;
+	}
+    }
+  else if (vector_all_ones_operand (op_true, mode)
+	   && op_false == CONST0_RTX (mode))
     {
       emit_insn (gen_rtx_SET (dest, cmp));
+      return;
     }
-  else if (op_false == CONST0_RTX (mode) && !maskcmp)
+  else if (op_false == CONST0_RTX (mode))
     {
       op_true = force_reg (mode, op_true);
       x = gen_rtx_AND (mode, cmp, op_true);
       emit_insn (gen_rtx_SET (dest, x));
+      return;
     }
-  else if (op_true == CONST0_RTX (mode) && !maskcmp)
+  else if (op_true == CONST0_RTX (mode))
     {
       op_false = force_reg (mode, op_false);
       x = gen_rtx_NOT (mode, cmp);
       x = gen_rtx_AND (mode, x, op_false);
       emit_insn (gen_rtx_SET (dest, x));
+      return;
     }
-  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
-	   && !maskcmp)
+  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
     {
       op_false = force_reg (mode, op_false);
       x = gen_rtx_IOR (mode, cmp, op_false);
       emit_insn (gen_rtx_SET (dest, x));
+      return;
     }
-  else if (TARGET_XOP && !maskcmp)
+  else if (TARGET_XOP)
     {
       op_true = force_reg (mode, op_true);
 
@@ -23629,127 +23713,126 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
       emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
 							  op_true,
 							  op_false)));
+      return;
     }
-  else
-    {
-      rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
-      rtx d = dest;
 
-      if (!vector_operand (op_true, mode))
-	op_true = force_reg (mode, op_true);
+  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+  rtx d = dest;
 
-      op_false = force_reg (mode, op_false);
+  if (!vector_operand (op_true, mode))
+    op_true = force_reg (mode, op_true);
 
-      switch (mode)
-	{
-	case E_V4SFmode:
-	  if (TARGET_SSE4_1)
-	    gen = gen_sse4_1_blendvps;
-	  break;
-	case E_V2DFmode:
-	  if (TARGET_SSE4_1)
-	    gen = gen_sse4_1_blendvpd;
-	  break;
-	case E_SFmode:
-	  if (TARGET_SSE4_1)
-	    {
-	      gen = gen_sse4_1_blendvss;
-	      op_true = force_reg (mode, op_true);
-	    }
-	  break;
-	case E_DFmode:
-	  if (TARGET_SSE4_1)
-	    {
-	      gen = gen_sse4_1_blendvsd;
-	      op_true = force_reg (mode, op_true);
-	    }
-	  break;
-	case E_V16QImode:
-	case E_V8HImode:
-	case E_V4SImode:
-	case E_V2DImode:
-	  if (TARGET_SSE4_1)
-	    {
-	      gen = gen_sse4_1_pblendvb;
-	      if (mode != V16QImode)
-		d = gen_reg_rtx (V16QImode);
-	      op_false = gen_lowpart (V16QImode, op_false);
-	      op_true = gen_lowpart (V16QImode, op_true);
-	      cmp = gen_lowpart (V16QImode, cmp);
-	    }
-	  break;
-	case E_V8SFmode:
-	  if (TARGET_AVX)
-	    gen = gen_avx_blendvps256;
-	  break;
-	case E_V4DFmode:
-	  if (TARGET_AVX)
-	    gen = gen_avx_blendvpd256;
-	  break;
-	case E_V32QImode:
-	case E_V16HImode:
-	case E_V8SImode:
-	case E_V4DImode:
-	  if (TARGET_AVX2)
-	    {
-	      gen = gen_avx2_pblendvb;
-	      if (mode != V32QImode)
-		d = gen_reg_rtx (V32QImode);
-	      op_false = gen_lowpart (V32QImode, op_false);
-	      op_true = gen_lowpart (V32QImode, op_true);
-	      cmp = gen_lowpart (V32QImode, cmp);
-	    }
-	  break;
-
-	case E_V64QImode:
-	  gen = gen_avx512bw_blendmv64qi;
-	  break;
-	case E_V32HImode:
-	  gen = gen_avx512bw_blendmv32hi;
-	  break;
-	case E_V16SImode:
-	  gen = gen_avx512f_blendmv16si;
-	  break;
-	case E_V8DImode:
-	  gen = gen_avx512f_blendmv8di;
-	  break;
-	case E_V8DFmode:
-	  gen = gen_avx512f_blendmv8df;
-	  break;
-	case E_V16SFmode:
-	  gen = gen_avx512f_blendmv16sf;
-	  break;
-
-	default:
-	  break;
-	}
+  op_false = force_reg (mode, op_false);
 
-      if (gen != NULL)
+  switch (mode)
+    {
+    case E_V4SFmode:
+      if (TARGET_SSE4_1)
+	gen = gen_sse4_1_blendvps;
+      break;
+    case E_V2DFmode:
+      if (TARGET_SSE4_1)
+	gen = gen_sse4_1_blendvpd;
+      break;
+    case E_SFmode:
+      if (TARGET_SSE4_1)
 	{
-	  emit_insn (gen (d, op_false, op_true, cmp));
-	  if (d != dest)
-	    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+	  gen = gen_sse4_1_blendvss;
+	  op_true = force_reg (mode, op_true);
 	}
-      else
+      break;
+    case E_DFmode:
+      if (TARGET_SSE4_1)
 	{
+	  gen = gen_sse4_1_blendvsd;
 	  op_true = force_reg (mode, op_true);
+	}
+      break;
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+      if (TARGET_SSE4_1)
+	{
+	  gen = gen_sse4_1_pblendvb;
+	  if (mode != V16QImode)
+	    d = gen_reg_rtx (V16QImode);
+	  op_false = gen_lowpart (V16QImode, op_false);
+	  op_true = gen_lowpart (V16QImode, op_true);
+	  cmp = gen_lowpart (V16QImode, cmp);
+	}
+      break;
+    case E_V8SFmode:
+      if (TARGET_AVX)
+	gen = gen_avx_blendvps256;
+      break;
+    case E_V4DFmode:
+      if (TARGET_AVX)
+	gen = gen_avx_blendvpd256;
+      break;
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SImode:
+    case E_V4DImode:
+      if (TARGET_AVX2)
+	{
+	  gen = gen_avx2_pblendvb;
+	  if (mode != V32QImode)
+	    d = gen_reg_rtx (V32QImode);
+	  op_false = gen_lowpart (V32QImode, op_false);
+	  op_true = gen_lowpart (V32QImode, op_true);
+	  cmp = gen_lowpart (V32QImode, cmp);
+	}
+      break;
+
+    case E_V64QImode:
+      gen = gen_avx512bw_blendmv64qi;
+      break;
+    case E_V32HImode:
+      gen = gen_avx512bw_blendmv32hi;
+      break;
+    case E_V16SImode:
+      gen = gen_avx512f_blendmv16si;
+      break;
+    case E_V8DImode:
+      gen = gen_avx512f_blendmv8di;
+      break;
+    case E_V8DFmode:
+      gen = gen_avx512f_blendmv8df;
+      break;
+    case E_V16SFmode:
+      gen = gen_avx512f_blendmv16sf;
+      break;
+
+    default:
+      break;
+    }
+
+  if (gen != NULL)
+    {
+      emit_insn (gen (d, op_false, op_true, cmp));
+      if (d != dest)
+	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+    }
+  else
+    {
+      op_true = force_reg (mode, op_true);
 
-	  t2 = gen_reg_rtx (mode);
-	  if (optimize)
-	    t3 = gen_reg_rtx (mode);
-	  else
-	    t3 = dest;
-
-	  x = gen_rtx_AND (mode, op_true, cmp);
-	  emit_insn (gen_rtx_SET (t2, x));
-
-	  x = gen_rtx_NOT (mode, cmp);
-	  x = gen_rtx_AND (mode, x, op_false);
-	  emit_insn (gen_rtx_SET (t3, x));
+      t2 = gen_reg_rtx (mode);
+      if (optimize)
+	t3 = gen_reg_rtx (mode);
+      else
+	t3 = dest;
 
-	  x = gen_rtx_IOR (mode, t3, t2);
-	  emit_insn (gen_rtx_SET (dest, x));
-	}
+      x = gen_rtx_AND (mode, op_true, cmp);
+      emit_insn (gen_rtx_SET (t2, x));
+
+      x = gen_rtx_NOT (mode, cmp);
+      x = gen_rtx_AND (mode, x, op_false);
+      emit_insn (gen_rtx_SET (t3, x));
+
+      x = gen_rtx_IOR (mode, t3, t2);
+      emit_insn (gen_rtx_SET (dest, x));
     }
 }
 
--- gcc/testsuite/gcc.target/i386/pr88547-1.c.jj	2018-12-19 17:20:16.396518647 +0100
+++ gcc/testsuite/gcc.target/i386/pr88547-1.c	2018-12-19 17:28:54.818103169 +0100
@@ -0,0 +1,117 @@
+/* PR target/88547 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mavx512dq" } */
+/* { dg-final { scan-assembler-not "vpternlog" } } */
+/* { dg-final { scan-assembler-times "vpmovm2b\[\t  ]" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovm2w\[\t  ]" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovm2d\[\t  ]" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovm2q\[\t  ]" 4 } } */
+/* { dg-final { scan-assembler-times "knotb\[\t  ]" 4 } } */
+/* { dg-final { scan-assembler-times "knotw\[\t  ]" 4 } } */
+/* { dg-final { scan-assembler-times "knotd\[\t  ]" 2 } } */
+/* { dg-final { scan-assembler-times "knotq\[\t  ]" 2 } } */
+
+typedef signed char v16qi __attribute__((vector_size(64)));
+typedef unsigned char v16uqi __attribute__((vector_size(64)));
+typedef short v8hi __attribute__((vector_size(64)));
+typedef unsigned short v8uhi __attribute__((vector_size(64)));
+typedef int v4si __attribute__((vector_size(64)));
+typedef unsigned v4usi __attribute__((vector_size(64)));
+typedef long long v2di __attribute__((vector_size(64)));
+typedef unsigned long long v2udi __attribute__((vector_size(64)));
+
+v16qi
+f1 (v16qi x, v16qi y)
+{
+  return x <= y;
+}
+
+v16uqi
+f2 (v16uqi x, v16uqi y)
+{
+  return x <= y;
+}
+
+v16qi
+f3 (v16qi x, v16qi y)
+{
+  return x >= y;
+}
+
+v16uqi
+f4 (v16uqi x, v16uqi y)
+{
+  return x >= y;
+}
+
+v8hi
+f5 (v8hi x, v8hi y)
+{
+  return x <= y;
+}
+
+v8uhi
+f6 (v8uhi x, v8uhi y)
+{
+  return x <= y;
+}
+
+v8hi
+f7 (v8hi x, v8hi y)
+{
+  return x >= y;
+}
+
+v8uhi
+f8 (v8uhi x, v8uhi y)
+{
+  return x >= y;
+}
+
+v4si
+f9 (v4si x, v4si y)
+{
+  return x <= y;
+}
+
+v4usi
+f10 (v4usi x, v4usi y)
+{
+  return x <= y;
+}
+
+v4si
+f11 (v4si x, v4si y)
+{
+  return x >= y;
+}
+
+v4usi
+f12 (v4usi x, v4usi y)
+{
+  return x >= y;
+}
+
+v2di
+f13 (v2di x, v2di y)
+{
+  return x <= y;
+}
+
+v2udi
+f14 (v2udi x, v2udi y)
+{
+  return x <= y;
+}
+
+v2di
+f15 (v2di x, v2di y)
+{
+  return x >= y;
+}
+
+v2udi
+f16 (v2udi x, v2udi y)
+{
+  return x >= y;
+}

	Jakub

[-- Attachment #2: 2 --]
[-- Type: text/plain, Size: 4317 bytes --]

--- gcc/config/i386/i386.c.jj	2018-12-18 19:40:27.698946295 +0100
+++ gcc/config/i386/i386.c	2018-12-19 17:14:24.948218640 +0100
@@ -23593,33 +23593,117 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
       cmp = gen_rtx_SUBREG (mode, cmp, 0);
     }
 
-  if (vector_all_ones_operand (op_true, mode)
-      && rtx_equal_p (op_false, CONST0_RTX (mode))
-      && !maskcmp)
+  if (maskcmp)
+    {
+      rtx (*gen) (rtx, rtx) = NULL;
+      if ((op_true == CONST0_RTX (mode)
+	   && vector_all_ones_operand (op_false, mode))
+	  || (op_false == CONST0_RTX (mode)
+	      && vector_all_ones_operand (op_true, mode)))
+	switch (mode)
+	  {
+	  case E_V64QImode:
+	    if (TARGET_AVX512BW)
+	      gen = gen_avx512bw_cvtmask2bv64qi;
+	    break;
+	  case E_V32QImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2bv32qi;
+	    break;
+	  case E_V16QImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2bv16qi;
+	    break;
+	  case E_V32HImode:
+	    if (TARGET_AVX512BW)
+	      gen = gen_avx512bw_cvtmask2wv32hi;
+	    break;
+	  case E_V16HImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2wv16hi;
+	    break;
+	  case E_V8HImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2wv8hi;
+	    break;
+	  case E_V16SImode:
+	    if (TARGET_AVX512DQ)
+	      gen = gen_avx512f_cvtmask2dv16si;
+	    break;
+	  case E_V8SImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2dv8si;
+	    break;
+	  case E_V4SImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2dv4si;
+	    break;
+	  case E_V8DImode:
+	    if (TARGET_AVX512DQ)
+	      gen = gen_avx512f_cvtmask2qv8di;
+	    break;
+	  case E_V4DImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2qv4di;
+	    break;
+	  case E_V2DImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2qv2di;
+	    break;
+          default:
+	    break;
+	  }
+      if (gen && SCALAR_INT_MODE_P (cmpmode))
+	{
+	  cmp = force_reg (cmpmode, cmp);
+	  if (op_true == CONST0_RTX (mode))
+	    {
+	      rtx (*gen_not) (rtx, rtx);
+	      switch (cmpmode)
+		{
+		case E_QImode: gen_not = gen_knotqi; break;
+		case E_HImode: gen_not = gen_knothi; break;
+		case E_SImode: gen_not = gen_knotsi; break;
+		case E_DImode: gen_not = gen_knotdi; break;
+		default: gcc_unreachable ();
+		}
+	      rtx n = gen_reg_rtx (cmpmode);
+	      emit_insn (gen_not (n, cmp));
+	      cmp = n;
+	    }
+	  emit_insn (gen (dest, cmp));
+	  return;
+	}
+    }
+  else if (vector_all_ones_operand (op_true, mode)
+	   && op_false == CONST0_RTX (mode))
     {
       emit_insn (gen_rtx_SET (dest, cmp));
+      return;
     }
-  else if (op_false == CONST0_RTX (mode) && !maskcmp)
+  else if (op_false == CONST0_RTX (mode))
     {
       op_true = force_reg (mode, op_true);
       x = gen_rtx_AND (mode, cmp, op_true);
       emit_insn (gen_rtx_SET (dest, x));
+      return;
     }
-  else if (op_true == CONST0_RTX (mode) && !maskcmp)
+  else if (op_true == CONST0_RTX (mode))
     {
       op_false = force_reg (mode, op_false);
       x = gen_rtx_NOT (mode, cmp);
       x = gen_rtx_AND (mode, x, op_false);
       emit_insn (gen_rtx_SET (dest, x));
+      return;
     }
-  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
-	   && !maskcmp)
+  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
     {
       op_false = force_reg (mode, op_false);
       x = gen_rtx_IOR (mode, cmp, op_false);
       emit_insn (gen_rtx_SET (dest, x));
+      return;
     }
-  else if (TARGET_XOP && !maskcmp)
+  else if (TARGET_XOP)
     {
       op_true = force_reg (mode, op_true);
 
@@ -23629,9 +23713,9 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
       emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
 							  op_true,
 							  op_false)));
+      return;
     }
-  else
-    {
+
       rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
       rtx d = dest;
 
@@ -23750,7 +23834,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
 	  x = gen_rtx_IOR (mode, t3, t2);
 	  emit_insn (gen_rtx_SET (dest, x));
 	}
-    }
 }
 
 /* Expand a floating-point conditional move.  Return true if successful.  */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] Improve AVX512 sse movcc (PR target/88547)
  2018-12-19 23:20 [PATCH] Improve AVX512 sse movcc (PR target/88547) Jakub Jelinek
@ 2018-12-20  7:42 ` Uros Bizjak
  2018-12-20  7:49   ` Jakub Jelinek
  0 siblings, 1 reply; 3+ messages in thread
From: Uros Bizjak @ 2018-12-20  7:42 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Thu, Dec 20, 2018 at 12:20 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> If one vcond argument is all ones (non-bool) vector and another one is all
> zeros, we can use for AVX512{DQ,BW} (sometimes + VL) the vpmovm2? insns.
> While if op_true is all ones and op_false, we emit large code that the
> combiner often optimizes to that vpmovm2?, if the arguments are swapped,
> we emit vpxor + vpternlog + and masked move (blend), while we could just
> invert the mask with knot* and use vpmovm2?.
>
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
> trunk?  The patch is large, but it is mostly reindentation, in the
> attachment there is diff -ubpd variant of the i386.c changes to make it more
> readable.
>
> 2018-12-19  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/88547
>         * config/i386/i386.c (ix86_expand_sse_movcc): For maskcmp, try to
>         emit vpmovm2? instruction perhaps after knot?.  Reorganize code
>         so that it doesn't have to test !maskcmp in almost every conditional.
>
>         * gcc.target/i386/pr88547-1.c: New test.

LGTM, under assumption that interunit moves from mask reg to xmm regs are fast.

Thanks,
Uros.

> --- gcc/config/i386/i386.c.jj   2018-12-18 19:40:27.698946295 +0100
> +++ gcc/config/i386/i386.c      2018-12-19 17:14:24.948218640 +0100
> @@ -23593,33 +23593,117 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
>        cmp = gen_rtx_SUBREG (mode, cmp, 0);
>      }
>
> -  if (vector_all_ones_operand (op_true, mode)
> -      && rtx_equal_p (op_false, CONST0_RTX (mode))
> -      && !maskcmp)
> +  if (maskcmp)
> +    {
> +      rtx (*gen) (rtx, rtx) = NULL;
> +      if ((op_true == CONST0_RTX (mode)
> +          && vector_all_ones_operand (op_false, mode))
> +         || (op_false == CONST0_RTX (mode)
> +             && vector_all_ones_operand (op_true, mode)))
> +       switch (mode)
> +         {
> +         case E_V64QImode:
> +           if (TARGET_AVX512BW)
> +             gen = gen_avx512bw_cvtmask2bv64qi;
> +           break;
> +         case E_V32QImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512BW)
> +             gen = gen_avx512vl_cvtmask2bv32qi;
> +           break;
> +         case E_V16QImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512BW)
> +             gen = gen_avx512vl_cvtmask2bv16qi;
> +           break;
> +         case E_V32HImode:
> +           if (TARGET_AVX512BW)
> +             gen = gen_avx512bw_cvtmask2wv32hi;
> +           break;
> +         case E_V16HImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512BW)
> +             gen = gen_avx512vl_cvtmask2wv16hi;
> +           break;
> +         case E_V8HImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512BW)
> +             gen = gen_avx512vl_cvtmask2wv8hi;
> +           break;
> +         case E_V16SImode:
> +           if (TARGET_AVX512DQ)
> +             gen = gen_avx512f_cvtmask2dv16si;
> +           break;
> +         case E_V8SImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512DQ)
> +             gen = gen_avx512vl_cvtmask2dv8si;
> +           break;
> +         case E_V4SImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512DQ)
> +             gen = gen_avx512vl_cvtmask2dv4si;
> +           break;
> +         case E_V8DImode:
> +           if (TARGET_AVX512DQ)
> +             gen = gen_avx512f_cvtmask2qv8di;
> +           break;
> +         case E_V4DImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512DQ)
> +             gen = gen_avx512vl_cvtmask2qv4di;
> +           break;
> +         case E_V2DImode:
> +           if (TARGET_AVX512VL && TARGET_AVX512DQ)
> +             gen = gen_avx512vl_cvtmask2qv2di;
> +           break;
> +         default:
> +           break;
> +         }
> +      if (gen && SCALAR_INT_MODE_P (cmpmode))
> +       {
> +         cmp = force_reg (cmpmode, cmp);
> +         if (op_true == CONST0_RTX (mode))
> +           {
> +             rtx (*gen_not) (rtx, rtx);
> +             switch (cmpmode)
> +               {
> +               case E_QImode: gen_not = gen_knotqi; break;
> +               case E_HImode: gen_not = gen_knothi; break;
> +               case E_SImode: gen_not = gen_knotsi; break;
> +               case E_DImode: gen_not = gen_knotdi; break;
> +               default: gcc_unreachable ();
> +               }
> +             rtx n = gen_reg_rtx (cmpmode);
> +             emit_insn (gen_not (n, cmp));
> +             cmp = n;
> +           }
> +         emit_insn (gen (dest, cmp));
> +         return;
> +       }
> +    }
> +  else if (vector_all_ones_operand (op_true, mode)
> +          && op_false == CONST0_RTX (mode))
>      {
>        emit_insn (gen_rtx_SET (dest, cmp));
> +      return;
>      }
> -  else if (op_false == CONST0_RTX (mode) && !maskcmp)
> +  else if (op_false == CONST0_RTX (mode))
>      {
>        op_true = force_reg (mode, op_true);
>        x = gen_rtx_AND (mode, cmp, op_true);
>        emit_insn (gen_rtx_SET (dest, x));
> +      return;
>      }
> -  else if (op_true == CONST0_RTX (mode) && !maskcmp)
> +  else if (op_true == CONST0_RTX (mode))
>      {
>        op_false = force_reg (mode, op_false);
>        x = gen_rtx_NOT (mode, cmp);
>        x = gen_rtx_AND (mode, x, op_false);
>        emit_insn (gen_rtx_SET (dest, x));
> +      return;
>      }
> -  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
> -          && !maskcmp)
> +  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
>      {
>        op_false = force_reg (mode, op_false);
>        x = gen_rtx_IOR (mode, cmp, op_false);
>        emit_insn (gen_rtx_SET (dest, x));
> +      return;
>      }
> -  else if (TARGET_XOP && !maskcmp)
> +  else if (TARGET_XOP)
>      {
>        op_true = force_reg (mode, op_true);
>
> @@ -23629,127 +23713,126 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
>        emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
>                                                           op_true,
>                                                           op_false)));
> +      return;
>      }
> -  else
> -    {
> -      rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
> -      rtx d = dest;
>
> -      if (!vector_operand (op_true, mode))
> -       op_true = force_reg (mode, op_true);
> +  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
> +  rtx d = dest;
>
> -      op_false = force_reg (mode, op_false);
> +  if (!vector_operand (op_true, mode))
> +    op_true = force_reg (mode, op_true);
>
> -      switch (mode)
> -       {
> -       case E_V4SFmode:
> -         if (TARGET_SSE4_1)
> -           gen = gen_sse4_1_blendvps;
> -         break;
> -       case E_V2DFmode:
> -         if (TARGET_SSE4_1)
> -           gen = gen_sse4_1_blendvpd;
> -         break;
> -       case E_SFmode:
> -         if (TARGET_SSE4_1)
> -           {
> -             gen = gen_sse4_1_blendvss;
> -             op_true = force_reg (mode, op_true);
> -           }
> -         break;
> -       case E_DFmode:
> -         if (TARGET_SSE4_1)
> -           {
> -             gen = gen_sse4_1_blendvsd;
> -             op_true = force_reg (mode, op_true);
> -           }
> -         break;
> -       case E_V16QImode:
> -       case E_V8HImode:
> -       case E_V4SImode:
> -       case E_V2DImode:
> -         if (TARGET_SSE4_1)
> -           {
> -             gen = gen_sse4_1_pblendvb;
> -             if (mode != V16QImode)
> -               d = gen_reg_rtx (V16QImode);
> -             op_false = gen_lowpart (V16QImode, op_false);
> -             op_true = gen_lowpart (V16QImode, op_true);
> -             cmp = gen_lowpart (V16QImode, cmp);
> -           }
> -         break;
> -       case E_V8SFmode:
> -         if (TARGET_AVX)
> -           gen = gen_avx_blendvps256;
> -         break;
> -       case E_V4DFmode:
> -         if (TARGET_AVX)
> -           gen = gen_avx_blendvpd256;
> -         break;
> -       case E_V32QImode:
> -       case E_V16HImode:
> -       case E_V8SImode:
> -       case E_V4DImode:
> -         if (TARGET_AVX2)
> -           {
> -             gen = gen_avx2_pblendvb;
> -             if (mode != V32QImode)
> -               d = gen_reg_rtx (V32QImode);
> -             op_false = gen_lowpart (V32QImode, op_false);
> -             op_true = gen_lowpart (V32QImode, op_true);
> -             cmp = gen_lowpart (V32QImode, cmp);
> -           }
> -         break;
> -
> -       case E_V64QImode:
> -         gen = gen_avx512bw_blendmv64qi;
> -         break;
> -       case E_V32HImode:
> -         gen = gen_avx512bw_blendmv32hi;
> -         break;
> -       case E_V16SImode:
> -         gen = gen_avx512f_blendmv16si;
> -         break;
> -       case E_V8DImode:
> -         gen = gen_avx512f_blendmv8di;
> -         break;
> -       case E_V8DFmode:
> -         gen = gen_avx512f_blendmv8df;
> -         break;
> -       case E_V16SFmode:
> -         gen = gen_avx512f_blendmv16sf;
> -         break;
> -
> -       default:
> -         break;
> -       }
> +  op_false = force_reg (mode, op_false);
>
> -      if (gen != NULL)
> +  switch (mode)
> +    {
> +    case E_V4SFmode:
> +      if (TARGET_SSE4_1)
> +       gen = gen_sse4_1_blendvps;
> +      break;
> +    case E_V2DFmode:
> +      if (TARGET_SSE4_1)
> +       gen = gen_sse4_1_blendvpd;
> +      break;
> +    case E_SFmode:
> +      if (TARGET_SSE4_1)
>         {
> -         emit_insn (gen (d, op_false, op_true, cmp));
> -         if (d != dest)
> -           emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
> +         gen = gen_sse4_1_blendvss;
> +         op_true = force_reg (mode, op_true);
>         }
> -      else
> +      break;
> +    case E_DFmode:
> +      if (TARGET_SSE4_1)
>         {
> +         gen = gen_sse4_1_blendvsd;
>           op_true = force_reg (mode, op_true);
> +       }
> +      break;
> +    case E_V16QImode:
> +    case E_V8HImode:
> +    case E_V4SImode:
> +    case E_V2DImode:
> +      if (TARGET_SSE4_1)
> +       {
> +         gen = gen_sse4_1_pblendvb;
> +         if (mode != V16QImode)
> +           d = gen_reg_rtx (V16QImode);
> +         op_false = gen_lowpart (V16QImode, op_false);
> +         op_true = gen_lowpart (V16QImode, op_true);
> +         cmp = gen_lowpart (V16QImode, cmp);
> +       }
> +      break;
> +    case E_V8SFmode:
> +      if (TARGET_AVX)
> +       gen = gen_avx_blendvps256;
> +      break;
> +    case E_V4DFmode:
> +      if (TARGET_AVX)
> +       gen = gen_avx_blendvpd256;
> +      break;
> +    case E_V32QImode:
> +    case E_V16HImode:
> +    case E_V8SImode:
> +    case E_V4DImode:
> +      if (TARGET_AVX2)
> +       {
> +         gen = gen_avx2_pblendvb;
> +         if (mode != V32QImode)
> +           d = gen_reg_rtx (V32QImode);
> +         op_false = gen_lowpart (V32QImode, op_false);
> +         op_true = gen_lowpart (V32QImode, op_true);
> +         cmp = gen_lowpart (V32QImode, cmp);
> +       }
> +      break;
> +
> +    case E_V64QImode:
> +      gen = gen_avx512bw_blendmv64qi;
> +      break;
> +    case E_V32HImode:
> +      gen = gen_avx512bw_blendmv32hi;
> +      break;
> +    case E_V16SImode:
> +      gen = gen_avx512f_blendmv16si;
> +      break;
> +    case E_V8DImode:
> +      gen = gen_avx512f_blendmv8di;
> +      break;
> +    case E_V8DFmode:
> +      gen = gen_avx512f_blendmv8df;
> +      break;
> +    case E_V16SFmode:
> +      gen = gen_avx512f_blendmv16sf;
> +      break;
> +
> +    default:
> +      break;
> +    }
> +
> +  if (gen != NULL)
> +    {
> +      emit_insn (gen (d, op_false, op_true, cmp));
> +      if (d != dest)
> +       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
> +    }
> +  else
> +    {
> +      op_true = force_reg (mode, op_true);
>
> -         t2 = gen_reg_rtx (mode);
> -         if (optimize)
> -           t3 = gen_reg_rtx (mode);
> -         else
> -           t3 = dest;
> -
> -         x = gen_rtx_AND (mode, op_true, cmp);
> -         emit_insn (gen_rtx_SET (t2, x));
> -
> -         x = gen_rtx_NOT (mode, cmp);
> -         x = gen_rtx_AND (mode, x, op_false);
> -         emit_insn (gen_rtx_SET (t3, x));
> +      t2 = gen_reg_rtx (mode);
> +      if (optimize)
> +       t3 = gen_reg_rtx (mode);
> +      else
> +       t3 = dest;
>
> -         x = gen_rtx_IOR (mode, t3, t2);
> -         emit_insn (gen_rtx_SET (dest, x));
> -       }
> +      x = gen_rtx_AND (mode, op_true, cmp);
> +      emit_insn (gen_rtx_SET (t2, x));
> +
> +      x = gen_rtx_NOT (mode, cmp);
> +      x = gen_rtx_AND (mode, x, op_false);
> +      emit_insn (gen_rtx_SET (t3, x));
> +
> +      x = gen_rtx_IOR (mode, t3, t2);
> +      emit_insn (gen_rtx_SET (dest, x));
>      }
>  }
>
> --- gcc/testsuite/gcc.target/i386/pr88547-1.c.jj        2018-12-19 17:20:16.396518647 +0100
> +++ gcc/testsuite/gcc.target/i386/pr88547-1.c   2018-12-19 17:28:54.818103169 +0100
> @@ -0,0 +1,117 @@
> +/* PR target/88547 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512vl -mavx512bw -mavx512dq" } */
> +/* { dg-final { scan-assembler-not "vpternlog" } } */
> +/* { dg-final { scan-assembler-times "vpmovm2b\[\t  ]" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovm2w\[\t  ]" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovm2d\[\t  ]" 4 } } */
> +/* { dg-final { scan-assembler-times "vpmovm2q\[\t  ]" 4 } } */
> +/* { dg-final { scan-assembler-times "knotb\[\t  ]" 4 } } */
> +/* { dg-final { scan-assembler-times "knotw\[\t  ]" 4 } } */
> +/* { dg-final { scan-assembler-times "knotd\[\t  ]" 2 } } */
> +/* { dg-final { scan-assembler-times "knotq\[\t  ]" 2 } } */
> +
> +typedef signed char v16qi __attribute__((vector_size(64)));
> +typedef unsigned char v16uqi __attribute__((vector_size(64)));
> +typedef short v8hi __attribute__((vector_size(64)));
> +typedef unsigned short v8uhi __attribute__((vector_size(64)));
> +typedef int v4si __attribute__((vector_size(64)));
> +typedef unsigned v4usi __attribute__((vector_size(64)));
> +typedef long long v2di __attribute__((vector_size(64)));
> +typedef unsigned long long v2udi __attribute__((vector_size(64)));
> +
> +v16qi
> +f1 (v16qi x, v16qi y)
> +{
> +  return x <= y;
> +}
> +
> +v16uqi
> +f2 (v16uqi x, v16uqi y)
> +{
> +  return x <= y;
> +}
> +
> +v16qi
> +f3 (v16qi x, v16qi y)
> +{
> +  return x >= y;
> +}
> +
> +v16uqi
> +f4 (v16uqi x, v16uqi y)
> +{
> +  return x >= y;
> +}
> +
> +v8hi
> +f5 (v8hi x, v8hi y)
> +{
> +  return x <= y;
> +}
> +
> +v8uhi
> +f6 (v8uhi x, v8uhi y)
> +{
> +  return x <= y;
> +}
> +
> +v8hi
> +f7 (v8hi x, v8hi y)
> +{
> +  return x >= y;
> +}
> +
> +v8uhi
> +f8 (v8uhi x, v8uhi y)
> +{
> +  return x >= y;
> +}
> +
> +v4si
> +f9 (v4si x, v4si y)
> +{
> +  return x <= y;
> +}
> +
> +v4usi
> +f10 (v4usi x, v4usi y)
> +{
> +  return x <= y;
> +}
> +
> +v4si
> +f11 (v4si x, v4si y)
> +{
> +  return x >= y;
> +}
> +
> +v4usi
> +f12 (v4usi x, v4usi y)
> +{
> +  return x >= y;
> +}
> +
> +v2di
> +f13 (v2di x, v2di y)
> +{
> +  return x <= y;
> +}
> +
> +v2udi
> +f14 (v2udi x, v2udi y)
> +{
> +  return x <= y;
> +}
> +
> +v2di
> +f15 (v2di x, v2di y)
> +{
> +  return x >= y;
> +}
> +
> +v2udi
> +f16 (v2udi x, v2udi y)
> +{
> +  return x >= y;
> +}
>
>         Jakub

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] Improve AVX512 sse movcc (PR target/88547)
  2018-12-20  7:42 ` Uros Bizjak
@ 2018-12-20  7:49   ` Jakub Jelinek
  0 siblings, 0 replies; 3+ messages in thread
From: Jakub Jelinek @ 2018-12-20  7:49 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

On Thu, Dec 20, 2018 at 08:42:05AM +0100, Uros Bizjak wrote:
> > If one vcond argument is all ones (non-bool) vector and another one is all
> > zeros, we can use for AVX512{DQ,BW} (sometimes + VL) the vpmovm2? insns.
> > While if op_true is all ones and op_false, we emit large code that the
> > combiner often optimizes to that vpmovm2?, if the arguments are swapped,
> > we emit vpxor + vpternlog + and masked move (blend), while we could just
> > invert the mask with knot* and use vpmovm2?.
> >
> > Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
> > trunk?  The patch is large, but it is mostly reindentation, in the
> > attachment there is diff -ubpd variant of the i386.c changes to make it more
> > readable.
> >
> > 2018-12-19  Jakub Jelinek  <jakub@redhat.com>
> >
> >         PR target/88547
> >         * config/i386/i386.c (ix86_expand_sse_movcc): For maskcmp, try to
> >         emit vpmovm2? instruction perhaps after knot?.  Reorganize code
> >         so that it doesn't have to test !maskcmp in almost every conditional.
> >
> >         * gcc.target/i386/pr88547-1.c: New test.
> 
> LGTM, under assumption that interunit moves from mask reg to xmm regs are fast.

In a simple benchmark (calling these functions in a tight loop on i9-7960X)
the performance is the same, just shorter sequences.

	Jakub

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-12-20  7:49 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-19 23:20 [PATCH] Improve AVX512 sse movcc (PR target/88547) Jakub Jelinek
2018-12-20  7:42 ` Uros Bizjak
2018-12-20  7:49   ` Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).