public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-3108] Optimize (a & b) | (c & ~b) to vpternlog instruction.
@ 2021-08-24  9:45 hongtao Liu
  0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2021-08-24  9:45 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:6ddb30f941a44bd528904558673ab35394565f08

commit r12-3108-g6ddb30f941a44bd528904558673ab35394565f08
Author: liuhongt <hongtao.liu@intel.com>
Date:   Fri Aug 20 15:30:40 2021 +0800

    Optimize (a & b) | (c & ~b) to vpternlog instruction.
    
    Also optimize below 3 forms to vpternlog, op1, op2, op3 are
    register_operand or unary_p as (not reg)
    
    A: (any_logic (any_logic op1 op2) op3)
    B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should
    be equal to op1/op2
    C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should
    be equal to op1/op2
    
    gcc/ChangeLog:
    
            PR target/101989
            * config/i386/i386.c (ix86_rtx_costs): Define cost for
            UNSPEC_VTERNLOG.
            * config/i386/i386.h (STRIP_UNARY): New macro.
            * config/i386/predicates.md (reg_or_notreg_operand): New
            predicate.
            * config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn.
            (*<avx512>_vternlog<mode>_1): New pre_reload
            define_insn_and_split.
            (*<avx512>_vternlog<mode>_2): Ditto.
            (*<avx512>_vternlog<mode>_3): Ditto.
            (any_logic1,any_logic2): New code iterator.
            (logic_op): New code attribute.
            (ternlogsuffix): Extend to VNxDF and VNxSF.
    
    gcc/testsuite/ChangeLog:
    
            PR target/101989
            * gcc.target/i386/pr101989-1.c: New test.
            * gcc.target/i386/pr101989-2.c: New test.
            * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.

Diff:
---
 gcc/config/i386/i386.c                             |   5 +
 gcc/config/i386/i386.h                             |   2 +
 gcc/config/i386/predicates.md                      |   7 +
 gcc/config/i386/sse.md                             | 234 +++++++++++++++++++++
 .../i386/avx512bw-shiftqihi-constant-1.c           |   4 +-
 gcc/testsuite/gcc.target/i386/pr101989-1.c         |  51 +++++
 gcc/testsuite/gcc.target/i386/pr101989-2.c         | 102 +++++++++
 7 files changed, 403 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5bff131f6d9..ebec8668758 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20542,6 +20542,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
     case UNSPEC:
       if (XINT (x, 1) == UNSPEC_TP)
 	*total = 0;
+      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
+	{
+	  *total = cost->sse_op;
+	  return true;
+	}
       return false;
 
     case VEC_SELECT:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 11ac8d01100..65114229c86 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1716,6 +1716,8 @@ typedef struct ix86_args {
 
 #define LEGITIMATE_PIC_OPERAND_P(X) legitimate_pic_operand_p (X)
 
+#define STRIP_UNARY(X) (UNARY_P (X) ? XEXP (X, 0) : X)
+
 #define SYMBOLIC_CONST(X)	\
   (GET_CODE (X) == SYMBOL_REF						\
    || GET_CODE (X) == LABEL_REF						\
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 9321f332ef9..df5acb425d4 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1044,6 +1044,13 @@
 	    (ior (match_test "op == const1_rtx")
 		 (match_test "op == constm1_rtx")))))
 
+;; True for registers, or (not: registers).  Used to optimize 3-operand
+;; bitwise operation.
+(define_predicate "reg_or_notreg_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "not")
+	    (match_test "register_operand (XEXP (op, 0), mode)"))))
+
 ;; True if OP is acceptable as operand of DImode shift expander.
 (define_predicate "shiftdi_operand"
   (if_then_else (match_test "TARGET_64BIT")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 95f95823ea3..25ca9a5c29c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -933,7 +933,9 @@
 ;; Mapping of vector modes to VPTERNLOG suffix
 (define_mode_attr ternlogsuffix
   [(V8DI "q") (V4DI "q") (V2DI "q")
+   (V8DF "q") (V4DF "q") (V2DF "q")
    (V16SI "d") (V8SI "d") (V4SI "d")
+   (V16SF "d") (V8SF "d") (V4SF "d")
    (V32HI "d") (V16HI "d") (V8HI "d")
    (V64QI "d") (V32QI "d") (V16QI "d")])
 
@@ -10041,6 +10043,238 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_vternlog<mode>_all"
+  [(set (match_operand:V 0 "register_operand" "=v")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand" "0")
+	   (match_operand:V 2 "register_operand" "v")
+	   (match_operand:V 3 "nonimmediate_operand" "vm")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "TARGET_AVX512F"
+  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+;; There must be lots of other combinations like
+;;
+;; (any_logic:V
+;;   (any_logic:V op1 op2)
+;;   (any_logic:V op1 op3))
+;;
+;; (any_logic:V
+;;   (any_logic:V
+;;     (any_logic:V op1, op2)
+;;     op3)
+;;   op1)
+;;
+;; and so on.
+
+(define_code_iterator any_logic1 [and ior xor])
+(define_code_iterator any_logic2 [and ior xor])
+(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (any_logic2:V
+	    (match_operand:V 3 "reg_or_notreg_operand")
+	    (match_operand:V 4 "reg_or_notreg_operand"))))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (STRIP_UNARY (operands[1]),
+		    STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[1]),
+		       STRIP_UNARY (operands[3]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (STRIP_UNARY (operands[1]),
+		   STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[1]),
+			STRIP_UNARY (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  tmp2 = reg3 <any_logic2:logic_op> reg4;
+  reg_mask = tmp1  <any_logic:logic_op> tmp2;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[6] = STRIP_UNARY (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (any_logic2:V
+	      (match_operand:V 1 "reg_or_notreg_operand")
+	      (match_operand:V 2 "reg_or_notreg_operand"))
+	    (match_operand:V 3 "reg_or_notreg_operand"))
+	  (match_operand:V 4 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (STRIP_UNARY (operands[1]),
+		    STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[1]),
+		       STRIP_UNARY (operands[3]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (STRIP_UNARY (operands[1]),
+		   STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[1]),
+			STRIP_UNARY (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic2:logic_op> reg2;
+  tmp2 = tmp1 <any_logic1:logic_op> reg3;
+  reg_mask = tmp2 <any_logic:logic_op> reg4;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[6] = STRIP_UNARY (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (match_operand:V 3 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 3)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
+  int reg3 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg_mask, tmp1;
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  reg_mask = tmp1 <any_logic:logic_op> reg3;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[3] = STRIP_UNARY (operands[3]);
+  operands[4] = GEN_INT (reg_mask);
+})
+
+
 (define_insn "<avx512>_vternlog<mode>_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:VI48_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
index 78bf5d33689..fbc3de08119 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
@@ -1,7 +1,8 @@
 /* PR target/95524 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512bw" } */
-/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
+/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
 typedef char v64qi  __attribute__ ((vector_size (64)));
 typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
 
@@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
   return a >> 2;
 }
 /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
-/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
 /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
 
 __attribute__((noipa)) v64qi
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
new file mode 100644
index 00000000000..594093ecdde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
+/* { dg-final { scan-assembler-not "vpxor" } } */
+/* { dg-final { scan-assembler-not "vpor" } } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+
+#include<immintrin.h>
+__m256d
+__attribute__((noipa, target("avx512vl")))
+copysign2_pd(__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+__attribute__((noipa, target("avx512vl")))
+foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
new file mode 100644
index 00000000000..9d9759a8e1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+#include "pr101989-1.c"
+__m256d
+avx2_copysign2_pd (__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+avx2_foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
+
+
+void
+test_256 (void)
+{
+  union256i_q q1, q2, q3, res2, exp2;
+  union256d d1, d2, res1, exp1;
+  int i, sign = 1;
+
+  for (i = 0; i < 4; i++)
+    {
+      d1.a[i] = 12.34 * (i + 2000) * sign;
+      d2.a[i] = 56.78 * (i - 30) * sign;
+      q1.a[i] = 12 * (i + 2000) * sign;
+      q2.a[i] = 56 * (i - 30) * sign;
+      q3.a[i] = 90 * (i + 40) * sign;
+      res1.a[i] = DEFAULT_VALUE;
+      exp1.a[i] = DEFAULT_VALUE;
+      res2.a[i] = exp2.a[i] = -1;
+      sign = -sign;
+    }
+
+  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
+  res1.x = copysign2_pd (d1.x, d2.x);
+  if (UNION_CHECK (256, d) (res1, exp1.a))
+    abort ();
+
+  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
+  res2.x = foo1 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
+  res2.x = foo2 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
+  res2.x = foo3 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
+  res2.x = foo4 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
+  res2.x = foo (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+}
+
+static void
+test_128 ()
+{}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-08-24  9:45 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-24  9:45 [gcc r12-3108] Optimize (a & b) | (c & ~b) to vpternlog instruction hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).