public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction.
@ 2021-08-24  1:36 liuhongt
  2021-08-24  9:53 ` Hongtao Liu
  0 siblings, 1 reply; 4+ messages in thread
From: liuhongt @ 2021-08-24  1:36 UTC (permalink / raw)
  To: gcc-patches

Also optimize below 3 forms to vpternlog, op1, op2, op3 are
register_operand or unary_p as (not reg)

A: (any_logic (any_logic op1 op2) op3)
B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should
be equal to op1/op2
C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should
be equal to op1/op2

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.

gcc/ChangeLog:

	PR target/101989
	* config/i386/i386-protos.h
	(ix86_strip_reg_or_notreg_operand): New declare.
	* config/i386/i386.c (ix86_rtx_costs): Define cost for
	UNSPEC_VTERNLOG.
	(ix86_strip_reg_or_notreg_operand): New function.
	* config/i386/predicates.md (reg_or_notreg_operand): New
	predicate.
	* config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn.
	(*<avx512>_vternlog<mode>_1): New pre_reload
	define_insn_and_split.
	(*<avx512>_vternlog<mode>_2): Ditto.
	(*<avx512>_vternlog<mode>_3): Ditto.
	(any_logic1,any_logic2): New code iterator.
	(logic_op): New code attribute.
	(ternlogsuffix): Extend to VNxDF and VNxSF.

gcc/testsuite/ChangeLog:

	PR target/101989
	* gcc.target/i386/pr101989-1.c: New test.
	* gcc.target/i386/pr101989-2.c: New test.
	* gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.
---
 gcc/config/i386/i386-protos.h                 |   1 +
 gcc/config/i386/i386.c                        |  13 +
 gcc/config/i386/predicates.md                 |   7 +
 gcc/config/i386/sse.md                        | 234 ++++++++++++++++++
 .../i386/avx512bw-shiftqihi-constant-1.c      |   4 +-
 gcc/testsuite/gcc.target/i386/pr101989-1.c    |  51 ++++
 gcc/testsuite/gcc.target/i386/pr101989-2.c    | 102 ++++++++
 7 files changed, 410 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 2fd13074c81..2bdaadcf4f3 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int);
 extern int standard_sse_constant_p (rtx, machine_mode);
 extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);
 extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);
+extern rtx ix86_strip_reg_or_notreg_operand (rtx);
 extern bool ix86_pre_reload_split (void);
 extern bool symbolic_reference_mentioned_p (rtx);
 extern bool extended_reg_mentioned_p (rtx);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 46844fab08f..a69225ccc81 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
   return true;
 }
 
+/* Returns true if INSN can be transformed from a memory load
+   to a supported FP constant load.  */
+rtx
+ix86_strip_reg_or_notreg_operand (rtx op)
+{
+  return UNARY_P (op) ? XEXP (op, 0) : op;
+}
+
 /* Predicate for pre-reload splitters with associated instructions,
    which can match any time before the split1 pass (usually combine),
    then are unconditionally split in that pass and should not be
@@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
     case UNSPEC:
       if (XINT (x, 1) == UNSPEC_TP)
 	*total = 0;
+      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
+	{
+	  *total = cost->sse_op;
+	  return true;
+	}
       return false;
 
     case VEC_SELECT:
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 9321f332ef9..df5acb425d4 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand"
 	    (ior (match_test "op == const1_rtx")
 		 (match_test "op == constm1_rtx")))))
 
+;; True for registers, or (not: registers).  Used to optimize 3-operand
+;; bitwise operation.
+(define_predicate "reg_or_notreg_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "not")
+	    (match_test "register_operand (XEXP (op, 0), mode)"))))
+
 ;; True if OP is acceptable as operand of DImode shift expander.
 (define_predicate "shiftdi_operand"
   (if_then_else (match_test "TARGET_64BIT")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 13889687793..0acd749d21c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -933,7 +933,9 @@ (define_mode_attr iptr
 ;; Mapping of vector modes to VPTERNLOG suffix
 (define_mode_attr ternlogsuffix
   [(V8DI "q") (V4DI "q") (V2DI "q")
+   (V8DF "q") (V4DF "q") (V2DF "q")
    (V16SI "d") (V8SI "d") (V4SI "d")
+   (V16SF "d") (V8SF "d") (V4SF "d")
    (V32HI "d") (V16HI "d") (V8HI "d")
    (V64QI "d") (V32QI "d") (V16QI "d")])
 
@@ -10041,6 +10043,238 @@ (define_insn "<avx512>_vternlog<mode><sd_maskz_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_vternlog<mode>_all"
+  [(set (match_operand:V 0 "register_operand" "=v")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand" "0")
+	   (match_operand:V 2 "register_operand" "v")
+	   (match_operand:V 3 "nonimmediate_operand" "vm")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "TARGET_AVX512F"
+  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+;; There must be lots of other combinations like
+;;
+;; (any_logic:V
+;;   (any_logic:V op1 op2)
+;;   (any_logic:V op1 op3))
+;;
+;; (any_logic:V
+;;   (any_logic:V
+;;     (any_logic:V op1, op2)
+;;     op3)
+;;   op1)
+;;
+;; and so on.
+
+(define_code_iterator any_logic1 [and ior xor])
+(define_code_iterator any_logic2 [and ior xor])
+(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (any_logic2:V
+	    (match_operand:V 3 "reg_or_notreg_operand")
+	    (match_operand:V 4 "reg_or_notreg_operand"))))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+		    ix86_strip_reg_or_notreg_operand (operands[4]))
+       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
+		       ix86_strip_reg_or_notreg_operand (operands[4]))
+       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+		       ix86_strip_reg_or_notreg_operand (operands[3]))
+       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
+		       ix86_strip_reg_or_notreg_operand (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+		   ix86_strip_reg_or_notreg_operand (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
+		       ix86_strip_reg_or_notreg_operand (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+			ix86_strip_reg_or_notreg_operand (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  tmp2 = reg3 <any_logic2:logic_op> reg4;
+  reg_mask = tmp1  <any_logic:logic_op> tmp2;
+  reg_mask &= 0xFF;
+
+  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
+  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
+  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (any_logic2:V
+	      (match_operand:V 1 "reg_or_notreg_operand")
+	      (match_operand:V 2 "reg_or_notreg_operand"))
+	    (match_operand:V 3 "reg_or_notreg_operand"))
+	  (match_operand:V 4 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+		    ix86_strip_reg_or_notreg_operand (operands[4]))
+       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
+		       ix86_strip_reg_or_notreg_operand (operands[4]))
+       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+		       ix86_strip_reg_or_notreg_operand (operands[3]))
+       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
+		       ix86_strip_reg_or_notreg_operand (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+		   ix86_strip_reg_or_notreg_operand (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
+		       ix86_strip_reg_or_notreg_operand (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
+			ix86_strip_reg_or_notreg_operand (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic2:logic_op> reg2;
+  tmp2 = tmp1 <any_logic1:logic_op> reg3;
+  reg_mask = tmp2 <any_logic:logic_op> reg4;
+  reg_mask &= 0xFF;
+
+  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
+  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
+  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (match_operand:V 3 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 3)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
+  int reg3 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg_mask, tmp1;
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  reg_mask = tmp1 <any_logic:logic_op> reg3;
+  reg_mask &= 0xFF;
+
+  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
+  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
+  operands[3] = ix86_strip_reg_or_notreg_operand (operands[3]);
+  operands[4] = GEN_INT (reg_mask);
+})
+
+
 (define_insn "<avx512>_vternlog<mode>_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:VI48_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
index 78bf5d33689..fbc3de08119 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
@@ -1,7 +1,8 @@
 /* PR target/95524 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512bw" } */
-/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
+/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
 typedef char v64qi  __attribute__ ((vector_size (64)));
 typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
 
@@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
   return a >> 2;
 }
 /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
-/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
 /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
 
 __attribute__((noipa)) v64qi
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
new file mode 100644
index 00000000000..594093ecdde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
+/* { dg-final { scan-assembler-not "vpxor" } } */
+/* { dg-final { scan-assembler-not "vpor" } } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+
+#include<immintrin.h>
+__m256d
+__attribute__((noipa, target("avx512vl")))
+copysign2_pd(__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+__attribute__((noipa, target("avx512vl")))
+foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
new file mode 100644
index 00000000000..9d9759a8e1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+#include "pr101989-1.c"
+__m256d
+avx2_copysign2_pd (__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+avx2_foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
+
+
+void
+test_256 (void)
+{
+  union256i_q q1, q2, q3, res2, exp2;
+  union256d d1, d2, res1, exp1;
+  int i, sign = 1;
+
+  for (i = 0; i < 4; i++)
+    {
+      d1.a[i] = 12.34 * (i + 2000) * sign;
+      d2.a[i] = 56.78 * (i - 30) * sign;
+      q1.a[i] = 12 * (i + 2000) * sign;
+      q2.a[i] = 56 * (i - 30) * sign;
+      q3.a[i] = 90 * (i + 40) * sign;
+      res1.a[i] = DEFAULT_VALUE;
+      exp1.a[i] = DEFAULT_VALUE;
+      res2.a[i] = exp2.a[i] = -1;
+      sign = -sign;
+    }
+
+  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
+  res1.x = copysign2_pd (d1.x, d2.x);
+  if (UNION_CHECK (256, d) (res1, exp1.a))
+    abort ();
+
+  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
+  res2.x = foo1 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
+  res2.x = foo2 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
+  res2.x = foo3 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
+  res2.x = foo4 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
+  res2.x = foo (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+}
+
+static void
+test_128 ()
+{}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction.
  2021-08-24  1:36 [PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction liuhongt
@ 2021-08-24  9:53 ` Hongtao Liu
  2021-08-24 13:10   ` Bernhard Reutner-Fischer
  0 siblings, 1 reply; 4+ messages in thread
From: Hongtao Liu @ 2021-08-24  9:53 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches, H. J. Lu

On Tue, Aug 24, 2021 at 9:36 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Also optimize below 3 forms to vpternlog, op1, op2, op3 are
> register_operand or unary_p as (not reg)
>
> A: (any_logic (any_logic op1 op2) op3)
> B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should
> be equal to op1/op2
> C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should
> be equal to op1/op2
>
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>
> gcc/ChangeLog:
>
>         PR target/101989
>         * config/i386/i386-protos.h
>         (ix86_strip_reg_or_notreg_operand): New declare.
>         * config/i386/i386.c (ix86_rtx_costs): Define cost for
>         UNSPEC_VTERNLOG.
>         (ix86_strip_reg_or_notreg_operand): New function.
Push to trunk by changing ix86_strip_reg_or_notreg_operand to macro,
function call seems too inefficient for the simple strip unary.
>         * config/i386/predicates.md (reg_or_notreg_operand): New
>         predicate.
>         * config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn.
>         (*<avx512>_vternlog<mode>_1): New pre_reload
>         define_insn_and_split.
>         (*<avx512>_vternlog<mode>_2): Ditto.
>         (*<avx512>_vternlog<mode>_3): Ditto.
>         (any_logic1,any_logic2): New code iterator.
>         (logic_op): New code attribute.
>         (ternlogsuffix): Extend to VNxDF and VNxSF.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/101989
>         * gcc.target/i386/pr101989-1.c: New test.
>         * gcc.target/i386/pr101989-2.c: New test.
>         * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.
> ---
>  gcc/config/i386/i386-protos.h                 |   1 +
>  gcc/config/i386/i386.c                        |  13 +
>  gcc/config/i386/predicates.md                 |   7 +
>  gcc/config/i386/sse.md                        | 234 ++++++++++++++++++
>  .../i386/avx512bw-shiftqihi-constant-1.c      |   4 +-
>  gcc/testsuite/gcc.target/i386/pr101989-1.c    |  51 ++++
>  gcc/testsuite/gcc.target/i386/pr101989-2.c    | 102 ++++++++
>  7 files changed, 410 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 2fd13074c81..2bdaadcf4f3 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int);
>  extern int standard_sse_constant_p (rtx, machine_mode);
>  extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);
>  extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);
> +extern rtx ix86_strip_reg_or_notreg_operand (rtx);
>  extern bool ix86_pre_reload_split (void);
>  extern bool symbolic_reference_mentioned_p (rtx);
>  extern bool extended_reg_mentioned_p (rtx);
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 46844fab08f..a69225ccc81 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
>    return true;
>  }
>
> +/* Returns true if INSN can be transformed from a memory load
> +   to a supported FP constant load.  */
> +rtx
> +ix86_strip_reg_or_notreg_operand (rtx op)
> +{
> +  return UNARY_P (op) ? XEXP (op, 0) : op;
> +}
> +
>  /* Predicate for pre-reload splitters with associated instructions,
>     which can match any time before the split1 pass (usually combine),
>     then are unconditionally split in that pass and should not be
> @@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
>      case UNSPEC:
>        if (XINT (x, 1) == UNSPEC_TP)
>         *total = 0;
> +      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
> +       {
> +         *total = cost->sse_op;
> +         return true;
> +       }
>        return false;
>
>      case VEC_SELECT:
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index 9321f332ef9..df5acb425d4 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand"
>             (ior (match_test "op == const1_rtx")
>                  (match_test "op == constm1_rtx")))))
>
> +;; True for registers, or (not: registers).  Used to optimize 3-operand
> +;; bitwise operation.
> +(define_predicate "reg_or_notreg_operand"
> +  (ior (match_operand 0 "register_operand")
> +       (and (match_code "not")
> +           (match_test "register_operand (XEXP (op, 0), mode)"))))
> +
>  ;; True if OP is acceptable as operand of DImode shift expander.
>  (define_predicate "shiftdi_operand"
>    (if_then_else (match_test "TARGET_64BIT")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 13889687793..0acd749d21c 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -933,7 +933,9 @@ (define_mode_attr iptr
>  ;; Mapping of vector modes to VPTERNLOG suffix
>  (define_mode_attr ternlogsuffix
>    [(V8DI "q") (V4DI "q") (V2DI "q")
> +   (V8DF "q") (V4DF "q") (V2DF "q")
>     (V16SI "d") (V8SI "d") (V4SI "d")
> +   (V16SF "d") (V8SF "d") (V4SF "d")
>     (V32HI "d") (V16HI "d") (V8HI "d")
>     (V64QI "d") (V32QI "d") (V16QI "d")])
>
> @@ -10041,6 +10043,238 @@ (define_insn "<avx512>_vternlog<mode><sd_maskz_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn "*<avx512>_vternlog<mode>_all"
> +  [(set (match_operand:V 0 "register_operand" "=v")
> +       (unspec:V
> +         [(match_operand:V 1 "register_operand" "0")
> +          (match_operand:V 2 "register_operand" "v")
> +          (match_operand:V 3 "nonimmediate_operand" "vm")
> +          (match_operand:SI 4 "const_0_to_255_operand")]
> +         UNSPEC_VTERNLOG))]
> +  "TARGET_AVX512F"
> +  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
> +  [(set_attr "type" "sselog")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +;; There must be lots of other combinations like
> +;;
> +;; (any_logic:V
> +;;   (any_logic:V op1 op2)
> +;;   (any_logic:V op1 op3))
> +;;
> +;; (any_logic:V
> +;;   (any_logic:V
> +;;     (any_logic:V op1, op2)
> +;;     op3)
> +;;   op1)
> +;;
> +;; and so on.
> +
> +(define_code_iterator any_logic1 [and ior xor])
> +(define_code_iterator any_logic2 [and ior xor])
> +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
> +
> +(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
> +  [(set (match_operand:V 0 "register_operand")
> +       (any_logic:V
> +         (any_logic1:V
> +           (match_operand:V 1 "reg_or_notreg_operand")
> +           (match_operand:V 2 "reg_or_notreg_operand"))
> +         (any_logic2:V
> +           (match_operand:V 3 "reg_or_notreg_operand")
> +           (match_operand:V 4 "reg_or_notreg_operand"))))]
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> +   && ix86_pre_reload_split ()
> +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (unspec:V
> +         [(match_dup 6)
> +          (match_dup 2)
> +          (match_dup 1)
> +          (match_dup 5)]
> +         UNSPEC_VTERNLOG))]
> +{
> +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> +  int reg6 = 0xF0;
> +  int reg2 = 0xCC;
> +  int reg1 = 0xAA;
> +  int reg3 = 0;
> +  int reg4 = 0;
> +  int reg_mask, tmp1, tmp2;
> +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> +    {
> +      reg4 = reg1;
> +      reg3 = reg6;
> +      operands[6] = operands[3];
> +    }
> +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> +    {
> +      reg4 = reg2;
> +      reg3 = reg6;
> +      operands[6] = operands[3];
> +    }
> +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> +    {
> +      reg4 = reg6;
> +      reg3 = reg1;
> +      operands[6] = operands[4];
> +    }
> +  else
> +    {
> +      reg4 = reg6;
> +      reg3 = reg2;
> +      operands[6] = operands[4];
> +    }
> +
> +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> +
> +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> +  tmp2 = reg3 <any_logic2:logic_op> reg4;
> +  reg_mask = tmp1  <any_logic:logic_op> tmp2;
> +  reg_mask &= 0xFF;
> +
> +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> +  operands[5] = GEN_INT (reg_mask);
> +})
> +
> +(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
> +  [(set (match_operand:V 0 "register_operand")
> +       (any_logic:V
> +         (any_logic1:V
> +           (any_logic2:V
> +             (match_operand:V 1 "reg_or_notreg_operand")
> +             (match_operand:V 2 "reg_or_notreg_operand"))
> +           (match_operand:V 3 "reg_or_notreg_operand"))
> +         (match_operand:V 4 "reg_or_notreg_operand")))]
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> +   && ix86_pre_reload_split ()
> +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (unspec:V
> +         [(match_dup 6)
> +          (match_dup 2)
> +          (match_dup 1)
> +          (match_dup 5)]
> +         UNSPEC_VTERNLOG))]
> +{
> +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> +  int reg6 = 0xF0;
> +  int reg2 = 0xCC;
> +  int reg1 = 0xAA;
> +  int reg3 = 0;
> +  int reg4 = 0;
> +  int reg_mask, tmp1, tmp2;
> +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> +    {
> +      reg4 = reg1;
> +      reg3 = reg6;
> +      operands[6] = operands[3];
> +    }
> +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> +    {
> +      reg4 = reg2;
> +      reg3 = reg6;
> +      operands[6] = operands[3];
> +    }
> +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> +    {
> +      reg4 = reg6;
> +      reg3 = reg1;
> +      operands[6] = operands[4];
> +    }
> +  else
> +    {
> +      reg4 = reg6;
> +      reg3 = reg2;
> +      operands[6] = operands[4];
> +    }
> +
> +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> +
> +  tmp1 = reg1 <any_logic2:logic_op> reg2;
> +  tmp2 = tmp1 <any_logic1:logic_op> reg3;
> +  reg_mask = tmp2 <any_logic:logic_op> reg4;
> +  reg_mask &= 0xFF;
> +
> +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> +  operands[5] = GEN_INT (reg_mask);
> +})
> +
> +(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
> +  [(set (match_operand:V 0 "register_operand")
> +       (any_logic:V
> +         (any_logic1:V
> +           (match_operand:V 1 "reg_or_notreg_operand")
> +           (match_operand:V 2 "reg_or_notreg_operand"))
> +         (match_operand:V 3 "reg_or_notreg_operand")))]
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> +   && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (unspec:V
> +         [(match_dup 3)
> +          (match_dup 2)
> +          (match_dup 1)
> +          (match_dup 4)]
> +         UNSPEC_VTERNLOG))]
> +{
> +  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
> +  int reg3 = 0xF0;
> +  int reg2 = 0xCC;
> +  int reg1 = 0xAA;
> +  int reg_mask, tmp1;
> +
> +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> +
> +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> +  reg_mask = tmp1 <any_logic:logic_op> reg3;
> +  reg_mask &= 0xFF;
> +
> +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> +  operands[3] = ix86_strip_reg_or_notreg_operand (operands[3]);
> +  operands[4] = GEN_INT (reg_mask);
> +})
> +
> +
>  (define_insn "<avx512>_vternlog<mode>_mask"
>    [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
>         (vec_merge:VI48_AVX512VL
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> index 78bf5d33689..fbc3de08119 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> @@ -1,7 +1,8 @@
>  /* PR target/95524 */
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -mavx512bw" } */
> -/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
> +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
> +/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
>  typedef char v64qi  __attribute__ ((vector_size (64)));
>  typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
>
> @@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
>    return a >> 2;
>  }
>  /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
> -/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
>  /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
>
>  __attribute__((noipa)) v64qi
> diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> new file mode 100644
> index 00000000000..594093ecdde
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
> +/* { dg-final { scan-assembler-not "vpxor" } } */
> +/* { dg-final { scan-assembler-not "vpor" } } */
> +/* { dg-final { scan-assembler-not "vpand" } } */
> +
> +#include<immintrin.h>
> +__m256d
> +__attribute__((noipa, target("avx512vl")))
> +copysign2_pd(__m256d from, __m256d to) {
> +  __m256i a = _mm256_castpd_si256(from);
> +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> +}
> +
> +__m256i
> +__attribute__((noipa, target("avx512vl")))
> +foo (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (src2 & ~src1) | (src3 & src1);
> +}
> +
> +__m256i
> +__attribute__ ((noipa, target("avx512vl")))
> +foo1 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (src2 & src1) | (src3 & ~src1);
> +}
> +
> +__m256i
> +__attribute__ ((noipa, target("avx512vl")))
> +foo2 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (src2 & src1) | (~src3 & src1);
> +}
> +
> +__m256i
> +__attribute__ ((noipa, target("avx512vl")))
> +foo3 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (~src2 & src1) | (src3 & src1);
> +}
> +
> +__m256i
> +__attribute__ ((noipa, target("avx512vl")))
> +foo4 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return src3 & src2 ^ src1;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> new file mode 100644
> index 00000000000..9d9759a8e1d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> @@ -0,0 +1,102 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
> +/* { dg-require-effective-target avx512vl } */
> +
> +#define AVX512VL
> +
> +#include "avx512f-helper.h"
> +
> +#include "pr101989-1.c"
> +__m256d
> +avx2_copysign2_pd (__m256d from, __m256d to) {
> +  __m256i a = _mm256_castpd_si256(from);
> +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> +}
> +
> +__m256i
> +avx2_foo (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (src2 & ~src1) | (src3 & src1);
> +}
> +
> +__m256i
> +avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (src2 & src1) | (src3 & ~src1);
> +}
> +
> +__m256i
> +avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (src2 & src1) | (~src3 & src1);
> +}
> +
> +__m256i
> +avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return (~src2 & src1) | (src3 & src1);
> +}
> +
> +__m256i
> +avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
> +{
> +  return src3 & src2 ^ src1;
> +}
> +
> +
> +void
> +test_256 (void)
> +{
> +  union256i_q q1, q2, q3, res2, exp2;
> +  union256d d1, d2, res1, exp1;
> +  int i, sign = 1;
> +
> +  for (i = 0; i < 4; i++)
> +    {
> +      d1.a[i] = 12.34 * (i + 2000) * sign;
> +      d2.a[i] = 56.78 * (i - 30) * sign;
> +      q1.a[i] = 12 * (i + 2000) * sign;
> +      q2.a[i] = 56 * (i - 30) * sign;
> +      q3.a[i] = 90 * (i + 40) * sign;
> +      res1.a[i] = DEFAULT_VALUE;
> +      exp1.a[i] = DEFAULT_VALUE;
> +      res2.a[i] = exp2.a[i] = -1;
> +      sign = -sign;
> +    }
> +
> +  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
> +  res1.x = copysign2_pd (d1.x, d2.x);
> +  if (UNION_CHECK (256, d) (res1, exp1.a))
> +    abort ();
> +
> +  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
> +  res2.x = foo1 (q1.x, q2.x, q3.x);
> +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> +    abort ();
> +
> +  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
> +  res2.x = foo2 (q1.x, q2.x, q3.x);
> +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> +    abort ();
> +
> +  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
> +  res2.x = foo3 (q1.x, q2.x, q3.x);
> +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> +    abort ();
> +
> +  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
> +  res2.x = foo4 (q1.x, q2.x, q3.x);
> +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> +    abort ();
> +
> +  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
> +  res2.x = foo (q1.x, q2.x, q3.x);
> +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> +    abort ();
> +}
> +
> +static void
> +test_128 ()
> +{}
> --
> 2.18.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction.
  2021-08-24  9:53 ` Hongtao Liu
@ 2021-08-24 13:10   ` Bernhard Reutner-Fischer
  2021-08-25  1:23     ` Hongtao Liu
  0 siblings, 1 reply; 4+ messages in thread
From: Bernhard Reutner-Fischer @ 2021-08-24 13:10 UTC (permalink / raw)
  To: Hongtao Liu via Gcc-patches; +Cc: rep.dot.nop, Hongtao Liu, liuhongt

On Tue, 24 Aug 2021 17:53:27 +0800
Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:

> On Tue, Aug 24, 2021 at 9:36 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Also optimize below 3 forms to vpternlog, op1, op2, op3 are
> > register_operand or unary_p as (not reg)

> > gcc/ChangeLog:
> >
> >         PR target/101989
> >         * config/i386/i386-protos.h
> >         (ix86_strip_reg_or_notreg_operand): New declare.

"New declaration."

> >         * config/i386/i386.c (ix86_rtx_costs): Define cost for
> >         UNSPEC_VTERNLOG.

I do not see a considerable amount of VTERNLOG in the docs i have here.
Is there a P missing in vPternlog?

> >         (ix86_strip_reg_or_notreg_operand): New function.  
> Push to trunk by changing ix86_strip_reg_or_notreg_operand to macro,
> function call seems too inefficient for the simple strip unary.
> >         * config/i386/predicates.md (reg_or_notreg_operand): New
> >         predicate.
> >         * config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn.
> >         (*<avx512>_vternlog<mode>_1): New pre_reload
> >         define_insn_and_split.
> >         (*<avx512>_vternlog<mode>_2): Ditto.
> >         (*<avx512>_vternlog<mode>_3): Ditto.

at least the above 3 insn_and_split do have a 'p' in the md.
thanks,
> >         (any_logic1,any_logic2): New code iterator.
> >         (logic_op): New code attribute.
> >         (ternlogsuffix): Extend to VNxDF and VNxSF.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/101989
> >         * gcc.target/i386/pr101989-1.c: New test.
> >         * gcc.target/i386/pr101989-2.c: New test.
> >         * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.
> > ---
> >  gcc/config/i386/i386-protos.h                 |   1 +
> >  gcc/config/i386/i386.c                        |  13 +
> >  gcc/config/i386/predicates.md                 |   7 +
> >  gcc/config/i386/sse.md                        | 234 ++++++++++++++++++
> >  .../i386/avx512bw-shiftqihi-constant-1.c      |   4 +-
> >  gcc/testsuite/gcc.target/i386/pr101989-1.c    |  51 ++++
> >  gcc/testsuite/gcc.target/i386/pr101989-2.c    | 102 ++++++++
> >  7 files changed, 410 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c
> >
> > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > index 2fd13074c81..2bdaadcf4f3 100644
> > --- a/gcc/config/i386/i386-protos.h
> > +++ b/gcc/config/i386/i386-protos.h
> > @@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int);
> >  extern int standard_sse_constant_p (rtx, machine_mode);
> >  extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);
> >  extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);
> > +extern rtx ix86_strip_reg_or_notreg_operand (rtx);
> >  extern bool ix86_pre_reload_split (void);
> >  extern bool symbolic_reference_mentioned_p (rtx);
> >  extern bool extended_reg_mentioned_p (rtx);
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index 46844fab08f..a69225ccc81 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
> >    return true;
> >  }
> >
> > +/* Returns true if INSN can be transformed from a memory load
> > +   to a supported FP constant load.  */
> > +rtx
> > +ix86_strip_reg_or_notreg_operand (rtx op)
> > +{
> > +  return UNARY_P (op) ? XEXP (op, 0) : op;
> > +}
> > +
> >  /* Predicate for pre-reload splitters with associated instructions,
> >     which can match any time before the split1 pass (usually combine),
> >     then are unconditionally split in that pass and should not be
> > @@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
> >      case UNSPEC:
> >        if (XINT (x, 1) == UNSPEC_TP)
> >         *total = 0;
> > +      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
> > +       {
> > +         *total = cost->sse_op;
> > +         return true;
> > +       }
> >        return false;
> >
> >      case VEC_SELECT:
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index 9321f332ef9..df5acb425d4 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand"
> >             (ior (match_test "op == const1_rtx")
> >                  (match_test "op == constm1_rtx")))))
> >
> > +;; True for registers, or (not: registers).  Used to optimize 3-operand
> > +;; bitwise operation.
> > +(define_predicate "reg_or_notreg_operand"
> > +  (ior (match_operand 0 "register_operand")
> > +       (and (match_code "not")
> > +           (match_test "register_operand (XEXP (op, 0), mode)"))))
> > +
> >  ;; True if OP is acceptable as operand of DImode shift expander.
> >  (define_predicate "shiftdi_operand"
> >    (if_then_else (match_test "TARGET_64BIT")
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 13889687793..0acd749d21c 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -933,7 +933,9 @@ (define_mode_attr iptr
> >  ;; Mapping of vector modes to VPTERNLOG suffix
> >  (define_mode_attr ternlogsuffix
> >    [(V8DI "q") (V4DI "q") (V2DI "q")
> > +   (V8DF "q") (V4DF "q") (V2DF "q")
> >     (V16SI "d") (V8SI "d") (V4SI "d")
> > +   (V16SF "d") (V8SF "d") (V4SF "d")
> >     (V32HI "d") (V16HI "d") (V8HI "d")
> >     (V64QI "d") (V32QI "d") (V16QI "d")])
> >
> > @@ -10041,6 +10043,238 @@ (define_insn "<avx512>_vternlog<mode><sd_maskz_name>"
> >     (set_attr "prefix" "evex")
> >     (set_attr "mode" "<sseinsnmode>")])
> >
> > +(define_insn "*<avx512>_vternlog<mode>_all"
> > +  [(set (match_operand:V 0 "register_operand" "=v")
> > +       (unspec:V
> > +         [(match_operand:V 1 "register_operand" "0")
> > +          (match_operand:V 2 "register_operand" "v")
> > +          (match_operand:V 3 "nonimmediate_operand" "vm")
> > +          (match_operand:SI 4 "const_0_to_255_operand")]
> > +         UNSPEC_VTERNLOG))]
> > +  "TARGET_AVX512F"
> > +  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
> > +  [(set_attr "type" "sselog")
> > +   (set_attr "prefix" "evex")
> > +   (set_attr "mode" "<sseinsnmode>")])
> > +
> > +;; There must be lots of other combinations like
> > +;;
> > +;; (any_logic:V
> > +;;   (any_logic:V op1 op2)
> > +;;   (any_logic:V op1 op3))
> > +;;
> > +;; (any_logic:V
> > +;;   (any_logic:V
> > +;;     (any_logic:V op1, op2)
> > +;;     op3)
> > +;;   op1)
> > +;;
> > +;; and so on.
> > +
> > +(define_code_iterator any_logic1 [and ior xor])
> > +(define_code_iterator any_logic2 [and ior xor])
> > +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
> > +
> > +(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
> > +  [(set (match_operand:V 0 "register_operand")
> > +       (any_logic:V
> > +         (any_logic1:V
> > +           (match_operand:V 1 "reg_or_notreg_operand")
> > +           (match_operand:V 2 "reg_or_notreg_operand"))
> > +         (any_logic2:V
> > +           (match_operand:V 3 "reg_or_notreg_operand")
> > +           (match_operand:V 4 "reg_or_notreg_operand"))))]
> > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > +   && ix86_pre_reload_split ()
> > +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> > +  "#"
> > +  "&& 1"
> > +  [(set (match_dup 0)
> > +       (unspec:V
> > +         [(match_dup 6)
> > +          (match_dup 2)
> > +          (match_dup 1)
> > +          (match_dup 5)]
> > +         UNSPEC_VTERNLOG))]
> > +{
> > +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> > +  int reg6 = 0xF0;
> > +  int reg2 = 0xCC;
> > +  int reg1 = 0xAA;
> > +  int reg3 = 0;
> > +  int reg4 = 0;
> > +  int reg_mask, tmp1, tmp2;
> > +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> > +    {
> > +      reg4 = reg1;
> > +      reg3 = reg6;
> > +      operands[6] = operands[3];
> > +    }
> > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> > +    {
> > +      reg4 = reg2;
> > +      reg3 = reg6;
> > +      operands[6] = operands[3];
> > +    }
> > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> > +    {
> > +      reg4 = reg6;
> > +      reg3 = reg1;
> > +      operands[6] = operands[4];
> > +    }
> > +  else
> > +    {
> > +      reg4 = reg6;
> > +      reg3 = reg2;
> > +      operands[6] = operands[4];
> > +    }
> > +
> > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> > +
> > +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> > +  tmp2 = reg3 <any_logic2:logic_op> reg4;
> > +  reg_mask = tmp1  <any_logic:logic_op> tmp2;
> > +  reg_mask &= 0xFF;
> > +
> > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> > +  operands[5] = GEN_INT (reg_mask);
> > +})
> > +
> > +(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
> > +  [(set (match_operand:V 0 "register_operand")
> > +       (any_logic:V
> > +         (any_logic1:V
> > +           (any_logic2:V
> > +             (match_operand:V 1 "reg_or_notreg_operand")
> > +             (match_operand:V 2 "reg_or_notreg_operand"))
> > +           (match_operand:V 3 "reg_or_notreg_operand"))
> > +         (match_operand:V 4 "reg_or_notreg_operand")))]
> > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > +   && ix86_pre_reload_split ()
> > +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> > +  "#"
> > +  "&& 1"
> > +  [(set (match_dup 0)
> > +       (unspec:V
> > +         [(match_dup 6)
> > +          (match_dup 2)
> > +          (match_dup 1)
> > +          (match_dup 5)]
> > +         UNSPEC_VTERNLOG))]
> > +{
> > +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> > +  int reg6 = 0xF0;
> > +  int reg2 = 0xCC;
> > +  int reg1 = 0xAA;
> > +  int reg3 = 0;
> > +  int reg4 = 0;
> > +  int reg_mask, tmp1, tmp2;
> > +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> > +    {
> > +      reg4 = reg1;
> > +      reg3 = reg6;
> > +      operands[6] = operands[3];
> > +    }
> > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> > +    {
> > +      reg4 = reg2;
> > +      reg3 = reg6;
> > +      operands[6] = operands[3];
> > +    }
> > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> > +    {
> > +      reg4 = reg6;
> > +      reg3 = reg1;
> > +      operands[6] = operands[4];
> > +    }
> > +  else
> > +    {
> > +      reg4 = reg6;
> > +      reg3 = reg2;
> > +      operands[6] = operands[4];
> > +    }
> > +
> > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> > +
> > +  tmp1 = reg1 <any_logic2:logic_op> reg2;
> > +  tmp2 = tmp1 <any_logic1:logic_op> reg3;
> > +  reg_mask = tmp2 <any_logic:logic_op> reg4;
> > +  reg_mask &= 0xFF;
> > +
> > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> > +  operands[5] = GEN_INT (reg_mask);
> > +})
> > +
> > +(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
> > +  [(set (match_operand:V 0 "register_operand")
> > +       (any_logic:V
> > +         (any_logic1:V
> > +           (match_operand:V 1 "reg_or_notreg_operand")
> > +           (match_operand:V 2 "reg_or_notreg_operand"))
> > +         (match_operand:V 3 "reg_or_notreg_operand")))]
> > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > +   && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(set (match_dup 0)
> > +       (unspec:V
> > +         [(match_dup 3)
> > +          (match_dup 2)
> > +          (match_dup 1)
> > +          (match_dup 4)]
> > +         UNSPEC_VTERNLOG))]
> > +{
> > +  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
> > +  int reg3 = 0xF0;
> > +  int reg2 = 0xCC;
> > +  int reg1 = 0xAA;
> > +  int reg_mask, tmp1;
> > +
> > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > +
> > +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> > +  reg_mask = tmp1 <any_logic:logic_op> reg3;
> > +  reg_mask &= 0xFF;
> > +
> > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > +  operands[3] = ix86_strip_reg_or_notreg_operand (operands[3]);
> > +  operands[4] = GEN_INT (reg_mask);
> > +})
> > +
> > +
> >  (define_insn "<avx512>_vternlog<mode>_mask"
> >    [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> >         (vec_merge:VI48_AVX512VL
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > index 78bf5d33689..fbc3de08119 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > @@ -1,7 +1,8 @@
> >  /* PR target/95524 */
> >  /* { dg-do compile } */
> >  /* { dg-options "-O2 -mavx512bw" } */
> > -/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
> > +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
> > +/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
> >  typedef char v64qi  __attribute__ ((vector_size (64)));
> >  typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
> >
> > @@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
> >    return a >> 2;
> >  }
> >  /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
> > -/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
> >  /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
> >
> >  __attribute__((noipa)) v64qi
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> > new file mode 100644
> > index 00000000000..594093ecdde
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> > @@ -0,0 +1,51 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
> > +/* { dg-final { scan-assembler-not "vpxor" } } */
> > +/* { dg-final { scan-assembler-not "vpor" } } */
> > +/* { dg-final { scan-assembler-not "vpand" } } */
> > +
> > +#include<immintrin.h>
> > +__m256d
> > +__attribute__((noipa, target("avx512vl")))
> > +copysign2_pd(__m256d from, __m256d to) {
> > +  __m256i a = _mm256_castpd_si256(from);
> > +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> > +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> > +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> > +}
> > +
> > +__m256i
> > +__attribute__((noipa, target("avx512vl")))
> > +foo (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (src2 & ~src1) | (src3 & src1);
> > +}
> > +
> > +__m256i
> > +__attribute__ ((noipa, target("avx512vl")))
> > +foo1 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (src2 & src1) | (src3 & ~src1);
> > +}
> > +
> > +__m256i
> > +__attribute__ ((noipa, target("avx512vl")))
> > +foo2 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (src2 & src1) | (~src3 & src1);
> > +}
> > +
> > +__m256i
> > +__attribute__ ((noipa, target("avx512vl")))
> > +foo3 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (~src2 & src1) | (src3 & src1);
> > +}
> > +
> > +__m256i
> > +__attribute__ ((noipa, target("avx512vl")))
> > +foo4 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return src3 & src2 ^ src1;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> > new file mode 100644
> > index 00000000000..9d9759a8e1d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> > @@ -0,0 +1,102 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
> > +/* { dg-require-effective-target avx512vl } */
> > +
> > +#define AVX512VL
> > +
> > +#include "avx512f-helper.h"
> > +
> > +#include "pr101989-1.c"
> > +__m256d
> > +avx2_copysign2_pd (__m256d from, __m256d to) {
> > +  __m256i a = _mm256_castpd_si256(from);
> > +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> > +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> > +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> > +}
> > +
> > +__m256i
> > +avx2_foo (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (src2 & ~src1) | (src3 & src1);
> > +}
> > +
> > +__m256i
> > +avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (src2 & src1) | (src3 & ~src1);
> > +}
> > +
> > +__m256i
> > +avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (src2 & src1) | (~src3 & src1);
> > +}
> > +
> > +__m256i
> > +avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return (~src2 & src1) | (src3 & src1);
> > +}
> > +
> > +__m256i
> > +avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
> > +{
> > +  return src3 & src2 ^ src1;
> > +}
> > +
> > +
> > +void
> > +test_256 (void)
> > +{
> > +  union256i_q q1, q2, q3, res2, exp2;
> > +  union256d d1, d2, res1, exp1;
> > +  int i, sign = 1;
> > +
> > +  for (i = 0; i < 4; i++)
> > +    {
> > +      d1.a[i] = 12.34 * (i + 2000) * sign;
> > +      d2.a[i] = 56.78 * (i - 30) * sign;
> > +      q1.a[i] = 12 * (i + 2000) * sign;
> > +      q2.a[i] = 56 * (i - 30) * sign;
> > +      q3.a[i] = 90 * (i + 40) * sign;
> > +      res1.a[i] = DEFAULT_VALUE;
> > +      exp1.a[i] = DEFAULT_VALUE;
> > +      res2.a[i] = exp2.a[i] = -1;
> > +      sign = -sign;
> > +    }
> > +
> > +  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
> > +  res1.x = copysign2_pd (d1.x, d2.x);
> > +  if (UNION_CHECK (256, d) (res1, exp1.a))
> > +    abort ();
> > +
> > +  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
> > +  res2.x = foo1 (q1.x, q2.x, q3.x);
> > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > +    abort ();
> > +
> > +  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
> > +  res2.x = foo2 (q1.x, q2.x, q3.x);
> > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > +    abort ();
> > +
> > +  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
> > +  res2.x = foo3 (q1.x, q2.x, q3.x);
> > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > +    abort ();
> > +
> > +  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
> > +  res2.x = foo4 (q1.x, q2.x, q3.x);
> > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > +    abort ();
> > +
> > +  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
> > +  res2.x = foo (q1.x, q2.x, q3.x);
> > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > +    abort ();
> > +}
> > +
> > +static void
> > +test_128 ()
> > +{}
> > --
> > 2.18.1
> >  
> 
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction.
  2021-08-24 13:10   ` Bernhard Reutner-Fischer
@ 2021-08-25  1:23     ` Hongtao Liu
  0 siblings, 0 replies; 4+ messages in thread
From: Hongtao Liu @ 2021-08-25  1:23 UTC (permalink / raw)
  To: Bernhard Reutner-Fischer; +Cc: Hongtao Liu via Gcc-patches, liuhongt

On Tue, Aug 24, 2021 at 9:11 PM Bernhard Reutner-Fischer
<rep.dot.nop@gmail.com> wrote:
>
> On Tue, 24 Aug 2021 17:53:27 +0800
> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>
> > On Tue, Aug 24, 2021 at 9:36 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > Also optimize below 3 forms to vpternlog, op1, op2, op3 are
> > > register_operand or unary_p as (not reg)
>
> > > gcc/ChangeLog:
> > >
> > >         PR target/101989
> > >         * config/i386/i386-protos.h
> > >         (ix86_strip_reg_or_notreg_operand): New declare.
>
> "New declaration."
>
> > >         * config/i386/i386.c (ix86_rtx_costs): Define cost for
> > >         UNSPEC_VTERNLOG.
>
> I do not see a considerable amount of VTERNLOG in the docs i have here.
> Is there a P missing in vPternlog?
The output assembly is vpternlog, and the internal pattern name is
originally vternlog (not clear why it is not called vpternlog, perhaps
the abbreviation of vector ternary logic), I added the new
define_insn_and_split just to keep in line with the original name.
>
> > >         (ix86_strip_reg_or_notreg_operand): New function.
> > Push to trunk by changing ix86_strip_reg_or_notreg_operand to macro,
> > function call seems too inefficient for the simple strip unary.
> > >         * config/i386/predicates.md (reg_or_notreg_operand): New
> > >         predicate.
> > >         * config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn.
> > >         (*<avx512>_vternlog<mode>_1): New pre_reload
> > >         define_insn_and_split.
> > >         (*<avx512>_vternlog<mode>_2): Ditto.
> > >         (*<avx512>_vternlog<mode>_3): Ditto.
>
> at least the above 3 insn_and_split do have a 'p' in the md.
> thanks,
> > >         (any_logic1,any_logic2): New code iterator.
> > >         (logic_op): New code attribute.
> > >         (ternlogsuffix): Extend to VNxDF and VNxSF.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/101989
> > >         * gcc.target/i386/pr101989-1.c: New test.
> > >         * gcc.target/i386/pr101989-2.c: New test.
> > >         * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.
> > > ---
> > >  gcc/config/i386/i386-protos.h                 |   1 +
> > >  gcc/config/i386/i386.c                        |  13 +
> > >  gcc/config/i386/predicates.md                 |   7 +
> > >  gcc/config/i386/sse.md                        | 234 ++++++++++++++++++
> > >  .../i386/avx512bw-shiftqihi-constant-1.c      |   4 +-
> > >  gcc/testsuite/gcc.target/i386/pr101989-1.c    |  51 ++++
> > >  gcc/testsuite/gcc.target/i386/pr101989-2.c    | 102 ++++++++
> > >  7 files changed, 410 insertions(+), 2 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c
> > >
> > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > > index 2fd13074c81..2bdaadcf4f3 100644
> > > --- a/gcc/config/i386/i386-protos.h
> > > +++ b/gcc/config/i386/i386-protos.h
> > > @@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int);
> > >  extern int standard_sse_constant_p (rtx, machine_mode);
> > >  extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);
> > >  extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);
> > > +extern rtx ix86_strip_reg_or_notreg_operand (rtx);
> > >  extern bool ix86_pre_reload_split (void);
> > >  extern bool symbolic_reference_mentioned_p (rtx);
> > >  extern bool extended_reg_mentioned_p (rtx);
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index 46844fab08f..a69225ccc81 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
> > >    return true;
> > >  }
> > >
> > > +/* Returns true if INSN can be transformed from a memory load
> > > +   to a supported FP constant load.  */
> > > +rtx
> > > +ix86_strip_reg_or_notreg_operand (rtx op)
> > > +{
> > > +  return UNARY_P (op) ? XEXP (op, 0) : op;
> > > +}
> > > +
> > >  /* Predicate for pre-reload splitters with associated instructions,
> > >     which can match any time before the split1 pass (usually combine),
> > >     then are unconditionally split in that pass and should not be
> > > @@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
> > >      case UNSPEC:
> > >        if (XINT (x, 1) == UNSPEC_TP)
> > >         *total = 0;
> > > +      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
> > > +       {
> > > +         *total = cost->sse_op;
> > > +         return true;
> > > +       }
> > >        return false;
> > >
> > >      case VEC_SELECT:
> > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > > index 9321f332ef9..df5acb425d4 100644
> > > --- a/gcc/config/i386/predicates.md
> > > +++ b/gcc/config/i386/predicates.md
> > > @@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand"
> > >             (ior (match_test "op == const1_rtx")
> > >                  (match_test "op == constm1_rtx")))))
> > >
> > > +;; True for registers, or (not: registers).  Used to optimize 3-operand
> > > +;; bitwise operation.
> > > +(define_predicate "reg_or_notreg_operand"
> > > +  (ior (match_operand 0 "register_operand")
> > > +       (and (match_code "not")
> > > +           (match_test "register_operand (XEXP (op, 0), mode)"))))
> > > +
> > >  ;; True if OP is acceptable as operand of DImode shift expander.
> > >  (define_predicate "shiftdi_operand"
> > >    (if_then_else (match_test "TARGET_64BIT")
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 13889687793..0acd749d21c 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -933,7 +933,9 @@ (define_mode_attr iptr
> > >  ;; Mapping of vector modes to VPTERNLOG suffix
> > >  (define_mode_attr ternlogsuffix
> > >    [(V8DI "q") (V4DI "q") (V2DI "q")
> > > +   (V8DF "q") (V4DF "q") (V2DF "q")
> > >     (V16SI "d") (V8SI "d") (V4SI "d")
> > > +   (V16SF "d") (V8SF "d") (V4SF "d")
> > >     (V32HI "d") (V16HI "d") (V8HI "d")
> > >     (V64QI "d") (V32QI "d") (V16QI "d")])
> > >
> > > @@ -10041,6 +10043,238 @@ (define_insn "<avx512>_vternlog<mode><sd_maskz_name>"
> > >     (set_attr "prefix" "evex")
> > >     (set_attr "mode" "<sseinsnmode>")])
> > >
> > > +(define_insn "*<avx512>_vternlog<mode>_all"
> > > +  [(set (match_operand:V 0 "register_operand" "=v")
> > > +       (unspec:V
> > > +         [(match_operand:V 1 "register_operand" "0")
> > > +          (match_operand:V 2 "register_operand" "v")
> > > +          (match_operand:V 3 "nonimmediate_operand" "vm")
> > > +          (match_operand:SI 4 "const_0_to_255_operand")]
> > > +         UNSPEC_VTERNLOG))]
> > > +  "TARGET_AVX512F"
> > > +  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
> > > +  [(set_attr "type" "sselog")
> > > +   (set_attr "prefix" "evex")
> > > +   (set_attr "mode" "<sseinsnmode>")])
> > > +
> > > +;; There must be lots of other combinations like
> > > +;;
> > > +;; (any_logic:V
> > > +;;   (any_logic:V op1 op2)
> > > +;;   (any_logic:V op1 op3))
> > > +;;
> > > +;; (any_logic:V
> > > +;;   (any_logic:V
> > > +;;     (any_logic:V op1, op2)
> > > +;;     op3)
> > > +;;   op1)
> > > +;;
> > > +;; and so on.
> > > +
> > > +(define_code_iterator any_logic1 [and ior xor])
> > > +(define_code_iterator any_logic2 [and ior xor])
> > > +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
> > > +
> > > +(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +       (any_logic:V
> > > +         (any_logic1:V
> > > +           (match_operand:V 1 "reg_or_notreg_operand")
> > > +           (match_operand:V 2 "reg_or_notreg_operand"))
> > > +         (any_logic2:V
> > > +           (match_operand:V 3 "reg_or_notreg_operand")
> > > +           (match_operand:V 4 "reg_or_notreg_operand"))))]
> > > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > > +   && ix86_pre_reload_split ()
> > > +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(set (match_dup 0)
> > > +       (unspec:V
> > > +         [(match_dup 6)
> > > +          (match_dup 2)
> > > +          (match_dup 1)
> > > +          (match_dup 5)]
> > > +         UNSPEC_VTERNLOG))]
> > > +{
> > > +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> > > +  int reg6 = 0xF0;
> > > +  int reg2 = 0xCC;
> > > +  int reg1 = 0xAA;
> > > +  int reg3 = 0;
> > > +  int reg4 = 0;
> > > +  int reg_mask, tmp1, tmp2;
> > > +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg1;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg2;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg1;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +  else
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg2;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +
> > > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > > +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> > > +
> > > +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> > > +  tmp2 = reg3 <any_logic2:logic_op> reg4;
> > > +  reg_mask = tmp1  <any_logic:logic_op> tmp2;
> > > +  reg_mask &= 0xFF;
> > > +
> > > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > > +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> > > +  operands[5] = GEN_INT (reg_mask);
> > > +})
> > > +
> > > +(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +       (any_logic:V
> > > +         (any_logic1:V
> > > +           (any_logic2:V
> > > +             (match_operand:V 1 "reg_or_notreg_operand")
> > > +             (match_operand:V 2 "reg_or_notreg_operand"))
> > > +           (match_operand:V 3 "reg_or_notreg_operand"))
> > > +         (match_operand:V 4 "reg_or_notreg_operand")))]
> > > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > > +   && ix86_pre_reload_split ()
> > > +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(set (match_dup 0)
> > > +       (unspec:V
> > > +         [(match_dup 6)
> > > +          (match_dup 2)
> > > +          (match_dup 1)
> > > +          (match_dup 5)]
> > > +         UNSPEC_VTERNLOG))]
> > > +{
> > > +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> > > +  int reg6 = 0xF0;
> > > +  int reg2 = 0xCC;
> > > +  int reg1 = 0xAA;
> > > +  int reg3 = 0;
> > > +  int reg4 = 0;
> > > +  int reg_mask, tmp1, tmp2;
> > > +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg1;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg2;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg1;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +  else
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg2;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +
> > > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > > +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> > > +
> > > +  tmp1 = reg1 <any_logic2:logic_op> reg2;
> > > +  tmp2 = tmp1 <any_logic1:logic_op> reg3;
> > > +  reg_mask = tmp2 <any_logic:logic_op> reg4;
> > > +  reg_mask &= 0xFF;
> > > +
> > > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > > +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> > > +  operands[5] = GEN_INT (reg_mask);
> > > +})
> > > +
> > > +(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +       (any_logic:V
> > > +         (any_logic1:V
> > > +           (match_operand:V 1 "reg_or_notreg_operand")
> > > +           (match_operand:V 2 "reg_or_notreg_operand"))
> > > +         (match_operand:V 3 "reg_or_notreg_operand")))]
> > > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > > +   && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(set (match_dup 0)
> > > +       (unspec:V
> > > +         [(match_dup 3)
> > > +          (match_dup 2)
> > > +          (match_dup 1)
> > > +          (match_dup 4)]
> > > +         UNSPEC_VTERNLOG))]
> > > +{
> > > +  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
> > > +  int reg3 = 0xF0;
> > > +  int reg2 = 0xCC;
> > > +  int reg1 = 0xAA;
> > > +  int reg_mask, tmp1;
> > > +
> > > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > > +
> > > +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> > > +  reg_mask = tmp1 <any_logic:logic_op> reg3;
> > > +  reg_mask &= 0xFF;
> > > +
> > > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > > +  operands[3] = ix86_strip_reg_or_notreg_operand (operands[3]);
> > > +  operands[4] = GEN_INT (reg_mask);
> > > +})
> > > +
> > > +
> > >  (define_insn "<avx512>_vternlog<mode>_mask"
> > >    [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> > >         (vec_merge:VI48_AVX512VL
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > > index 78bf5d33689..fbc3de08119 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > > @@ -1,7 +1,8 @@
> > >  /* PR target/95524 */
> > >  /* { dg-do compile } */
> > >  /* { dg-options "-O2 -mavx512bw" } */
> > > -/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
> > > +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
> > > +/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
> > >  typedef char v64qi  __attribute__ ((vector_size (64)));
> > >  typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
> > >
> > > @@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
> > >    return a >> 2;
> > >  }
> > >  /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
> > >
> > >  __attribute__((noipa)) v64qi
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> > > new file mode 100644
> > > index 00000000000..594093ecdde
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> > > @@ -0,0 +1,51 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2" } */
> > > +/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
> > > +/* { dg-final { scan-assembler-not "vpxor" } } */
> > > +/* { dg-final { scan-assembler-not "vpor" } } */
> > > +/* { dg-final { scan-assembler-not "vpand" } } */
> > > +
> > > +#include<immintrin.h>
> > > +__m256d
> > > +__attribute__((noipa, target("avx512vl")))
> > > +copysign2_pd(__m256d from, __m256d to) {
> > > +  __m256i a = _mm256_castpd_si256(from);
> > > +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> > > +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> > > +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__((noipa, target("avx512vl")))
> > > +foo (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & ~src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo1 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (src3 & ~src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo2 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (~src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo3 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (~src2 & src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo4 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return src3 & src2 ^ src1;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> > > new file mode 100644
> > > index 00000000000..9d9759a8e1d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> > > @@ -0,0 +1,102 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
> > > +/* { dg-require-effective-target avx512vl } */
> > > +
> > > +#define AVX512VL
> > > +
> > > +#include "avx512f-helper.h"
> > > +
> > > +#include "pr101989-1.c"
> > > +__m256d
> > > +avx2_copysign2_pd (__m256d from, __m256d to) {
> > > +  __m256i a = _mm256_castpd_si256(from);
> > > +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> > > +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> > > +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & ~src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (src3 & ~src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (~src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (~src2 & src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return src3 & src2 ^ src1;
> > > +}
> > > +
> > > +
> > > +void
> > > +test_256 (void)
> > > +{
> > > +  union256i_q q1, q2, q3, res2, exp2;
> > > +  union256d d1, d2, res1, exp1;
> > > +  int i, sign = 1;
> > > +
> > > +  for (i = 0; i < 4; i++)
> > > +    {
> > > +      d1.a[i] = 12.34 * (i + 2000) * sign;
> > > +      d2.a[i] = 56.78 * (i - 30) * sign;
> > > +      q1.a[i] = 12 * (i + 2000) * sign;
> > > +      q2.a[i] = 56 * (i - 30) * sign;
> > > +      q3.a[i] = 90 * (i + 40) * sign;
> > > +      res1.a[i] = DEFAULT_VALUE;
> > > +      exp1.a[i] = DEFAULT_VALUE;
> > > +      res2.a[i] = exp2.a[i] = -1;
> > > +      sign = -sign;
> > > +    }
> > > +
> > > +  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
> > > +  res1.x = copysign2_pd (d1.x, d2.x);
> > > +  if (UNION_CHECK (256, d) (res1, exp1.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo1 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo2 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo3 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo4 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
> > > +  res2.x = foo (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +}
> > > +
> > > +static void
> > > +test_128 ()
> > > +{}
> > > --
> > > 2.18.1
> > >
> >
> >
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-08-25  1:18 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-24  1:36 [PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction liuhongt
2021-08-24  9:53 ` Hongtao Liu
2021-08-24 13:10   ` Bernhard Reutner-Fischer
2021-08-25  1:23     ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).