public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Replace invariant ternlog operands
@ 2023-07-25 18:11 Yan Simonaytes
  2023-07-27  3:00 ` Liu, Hongtao
  0 siblings, 1 reply; 4+ messages in thread
From: Yan Simonaytes @ 2023-07-25 18:11 UTC (permalink / raw)
  To: gcc-patches; +Cc: Hongtao Liu, Uros Bizjak, Yan Simonaytes

Sometimes GCC generates ternlog with three operands, but some of them are invariant.
For example:

vpternlogq	$252, %zmm2, %zmm1, %zmm0

In this case zmm1 register isnt used by ternlog.
So should replace zmm1 with zmm0 or zmm2:

vpternlogq	$252, %zmm0, %zmm1, %zmm0

When the third operand of ternlog is memory and both others are invariant 
should add load instruction from this memory to register
and replace the first and the second operands to this register. 
So insted of

vpternlogq	$85, (%rdi), %zmm1, %zmm0

Should emit

vmovdqa64	(%rdi), %zmm0
vpternlogq	$85, %zmm0, %zmm0, %zmm0

gcc/ChangeLog:

        * config/i386/i386.cc (ternlog_invariant_operand_mask): New helper
	function for replacing invariant operands.
        (reduce_ternlog_operands): Likewise.
        * config/i386/i386-protos.h (ternlog_invariant_operand_mask): Prototype here.
        (reduce_ternlog_operands): Likewise.
        * config/i386/sse.md:

gcc/testsuite/ChangeLog:

        * gcc.target/i386/reduce-ternlog-operands-1.c: New test.
        * gcc.target/i386/reduce-ternlog-operands-2.c: New test.
---
 gcc/config/i386/i386-protos.h                 |  2 +
 gcc/config/i386/i386.cc                       | 45 +++++++++++++++++++
 gcc/config/i386/sse.md                        | 43 ++++++++++++++++++
 .../i386/reduce-ternlog-operands-1.c          | 20 +++++++++
 .../i386/reduce-ternlog-operands-2.c          | 11 +++++
 5 files changed, 121 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-2.c

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 27fe73ca65c..49398ef9936 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -57,6 +57,8 @@ extern int standard_80387_constant_p (rtx);
 extern const char *standard_80387_constant_opcode (rtx);
 extern rtx standard_80387_constant_rtx (int);
 extern int standard_sse_constant_p (rtx, machine_mode);
+extern int ternlog_invariant_operand_mask (rtx *operands);
+extern void reduce_ternlog_operands (rtx *operands);
 extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);
 extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);
 extern bool ix86_pre_reload_split (void);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f0d6167e667..140de478571 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -5070,6 +5070,51 @@ ix86_check_no_addr_space (rtx insn)
     }
   return true;
 }
+
+/* Return mask of invariant operands:
+   bit number     0 1 2
+   operand number 1 2 3.  */
+
+int
+ternlog_invariant_operand_mask (rtx *operands)
+{
+  int mask = 0;
+  int imm8 = XINT (operands[4], 0);
+
+  if (((imm8 >> 4) & 0xF) == (imm8 & 0xF))
+    mask |= 1;
+  if (((imm8 >> 2) & 0x33) == (imm8 & 0x33))
+    mask |= (1 << 1);
+  if (((imm8 >> 1) & 0x55) == (imm8 & 0x55))
+    mask |= (1 << 2);
+
+  return mask;
+}
+
+/* Replace one of the unused operators with the one used.  */
+
+void
+reduce_ternlog_operands (rtx *operands)
+{
+  int mask = ternlog_invariant_operand_mask (operands);
+
+  if (mask & 1) /* the first operand is invariant.  */
+    operands[1] = operands[2];
+
+  if (mask & 2) /* the second operand is invariant.  */
+    operands[2] = operands[1];
+
+  if (mask & 4)	/* the third operand is invariant.  */
+   operands[3] = operands[1];
+  else if (!MEM_P (operands[3]))
+    {
+      if (mask & 1) /* the first operand is invariant.  */
+	operands[1] = operands[3];
+      if (mask & 2) /* the second operands is invariant.  */
+	operands[2] = operands[3];
+    }
+}
+
 \f
 /* Initialize the table of extra 80387 mathematical constants.  */
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a2099373123..f88d82b315c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12625,6 +12625,49 @@
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
+;; If the first and the second operands of ternlog are invariant and
+;; the third operand is memory
+;; then we should add load third operand from memory to register and
+;; replace first and second operands with this register
+(define_split
+  [(set (match_operand:V 0 "register_operand")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand")
+	   (match_operand:V 2 "register_operand")
+	   (match_operand:V 3 "memory_operand")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "ternlog_invariant_operand_mask (operands) == 3 && !reload_completed"
+  [(set (match_dup 0)
+	(match_dup 3))
+   (set (match_dup 0)
+	(unspec:V
+	  [(match_dup 0)
+	   (match_dup 0)
+	   (match_dup 0)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))])
+
+;; Replace invariant ternlog operands with used operands
+;; (except for the case discussed in the previous define_split)
+(define_split
+  [(set (match_operand:V 0 "register_operand")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand")
+	   (match_operand:V 2 "register_operand")
+	   (match_operand:V 3 "nonimmediate_operand")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "ternlog_invariant_operand_mask (operands) != 0 && !reload_completed"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))]
+  "reduce_ternlog_operands (operands);")
+
 ;; There must be lots of other combinations like
 ;;
 ;; (any_logic:V
diff --git a/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-1.c b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-1.c
new file mode 100644
index 00000000000..a7063df9dcb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {vmovdqa*} "4" } } */
+
+#include <immintrin.h>
+
+__m512i f(__m512i* a, __m512i* b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], 119);
+}
+
+__m512i g(__m512i* a, __m512i* b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], 250);
+}
+
+__m512i h(__m512i* a, __m512i* b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], 252);
+}
diff --git a/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-2.c b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-2.c
new file mode 100644
index 00000000000..b44986cc259
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler "vpternlog.*0.*0.*0" } } */
+
+#include <immintrin.h>
+
+__m512i f(__m512i a, __m512i b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a, b, c[0], 0x55);
+}
+
-- 
2.34.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH] Replace invariant ternlog operands
  2023-07-25 18:11 [PATCH] Replace invariant ternlog operands Yan Simonaytes
@ 2023-07-27  3:00 ` Liu, Hongtao
  2023-08-03 17:30   ` Alexander Monakov
  0 siblings, 1 reply; 4+ messages in thread
From: Liu, Hongtao @ 2023-07-27  3:00 UTC (permalink / raw)
  To: Yan Simonaytes, gcc-patches; +Cc: Uros Bizjak



> -----Original Message-----
> From: Yan Simonaytes <simonaytes.yan@ispras.ru>
> Sent: Wednesday, July 26, 2023 2:11 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Liu, Hongtao <hongtao.liu@intel.com>; Uros Bizjak <ubizjak@gmail.com>;
> Yan Simonaytes <simonaytes.yan@ispras.ru>
> Subject: [PATCH] Replace invariant ternlog operands
> 
> Sometimes GCC generates ternlog with three operands, but some of them are
> invariant.
> For example:
> 
> vpternlogq	$252, %zmm2, %zmm1, %zmm0
> 
> In this case zmm1 register isnt used by ternlog.
> So should replace zmm1 with zmm0 or zmm2:
> 
> vpternlogq	$252, %zmm0, %zmm1, %zmm0
> 
> When the third operand of ternlog is memory and both others are invariant
> should add load instruction from this memory to register and replace the first
> and the second operands to this register.
> So insted of
> 
> vpternlogq	$85, (%rdi), %zmm1, %zmm0
> 
> Should emit
> 
> vmovdqa64	(%rdi), %zmm0
> vpternlogq	$85, %zmm0, %zmm0, %zmm0
> 
> gcc/ChangeLog:
> 
>         * config/i386/i386.cc (ternlog_invariant_operand_mask): New helper
> 	function for replacing invariant operands.
>         (reduce_ternlog_operands): Likewise.
>         * config/i386/i386-protos.h (ternlog_invariant_operand_mask):
> Prototype here.
>         (reduce_ternlog_operands): Likewise.
>         * config/i386/sse.md:
> 
> gcc/testsuite/ChangeLog:
> 
>         * gcc.target/i386/reduce-ternlog-operands-1.c: New test.
>         * gcc.target/i386/reduce-ternlog-operands-2.c: New test.
> ---
>  gcc/config/i386/i386-protos.h                 |  2 +
>  gcc/config/i386/i386.cc                       | 45 +++++++++++++++++++
>  gcc/config/i386/sse.md                        | 43 ++++++++++++++++++
>  .../i386/reduce-ternlog-operands-1.c          | 20 +++++++++
>  .../i386/reduce-ternlog-operands-2.c          | 11 +++++
>  5 files changed, 121 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-
> 1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-
> 2.c
> 
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 27fe73ca65c..49398ef9936 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -57,6 +57,8 @@ extern int standard_80387_constant_p (rtx);  extern
> const char *standard_80387_constant_opcode (rtx);  extern rtx
> standard_80387_constant_rtx (int);  extern int standard_sse_constant_p (rtx,
> machine_mode);
> +extern int ternlog_invariant_operand_mask (rtx *operands); extern void
> +reduce_ternlog_operands (rtx *operands);
>  extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);  extern
> bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);  extern
> bool ix86_pre_reload_split (void); diff --git a/gcc/config/i386/i386.cc
> b/gcc/config/i386/i386.cc index f0d6167e667..140de478571 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -5070,6 +5070,51 @@ ix86_check_no_addr_space (rtx insn)
>      }
>    return true;
>  }
> +
> +/* Return mask of invariant operands:
> +   bit number     0 1 2
> +   operand number 1 2 3.  */
> +
> +int
> +ternlog_invariant_operand_mask (rtx *operands) {
> +  int mask = 0;
> +  int imm8 = XINT (operands[4], 0);
> +
> +  if (((imm8 >> 4) & 0xF) == (imm8 & 0xF))
> +    mask |= 1;
> +  if (((imm8 >> 2) & 0x33) == (imm8 & 0x33))
> +    mask |= (1 << 1);
> +  if (((imm8 >> 1) & 0x55) == (imm8 & 0x55))
> +    mask |= (1 << 2);
> +
> +  return mask;
> +}
> +
> +/* Replace one of the unused operators with the one used.  */
> +
> +void
> +reduce_ternlog_operands (rtx *operands) {
> +  int mask = ternlog_invariant_operand_mask (operands);
> +
> +  if (mask & 1) /* the first operand is invariant.  */
> +    operands[1] = operands[2];
> +
> +  if (mask & 2) /* the second operand is invariant.  */
> +    operands[2] = operands[1];
> +
> +  if (mask & 4)	/* the third operand is invariant.  */
> +   operands[3] = operands[1];
> +  else if (!MEM_P (operands[3]))
> +    {
> +      if (mask & 1) /* the first operand is invariant.  */
> +	operands[1] = operands[3];
> +      if (mask & 2) /* the second operands is invariant.  */
> +	operands[2] = operands[3];
> +    }
> +}
> +
> 
> 
> 
>  /* Initialize the table of extra 80387 mathematical constants.  */
> 
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index
> a2099373123..f88d82b315c 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -12625,6 +12625,49 @@
>  		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>  		      (const_string "*")))])
> 
> +;; If the first and the second operands of ternlog are invariant and ;;
> +the third operand is memory ;; then we should add load third operand
> +from memory to register and ;; replace first and second operands with
> +this register (define_split
> +  [(set (match_operand:V 0 "register_operand")
> +	(unspec:V
> +	  [(match_operand:V 1 "register_operand")
> +	   (match_operand:V 2 "register_operand")
> +	   (match_operand:V 3 "memory_operand")
> +	   (match_operand:SI 4 "const_0_to_255_operand")]
> +	  UNSPEC_VTERNLOG))]
> +  "ternlog_invariant_operand_mask (operands) == 3 && !reload_completed"
Maybe better with "!reload_completed  && ternlog_invariant_operand_mask (operands) == 3"
> +  [(set (match_dup 0)
> +	(match_dup 3))
> +   (set (match_dup 0)
> +	(unspec:V
> +	  [(match_dup 0)
> +	   (match_dup 0)
> +	   (match_dup 0)
> +	   (match_dup 4)]
> +	  UNSPEC_VTERNLOG))])
> +
> +;; Replace invariant ternlog operands with used operands ;; (except for
> +the case discussed in the previous define_split) (define_split
> +  [(set (match_operand:V 0 "register_operand")
> +	(unspec:V
> +	  [(match_operand:V 1 "register_operand")
> +	   (match_operand:V 2 "register_operand")
> +	   (match_operand:V 3 "nonimmediate_operand")
> +	   (match_operand:SI 4 "const_0_to_255_operand")]
> +	  UNSPEC_VTERNLOG))]
> +  "ternlog_invariant_operand_mask (operands) != 0 && !reload_completed"
Ditto.
> +  [(set (match_dup 0)
> +	(unspec:V
> +	  [(match_dup 1)
> +	   (match_dup 2)
> +	   (match_dup 3)
> +	   (match_dup 4)]
> +	  UNSPEC_VTERNLOG))]
> +  "reduce_ternlog_operands (operands);")
> +
Others LGTM.
>  ;; There must be lots of other combinations like  ;;  ;; (any_logic:V diff --git
> a/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-1.c
> b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-1.c
> new file mode 100644
> index 00000000000..a7063df9dcb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-1.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times {vmovdqa*} "4" } } */
> +
> +#include <immintrin.h>
> +
> +__m512i f(__m512i* a, __m512i* b, __m512i* c) {
> +	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], 119); }
> +
> +__m512i g(__m512i* a, __m512i* b, __m512i* c) {
> +	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], 250); }
> +
> +__m512i h(__m512i* a, __m512i* b, __m512i* c) {
> +	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], 252); }
> diff --git a/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-2.c
> b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-2.c
> new file mode 100644
> index 00000000000..b44986cc259
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/reduce-ternlog-operands-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler "vpternlog.*0.*0.*0" } } */
> +
> +#include <immintrin.h>
> +
> +__m512i f(__m512i a, __m512i b, __m512i* c) {
> +	return _mm512_ternarylogic_epi64 (a, b, c[0], 0x55); }
> +
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH] Replace invariant ternlog operands
  2023-07-27  3:00 ` Liu, Hongtao
@ 2023-08-03 17:30   ` Alexander Monakov
  2023-08-04  0:43     ` Hongtao Liu
  0 siblings, 1 reply; 4+ messages in thread
From: Alexander Monakov @ 2023-08-03 17:30 UTC (permalink / raw)
  To: Liu, Hongtao; +Cc: Yan Simonaytes, gcc-patches, Uros Bizjak


On Thu, 27 Jul 2023, Liu, Hongtao via Gcc-patches wrote:

> > +;; If the first and the second operands of ternlog are invariant and ;;
> > +the third operand is memory ;; then we should add load third operand
> > +from memory to register and ;; replace first and second operands with
> > +this register (define_split
> > +  [(set (match_operand:V 0 "register_operand")
> > +	(unspec:V
> > +	  [(match_operand:V 1 "register_operand")
> > +	   (match_operand:V 2 "register_operand")
> > +	   (match_operand:V 3 "memory_operand")
> > +	   (match_operand:SI 4 "const_0_to_255_operand")]
> > +	  UNSPEC_VTERNLOG))]
> > +  "ternlog_invariant_operand_mask (operands) == 3 && !reload_completed"
> Maybe better with "!reload_completed  && ternlog_invariant_operand_mask (operands) == 3"

I made this change (in both places), plus some style TLC. Ok to apply?

From d24304a9efd049e8db6df5ac78de8ca2d941a3c7 Mon Sep 17 00:00:00 2001
From: Yan Simonaytes <simonaytes.yan@ispras.ru>
Date: Tue, 25 Jul 2023 20:43:19 +0300
Subject: [PATCH] Eliminate irrelevant operands of VPTERNLOG

As mentioned in PR 110202, GCC may be presented with input where control
word of the VPTERNLOG intrinsic implies that some of its operands do not
affect the result.  In that case, we can eliminate irrelevant operands
of the instruction by substituting any other operand in their place.
This removes false dependencies.

For instance, instead of (252 = 0xfc = _MM_TERNLOG_A | _MM_TERNLOG_B)

	vpternlogq	$252, %zmm2, %zmm1, %zmm0

emit

	vpternlogq	$252, %zmm0, %zmm1, %zmm0

When VPTERNLOG is invariant w.r.t first and second operands, and the
third operand is memory, load memory into the output operand first, i.e.
instead of (85 = 0x55 = ~_MM_TERNLOG_C)

	vpternlogq	$85, (%rdi), %zmm1, %zmm0

emit

	vmovdqa64	(%rdi), %zmm0
	vpternlogq	$85, %zmm0, %zmm0, %zmm0

gcc/ChangeLog:

	* config/i386/i386-protos.h (vpternlog_irrelevant_operand_mask):
        Declare.
	(substitute_vpternlog_operands): Declare.
	* config/i386/i386.cc (vpternlog_irrelevant_operand_mask): New
        helper.
	(substitute_vpternlog_operands): New function.  Use them...
	* config/i386/sse.md: ... here in new VPTERNLOG define_splits.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/invariant-ternlog-1.c: New test.
	* gcc.target/i386/invariant-ternlog-2.c: New test.
---
 gcc/config/i386/i386-protos.h                 |  3 ++
 gcc/config/i386/i386.cc                       | 43 +++++++++++++++++++
 gcc/config/i386/sse.md                        | 42 ++++++++++++++++++
 .../gcc.target/i386/invariant-ternlog-1.c     | 21 +++++++++
 .../gcc.target/i386/invariant-ternlog-2.c     | 12 ++++++
 5 files changed, 121 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 27fe73ca65..12e6ff0ebc 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -70,6 +70,9 @@ extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
 extern int avx_vpermilp_parallel (rtx par, machine_mode mode);
 extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
 
+extern int vpternlog_irrelevant_operand_mask (rtx[]);
+extern void substitute_vpternlog_operands (rtx[]);
+
 extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
 extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
 				       rtx, rtx, rtx, rtx, bool);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 32851a514a..9a7c1135a0 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19420,6 +19420,49 @@ avx_vperm2f128_parallel (rtx par, machine_mode mode)
   return mask + 1;
 }
 \f
+/* Return a mask of VPTERNLOG operands that do not affect output.  */
+
+int
+vpternlog_irrelevant_operand_mask (rtx *operands)
+{
+  int mask = 0;
+  int imm8 = XINT (operands[4], 0);
+
+  if (((imm8 >> 4) & 0x0F) == (imm8 & 0x0F))
+    mask |= 1;
+  if (((imm8 >> 2) & 0x33) == (imm8 & 0x33))
+    mask |= 2;
+  if (((imm8 >> 1) & 0x55) == (imm8 & 0x55))
+    mask |= 4;
+
+  return mask;
+}
+
+/* Eliminate false dependencies on operands that do not affect output
+   by substituting other operands of a VPTERNLOG.  */
+
+void
+substitute_vpternlog_operands (rtx *operands)
+{
+  int mask = vpternlog_irrelevant_operand_mask (operands);
+
+  if (mask & 1) /* The first operand is irrelevant.  */
+    operands[1] = operands[2];
+
+  if (mask & 2) /* The second operand is irrelevant.  */
+    operands[2] = operands[1];
+
+  if (mask & 4) /* The third operand is irrelevant.  */
+    operands[3] = operands[1];
+  else if (REG_P (operands[3]))
+    {
+      if (mask & 1)
+	operands[1] = operands[3];
+      if (mask & 2)
+	operands[2] = operands[3];
+    }
+}
+\f
 /* Return a register priority for hard reg REGNO.  */
 static int
 ix86_register_priority (int hard_regno)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f793258b6c..1e2ec4bedc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12627,6 +12627,48 @@ (define_insn "*<avx512>_vternlog<mode>_all"
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_string "*")))])
 
+;; When VPTERNLOG happens to be invariant w.r.t first and second operands,
+;; and the third operand is memory, eliminate false dependencies by loading
+;; memory into the output operand first.
+(define_split
+  [(set (match_operand:V 0 "register_operand")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand")
+	   (match_operand:V 2 "register_operand")
+	   (match_operand:V 3 "memory_operand")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "!reload_completed && vpternlog_irrelevant_operand_mask (operands) == 3"
+  [(set (match_dup 0)
+	(match_dup 3))
+   (set (match_dup 0)
+	(unspec:V
+	  [(match_dup 0)
+	   (match_dup 0)
+	   (match_dup 0)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))])
+
+;; Eliminate false dependencies when VPTERNLOG is invariant w.r.t any
+;; of input operands (except the case handled in the above split).
+(define_split
+  [(set (match_operand:V 0 "register_operand")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand")
+	   (match_operand:V 2 "register_operand")
+	   (match_operand:V 3 "nonimmediate_operand")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "!reload_completed && vpternlog_irrelevant_operand_mask (operands) != 0"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))]
+  "substitute_vpternlog_operands (operands);")
+
 ;; There must be lots of other combinations like
 ;;
 ;; (any_logic:V
diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
new file mode 100644
index 0000000000..21051c6bba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
+/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 } } */
+
+#include <immintrin.h>
+
+__m512i f(__m512i* a, __m512i* b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_B | ~_MM_TERNLOG_C);
+}
+
+__m512i g(__m512i* a, __m512i* b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | ~_MM_TERNLOG_C);
+}
+
+__m512i h(__m512i* a, __m512i* b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | ~_MM_TERNLOG_B);
+}
diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
new file mode 100644
index 0000000000..d70bbb0239
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times "vmovdqa" 1 } } */
+/* { dg-final { scan-assembler "vpternlog.*zmm0.*zmm0.*zmm0" } } */
+
+#include <immintrin.h>
+
+__m512i f(__m512i* a, __m512i* b, __m512i* c)
+{
+	return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_C);
+}
+
-- 
2.39.2


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Replace invariant ternlog operands
  2023-08-03 17:30   ` Alexander Monakov
@ 2023-08-04  0:43     ` Hongtao Liu
  0 siblings, 0 replies; 4+ messages in thread
From: Hongtao Liu @ 2023-08-04  0:43 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Liu, Hongtao, Yan Simonaytes, gcc-patches, Uros Bizjak

On Fri, Aug 4, 2023 at 1:30 AM Alexander Monakov <amonakov@ispras.ru> wrote:
>
>
> On Thu, 27 Jul 2023, Liu, Hongtao via Gcc-patches wrote:
>
> > > +;; If the first and the second operands of ternlog are invariant and ;;
> > > +the third operand is memory ;; then we should add load third operand
> > > +from memory to register and ;; replace first and second operands with
> > > +this register (define_split
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +   (unspec:V
> > > +     [(match_operand:V 1 "register_operand")
> > > +      (match_operand:V 2 "register_operand")
> > > +      (match_operand:V 3 "memory_operand")
> > > +      (match_operand:SI 4 "const_0_to_255_operand")]
> > > +     UNSPEC_VTERNLOG))]
> > > +  "ternlog_invariant_operand_mask (operands) == 3 && !reload_completed"
> > Maybe better with "!reload_completed  && ternlog_invariant_operand_mask (operands) == 3"
>
> I made this change (in both places), plus some style TLC. Ok to apply?
Ok.
>
> From d24304a9efd049e8db6df5ac78de8ca2d941a3c7 Mon Sep 17 00:00:00 2001
> From: Yan Simonaytes <simonaytes.yan@ispras.ru>
> Date: Tue, 25 Jul 2023 20:43:19 +0300
> Subject: [PATCH] Eliminate irrelevant operands of VPTERNLOG
>
> As mentioned in PR 110202, GCC may be presented with input where control
> word of the VPTERNLOG intrinsic implies that some of its operands do not
> affect the result.  In that case, we can eliminate irrelevant operands
> of the instruction by substituting any other operand in their place.
> This removes false dependencies.
>
> For instance, instead of (252 = 0xfc = _MM_TERNLOG_A | _MM_TERNLOG_B)
>
>         vpternlogq      $252, %zmm2, %zmm1, %zmm0
>
> emit
>
>         vpternlogq      $252, %zmm0, %zmm1, %zmm0
>
> When VPTERNLOG is invariant w.r.t first and second operands, and the
> third operand is memory, load memory into the output operand first, i.e.
> instead of (85 = 0x55 = ~_MM_TERNLOG_C)
>
>         vpternlogq      $85, (%rdi), %zmm1, %zmm0
>
> emit
>
>         vmovdqa64       (%rdi), %zmm0
>         vpternlogq      $85, %zmm0, %zmm0, %zmm0
>
> gcc/ChangeLog:
>
>         * config/i386/i386-protos.h (vpternlog_irrelevant_operand_mask):
>         Declare.
>         (substitute_vpternlog_operands): Declare.
>         * config/i386/i386.cc (vpternlog_irrelevant_operand_mask): New
>         helper.
>         (substitute_vpternlog_operands): New function.  Use them...
>         * config/i386/sse.md: ... here in new VPTERNLOG define_splits.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/invariant-ternlog-1.c: New test.
>         * gcc.target/i386/invariant-ternlog-2.c: New test.
> ---
>  gcc/config/i386/i386-protos.h                 |  3 ++
>  gcc/config/i386/i386.cc                       | 43 +++++++++++++++++++
>  gcc/config/i386/sse.md                        | 42 ++++++++++++++++++
>  .../gcc.target/i386/invariant-ternlog-1.c     | 21 +++++++++
>  .../gcc.target/i386/invariant-ternlog-2.c     | 12 ++++++
>  5 files changed, 121 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 27fe73ca65..12e6ff0ebc 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -70,6 +70,9 @@ extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
>  extern int avx_vpermilp_parallel (rtx par, machine_mode mode);
>  extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
>
> +extern int vpternlog_irrelevant_operand_mask (rtx[]);
> +extern void substitute_vpternlog_operands (rtx[]);
> +
>  extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
>  extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
>                                        rtx, rtx, rtx, rtx, bool);
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 32851a514a..9a7c1135a0 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -19420,6 +19420,49 @@ avx_vperm2f128_parallel (rtx par, machine_mode mode)
>    return mask + 1;
>  }
>
> +/* Return a mask of VPTERNLOG operands that do not affect output.  */
> +
> +int
> +vpternlog_irrelevant_operand_mask (rtx *operands)
> +{
> +  int mask = 0;
> +  int imm8 = XINT (operands[4], 0);
> +
> +  if (((imm8 >> 4) & 0x0F) == (imm8 & 0x0F))
> +    mask |= 1;
> +  if (((imm8 >> 2) & 0x33) == (imm8 & 0x33))
> +    mask |= 2;
> +  if (((imm8 >> 1) & 0x55) == (imm8 & 0x55))
> +    mask |= 4;
> +
> +  return mask;
> +}
> +
> +/* Eliminate false dependencies on operands that do not affect output
> +   by substituting other operands of a VPTERNLOG.  */
> +
> +void
> +substitute_vpternlog_operands (rtx *operands)
> +{
> +  int mask = vpternlog_irrelevant_operand_mask (operands);
> +
> +  if (mask & 1) /* The first operand is irrelevant.  */
> +    operands[1] = operands[2];
> +
> +  if (mask & 2) /* The second operand is irrelevant.  */
> +    operands[2] = operands[1];
> +
> +  if (mask & 4) /* The third operand is irrelevant.  */
> +    operands[3] = operands[1];
> +  else if (REG_P (operands[3]))
> +    {
> +      if (mask & 1)
> +       operands[1] = operands[3];
> +      if (mask & 2)
> +       operands[2] = operands[3];
> +    }
> +}
> +
>  /* Return a register priority for hard reg REGNO.  */
>  static int
>  ix86_register_priority (int hard_regno)
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index f793258b6c..1e2ec4bedc 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -12627,6 +12627,48 @@ (define_insn "*<avx512>_vternlog<mode>_all"
>                       (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>                       (const_string "*")))])
>
> +;; When VPTERNLOG happens to be invariant w.r.t first and second operands,
> +;; and the third operand is memory, eliminate false dependencies by loading
> +;; memory into the output operand first.
> +(define_split
> +  [(set (match_operand:V 0 "register_operand")
> +       (unspec:V
> +         [(match_operand:V 1 "register_operand")
> +          (match_operand:V 2 "register_operand")
> +          (match_operand:V 3 "memory_operand")
> +          (match_operand:SI 4 "const_0_to_255_operand")]
> +         UNSPEC_VTERNLOG))]
> +  "!reload_completed && vpternlog_irrelevant_operand_mask (operands) == 3"
> +  [(set (match_dup 0)
> +       (match_dup 3))
> +   (set (match_dup 0)
> +       (unspec:V
> +         [(match_dup 0)
> +          (match_dup 0)
> +          (match_dup 0)
> +          (match_dup 4)]
> +         UNSPEC_VTERNLOG))])
> +
> +;; Eliminate false dependencies when VPTERNLOG is invariant w.r.t any
> +;; of input operands (except the case handled in the above split).
> +(define_split
> +  [(set (match_operand:V 0 "register_operand")
> +       (unspec:V
> +         [(match_operand:V 1 "register_operand")
> +          (match_operand:V 2 "register_operand")
> +          (match_operand:V 3 "nonimmediate_operand")
> +          (match_operand:SI 4 "const_0_to_255_operand")]
> +         UNSPEC_VTERNLOG))]
> +  "!reload_completed && vpternlog_irrelevant_operand_mask (operands) != 0"
> +  [(set (match_dup 0)
> +       (unspec:V
> +         [(match_dup 1)
> +          (match_dup 2)
> +          (match_dup 3)
> +          (match_dup 4)]
> +         UNSPEC_VTERNLOG))]
> +  "substitute_vpternlog_operands (operands);")
> +
>  ;; There must be lots of other combinations like
>  ;;
>  ;; (any_logic:V
> diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
> new file mode 100644
> index 0000000000..21051c6bba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
> +/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 } } */
> +
> +#include <immintrin.h>
> +
> +__m512i f(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_B | ~_MM_TERNLOG_C);
> +}
> +
> +__m512i g(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | ~_MM_TERNLOG_C);
> +}
> +
> +__m512i h(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | ~_MM_TERNLOG_B);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
> new file mode 100644
> index 0000000000..d70bbb0239
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vmovdqa" 1 } } */
> +/* { dg-final { scan-assembler "vpternlog.*zmm0.*zmm0.*zmm0" } } */
> +
> +#include <immintrin.h>
> +
> +__m512i f(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_C);
> +}
> +
> --
> 2.39.2
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-08-04  0:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-25 18:11 [PATCH] Replace invariant ternlog operands Yan Simonaytes
2023-07-27  3:00 ` Liu, Hongtao
2023-08-03 17:30   ` Alexander Monakov
2023-08-04  0:43     ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).