* [PATCH] Break false dependence for vpternlog by inserting vpxor.
@ 2023-07-04 2:50 liuhongt
2023-07-06 15:46 ` simonaytes.yan
0 siblings, 1 reply; 7+ messages in thread
From: liuhongt @ 2023-07-04 2:50 UTC (permalink / raw)
To: gcc-patches; +Cc: crazylht, hjl.tools
vpternlog is also used for optimization which doesn't need any valid
input operand, in that case, the destination is used as input in the
instruction and that creates a false dependence.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.
gcc/ChangeLog:
PR target/110438
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov<mode>_constm1_pternlog): New
define_insn.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to
define_insn_and_split to avoid false dependence.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog): New
define_insn.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110438.c: New test.
---
gcc/config/i386/predicates.md | 8 ++-
gcc/config/i386/sse.md | 69 +++++++++++++++++++-----
gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++++++++++
3 files changed, 94 insertions(+), 13 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index fb07707dcba..df0d9e20def 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
return false;
})
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
(define_predicate "vector_all_ones_operand"
(and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+ (ior (match_operand 0 "vector_all_ones_operand")
+ (match_operand 0 "float_vector_all_ones_operand")
+ (match_test "op == constm1_rtx")))
+
/* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit. */
(define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 812cfca4b92..93cdd844026 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,28 @@ (define_insn "mov<mode>_internal"
]
(symbol_ref "true")))])
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+ [(set (match_operand:VMOVE 0 "register_operand")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+ "TARGET_AVX512F && reload_completed
+ && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))"
+ [(set (match_dup 0) (match_dup 2))
+ (parallel
+ [(set (match_dup 0) (match_dup 1))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*vmov<mode>_constm1_pternlog"
+ [(set (match_operand:VMOVE 0 "register_operand" "=v")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
+ (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512VL || <MODE_SIZE> == 64"
+ "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+ [(set_attr "type" "sselog1")
+ (set_attr "prefix" "evex")])
+
;; If mem_addr points to a memory region with less than whole vector size bytes
;; of accessible memory and k is a mask that would prevent reading the inaccessible
;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -9336,7 +9358,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
operands[3] = CONST0_RTX (<MODE>mode);
}")
-(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
+(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
(match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9345,12 +9367,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
"TARGET_AVX512F"
"@
vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
- vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ #"
+ "&& !TARGET_AVX512DQ && reload_completed"
+ [(set (match_dup 0) (match_dup 4))
+ (parallel
+ [(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+ (match_dup 2)
+ (match_dup 3)
+ (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[4] = CONST0_RTX (<MODE>mode);"
[(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog"
+ [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+ (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
+ (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512F && !TARGET_AVX512DQ"
+ "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ [(set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "extendv2sfv2df2"
[(set (match_operand:V2DF 0 "register_operand")
(float_extend:V2DF
@@ -17164,32 +17209,32 @@ (define_expand "one_cmpl<mode>2"
if (!TARGET_AVX512F)
operands[2] = force_reg (<MODE>mode, operands[2]);
+ else
+ operands[1] = force_reg (<MODE>mode, operands[1]);
})
(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
- [(set (match_operand:VI 0 "register_operand" "=v,v")
- (xor:VI (match_operand:VI 1 "nonimmediate_operand" "v,m")
- (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
+ [(set (match_operand:VI 0 "register_operand" "=v")
+ (xor:VI (match_operand:VI 1 "register_operand" "v")
+ (match_operand:VI 2 "vector_all_ones_operand" "BC")))]
"TARGET_AVX512F
&& (!<mask_applied>
|| <ssescalarmode>mode == SImode
|| <ssescalarmode>mode == DImode)"
{
+ /* Use vpternlog 0x55, %1, %1, %0 instead of
+ vpternlog 0x55, %1, %0, %0 to avoid false dependence on %0. */
if (TARGET_AVX512VL)
- return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
+ return "vpternlog<ternlogsuffix>\t{$0x55, %1, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %1, 0x55}";
else
- return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
+ return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g1, %g0<mask_operand3>|%g0<mask_operand3>, %g1, %g1, 0x55}";
}
[(set_attr "type" "sselog")
(set_attr "prefix" "evex")
(set (attr "mode")
(if_then_else (match_test "TARGET_AVX512VL")
(const_string "<sseinsnmode>")
- (const_string "XI")))
- (set (attr "enabled")
- (if_then_else (eq_attr "alternative" "1")
- (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
- (const_int 1)))])
+ (const_string "XI")))])
(define_expand "<sse2_avx2>_andnot<mode>3"
[(set (match_operand:VI_AVX2 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
new file mode 100644
index 00000000000..11b8cc59fd2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110438.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
+/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
+/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
+
+
+#include <immintrin.h>
+
+__m512i g(void)
+{
+ return (__m512i){ 0 } - 1;
+}
+
+__m512i g1(__m512i* a)
+{
+ return ~(*a);
+}
+
+void
+foo (int* a, int* __restrict b)
+{
+ for (int i = 0; i != 16; i++)
+ {
+ if (b[i])
+ a[i] = -1;
+ else
+ a[i] = 0;
+ }
+}
--
2.39.1.388.g2fc9e9ca3c
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor.
2023-07-04 2:50 [PATCH] Break false dependence for vpternlog by inserting vpxor liuhongt
@ 2023-07-06 15:46 ` simonaytes.yan
2023-07-07 6:50 ` Hongtao Liu
0 siblings, 1 reply; 7+ messages in thread
From: simonaytes.yan @ 2023-07-06 15:46 UTC (permalink / raw)
To: liuhongt; +Cc: gcc-patches, crazylht, hjl.tools
> +; False dependency happens on destination register which is not really
> +; used when moving all ones to vector register
> +(define_split
> + [(set (match_operand:VMOVE 0 "register_operand")
> + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> + "TARGET_AVX512F && reload_completed
> + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))"
> + [(set (match_dup 0) (match_dup 2))
> + (parallel
> + [(set (match_dup 0) (match_dup 1))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> + "operands[2] = CONST0_RTX (<MODE>mode);")
I think we shouldnt emit PXOR when optimizing for size. So should change
define_split:
define_split
[(set (match_operand:VMOVE 0 "register_operand")
(match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
"TARGET_AVX512F && reload_completed
&& (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
&& optimize_insn_for_speed_p ()"
[(set (match_dup 0) (match_dup 2))
(parallel
[(set (match_dup 0) (match_dup 1))
(unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
"operands[2] = CONST0_RTX (<MODE>mode);")
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor.
2023-07-06 15:46 ` simonaytes.yan
@ 2023-07-07 6:50 ` Hongtao Liu
2023-07-10 1:17 ` [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' liuhongt
0 siblings, 1 reply; 7+ messages in thread
From: Hongtao Liu @ 2023-07-07 6:50 UTC (permalink / raw)
To: simonaytes.yan; +Cc: liuhongt, gcc-patches, hjl.tools
On Thu, Jul 6, 2023 at 11:46 PM <simonaytes.yan@ispras.ru> wrote:
>
> > +; False dependency happens on destination register which is not really
> > +; used when moving all ones to vector register
> > +(define_split
> > + [(set (match_operand:VMOVE 0 "register_operand")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> > + "TARGET_AVX512F && reload_completed
> > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))"
> > + [(set (match_dup 0) (match_dup 2))
> > + (parallel
> > + [(set (match_dup 0) (match_dup 1))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > + "operands[2] = CONST0_RTX (<MODE>mode);")
>
> I think we shouldnt emit PXOR when optimizing for size. So should change
> define_split:
> define_split
> [(set (match_operand:VMOVE 0 "register_operand")
> (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> "TARGET_AVX512F && reload_completed
> && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
> && optimize_insn_for_speed_p ()"
> [(set (match_dup 0) (match_dup 2))
> (parallel
> [(set (match_dup 0) (match_dup 1))
> (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> "operands[2] = CONST0_RTX (<MODE>mode);")
Yes, will do. I'm still working on breaking the false depence for
pternlog in newly added pattern *iornot<mode>3,*xnor<mode>3 and
*<nlogic><mode>3.
Will repost the patch when it's done.
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
2023-07-07 6:50 ` Hongtao Liu
@ 2023-07-10 1:17 ` liuhongt
2023-07-10 16:23 ` Alexander Monakov
0 siblings, 1 reply; 7+ messages in thread
From: liuhongt @ 2023-07-10 1:17 UTC (permalink / raw)
To: gcc-patches; +Cc: simonaytes.yan
False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.
gcc/ChangeLog:
PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New
define_insn.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to
define_insn_and_split to avoid false dependence.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto.
(<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot<mode>3): Ditto.
(iornot<mode>3): Ditto.
(*<nlogic><mode>3): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110438.c: New test.
---
gcc/config/i386/predicates.md | 8 +-
gcc/config/i386/sse.md | 113 ++++++++++++++++++++---
gcc/testsuite/gcc.target/i386/pr110438.c | 30 ++++++
3 files changed, 135 insertions(+), 16 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
return false;
})
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
(define_predicate "vector_all_ones_operand"
(and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+ (ior (match_operand 0 "vector_all_ones_operand")
+ (match_operand 0 "float_vector_all_ones_operand")
+ (match_test "op == constm1_rtx")))
+
/* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit. */
(define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..56920a3e1d3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
]
(symbol_ref "true")))])
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+ [(set (match_operand:VMOVE 0 "register_operand")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+ "TARGET_AVX512F && reload_completed
+ && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
+ && optimize_function_for_speed_p (cfun)"
+ [(set (match_dup 0) (match_dup 2))
+ (parallel
+ [(set (match_dup 0) (match_dup 1))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
+ [(set (match_operand:VMOVE 0 "register_operand" "=v")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
+ (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512VL || <MODE_SIZE> == 64"
+ "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+ [(set_attr "type" "sselog1")
+ (set_attr "prefix" "evex")])
+
;; If mem_addr points to a memory region with less than whole vector size bytes
;; of accessible memory and k is a mask that would prevent reading the inaccessible
;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
operands[3] = CONST0_RTX (<MODE>mode);
}")
-(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
+(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
(match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
"@
vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ "&& !TARGET_AVX512DQ && reload_completed
+ && optimize_function_for_speed_p (cfun)"
+ [(set (match_dup 0) (match_dup 4))
+ (parallel
+ [(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+ (match_dup 2)
+ (match_dup 3)
+ (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[4] = CONST0_RTX (<MODE>mode);"
[(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
+ [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+ (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
+ (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512F && !TARGET_AVX512DQ"
+ "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ [(set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "extendv2sfv2df2"
[(set (match_operand:V2DF 0 "register_operand")
(float_extend:V2DF
@@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
operands[2] = force_reg (<MODE>mode, operands[2]);
})
-(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
- [(set (match_operand:VI 0 "register_operand" "=v,v")
- (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
- (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
+(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
+ [(set (match_operand:VI 0 "register_operand" "=v,v,v")
+ (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
+ (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
"TARGET_AVX512F
&& (!<mask_applied>
|| <ssescalarmode>mode == SImode
|| <ssescalarmode>mode == DImode)"
{
+ if (!<mask_applied> && which_alternative
+ && optimize_function_for_speed_p (cfun))
+ return "#";
+
if (TARGET_AVX512VL)
return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
else
return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
}
+ "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
+ && optimize_function_for_speed_p (cfun)"
+ [(set (match_dup 0) (match_dup 3))
+ (parallel
+ [(set (match_dup 0)
+ (xor:VI (match_dup 1) (match_dup 2)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[3] = CONST0_RTX (<MODE>mode);"
[(set_attr "type" "sselog")
(set_attr "prefix" "evex")
(set (attr "mode")
@@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_int 1)))])
+(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
+ [(set (match_operand:VI 0 "register_operand" "=v,v")
+ (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br")
+ (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
+ (unspec [(match_operand:VI 3 "register_operand" "0,0")]
+ UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512F"
+{
+ if (TARGET_AVX512VL)
+ return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
+ else
+ return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
+}
+ [(set_attr "type" "sselog")
+ (set_attr "prefix" "evex")
+ (set (attr "mode")
+ (if_then_else (match_test "TARGET_AVX512VL")
+ (const_string "<sseinsnmode>")
+ (const_string "XI")))
+ (set (attr "enabled")
+ (if_then_else (eq_attr "alternative" "0")
+ (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+ (const_int 1)))])
+
(define_split
[(set (match_operand:VI48_AVX512F 0 "register_operand")
(vec_duplicate:VI48_AVX512F
@@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
[(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
(and:VI
(not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
- (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
+ (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
"TARGET_SSE
&& (register_operand (operands[1], <MODE>mode)
|| register_operand (operands[2], <MODE>mode))"
@@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
(ior:VI
(not:VI
- (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
- (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
+ (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr"))
+ (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "2,3")
+ (if_then_else (eq_attr "alternative" "0,1")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
@@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v")
(not:VI
(xor:VI
- (match_operand:VI 1 "bcst_vector_operand" "%v,v")
- (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
+ (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "1")
+ (if_then_else (eq_attr "alternative" "0")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
@@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
(define_insn "*<nlogic><mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v")
(andor:VI
- (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
- (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
+ (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "1")
+ (if_then_else (eq_attr "alternative" "0")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
new file mode 100644
index 00000000000..11b8cc59fd2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110438.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
+/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
+/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
+
+
+#include <immintrin.h>
+
+__m512i g(void)
+{
+ return (__m512i){ 0 } - 1;
+}
+
+__m512i g1(__m512i* a)
+{
+ return ~(*a);
+}
+
+void
+foo (int* a, int* __restrict b)
+{
+ for (int i = 0; i != 16; i++)
+ {
+ if (b[i])
+ a[i] = -1;
+ else
+ a[i] = 0;
+ }
+}
--
2.39.1.388.g2fc9e9ca3c
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
2023-07-10 1:17 ` [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' liuhongt
@ 2023-07-10 16:23 ` Alexander Monakov
2023-07-11 0:03 ` Hongtao Liu
0 siblings, 1 reply; 7+ messages in thread
From: Alexander Monakov @ 2023-07-10 16:23 UTC (permalink / raw)
To: liuhongt; +Cc: gcc-patches, simonaytes.yan
On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:
> False dependency happens when destination is only updated by
> pternlog. There is no false dependency when destination is also used
> in source. So either a pxor should be inserted, or input operand
> should be set with constraint '0'.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ready to push to trunk.
Shouldn't this patch also remove uses of vpternlog in
standard_sse_constant_opcode?
A couple more questions below:
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
> ]
> (symbol_ref "true")))])
>
> +; False dependency happens on destination register which is not really
> +; used when moving all ones to vector register
> +(define_split
> + [(set (match_operand:VMOVE 0 "register_operand")
> + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> + "TARGET_AVX512F && reload_completed
> + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
> + && optimize_function_for_speed_p (cfun)"
Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
Doesn't it work here as well?
> + [(set (match_dup 0) (match_dup 2))
> + (parallel
> + [(set (match_dup 0) (match_dup 1))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> + "operands[2] = CONST0_RTX (<MODE>mode);")
> +
> +(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
> + [(set (match_operand:VMOVE 0 "register_operand" "=v")
> + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
> + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> + "TARGET_AVX512VL || <MODE_SIZE> == 64"
> + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> + [(set_attr "type" "sselog1")
> + (set_attr "prefix" "evex")])
> +
> ;; If mem_addr points to a memory region with less than whole vector size bytes
> ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
> operands[3] = CONST0_RTX (<MODE>mode);
> }")
>
> -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
> (vec_merge:VI48_AVX512VL
> (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> "@
> vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
> vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> + "&& !TARGET_AVX512DQ && reload_completed
> + && optimize_function_for_speed_p (cfun)"
> + [(set (match_dup 0) (match_dup 4))
> + (parallel
> + [(set (match_dup 0)
> + (vec_merge:VI48_AVX512VL
> + (match_dup 2)
> + (match_dup 3)
> + (match_dup 1)))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> + "operands[4] = CONST0_RTX (<MODE>mode);"
> [(set_attr "isa" "avx512dq,*")
> (set_attr "length_immediate" "0,1")
> (set_attr "prefix" "evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
> + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> + (vec_merge:VI48_AVX512VL
> + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> + (match_operand:VI48_AVX512VL 3 "const0_operand")
> + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
> + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> + "TARGET_AVX512F && !TARGET_AVX512DQ"
> + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> + [(set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> (define_expand "extendv2sfv2df2"
> [(set (match_operand:V2DF 0 "register_operand")
> (float_extend:V2DF
> @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
> operands[2] = force_reg (<MODE>mode, operands[2]);
> })
>
> -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> - [(set (match_operand:VI 0 "register_operand" "=v,v")
> - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
> - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
> +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
> + [(set (match_operand:VI 0 "register_operand" "=v,v,v")
> + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
> + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
> "TARGET_AVX512F
> && (!<mask_applied>
> || <ssescalarmode>mode == SImode
> || <ssescalarmode>mode == DImode)"
> {
> + if (!<mask_applied> && which_alternative
> + && optimize_function_for_speed_p (cfun))
> + return "#";
> +
> if (TARGET_AVX512VL)
> return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> else
> return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> }
> + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
> + && optimize_function_for_speed_p (cfun)"
> + [(set (match_dup 0) (match_dup 3))
> + (parallel
> + [(set (match_dup 0)
> + (xor:VI (match_dup 1) (match_dup 2)))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> + "operands[3] = CONST0_RTX (<MODE>mode);"
Perhaps I'm misreading this, but this seems to result in
vpxor zmm0, zmm0
vpternlog zmm0, zmm0, [mem], 0x55
while in the PR the agreement was to emit
vmovdq? zmm0, [mem]
vpternlog zmm0, zmm0, zmm0, 0x55
when the source is in memory, because the former has three uops in fused domain?
> [(set_attr "type" "sselog")
> (set_attr "prefix" "evex")
> (set (attr "mode")
> @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_int 1)))])
>
> +(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br")
> + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
> + (unspec [(match_operand:VI 3 "register_operand" "0,0")]
> + UNSPEC_INSN_FALSE_DEP)]
> + "TARGET_AVX512F"
> +{
> + if (TARGET_AVX512VL)
> + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> + else
> + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> +}
> + [(set_attr "type" "sselog")
> + (set_attr "prefix" "evex")
> + (set (attr "mode")
> + (if_then_else (match_test "TARGET_AVX512VL")
> + (const_string "<sseinsnmode>")
> + (const_string "XI")))
> + (set (attr "enabled")
> + (if_then_else (eq_attr "alternative" "0")
> + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> + (const_int 1)))])
> +
> (define_split
> [(set (match_operand:VI48_AVX512F 0 "register_operand")
> (vec_duplicate:VI48_AVX512F
> @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
> [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
> (and:VI
> (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
> - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
> + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
> "TARGET_SSE
> && (register_operand (operands[1], <MODE>mode)
> || register_operand (operands[2], <MODE>mode))"
> @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
> [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
> (ior:VI
> (not:VI
> - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr"))
> + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))]
> "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> && (register_operand (operands[1], <MODE>mode)
> @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
> (const_string "<sseinsnmode>")
> (const_string "XI")))
> (set (attr "enabled")
> - (if_then_else (eq_attr "alternative" "2,3")
> + (if_then_else (eq_attr "alternative" "0,1")
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_string "*")))])
>
> @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
> [(set (match_operand:VI 0 "register_operand" "=v,v")
> (not:VI
> (xor:VI
> - (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> + (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
> + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
> "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> && (register_operand (operands[1], <MODE>mode)
> @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
> (const_string "<sseinsnmode>")
> (const_string "XI")))
> (set (attr "enabled")
> - (if_then_else (eq_attr "alternative" "1")
> + (if_then_else (eq_attr "alternative" "0")
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_string "*")))])
>
> @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> (define_insn "*<nlogic><mode>3"
> [(set (match_operand:VI 0 "register_operand" "=v,v")
> (andor:VI
> - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
> "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> && (register_operand (operands[1], <MODE>mode)
> @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
> (const_string "<sseinsnmode>")
> (const_string "XI")))
> (set (attr "enabled")
> - (if_then_else (eq_attr "alternative" "1")
> + (if_then_else (eq_attr "alternative" "0")
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_string "*")))])
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
> new file mode 100644
> index 00000000000..11b8cc59fd2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr110438.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
> +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
> +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
> +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
> +
> +
> +#include <immintrin.h>
> +
> +__m512i g(void)
> +{
> + return (__m512i){ 0 } - 1;
> +}
> +
> +__m512i g1(__m512i* a)
> +{
> + return ~(*a);
> +}
> +
> +void
> +foo (int* a, int* __restrict b)
> +{
> + for (int i = 0; i != 16; i++)
> + {
> + if (b[i])
> + a[i] = -1;
> + else
> + a[i] = 0;
> + }
> +}
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
2023-07-10 16:23 ` Alexander Monakov
@ 2023-07-11 0:03 ` Hongtao Liu
2023-07-11 4:01 ` [PATCH v2] " liuhongt
0 siblings, 1 reply; 7+ messages in thread
From: Hongtao Liu @ 2023-07-11 0:03 UTC (permalink / raw)
To: Alexander Monakov; +Cc: liuhongt, gcc-patches, simonaytes.yan
On Tue, Jul 11, 2023 at 12:24 AM Alexander Monakov via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:
>
> > False dependency happens when destination is only updated by
> > pternlog. There is no false dependency when destination is also used
> > in source. So either a pxor should be inserted, or input operand
> > should be set with constraint '0'.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ready to push to trunk.
>
> Shouldn't this patch also remove uses of vpternlog in
> standard_sse_constant_opcode?
It's still needed when !optimize_function_for_speed_p (cfun).
>
> A couple more questions below:
>
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
> > ]
> > (symbol_ref "true")))])
> >
> > +; False dependency happens on destination register which is not really
> > +; used when moving all ones to vector register
> > +(define_split
> > + [(set (match_operand:VMOVE 0 "register_operand")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> > + "TARGET_AVX512F && reload_completed
> > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
> > + && optimize_function_for_speed_p (cfun)"
>
> Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
> Doesn't it work here as well?
I'm just aligned with lzcnt/popcnt case, the difference between
option_insn_for_speed_p and optimized_function_for_speed_p is the
former will consider
!crtl->maybe_hot_insn_p but the latter just returns
!optimize_function_for_size_p (cfun). It looks
optimize_insn_for_speed_p() is more reasonable for single insn.
350optimize_insn_for_size_p (void)
351{
352 enum optimize_size_level ret = optimize_function_for_size_p (cfun);
353 if (ret < OPTIMIZE_SIZE_BALANCED && !crtl->maybe_hot_insn_p)
354 ret = OPTIMIZE_SIZE_BALANCED;
355 return ret;
>
> > + [(set (match_dup 0) (match_dup 2))
> > + (parallel
> > + [(set (match_dup 0) (match_dup 1))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > + "operands[2] = CONST0_RTX (<MODE>mode);")
> > +
> > +(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
> > + [(set (match_operand:VMOVE 0 "register_operand" "=v")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
> > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> > + "TARGET_AVX512VL || <MODE_SIZE> == 64"
> > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> > + [(set_attr "type" "sselog1")
> > + (set_attr "prefix" "evex")])
> > +
> > ;; If mem_addr points to a memory region with less than whole vector size bytes
> > ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> > @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
> > operands[3] = CONST0_RTX (<MODE>mode);
> > }")
> >
> > -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> > +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> > [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
> > (vec_merge:VI48_AVX512VL
> > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> > "@
> > vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
> > vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> > + "&& !TARGET_AVX512DQ && reload_completed
> > + && optimize_function_for_speed_p (cfun)"
> > + [(set (match_dup 0) (match_dup 4))
> > + (parallel
> > + [(set (match_dup 0)
> > + (vec_merge:VI48_AVX512VL
> > + (match_dup 2)
> > + (match_dup 3)
> > + (match_dup 1)))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > + "operands[4] = CONST0_RTX (<MODE>mode);"
> > [(set_attr "isa" "avx512dq,*")
> > (set_attr "length_immediate" "0,1")
> > (set_attr "prefix" "evex")
> > (set_attr "mode" "<sseinsnmode>")])
> >
> > +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
> > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> > + (vec_merge:VI48_AVX512VL
> > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > + (match_operand:VI48_AVX512VL 3 "const0_operand")
> > + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
> > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> > + "TARGET_AVX512F && !TARGET_AVX512DQ"
> > + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> > + [(set_attr "length_immediate" "1")
> > + (set_attr "prefix" "evex")
> > + (set_attr "mode" "<sseinsnmode>")])
> > +
> > (define_expand "extendv2sfv2df2"
> > [(set (match_operand:V2DF 0 "register_operand")
> > (float_extend:V2DF
> > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
> > operands[2] = force_reg (<MODE>mode, operands[2]);
> > })
> >
> > -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > - [(set (match_operand:VI 0 "register_operand" "=v,v")
> > - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
> > - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
> > +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > + [(set (match_operand:VI 0 "register_operand" "=v,v,v")
> > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
> > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
> > "TARGET_AVX512F
> > && (!<mask_applied>
> > || <ssescalarmode>mode == SImode
> > || <ssescalarmode>mode == DImode)"
> > {
> > + if (!<mask_applied> && which_alternative
> > + && optimize_function_for_speed_p (cfun))
> > + return "#";
> > +
> > if (TARGET_AVX512VL)
> > return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> > else
> > return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> > }
> > + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
> > + && optimize_function_for_speed_p (cfun)"
> > + [(set (match_dup 0) (match_dup 3))
> > + (parallel
> > + [(set (match_dup 0)
> > + (xor:VI (match_dup 1) (match_dup 2)))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > + "operands[3] = CONST0_RTX (<MODE>mode);"
>
> Perhaps I'm misreading this, but this seems to result in
>
> vpxor zmm0, zmm0
> vpternlog zmm0, zmm0, [mem], 0x55
>
I thought the first alternative (v,0,BC) would handle that, looks not,
i'll adjust the splitter to explicitly put operands[1] into
operands[0] when it's memory.
> while in the PR the agreement was to emit
>
> vmovdq? zmm0, [mem]
> vpternlog zmm0, zmm0, zmm0, 0x55
>
> when the source is in memory, because the former has three uops in fused domain?
>
> > [(set_attr "type" "sselog")
>
> > (set_attr "prefix" "evex")
> > (set (attr "mode")
> > @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_int 1)))])
> >
> > +(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
> > + [(set (match_operand:VI 0 "register_operand" "=v,v")
> > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br")
> > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
> > + (unspec [(match_operand:VI 3 "register_operand" "0,0")]
> > + UNSPEC_INSN_FALSE_DEP)]
> > + "TARGET_AVX512F"
> > +{
> > + if (TARGET_AVX512VL)
> > + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> > + else
> > + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> > +}
> > + [(set_attr "type" "sselog")
> > + (set_attr "prefix" "evex")
> > + (set (attr "mode")
> > + (if_then_else (match_test "TARGET_AVX512VL")
> > + (const_string "<sseinsnmode>")
> > + (const_string "XI")))
> > + (set (attr "enabled")
> > + (if_then_else (eq_attr "alternative" "0")
> > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > + (const_int 1)))])
> > +
> > (define_split
> > [(set (match_operand:VI48_AVX512F 0 "register_operand")
> > (vec_duplicate:VI48_AVX512F
> > @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
> > (and:VI
> > (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
> > - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
> > + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
> > "TARGET_SSE
> > && (register_operand (operands[1], <MODE>mode)
> > || register_operand (operands[2], <MODE>mode))"
> > @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
> > (ior:VI
> > (not:VI
> > - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> > - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> > + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr"))
> > + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))]
> > "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> > || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> > && (register_operand (operands[1], <MODE>mode)
> > @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
> > (const_string "<sseinsnmode>")
> > (const_string "XI")))
> > (set (attr "enabled")
> > - (if_then_else (eq_attr "alternative" "2,3")
> > + (if_then_else (eq_attr "alternative" "0,1")
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_string "*")))])
> >
> > @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=v,v")
> > (not:VI
> > (xor:VI
> > - (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> > - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > + (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
> > + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
> > "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> > || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> > && (register_operand (operands[1], <MODE>mode)
> > @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
> > (const_string "<sseinsnmode>")
> > (const_string "XI")))
> > (set (attr "enabled")
> > - (if_then_else (eq_attr "alternative" "1")
> > + (if_then_else (eq_attr "alternative" "0")
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_string "*")))])
> >
> > @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> > (define_insn "*<nlogic><mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=v,v")
> > (andor:VI
> > - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> > - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
> > + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
> > "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> > || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> > && (register_operand (operands[1], <MODE>mode)
> > @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
> > (const_string "<sseinsnmode>")
> > (const_string "XI")))
> > (set (attr "enabled")
> > - (if_then_else (eq_attr "alternative" "1")
> > + (if_then_else (eq_attr "alternative" "0")
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_string "*")))])
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
> > new file mode 100644
> > index 00000000000..11b8cc59fd2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr110438.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
> > +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
> > +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
> > +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
> > +
> > +
> > +#include <immintrin.h>
> > +
> > +__m512i g(void)
> > +{
> > + return (__m512i){ 0 } - 1;
> > +}
> > +
> > +__m512i g1(__m512i* a)
> > +{
> > + return ~(*a);
> > +}
> > +
> > +void
> > +foo (int* a, int* __restrict b)
> > +{
> > + for (int i = 0; i != 16; i++)
> > + {
> > + if (b[i])
> > + a[i] = -1;
> > + else
> > + a[i] = 0;
> > + }
> > +}
> >
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
2023-07-11 0:03 ` Hongtao Liu
@ 2023-07-11 4:01 ` liuhongt
0 siblings, 0 replies; 7+ messages in thread
From: liuhongt @ 2023-07-11 4:01 UTC (permalink / raw)
To: gcc-patches; +Cc: amonakov
Here's updated patch.
1. use optimize_insn_for_speed_p instead of using optimize_function_for_speed_p.
2. explicitly move memory to dest register to avoid false dependence in one_cmpl pattern.
False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.
gcc/ChangeLog:
PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New
define_insn.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to
define_insn_and_split to avoid false dependence.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto.
(<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot<mode>3): Ditto.
(iornot<mode>3): Ditto.
(*<nlogic><mode>3): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110438.c: New test.
* gcc.target/i386/pr100711.c: Adjust testcase.
---
gcc/config/i386/predicates.md | 8 +-
gcc/config/i386/sse.md | 145 ++++++++++++++++++---
gcc/testsuite/gcc.target/i386/pr100711-6.c | 2 +-
gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++++
4 files changed, 168 insertions(+), 17 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
return false;
})
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
(define_predicate "vector_all_ones_operand"
(and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+ (ior (match_operand 0 "vector_all_ones_operand")
+ (match_operand 0 "float_vector_all_ones_operand")
+ (match_test "op == constm1_rtx")))
+
/* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit. */
(define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..05485b1792d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
]
(symbol_ref "true")))])
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+ [(set (match_operand:VMOVE 0 "register_operand")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+ "TARGET_AVX512F && reload_completed
+ && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
+ && optimize_insn_for_speed_p ()"
+ [(set (match_dup 0) (match_dup 2))
+ (parallel
+ [(set (match_dup 0) (match_dup 1))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
+ [(set (match_operand:VMOVE 0 "register_operand" "=v")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
+ (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512VL || <MODE_SIZE> == 64"
+ "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+ [(set_attr "type" "sselog1")
+ (set_attr "prefix" "evex")])
+
;; If mem_addr points to a memory region with less than whole vector size bytes
;; of accessible memory and k is a mask that would prevent reading the inaccessible
;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
operands[3] = CONST0_RTX (<MODE>mode);
}")
-(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
+(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
(match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
"@
vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ "&& !TARGET_AVX512DQ && reload_completed
+ && optimize_function_for_speed_p (cfun)"
+ [(set (match_dup 0) (match_dup 4))
+ (parallel
+ [(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+ (match_dup 2)
+ (match_dup 3)
+ (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[4] = CONST0_RTX (<MODE>mode);"
[(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
+ [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+ (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
+ (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512F && !TARGET_AVX512DQ"
+ "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ [(set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "extendv2sfv2df2"
[(set (match_operand:V2DF 0 "register_operand")
(float_extend:V2DF
@@ -17166,14 +17213,82 @@ (define_expand "one_cmpl<mode>2"
operands[2] = force_reg (<MODE>mode, operands[2]);
})
-(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
- [(set (match_operand:VI 0 "register_operand" "=v,v")
- (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
- (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
+(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
+ [(set (match_operand:VI 0 "register_operand" "=v,v,v")
+ (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
+ (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
"TARGET_AVX512F
&& (!<mask_applied>
|| <ssescalarmode>mode == SImode
|| <ssescalarmode>mode == DImode)"
+{
+ if (!<mask_applied> && which_alternative
+ && optimize_insn_for_speed_p ())
+ return "#";
+
+ if (TARGET_AVX512VL)
+ return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
+ else
+ return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
+}
+ "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
+ && optimize_insn_for_speed_p ()"
+ [(set (match_dup 0) (match_dup 3))
+ (parallel
+ [(set (match_dup 0)
+ (xor:VI (match_dup 1) (match_dup 2)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+{
+ if (MEM_P (operands[1]))
+ {
+ operands[3] = operands[1];
+ operands[1] = operands[0];
+ }
+ else
+ {
+ if (GET_MODE_SIZE (<ssescalarmode>mode) < 4)
+ {
+ if (<MODE_SIZE> == 64 ? TARGET_AVX512BW
+ : (TARGET_AVX512BW && TARGET_AVX512VL)
+ || !EXT_REX_SSE_REG_P (operands[0]))
+ {
+ operands[3] = operands[1];
+ operands[1] = operands[0];
+ }
+ else
+ operands[3] = CONST0_RTX (<MODE>mode);
+ }
+ else
+ {
+ if (<MODE_SIZE> == 64 || TARGET_AVX512VL
+ || !EXT_REX_SSE_REG_P (operands[0]))
+ {
+ operands[3] = operands[1];
+ operands[1] = operands[0];
+ }
+ else
+ operands[3] = CONST0_RTX (<MODE>mode);
+ }
+ }
+}
+ [(set_attr "type" "sselog")
+ (set_attr "prefix" "evex")
+ (set (attr "mode")
+ (if_then_else (match_test "TARGET_AVX512VL")
+ (const_string "<sseinsnmode>")
+ (const_string "XI")))
+ (set (attr "enabled")
+ (if_then_else (eq_attr "alternative" "1")
+ (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+ (const_int 1)))])
+
+(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
+ [(set (match_operand:VI 0 "register_operand" "=v,v,v")
+ (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
+ (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))
+ (unspec [(match_operand:VI 3 "register_operand" "0,0,0")]
+ UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512F"
{
if (TARGET_AVX512VL)
return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
@@ -17226,7 +17341,7 @@ (define_insn "*andnot<mode>3"
[(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
(and:VI
(not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
- (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
+ (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
"TARGET_SSE
&& (register_operand (operands[1], <MODE>mode)
|| register_operand (operands[2], <MODE>mode))"
@@ -17685,8 +17800,8 @@ (define_insn "*iornot<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
(ior:VI
(not:VI
- (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
- (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
+ (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr"))
+ (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17710,7 +17825,7 @@ (define_insn "*iornot<mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "2,3")
+ (if_then_else (eq_attr "alternative" "0,1")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
@@ -17718,8 +17833,8 @@ (define_insn "*xnor<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v")
(not:VI
(xor:VI
- (match_operand:VI 1 "bcst_vector_operand" "%v,v")
- (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
+ (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17738,7 +17853,7 @@ (define_insn "*xnor<mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "1")
+ (if_then_else (eq_attr "alternative" "0")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
@@ -17749,8 +17864,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
(define_insn "*<nlogic><mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v")
(andor:VI
- (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
- (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
+ (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17769,7 +17884,7 @@ (define_insn "*<nlogic><mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "1")
+ (if_then_else (eq_attr "alternative" "0")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-6.c b/gcc/testsuite/gcc.target/i386/pr100711-6.c
index 7142a98f537..808507471c9 100644
--- a/gcc/testsuite/gcc.target/i386/pr100711-6.c
+++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -Os" } */
typedef int v16si __attribute__ ((vector_size (64)));
typedef long long v8di __attribute__((vector_size (64)));
diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
new file mode 100644
index 00000000000..11b8cc59fd2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110438.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
+/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
+/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
+
+
+#include <immintrin.h>
+
+__m512i g(void)
+{
+ return (__m512i){ 0 } - 1;
+}
+
+__m512i g1(__m512i* a)
+{
+ return ~(*a);
+}
+
+void
+foo (int* a, int* __restrict b)
+{
+ for (int i = 0; i != 16; i++)
+ {
+ if (b[i])
+ a[i] = -1;
+ else
+ a[i] = 0;
+ }
+}
--
2.39.1.388.g2fc9e9ca3c
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2023-07-11 4:03 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-04 2:50 [PATCH] Break false dependence for vpternlog by inserting vpxor liuhongt
2023-07-06 15:46 ` simonaytes.yan
2023-07-07 6:50 ` Hongtao Liu
2023-07-10 1:17 ` [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' liuhongt
2023-07-10 16:23 ` Alexander Monakov
2023-07-11 0:03 ` Hongtao Liu
2023-07-11 4:01 ` [PATCH v2] " liuhongt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).