* [PATCH] Break false dependence for vpternlog by inserting vpxor. @ 2023-07-04 2:50 liuhongt 2023-07-06 15:46 ` simonaytes.yan 0 siblings, 1 reply; 7+ messages in thread From: liuhongt @ 2023-07-04 2:50 UTC (permalink / raw) To: gcc-patches; +Cc: crazylht, hjl.tools vpternlog is also used for optimization which doesn't need any valid input operand, in that case, the destination is used as input in the instruction and that creates a false dependence. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/110438 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov<mode>_constm1_pternlog): New define_insn. (*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to define_insn_and_split to avoid false dependence. (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog): New define_insn. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. --- gcc/config/i386/predicates.md | 8 ++- gcc/config/i386/sse.md | 69 +++++++++++++++++++----- gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++++++++++ 3 files changed, 94 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index fb07707dcba..df0d9e20def 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand" return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 812cfca4b92..93cdd844026 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,28 @@ (define_insn "mov<mode>_internal" ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (<MODE>mode);") + +(define_insn "*vmov<mode>_constm1_pternlog" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || <MODE_SIZE> == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9358,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>" operands[3] = CONST0_RTX (<MODE>mode); }") -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9345,12 +9367,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" "TARGET_AVX512F" "@ vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} - vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + #" + "&& !TARGET_AVX512DQ && reload_completed" + [(set (match_dup 0) (match_dup 4)) + (parallel + [(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (<MODE>mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog" + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") + (match_operand:VI48_AVX512VL 3 "const0_operand") + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F && !TARGET_AVX512DQ" + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "extendv2sfv2df2" [(set (match_operand:V2DF 0 "register_operand") (float_extend:V2DF @@ -17164,32 +17209,32 @@ (define_expand "one_cmpl<mode>2" if (!TARGET_AVX512F) operands[2] = force_reg (<MODE>mode, operands[2]); + else + operands[1] = force_reg (<MODE>mode, operands[1]); }) (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" - [(set (match_operand:VI 0 "register_operand" "=v,v") - (xor:VI (match_operand:VI 1 "nonimmediate_operand" "v,m") - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] + [(set (match_operand:VI 0 "register_operand" "=v") + (xor:VI (match_operand:VI 1 "register_operand" "v") + (match_operand:VI 2 "vector_all_ones_operand" "BC")))] "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode == SImode || <ssescalarmode>mode == DImode)" { + /* Use vpternlog 0x55, %1, %1, %0 instead of + vpternlog 0x55, %1, %0, %0 to avoid false dependence on %0. */ if (TARGET_AVX512VL) - return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %1, 0x55}"; else - return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g1, %g0<mask_operand3>|%g0<mask_operand3>, %g1, %g1, 0x55}"; } [(set_attr "type" "sselog") (set_attr "prefix" "evex") (set (attr "mode") (if_then_else (match_test "TARGET_AVX512VL") (const_string "<sseinsnmode>") - (const_string "XI"))) - (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") - (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") - (const_int 1)))]) + (const_string "XI")))]) (define_expand "<sse2_avx2>_andnot<mode>3" [(set (match_operand:VI_AVX2 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c new file mode 100644 index 00000000000..11b8cc59fd2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110438.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ + + +#include <immintrin.h> + +__m512i g(void) +{ + return (__m512i){ 0 } - 1; +} + +__m512i g1(__m512i* a) +{ + return ~(*a); +} + +void +foo (int* a, int* __restrict b) +{ + for (int i = 0; i != 16; i++) + { + if (b[i]) + a[i] = -1; + else + a[i] = 0; + } +} -- 2.39.1.388.g2fc9e9ca3c ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor. 2023-07-04 2:50 [PATCH] Break false dependence for vpternlog by inserting vpxor liuhongt @ 2023-07-06 15:46 ` simonaytes.yan 2023-07-07 6:50 ` Hongtao Liu 0 siblings, 1 reply; 7+ messages in thread From: simonaytes.yan @ 2023-07-06 15:46 UTC (permalink / raw) To: liuhongt; +Cc: gcc-patches, crazylht, hjl.tools > +; False dependency happens on destination register which is not really > +; used when moving all ones to vector register > +(define_split > + [(set (match_operand:VMOVE 0 "register_operand") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > + "TARGET_AVX512F && reload_completed > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))" > + [(set (match_dup 0) (match_dup 2)) > + (parallel > + [(set (match_dup 0) (match_dup 1)) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[2] = CONST0_RTX (<MODE>mode);") I think we shouldnt emit PXOR when optimizing for size. So should change define_split: define_split [(set (match_operand:VMOVE 0 "register_operand") (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] "TARGET_AVX512F && reload_completed && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) && optimize_insn_for_speed_p ()" [(set (match_dup 0) (match_dup 2)) (parallel [(set (match_dup 0) (match_dup 1)) (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] "operands[2] = CONST0_RTX (<MODE>mode);") ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor. 2023-07-06 15:46 ` simonaytes.yan @ 2023-07-07 6:50 ` Hongtao Liu 2023-07-10 1:17 ` [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' liuhongt 0 siblings, 1 reply; 7+ messages in thread From: Hongtao Liu @ 2023-07-07 6:50 UTC (permalink / raw) To: simonaytes.yan; +Cc: liuhongt, gcc-patches, hjl.tools On Thu, Jul 6, 2023 at 11:46 PM <simonaytes.yan@ispras.ru> wrote: > > > +; False dependency happens on destination register which is not really > > +; used when moving all ones to vector register > > +(define_split > > + [(set (match_operand:VMOVE 0 "register_operand") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > > + "TARGET_AVX512F && reload_completed > > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))" > > + [(set (match_dup 0) (match_dup 2)) > > + (parallel > > + [(set (match_dup 0) (match_dup 1)) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[2] = CONST0_RTX (<MODE>mode);") > > I think we shouldnt emit PXOR when optimizing for size. So should change > define_split: > define_split > [(set (match_operand:VMOVE 0 "register_operand") > (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > "TARGET_AVX512F && reload_completed > && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) > && optimize_insn_for_speed_p ()" > [(set (match_dup 0) (match_dup 2)) > (parallel > [(set (match_dup 0) (match_dup 1)) > (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > "operands[2] = CONST0_RTX (<MODE>mode);") Yes, will do. I'm still working on breaking the false depence for pternlog in newly added pattern *iornot<mode>3,*xnor<mode>3 and *<nlogic><mode>3. Will repost the patch when it's done. -- BR, Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' 2023-07-07 6:50 ` Hongtao Liu @ 2023-07-10 1:17 ` liuhongt 2023-07-10 16:23 ` Alexander Monakov 0 siblings, 1 reply; 7+ messages in thread From: liuhongt @ 2023-07-10 1:17 UTC (permalink / raw) To: gcc-patches; +Cc: simonaytes.yan False dependency happens when destination is only updated by pternlog. There is no false dependency when destination is also used in source. So either a pxor should be inserted, or input operand should be set with constraint '0'. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/110438 PR target/110202 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New define_insn. (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep): Ditto. (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep): Ditto. (*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to define_insn_and_split to avoid false dependence. (*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto. (<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint of operands 1 to '0' to avoid false dependence. (*andnot<mode>3): Ditto. (iornot<mode>3): Ditto. (*<nlogic><mode>3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. --- gcc/config/i386/predicates.md | 8 +- gcc/config/i386/sse.md | 113 ++++++++++++++++++++--- gcc/testsuite/gcc.target/i386/pr110438.c | 30 ++++++ 3 files changed, 135 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 7ddbe01a6f9..37d20c6303a 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand" return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 418c337a775..56920a3e1d3 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal" ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (<MODE>mode);") + +(define_insn "*vmov<mode>_constm1_pternlog_false_dep" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || <MODE_SIZE> == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>" operands[3] = CONST0_RTX (<MODE>mode); }") -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" "@ vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + "&& !TARGET_AVX512DQ && reload_completed + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 4)) + (parallel + [(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (<MODE>mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep" + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") + (match_operand:VI48_AVX512VL 3 "const0_operand") + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F && !TARGET_AVX512DQ" + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "extendv2sfv2df2" [(set (match_operand:V2DF 0 "register_operand") (float_extend:V2DF @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2" operands[2] = force_reg (<MODE>mode, operands[2]); }) -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" - [(set (match_operand:VI 0 "register_operand" "=v,v") - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>" + [(set (match_operand:VI 0 "register_operand" "=v,v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode == SImode || <ssescalarmode>mode == DImode)" { + if (!<mask_applied> && which_alternative + && optimize_function_for_speed_p (cfun)) + return "#"; + if (TARGET_AVX512VL) return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; else return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; } + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied> + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 3)) + (parallel + [(set (match_dup 0) + (xor:VI (match_dup 1) (match_dup 2))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[3] = CONST0_RTX (<MODE>mode);" [(set_attr "type" "sselog") (set_attr "prefix" "evex") (set (attr "mode") @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_int 1)))]) +(define_insn "*one_cmpl<mode>2_pternlog_false_dep" + [(set (match_operand:VI 0 "register_operand" "=v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC"))) + (unspec [(match_operand:VI 3 "register_operand" "0,0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F" +{ + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; + else + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "<sseinsnmode>") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "0") + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") + (const_int 1)))]) + (define_split [(set (match_operand:VI48_AVX512F 0 "register_operand") (vec_duplicate:VI48_AVX512F @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3" [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") (and:VI (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))] "TARGET_SSE && (register_operand (operands[1], <MODE>mode) || register_operand (operands[2], <MODE>mode))" @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3" [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") (ior:VI (not:VI - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr")) + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3" (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "2,3") + (if_then_else (eq_attr "alternative" "0,1") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3" [(set (match_operand:VI 0 "register_operand" "=v,v") (not:VI (xor:VI - (match_operand:VI 1 "bcst_vector_operand" "%v,v") - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (match_operand:VI 1 "bcst_vector_operand" "%0, 0") + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3" (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) (define_insn "*<nlogic><mode>3" [(set (match_operand:VI 0 "register_operand" "=v,v") (andor:VI - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0")) + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3" (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c new file mode 100644 index 00000000000..11b8cc59fd2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110438.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ + + +#include <immintrin.h> + +__m512i g(void) +{ + return (__m512i){ 0 } - 1; +} + +__m512i g1(__m512i* a) +{ + return ~(*a); +} + +void +foo (int* a, int* __restrict b) +{ + for (int i = 0; i != 16; i++) + { + if (b[i]) + a[i] = -1; + else + a[i] = 0; + } +} -- 2.39.1.388.g2fc9e9ca3c ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' 2023-07-10 1:17 ` [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' liuhongt @ 2023-07-10 16:23 ` Alexander Monakov 2023-07-11 0:03 ` Hongtao Liu 0 siblings, 1 reply; 7+ messages in thread From: Alexander Monakov @ 2023-07-10 16:23 UTC (permalink / raw) To: liuhongt; +Cc: gcc-patches, simonaytes.yan On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote: > False dependency happens when destination is only updated by > pternlog. There is no false dependency when destination is also used > in source. So either a pxor should be inserted, or input operand > should be set with constraint '0'. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ready to push to trunk. Shouldn't this patch also remove uses of vpternlog in standard_sse_constant_opcode? A couple more questions below: > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal" > ] > (symbol_ref "true")))]) > > +; False dependency happens on destination register which is not really > +; used when moving all ones to vector register > +(define_split > + [(set (match_operand:VMOVE 0 "register_operand") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > + "TARGET_AVX512F && reload_completed > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) > + && optimize_function_for_speed_p (cfun)" Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate. Doesn't it work here as well? > + [(set (match_dup 0) (match_dup 2)) > + (parallel > + [(set (match_dup 0) (match_dup 1)) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[2] = CONST0_RTX (<MODE>mode);") > + > +(define_insn "*vmov<mode>_constm1_pternlog_false_dep" > + [(set (match_operand:VMOVE 0 "register_operand" "=v") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>")) > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512VL || <MODE_SIZE> == 64" > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" > + [(set_attr "type" "sselog1") > + (set_attr "prefix" "evex")]) > + > ;; If mem_addr points to a memory region with less than whole vector size bytes > ;; of accessible memory and k is a mask that would prevent reading the inaccessible > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd > @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>" > operands[3] = CONST0_RTX (<MODE>mode); > }") > > -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" > +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" > [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") > (vec_merge:VI48_AVX512VL > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" > "@ > vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} > vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" > + "&& !TARGET_AVX512DQ && reload_completed > + && optimize_function_for_speed_p (cfun)" > + [(set (match_dup 0) (match_dup 4)) > + (parallel > + [(set (match_dup 0) > + (vec_merge:VI48_AVX512VL > + (match_dup 2) > + (match_dup 3) > + (match_dup 1))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[4] = CONST0_RTX (<MODE>mode);" > [(set_attr "isa" "avx512dq,*") > (set_attr "length_immediate" "0,1") > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep" > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > + (vec_merge:VI48_AVX512VL > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > + (match_operand:VI48_AVX512VL 3 "const0_operand") > + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512F && !TARGET_AVX512DQ" > + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" > + [(set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_expand "extendv2sfv2df2" > [(set (match_operand:V2DF 0 "register_operand") > (float_extend:V2DF > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2" > operands[2] = force_reg (<MODE>mode, operands[2]); > }) > > -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" > - [(set (match_operand:VI 0 "register_operand" "=v,v") > - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") > - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] > +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>" > + [(set (match_operand:VI 0 "register_operand" "=v,v,v") > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] > "TARGET_AVX512F > && (!<mask_applied> > || <ssescalarmode>mode == SImode > || <ssescalarmode>mode == DImode)" > { > + if (!<mask_applied> && which_alternative > + && optimize_function_for_speed_p (cfun)) > + return "#"; > + > if (TARGET_AVX512VL) > return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; > else > return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; > } > + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied> > + && optimize_function_for_speed_p (cfun)" > + [(set (match_dup 0) (match_dup 3)) > + (parallel > + [(set (match_dup 0) > + (xor:VI (match_dup 1) (match_dup 2))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[3] = CONST0_RTX (<MODE>mode);" Perhaps I'm misreading this, but this seems to result in vpxor zmm0, zmm0 vpternlog zmm0, zmm0, [mem], 0x55 while in the PR the agreement was to emit vmovdq? zmm0, [mem] vpternlog zmm0, zmm0, zmm0, 0x55 when the source is in memory, because the former has three uops in fused domain? > [(set_attr "type" "sselog") > (set_attr "prefix" "evex") > (set (attr "mode") > @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_int 1)))]) > > +(define_insn "*one_cmpl<mode>2_pternlog_false_dep" > + [(set (match_operand:VI 0 "register_operand" "=v,v") > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br") > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC"))) > + (unspec [(match_operand:VI 3 "register_operand" "0,0")] > + UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512F" > +{ > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; > + else > + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "0") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_int 1)))]) > + > (define_split > [(set (match_operand:VI48_AVX512F 0 "register_operand") > (vec_duplicate:VI48_AVX512F > @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3" > [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") > (and:VI > (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) > - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] > + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))] > "TARGET_SSE > && (register_operand (operands[1], <MODE>mode) > || register_operand (operands[2], <MODE>mode))" > @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3" > [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") > (ior:VI > (not:VI > - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) > - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] > + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr")) > + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))] > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && (register_operand (operands[1], <MODE>mode) > @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3" > (const_string "<sseinsnmode>") > (const_string "XI"))) > (set (attr "enabled") > - (if_then_else (eq_attr "alternative" "2,3") > + (if_then_else (eq_attr "alternative" "0,1") > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_string "*")))]) > > @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3" > [(set (match_operand:VI 0 "register_operand" "=v,v") > (not:VI > (xor:VI > - (match_operand:VI 1 "bcst_vector_operand" "%v,v") > - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > + (match_operand:VI 1 "bcst_vector_operand" "%0, 0") > + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))] > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && (register_operand (operands[1], <MODE>mode) > @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3" > (const_string "<sseinsnmode>") > (const_string "XI"))) > (set (attr "enabled") > - (if_then_else (eq_attr "alternative" "1") > + (if_then_else (eq_attr "alternative" "0") > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_string "*")))]) > > @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > (define_insn "*<nlogic><mode>3" > [(set (match_operand:VI 0 "register_operand" "=v,v") > (andor:VI > - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0")) > + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))] > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > && (register_operand (operands[1], <MODE>mode) > @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3" > (const_string "<sseinsnmode>") > (const_string "XI"))) > (set (attr "enabled") > - (if_then_else (eq_attr "alternative" "1") > + (if_then_else (eq_attr "alternative" "0") > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_string "*")))]) > > diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c > new file mode 100644 > index 00000000000..11b8cc59fd2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr110438.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ > +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ > +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ > +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ > + > + > +#include <immintrin.h> > + > +__m512i g(void) > +{ > + return (__m512i){ 0 } - 1; > +} > + > +__m512i g1(__m512i* a) > +{ > + return ~(*a); > +} > + > +void > +foo (int* a, int* __restrict b) > +{ > + for (int i = 0; i != 16; i++) > + { > + if (b[i]) > + a[i] = -1; > + else > + a[i] = 0; > + } > +} > ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' 2023-07-10 16:23 ` Alexander Monakov @ 2023-07-11 0:03 ` Hongtao Liu 2023-07-11 4:01 ` [PATCH v2] " liuhongt 0 siblings, 1 reply; 7+ messages in thread From: Hongtao Liu @ 2023-07-11 0:03 UTC (permalink / raw) To: Alexander Monakov; +Cc: liuhongt, gcc-patches, simonaytes.yan On Tue, Jul 11, 2023 at 12:24 AM Alexander Monakov via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > > On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote: > > > False dependency happens when destination is only updated by > > pternlog. There is no false dependency when destination is also used > > in source. So either a pxor should be inserted, or input operand > > should be set with constraint '0'. > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ready to push to trunk. > > Shouldn't this patch also remove uses of vpternlog in > standard_sse_constant_opcode? It's still needed when !optimize_function_for_speed_p (cfun). > > A couple more questions below: > > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal" > > ] > > (symbol_ref "true")))]) > > > > +; False dependency happens on destination register which is not really > > +; used when moving all ones to vector register > > +(define_split > > + [(set (match_operand:VMOVE 0 "register_operand") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > > + "TARGET_AVX512F && reload_completed > > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) > > + && optimize_function_for_speed_p (cfun)" > > Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate. > Doesn't it work here as well? I'm just aligned with lzcnt/popcnt case, the difference between option_insn_for_speed_p and optimized_function_for_speed_p is the former will consider !crtl->maybe_hot_insn_p but the latter just returns !optimize_function_for_size_p (cfun). It looks optimize_insn_for_speed_p() is more reasonable for single insn. 350optimize_insn_for_size_p (void) 351{ 352 enum optimize_size_level ret = optimize_function_for_size_p (cfun); 353 if (ret < OPTIMIZE_SIZE_BALANCED && !crtl->maybe_hot_insn_p) 354 ret = OPTIMIZE_SIZE_BALANCED; 355 return ret; > > > + [(set (match_dup 0) (match_dup 2)) > > + (parallel > > + [(set (match_dup 0) (match_dup 1)) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[2] = CONST0_RTX (<MODE>mode);") > > + > > +(define_insn "*vmov<mode>_constm1_pternlog_false_dep" > > + [(set (match_operand:VMOVE 0 "register_operand" "=v") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>")) > > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] > > + "TARGET_AVX512VL || <MODE_SIZE> == 64" > > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" > > + [(set_attr "type" "sselog1") > > + (set_attr "prefix" "evex")]) > > + > > ;; If mem_addr points to a memory region with less than whole vector size bytes > > ;; of accessible memory and k is a mask that would prevent reading the inaccessible > > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd > > @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>" > > operands[3] = CONST0_RTX (<MODE>mode); > > }") > > > > -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" > > +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" > > [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") > > (vec_merge:VI48_AVX512VL > > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > > @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" > > "@ > > vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} > > vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" > > + "&& !TARGET_AVX512DQ && reload_completed > > + && optimize_function_for_speed_p (cfun)" > > + [(set (match_dup 0) (match_dup 4)) > > + (parallel > > + [(set (match_dup 0) > > + (vec_merge:VI48_AVX512VL > > + (match_dup 2) > > + (match_dup 3) > > + (match_dup 1))) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[4] = CONST0_RTX (<MODE>mode);" > > [(set_attr "isa" "avx512dq,*") > > (set_attr "length_immediate" "0,1") > > (set_attr "prefix" "evex") > > (set_attr "mode" "<sseinsnmode>")]) > > > > +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep" > > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > > + (vec_merge:VI48_AVX512VL > > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > > + (match_operand:VI48_AVX512VL 3 "const0_operand") > > + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) > > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] > > + "TARGET_AVX512F && !TARGET_AVX512DQ" > > + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" > > + [(set_attr "length_immediate" "1") > > + (set_attr "prefix" "evex") > > + (set_attr "mode" "<sseinsnmode>")]) > > + > > (define_expand "extendv2sfv2df2" > > [(set (match_operand:V2DF 0 "register_operand") > > (float_extend:V2DF > > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2" > > operands[2] = force_reg (<MODE>mode, operands[2]); > > }) > > > > -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" > > - [(set (match_operand:VI 0 "register_operand" "=v,v") > > - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") > > - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] > > +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>" > > + [(set (match_operand:VI 0 "register_operand" "=v,v,v") > > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") > > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] > > "TARGET_AVX512F > > && (!<mask_applied> > > || <ssescalarmode>mode == SImode > > || <ssescalarmode>mode == DImode)" > > { > > + if (!<mask_applied> && which_alternative > > + && optimize_function_for_speed_p (cfun)) > > + return "#"; > > + > > if (TARGET_AVX512VL) > > return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; > > else > > return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; > > } > > + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied> > > + && optimize_function_for_speed_p (cfun)" > > + [(set (match_dup 0) (match_dup 3)) > > + (parallel > > + [(set (match_dup 0) > > + (xor:VI (match_dup 1) (match_dup 2))) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[3] = CONST0_RTX (<MODE>mode);" > > Perhaps I'm misreading this, but this seems to result in > > vpxor zmm0, zmm0 > vpternlog zmm0, zmm0, [mem], 0x55 > I thought the first alternative (v,0,BC) would handle that, looks not, i'll adjust the splitter to explicitly put operands[1] into operands[0] when it's memory. > while in the PR the agreement was to emit > > vmovdq? zmm0, [mem] > vpternlog zmm0, zmm0, zmm0, 0x55 > > when the source is in memory, because the former has three uops in fused domain? > > > [(set_attr "type" "sselog") > > > (set_attr "prefix" "evex") > > (set (attr "mode") > > @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" > > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > > (const_int 1)))]) > > > > +(define_insn "*one_cmpl<mode>2_pternlog_false_dep" > > + [(set (match_operand:VI 0 "register_operand" "=v,v") > > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br") > > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC"))) > > + (unspec [(match_operand:VI 3 "register_operand" "0,0")] > > + UNSPEC_INSN_FALSE_DEP)] > > + "TARGET_AVX512F" > > +{ > > + if (TARGET_AVX512VL) > > + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; > > + else > > + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; > > +} > > + [(set_attr "type" "sselog") > > + (set_attr "prefix" "evex") > > + (set (attr "mode") > > + (if_then_else (match_test "TARGET_AVX512VL") > > + (const_string "<sseinsnmode>") > > + (const_string "XI"))) > > + (set (attr "enabled") > > + (if_then_else (eq_attr "alternative" "0") > > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > > + (const_int 1)))]) > > + > > (define_split > > [(set (match_operand:VI48_AVX512F 0 "register_operand") > > (vec_duplicate:VI48_AVX512F > > @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3" > > [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") > > (and:VI > > (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) > > - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] > > + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))] > > "TARGET_SSE > > && (register_operand (operands[1], <MODE>mode) > > || register_operand (operands[2], <MODE>mode))" > > @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3" > > [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") > > (ior:VI > > (not:VI > > - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) > > - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] > > + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr")) > > + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))] > > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > > && (register_operand (operands[1], <MODE>mode) > > @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3" > > (const_string "<sseinsnmode>") > > (const_string "XI"))) > > (set (attr "enabled") > > - (if_then_else (eq_attr "alternative" "2,3") > > + (if_then_else (eq_attr "alternative" "0,1") > > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > > (const_string "*")))]) > > > > @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3" > > [(set (match_operand:VI 0 "register_operand" "=v,v") > > (not:VI > > (xor:VI > > - (match_operand:VI 1 "bcst_vector_operand" "%v,v") > > - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > > + (match_operand:VI 1 "bcst_vector_operand" "%0, 0") > > + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))] > > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > > && (register_operand (operands[1], <MODE>mode) > > @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3" > > (const_string "<sseinsnmode>") > > (const_string "XI"))) > > (set (attr "enabled") > > - (if_then_else (eq_attr "alternative" "1") > > + (if_then_else (eq_attr "alternative" "0") > > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > > (const_string "*")))]) > > > > @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > > (define_insn "*<nlogic><mode>3" > > [(set (match_operand:VI 0 "register_operand" "=v,v") > > (andor:VI > > - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > > - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > > + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0")) > > + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))] > > "(<MODE_SIZE> == 64 || TARGET_AVX512VL > > || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > > && (register_operand (operands[1], <MODE>mode) > > @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3" > > (const_string "<sseinsnmode>") > > (const_string "XI"))) > > (set (attr "enabled") > > - (if_then_else (eq_attr "alternative" "1") > > + (if_then_else (eq_attr "alternative" "0") > > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > > (const_string "*")))]) > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c > > new file mode 100644 > > index 00000000000..11b8cc59fd2 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr110438.c > > @@ -0,0 +1,30 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ > > +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ > > +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ > > +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ > > + > > + > > +#include <immintrin.h> > > + > > +__m512i g(void) > > +{ > > + return (__m512i){ 0 } - 1; > > +} > > + > > +__m512i g1(__m512i* a) > > +{ > > + return ~(*a); > > +} > > + > > +void > > +foo (int* a, int* __restrict b) > > +{ > > + for (int i = 0; i != 16; i++) > > + { > > + if (b[i]) > > + a[i] = -1; > > + else > > + a[i] = 0; > > + } > > +} > > -- BR, Hongtao ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' 2023-07-11 0:03 ` Hongtao Liu @ 2023-07-11 4:01 ` liuhongt 0 siblings, 0 replies; 7+ messages in thread From: liuhongt @ 2023-07-11 4:01 UTC (permalink / raw) To: gcc-patches; +Cc: amonakov Here's updated patch. 1. use optimize_insn_for_speed_p instead of using optimize_function_for_speed_p. 2. explicitly move memory to dest register to avoid false dependence in one_cmpl pattern. False dependency happens when destination is only updated by pternlog. There is no false dependency when destination is also used in source. So either a pxor should be inserted, or input operand should be set with constraint '0'. gcc/ChangeLog: PR target/110438 PR target/110202 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New define_insn. (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep): Ditto. (*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep): Ditto. (*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to define_insn_and_split to avoid false dependence. (*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto. (<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint of operands 1 to '0' to avoid false dependence. (*andnot<mode>3): Ditto. (iornot<mode>3): Ditto. (*<nlogic><mode>3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. * gcc.target/i386/pr100711.c: Adjust testcase. --- gcc/config/i386/predicates.md | 8 +- gcc/config/i386/sse.md | 145 ++++++++++++++++++--- gcc/testsuite/gcc.target/i386/pr100711-6.c | 2 +- gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++++ 4 files changed, 168 insertions(+), 17 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 7ddbe01a6f9..37d20c6303a 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand" return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 418c337a775..05485b1792d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal" ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0])) + && optimize_insn_for_speed_p ()" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (<MODE>mode);") + +(define_insn "*vmov<mode>_constm1_pternlog_false_dep" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || <MODE_SIZE> == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>" operands[3] = CONST0_RTX (<MODE>mode); }") -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" "@ vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + "&& !TARGET_AVX512DQ && reload_completed + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 4)) + (parallel + [(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (<MODE>mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep" + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") + (match_operand:VI48_AVX512VL 3 "const0_operand") + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk"))) + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F && !TARGET_AVX512DQ" + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "extendv2sfv2df2" [(set (match_operand:V2DF 0 "register_operand") (float_extend:V2DF @@ -17166,14 +17213,82 @@ (define_expand "one_cmpl<mode>2" operands[2] = force_reg (<MODE>mode, operands[2]); }) -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" - [(set (match_operand:VI 0 "register_operand" "=v,v") - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>" + [(set (match_operand:VI 0 "register_operand" "=v,v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode == SImode || <ssescalarmode>mode == DImode)" +{ + if (!<mask_applied> && which_alternative + && optimize_insn_for_speed_p ()) + return "#"; + + if (TARGET_AVX512VL) + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; + else + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}"; +} + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied> + && optimize_insn_for_speed_p ()" + [(set (match_dup 0) (match_dup 3)) + (parallel + [(set (match_dup 0) + (xor:VI (match_dup 1) (match_dup 2))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] +{ + if (MEM_P (operands[1])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + { + if (GET_MODE_SIZE (<ssescalarmode>mode) < 4) + { + if (<MODE_SIZE> == 64 ? TARGET_AVX512BW + : (TARGET_AVX512BW && TARGET_AVX512VL) + || !EXT_REX_SSE_REG_P (operands[0])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + operands[3] = CONST0_RTX (<MODE>mode); + } + else + { + if (<MODE_SIZE> == 64 || TARGET_AVX512VL + || !EXT_REX_SSE_REG_P (operands[0])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + operands[3] = CONST0_RTX (<MODE>mode); + } + } +} + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "<sseinsnmode>") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") + (const_int 1)))]) + +(define_insn "*one_cmpl<mode>2_pternlog_false_dep" + [(set (match_operand:VI 0 "register_operand" "=v,v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC"))) + (unspec [(match_operand:VI 3 "register_operand" "0,0,0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F" { if (TARGET_AVX512VL) return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}"; @@ -17226,7 +17341,7 @@ (define_insn "*andnot<mode>3" [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") (and:VI (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))] "TARGET_SSE && (register_operand (operands[1], <MODE>mode) || register_operand (operands[2], <MODE>mode))" @@ -17685,8 +17800,8 @@ (define_insn "*iornot<mode>3" [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") (ior:VI (not:VI - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr")) + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17710,7 +17825,7 @@ (define_insn "*iornot<mode>3" (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "2,3") + (if_then_else (eq_attr "alternative" "0,1") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17718,8 +17833,8 @@ (define_insn "*xnor<mode>3" [(set (match_operand:VI 0 "register_operand" "=v,v") (not:VI (xor:VI - (match_operand:VI 1 "bcst_vector_operand" "%v,v") - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (match_operand:VI 1 "bcst_vector_operand" "%0, 0") + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17738,7 +17853,7 @@ (define_insn "*xnor<mode>3" (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17749,8 +17864,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) (define_insn "*<nlogic><mode>3" [(set (match_operand:VI 0 "register_operand" "=v,v") (andor:VI - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0")) + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))] "(<MODE_SIZE> == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], <MODE>mode) @@ -17769,7 +17884,7 @@ (define_insn "*<nlogic><mode>3" (const_string "<sseinsnmode>") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") (const_string "*")))]) diff --git a/gcc/testsuite/gcc.target/i386/pr100711-6.c b/gcc/testsuite/gcc.target/i386/pr100711-6.c index 7142a98f537..808507471c9 100644 --- a/gcc/testsuite/gcc.target/i386/pr100711-6.c +++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -Os" } */ typedef int v16si __attribute__ ((vector_size (64))); typedef long long v8di __attribute__((vector_size (64))); diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c new file mode 100644 index 00000000000..11b8cc59fd2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110438.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ + + +#include <immintrin.h> + +__m512i g(void) +{ + return (__m512i){ 0 } - 1; +} + +__m512i g1(__m512i* a) +{ + return ~(*a); +} + +void +foo (int* a, int* __restrict b) +{ + for (int i = 0; i != 16; i++) + { + if (b[i]) + a[i] = -1; + else + a[i] = 0; + } +} -- 2.39.1.388.g2fc9e9ca3c ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2023-07-11 4:03 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-07-04 2:50 [PATCH] Break false dependence for vpternlog by inserting vpxor liuhongt 2023-07-06 15:46 ` simonaytes.yan 2023-07-07 6:50 ` Hongtao Liu 2023-07-10 1:17 ` [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' liuhongt 2023-07-10 16:23 ` Alexander Monakov 2023-07-11 0:03 ` Hongtao Liu 2023-07-11 4:01 ` [PATCH v2] " liuhongt
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).