From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2078) id C8E893858C2C; Tue, 24 Aug 2021 09:45:45 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C8E893858C2C MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: hongtao Liu To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-3108] Optimize (a & b) | (c & ~b) to vpternlog instruction. X-Act-Checkin: gcc X-Git-Author: liuhongt X-Git-Refname: refs/heads/master X-Git-Oldrev: 8571ff0ae0922bee292161c7fd61dd127d26a4ed X-Git-Newrev: 6ddb30f941a44bd528904558673ab35394565f08 Message-Id: <20210824094545.C8E893858C2C@sourceware.org> Date: Tue, 24 Aug 2021 09:45:45 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 24 Aug 2021 09:45:45 -0000 https://gcc.gnu.org/g:6ddb30f941a44bd528904558673ab35394565f08 commit r12-3108-g6ddb30f941a44bd528904558673ab35394565f08 Author: liuhongt Date: Fri Aug 20 15:30:40 2021 +0800 Optimize (a & b) | (c & ~b) to vpternlog instruction. Also optimize below 3 forms to vpternlog, op1, op2, op3 are register_operand or unary_p as (not reg) A: (any_logic (any_logic op1 op2) op3) B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should be equal to op1/op2 C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should be equal to op1/op2 gcc/ChangeLog: PR target/101989 * config/i386/i386.c (ix86_rtx_costs): Define cost for UNSPEC_VTERNLOG. * config/i386/i386.h (STRIP_UNARY): New macro. * config/i386/predicates.md (reg_or_notreg_operand): New predicate. * config/i386/sse.md (*_vternlog_all): New define_insn. (*_vternlog_1): New pre_reload define_insn_and_split. (*_vternlog_2): Ditto. (*_vternlog_3): Ditto. (any_logic1,any_logic2): New code iterator. (logic_op): New code attribute. (ternlogsuffix): Extend to VNxDF and VNxSF. gcc/testsuite/ChangeLog: PR target/101989 * gcc.target/i386/pr101989-1.c: New test. * gcc.target/i386/pr101989-2.c: New test. * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase. Diff: --- gcc/config/i386/i386.c | 5 + gcc/config/i386/i386.h | 2 + gcc/config/i386/predicates.md | 7 + gcc/config/i386/sse.md | 234 +++++++++++++++++++++ .../i386/avx512bw-shiftqihi-constant-1.c | 4 +- gcc/testsuite/gcc.target/i386/pr101989-1.c | 51 +++++ gcc/testsuite/gcc.target/i386/pr101989-2.c | 102 +++++++++ 7 files changed, 403 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5bff131f6d9..ebec8668758 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -20542,6 +20542,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case UNSPEC: if (XINT (x, 1) == UNSPEC_TP) *total = 0; + else if (XINT(x, 1) == UNSPEC_VTERNLOG) + { + *total = cost->sse_op; + return true; + } return false; case VEC_SELECT: diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 11ac8d01100..65114229c86 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1716,6 +1716,8 @@ typedef struct ix86_args { #define LEGITIMATE_PIC_OPERAND_P(X) legitimate_pic_operand_p (X) +#define STRIP_UNARY(X) (UNARY_P (X) ? XEXP (X, 0) : X) + #define SYMBOLIC_CONST(X) \ (GET_CODE (X) == SYMBOL_REF \ || GET_CODE (X) == LABEL_REF \ diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 9321f332ef9..df5acb425d4 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1044,6 +1044,13 @@ (ior (match_test "op == const1_rtx") (match_test "op == constm1_rtx"))))) +;; True for registers, or (not: registers). Used to optimize 3-operand +;; bitwise operation. +(define_predicate "reg_or_notreg_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "not") + (match_test "register_operand (XEXP (op, 0), mode)")))) + ;; True if OP is acceptable as operand of DImode shift expander. (define_predicate "shiftdi_operand" (if_then_else (match_test "TARGET_64BIT") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 95f95823ea3..25ca9a5c29c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -933,7 +933,9 @@ ;; Mapping of vector modes to VPTERNLOG suffix (define_mode_attr ternlogsuffix [(V8DI "q") (V4DI "q") (V2DI "q") + (V8DF "q") (V4DF "q") (V2DF "q") (V16SI "d") (V8SI "d") (V4SI "d") + (V16SF "d") (V8SF "d") (V4SF "d") (V32HI "d") (V16HI "d") (V8HI "d") (V64QI "d") (V32QI "d") (V16QI "d")]) @@ -10041,6 +10043,238 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*_vternlog_all" + [(set (match_operand:V 0 "register_operand" "=v") + (unspec:V + [(match_operand:V 1 "register_operand" "0") + (match_operand:V 2 "register_operand" "v") + (match_operand:V 3 "nonimmediate_operand" "vm") + (match_operand:SI 4 "const_0_to_255_operand")] + UNSPEC_VTERNLOG))] + "TARGET_AVX512F" + "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +;; There must be lots of other combinations like +;; +;; (any_logic:V +;; (any_logic:V op1 op2) +;; (any_logic:V op1 op3)) +;; +;; (any_logic:V +;; (any_logic:V +;; (any_logic:V op1, op2) +;; op3) +;; op1) +;; +;; and so on. + +(define_code_iterator any_logic1 [and ior xor]) +(define_code_iterator any_logic2 [and ior xor]) +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")]) + +(define_insn_and_split "*_vpternlog_1" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (any_logic2:V + (match_operand:V 3 "reg_or_notreg_operand") + (match_operand:V 4 "reg_or_notreg_operand"))))] + "( == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split () + && (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[3])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 6) + (match_dup 2) + (match_dup 1) + (match_dup 5)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ + int reg6 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg3 = 0; + int reg4 = 0; + int reg_mask, tmp1, tmp2; + if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg1; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg2; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3]))) + { + reg4 = reg6; + reg3 = reg1; + operands[6] = operands[4]; + } + else + { + reg4 = reg6; + reg3 = reg2; + operands[6] = operands[4]; + } + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; + + tmp1 = reg1 reg2; + tmp2 = reg3 reg4; + reg_mask = tmp1 tmp2; + reg_mask &= 0xFF; + + operands[1] = STRIP_UNARY (operands[1]); + operands[2] = STRIP_UNARY (operands[2]); + operands[6] = STRIP_UNARY (operands[6]); + operands[5] = GEN_INT (reg_mask); +}) + +(define_insn_and_split "*_vpternlog_2" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (any_logic2:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (match_operand:V 3 "reg_or_notreg_operand")) + (match_operand:V 4 "reg_or_notreg_operand")))] + "( == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split () + && (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4])) + || rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3])) + || rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[3])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 6) + (match_dup 2) + (match_dup 1) + (match_dup 5)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ + int reg6 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg3 = 0; + int reg4 = 0; + int reg_mask, tmp1, tmp2; + if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg1; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[2]), + STRIP_UNARY (operands[4]))) + { + reg4 = reg2; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (STRIP_UNARY (operands[1]), + STRIP_UNARY (operands[3]))) + { + reg4 = reg6; + reg3 = reg1; + operands[6] = operands[4]; + } + else + { + reg4 = reg6; + reg3 = reg2; + operands[6] = operands[4]; + } + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; + + tmp1 = reg1 reg2; + tmp2 = tmp1 reg3; + reg_mask = tmp2 reg4; + reg_mask &= 0xFF; + + operands[1] = STRIP_UNARY (operands[1]); + operands[2] = STRIP_UNARY (operands[2]); + operands[6] = STRIP_UNARY (operands[6]); + operands[5] = GEN_INT (reg_mask); +}) + +(define_insn_and_split "*_vpternlog_3" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (match_operand:V 3 "reg_or_notreg_operand")))] + "( == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 3) + (match_dup 2) + (match_dup 1) + (match_dup 4)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg3, reg2, reg1, imm8. */ + int reg3 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg_mask, tmp1; + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + + tmp1 = reg1 reg2; + reg_mask = tmp1 reg3; + reg_mask &= 0xFF; + + operands[1] = STRIP_UNARY (operands[1]); + operands[2] = STRIP_UNARY (operands[2]); + operands[3] = STRIP_UNARY (operands[3]); + operands[4] = GEN_INT (reg_mask); +}) + + (define_insn "_vternlog_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c index 78bf5d33689..fbc3de08119 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c @@ -1,7 +1,8 @@ /* PR target/95524 */ /* { dg-do compile } */ /* { dg-options "-O2 -mavx512bw" } */ -/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } } */ +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } } */ +/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } } */ typedef char v64qi __attribute__ ((vector_size (64))); typedef unsigned char v64uqi __attribute__ ((vector_size (64))); @@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a) return a >> 2; } /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */ -/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */ /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */ __attribute__((noipa)) v64qi diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c new file mode 100644 index 00000000000..594093ecdde --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { scan-assembler-times "vpternlog" 6 } } */ +/* { dg-final { scan-assembler-not "vpxor" } } */ +/* { dg-final { scan-assembler-not "vpor" } } */ +/* { dg-final { scan-assembler-not "vpand" } } */ + +#include +__m256d +__attribute__((noipa, target("avx512vl"))) +copysign2_pd(__m256d from, __m256d to) { + __m256i a = _mm256_castpd_si256(from); + __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63)); + /* (avx_signbit & from) | (~avx_signbit & to) */ + return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to)); +} + +__m256i +__attribute__((noipa, target("avx512vl"))) +foo (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & ~src1) | (src3 & src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo1 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (src3 & ~src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo2 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (~src3 & src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo3 (__m256i src1, __m256i src2, __m256i src3) +{ + return (~src2 & src1) | (src3 & src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo4 (__m256i src1, __m256i src2, __m256i src3) +{ + return src3 & src2 ^ src1; +} diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c new file mode 100644 index 00000000000..9d9759a8e1d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c @@ -0,0 +1,102 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ +/* { dg-require-effective-target avx512vl } */ + +#define AVX512VL + +#include "avx512f-helper.h" + +#include "pr101989-1.c" +__m256d +avx2_copysign2_pd (__m256d from, __m256d to) { + __m256i a = _mm256_castpd_si256(from); + __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63)); + /* (avx_signbit & from) | (~avx_signbit & to) */ + return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to)); +} + +__m256i +avx2_foo (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & ~src1) | (src3 & src1); +} + +__m256i +avx2_foo1 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (src3 & ~src1); +} + +__m256i +avx2_foo2 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (~src3 & src1); +} + +__m256i +avx2_foo3 (__m256i src1, __m256i src2, __m256i src3) +{ + return (~src2 & src1) | (src3 & src1); +} + +__m256i +avx2_foo4 (__m256i src1, __m256i src2, __m256i src3) +{ + return src3 & src2 ^ src1; +} + + +void +test_256 (void) +{ + union256i_q q1, q2, q3, res2, exp2; + union256d d1, d2, res1, exp1; + int i, sign = 1; + + for (i = 0; i < 4; i++) + { + d1.a[i] = 12.34 * (i + 2000) * sign; + d2.a[i] = 56.78 * (i - 30) * sign; + q1.a[i] = 12 * (i + 2000) * sign; + q2.a[i] = 56 * (i - 30) * sign; + q3.a[i] = 90 * (i + 40) * sign; + res1.a[i] = DEFAULT_VALUE; + exp1.a[i] = DEFAULT_VALUE; + res2.a[i] = exp2.a[i] = -1; + sign = -sign; + } + + exp1.x = avx2_copysign2_pd (d1.x, d2.x); + res1.x = copysign2_pd (d1.x, d2.x); + if (UNION_CHECK (256, d) (res1, exp1.a)) + abort (); + + exp2.x = avx2_foo1 (q1.x, q2.x, q3.x); + res2.x = foo1 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo2 (q1.x, q2.x, q3.x); + res2.x = foo2 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo3 (q1.x, q2.x, q3.x); + res2.x = foo3 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo4 (q1.x, q2.x, q3.x); + res2.x = foo4 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo (q1.x, q2.x, q3.x); + res2.x = foo (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); +} + +static void +test_128 () +{}