From b8f49299e3d23f927a659cd394e3099e3291a76f Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 8 Jul 2020 17:14:36 +0800 Subject: [PATCH] Optimize memory broadcast for constant vector under AVX512. For constant vector having one duplicated value, there's no need to put whole vector in the constant pool, using embedded broadcast instead. 2020-07-09 Hongtao Liu gcc/ChangeLog: PR target/87767 * config/i386/i386-features.c (replace_constant_pool_with_broadcast): New function. (constant_pool_broadcast): Ditto. (class pass_constant_pool_broadcast): New pass. (make_pass_constant_pool_broadcast): Ditto. * config/i386/i386-passes.def: Insert new pass after combine. * config/i386/i386-protos.h (make_pass_constant_pool_broadcast): Declare. * config/i386/sse.md (*avx512dq_mul3_bcst, *avx512f_mul3_bcst): New define_insn. gcc/testsuite/ChangeLog: PR target/87767 * gcc.target/i386/avx2-broadcast-pr87767-1.c: New test. * gcc.target/i386/avx512f-broadcast-pr87767-1.c: New test. * gcc.target/i386/avx512f-broadcast-pr87767-2.c: New test. * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: New test. * gcc.target/i386/pr92865-1.c: Adjust testcase. --- gcc/config/i386/i386-features.c | 146 ++++++++++++++++++ gcc/config/i386/i386-passes.def | 1 + gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/sse.md | 25 +++ .../i386/avx2-broadcast-pr87767-1.c | 40 +++++ .../i386/avx512f-broadcast-pr87767-1.c | 66 ++++++++ .../i386/avx512f-broadcast-pr87767-2.c | 54 +++++++ .../i386/avx512vl-broadcast-pr87767-1.c | 40 +++++ gcc/testsuite/gcc.target/i386/pr92865-1.c | 9 +- 9 files changed, 378 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 535fc7e981d..8f81d101382 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -2379,6 +2379,152 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) return new pass_remove_partial_avx_dependency (ctxt); } +/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x + with embedded broadcast. i.e.transform + + vpaddq .LC0(%rip), %zmm0, %zmm0 + ret + .LC0: + .quad 3 + .quad 3 + .quad 3 + .quad 3 + .quad 3 + .quad 3 + .quad 3 + .quad 3 + + to + + vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 + ret + .LC0: + .quad 3 */ +static void +replace_constant_pool_with_broadcast (rtx_insn* insn) +{ + subrtx_ptr_iterator::array_type array; + FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) + { + rtx *loc = *iter; + rtx x = *loc; + rtx broadcast_mem, vec_dup, constant, first; + machine_mode mode; + if (GET_CODE (x) != MEM + || GET_CODE (XEXP (x, 0)) != SYMBOL_REF + || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) + continue; + + mode = GET_MODE (x); + if (!VECTOR_MODE_P (mode)) + return; + + constant = get_pool_constant (XEXP (x, 0)); + first = XVECEXP (constant, 0, 0); + /* There could be some rtx like + (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) + but with "*.LC1" refer to V2DI constant vector. */ + if (GET_MODE (constant) != mode) + return; + + for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) + { + rtx tmp = XVECEXP (constant, 0, i); + /* Only handle one-value const vector. */ + if (!rtx_equal_p (tmp, first)) + return; + } + + broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); + vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); + *loc = vec_dup; + INSN_CODE (insn) = -1; + /* Revert change if there's no corresponding pattern. */ + if (recog_memoized (insn) < 0) + { + *loc = x; + recog_memoized (insn); + } + /* At most 1 memory_operand in an insn. */ + return; + } +} + +/* For const vector having one duplicated value, there's no need to put + whole vector in the constant pool when target supports embedded broadcast. */ +static unsigned int +constant_pool_broadcast (void) +{ + timevar_push (TV_MACH_DEP); + rtx_insn *insn; + + for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + if (!INSN_P (insn)) + continue; + + /* Insns may appear inside a SEQUENCE. Only check the patterns of + insns, not any notes that may be attached. We don't want to mark + a constant just because it happens to appear in a REG_EQUIV note. */ + if (rtx_sequence *seq = dyn_cast (PATTERN (insn))) + { + int i, n = seq->len (); + for (i = 0; i < n; ++i) + { + rtx subinsn = seq->element (i); + if (INSN_P (subinsn)) + replace_constant_pool_with_broadcast (dyn_cast (subinsn)); + } + } + else + replace_constant_pool_with_broadcast (insn); + } + timevar_pop (TV_MACH_DEP); + return 0; +} + +namespace { + +const pass_data pass_data_constant_pool_broadcast = +{ + RTL_PASS, /* type */ + "cpb", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_constant_pool_broadcast : public rtl_opt_pass +{ +public: + pass_constant_pool_broadcast (gcc::context *ctxt) + : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return TARGET_AVX512F; + } + + virtual unsigned int execute (function *) + { + return constant_pool_broadcast (); + } +}; // class pass_cpb + +} // anon namespace + +rtl_opt_pass * +make_pass_constant_pool_broadcast (gcc::context *ctxt) +{ + return new pass_constant_pool_broadcast (ctxt); +} + /* This compares the priority of target features in function DECL1 and DECL2. It returns positive value if DECL1 is higher priority, negative value if DECL2 is higher priority and 0 if they are the diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index d83c7b956b1..07ecf8e790f 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -33,3 +33,4 @@ along with GCC; see the file COPYING3. If not see INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); + INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 7c2ce618f3f..6c6909b41dd 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -386,3 +386,4 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); +extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 431571a4bc1..fbfb459c5bf 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12127,6 +12127,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*avx512dq_mul3_bcst" + [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v") + (mult:VI8_AVX512VL + (vec_duplicate:VI8_AVX512VL + (match_operand: 1 "memory_operand" "m")) + (match_operand:VI8_AVX512VL 2 "register_operand" "v") +))] + "TARGET_AVX512DQ" + "vpmullq\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "type" "sseimul") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_expand "mul3" [(set (match_operand:VI4_AVX512F 0 "register_operand") (mult:VI4_AVX512F @@ -12167,6 +12180,18 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "")]) +(define_insn "*avx512f_mul3_bcst" + [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v") + (mult:VI4_AVX512VL + (vec_duplicate:VI4_AVX512VL + (match_operand: 1 "memory_operand" "m")) + (match_operand:VI4_AVX512VL 2 "register_operand" "v")))] + "TARGET_AVX512F" + "vpmulld\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "type" "sseimul") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_expand "mul3" [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand") (mult:VI8_AVX2_AVX512F diff --git a/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c new file mode 100644 index 00000000000..800ef1f957e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c @@ -0,0 +1,40 @@ +/* PR target/87767 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2" } */ +/* { dg-final { scan-assembler-not "\\\{1to\[248\]\\\}" } } */ +/* { dg-final { scan-assembler-not "\\\{1to16\\\}" } } */ + +typedef int v4si __attribute__ ((vector_size (16))); +typedef int v8si __attribute__ ((vector_size (32))); +typedef long long v2di __attribute__ ((vector_size (16))); +typedef long long v4di __attribute__ ((vector_size (32))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef float v8sf __attribute__ ((vector_size (32))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef double v4df __attribute__ ((vector_size (32))); + +#define FOO(VTYPE, OP_NAME, OP) \ +VTYPE \ + __attribute__ ((noipa)) \ +foo_##OP_NAME##_##VTYPE (VTYPE a) \ +{ \ + return a OP 101; \ +} \ + +FOO (v4si, add, +); +FOO (v8si, add, +); +FOO (v2di, add, +); +FOO (v4di, add, +); +FOO (v4sf, add, +); +FOO (v8sf, add, +); +FOO (v2df, add, +); +FOO (v4df, add, +); + +FOO (v4si, mul, *); +FOO (v8si, mul, *); +FOO (v2di, mul, *); +FOO (v4di, mul, *); +FOO (v4sf, mul, *); +FOO (v8sf, mul, *); +FOO (v2df, mul, *); +FOO (v4df, mul, *); diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c new file mode 100644 index 00000000000..21249bc0cf9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c @@ -0,0 +1,66 @@ +/* PR target/87767 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */ +/* { dg-final { scan-assembler-times "vpadd\[^\n\]*\\\{1to2\\\}" 1 } } */ +/* { dg-final { scan-assembler-times "vpadd\[^\n\]*\\\{1to4\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vpadd\[^\n\]*\\\{1to8\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vpadd\[^\n\]*\\\{1to16\\\}" 1 } } */ +/* { dg-final { scan-assembler-times "vpmul\[^\n\]*\\\{1to2\\\}" 1 } } */ +/* { dg-final { scan-assembler-times "vpmul\[^\n\]*\\\{1to4\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vpmul\[^\n\]*\\\{1to8\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vpmul\[^\n\]*\\\{1to16\\\}" 1 } } */ +/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to2\\\}" 1 } } */ +/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to4\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to8\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to16\\\}" 1 } } */ +/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to2\\\}" 1 } } */ +/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to4\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to8\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to16\\\}" 1 } } */ + +typedef int v4si __attribute__ ((vector_size (16))); +typedef int v8si __attribute__ ((vector_size (32))); +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v2di __attribute__ ((vector_size (16))); +typedef long long v4di __attribute__ ((vector_size (32))); +typedef long long v8di __attribute__ ((vector_size (64))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef float v8sf __attribute__ ((vector_size (32))); +typedef float v16sf __attribute__ ((vector_size (64))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef double v4df __attribute__ ((vector_size (32))); +typedef double v8df __attribute__ ((vector_size (64))); + +#define FOO(VTYPE, OP_NAME, OP) \ +VTYPE \ + __attribute__ ((noipa)) \ +foo_##OP_NAME##_##VTYPE (VTYPE a) \ +{ \ + return a OP 101; \ +} \ + +FOO (v4si, add, +); +FOO (v8si, add, +); +FOO (v16si, add, +); +FOO (v2di, add, +); +FOO (v4di, add, +); +FOO (v8di, add, +); +FOO (v4sf, add, +); +FOO (v8sf, add, +); +FOO (v16sf, add, +); +FOO (v2df, add, +); +FOO (v4df, add, +); +FOO (v8df, add, +); + +FOO (v4si, mul, *); +FOO (v8si, mul, *); +FOO (v16si, mul, *); +FOO (v2di, mul, *); +FOO (v4di, mul, *); +FOO (v8di, mul, *); +FOO (v4sf, mul, *); +FOO (v8sf, mul, *); +FOO (v16sf, mul, *); +FOO (v2df, mul, *); +FOO (v4df, mul, *); +FOO (v8df, mul, *); diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c new file mode 100644 index 00000000000..938346743c2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c @@ -0,0 +1,54 @@ +/* PR target/87767 */ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */ + +#include +#include +#include "avx512f-broadcast-pr87767-1.c" + +#define TEST(VTYPE, TYPE, N, OP_NAME, OP) \ + do \ + { \ + TYPE exp[N], src[N]; \ + VTYPE res; \ + for (int i = 0; i < N; i++) \ + src[i] = i * i * 107; \ + res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]); \ + for (int i = 0; i < N; i ++) \ + exp[i] = src[i] OP 101; \ + for (int j = 0; j < N; j++) \ + { \ + if (res[j] != exp[j]) \ + abort(); \ + } \ + } \ + while (0) + +int main() +{ + TEST (v4si, int, 4, add, +); + TEST (v8si, int, 8, add, +); + TEST (v16si, int, 16, add, +); + TEST (v2di, long long, 2, add, +); + TEST (v4di, long long, 4, add, +); + TEST (v8di, long long, 8, add, +); + TEST (v4sf, float, 4, add, +); + TEST (v8sf, float, 8, add, +); + TEST (v16sf, float, 16, add, +); + TEST (v2df, double, 2, add, +); + TEST (v4df, double, 4, add, +); + TEST (v8df, double, 8, add, +); + + TEST (v4si, int, 4, mul, *); + TEST (v8si, int, 8, mul, *); + TEST (v16si, int, 16, mul, *); + TEST (v2di, long long, 2, mul, *); + TEST (v4di, long long, 4, mul, *); + TEST (v8di, long long, 8, mul, *); + TEST (v4sf, float, 4, mul, *); + TEST (v8sf, float, 8, mul, *); + TEST (v16sf, float, 16, mul, *); + TEST (v2df, double, 2, mul, *); + TEST (v4df, double, 4, mul, *); + TEST (v8df, double, 8, mul, *); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c new file mode 100644 index 00000000000..ec159a68158 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c @@ -0,0 +1,40 @@ +/* PR target/87767 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-final { scan-assembler-not "\\\{1to\[248\]\\\}" } } */ +/* { dg-final { scan-assembler-not "\\\{1to16\\\}" } } */ + +typedef int v4si __attribute__ ((vector_size (16))); +typedef int v8si __attribute__ ((vector_size (32))); +typedef long long v2di __attribute__ ((vector_size (16))); +typedef long long v4di __attribute__ ((vector_size (32))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef float v8sf __attribute__ ((vector_size (32))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef double v4df __attribute__ ((vector_size (32))); + +#define FOO(VTYPE, OP_NAME, OP) \ +VTYPE \ + __attribute__ ((noipa)) \ +foo_##OP_NAME##_##VTYPE (VTYPE a) \ +{ \ + return a OP 101; \ +} \ + +FOO (v4si, add, +); +FOO (v8si, add, +); +FOO (v2di, add, +); +FOO (v4di, add, +); +FOO (v4sf, add, +); +FOO (v8sf, add, +); +FOO (v2df, add, +); +FOO (v4df, add, +); + +FOO (v4si, mul, *); +FOO (v8si, mul, *); +FOO (v2di, mul, *); +FOO (v4di, mul, *); +FOO (v4sf, mul, *); +FOO (v8sf, mul, *); +FOO (v2df, mul, *); +FOO (v4df, mul, *); diff --git a/gcc/testsuite/gcc.target/i386/pr92865-1.c b/gcc/testsuite/gcc.target/i386/pr92865-1.c index 49b5778a067..a37487d9af7 100644 --- a/gcc/testsuite/gcc.target/i386/pr92865-1.c +++ b/gcc/testsuite/gcc.target/i386/pr92865-1.c @@ -3,10 +3,11 @@ /* { dg-options "-Ofast -mavx512f -mavx512bw -mxop" } */ /* { dg-final { scan-assembler-times "vpcmp\[bwdq\]\[\t ]" 4 } } */ /* { dg-final { scan-assembler-times "vpcmpu\[bwdq\]\[\t ]" 4 } } */ -/* { dg-final { scan-assembler-times "vmovdq\[au\]8\[\t ]" 4 } } */ -/* { dg-final { scan-assembler-times "vmovdq\[au\]16\[\t ]" 4 } } * -/* { dg-final { scan-assembler-times "vmovdq\[au\]32\[\t ]" 4 } } */ -/* { dg-final { scan-assembler-times "vmovdq\[au\]64\[\t ]" 4 } } */ +/* { dg-final { scan-assembler-times "vmovdq\[au\]8\[\t ]" 2 } } */ +/* { dg-final { scan-assembler-times "vmovdq\[au\]16\[\t ]" 2 } } * +/* { dg-final { scan-assembler-times "vmovdq\[au\]32\[\t ]" 2 } } */ +/* { dg-final { scan-assembler-times "vmovdq\[au\]64\[\t ]" 2 } } */ +/* { dg-final { scan-assembler-times "vpbroadcast\[bwqd\]\[\t ]" 16 } } */ extern char arraysb[64]; extern short arraysw[32]; -- 2.18.1