* [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
@ 2017-11-28 15:25 Richard Biener
2017-12-05 20:18 ` Jeff Law
2018-01-05 9:01 ` Richard Biener
0 siblings, 2 replies; 8+ messages in thread
From: Richard Biener @ 2017-11-28 15:25 UTC (permalink / raw)
To: gcc-patches; +Cc: Jan Hubicka, ubizjak, Martin Jambor
The following adds a new target hook, targetm.vectorize.split_reduction,
which allows the target to specify a preferred mode to perform the
final reducion on using either vector shifts or scalar extractions.
Up to that mode the vector reduction result is reduced by combining
lowparts and highparts recursively. This avoids lane-crossing operations
when doing AVX256 on Zen and Bulldozer and also speeds up things on
Haswell (I verified ~20% speedup on Broadwell).
Thus the patch implements the target hook on x86 to _always_ prefer
SSE modes for the final reduction.
For the testcase in the bugzilla
int sumint(const int arr[]) {
arr = __builtin_assume_aligned(arr, 64);
int sum=0;
for (int i=0 ; i<1024 ; i++)
sum+=arr[i];
return sum;
}
this changes -O3 -mavx512f code from
sumint:
.LFB0:
.cfi_startproc
vpxord %zmm0, %zmm0, %zmm0
leaq 4096(%rdi), %rax
.p2align 4,,10
.p2align 3
.L2:
vpaddd (%rdi), %zmm0, %zmm0
addq $64, %rdi
cmpq %rdi, %rax
jne .L2
vpxord %zmm1, %zmm1, %zmm1
vshufi32x4 $78, %zmm1, %zmm0, %zmm2
vpaddd %zmm2, %zmm0, %zmm0
vmovdqa64 .LC0(%rip), %zmm2
vpermi2d %zmm1, %zmm0, %zmm2
vpaddd %zmm2, %zmm0, %zmm0
vmovdqa64 .LC1(%rip), %zmm2
vpermi2d %zmm1, %zmm0, %zmm2
vpaddd %zmm2, %zmm0, %zmm0
vmovdqa64 .LC2(%rip), %zmm2
vpermi2d %zmm1, %zmm0, %zmm2
vpaddd %zmm2, %zmm0, %zmm0
vmovd %xmm0, %eax
to
sumint:
.LFB0:
.cfi_startproc
vpxord %zmm0, %zmm0, %zmm0
leaq 4096(%rdi), %rax
.p2align 4,,10
.p2align 3
.L2:
vpaddd (%rdi), %zmm0, %zmm0
addq $64, %rdi
cmpq %rdi, %rax
jne .L2
vextracti64x4 $0x1, %zmm0, %ymm1
vpaddd %ymm0, %ymm1, %ymm1
vmovdqa %xmm1, %xmm0
vextracti128 $1, %ymm1, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $8, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $4, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vmovd %xmm0, %eax
and for -O3 -mavx2 from
sumint:
.LFB0:
.cfi_startproc
vpxor %xmm0, %xmm0, %xmm0
leaq 4096(%rdi), %rax
.p2align 4,,10
.p2align 3
.L2:
vpaddd (%rdi), %ymm0, %ymm0
addq $32, %rdi
cmpq %rdi, %rax
jne .L2
vpxor %xmm1, %xmm1, %xmm1
vperm2i128 $33, %ymm1, %ymm0, %ymm2
vpaddd %ymm2, %ymm0, %ymm0
vperm2i128 $33, %ymm1, %ymm0, %ymm2
vpalignr $8, %ymm0, %ymm2, %ymm2
vpaddd %ymm2, %ymm0, %ymm0
vperm2i128 $33, %ymm1, %ymm0, %ymm1
vpalignr $4, %ymm0, %ymm1, %ymm1
vpaddd %ymm1, %ymm0, %ymm0
vmovd %xmm0, %eax
to
sumint:
.LFB0:
.cfi_startproc
vpxor %xmm0, %xmm0, %xmm0
leaq 4096(%rdi), %rax
.p2align 4,,10
.p2align 3
.L2:
vpaddd (%rdi), %ymm0, %ymm0
addq $32, %rdi
cmpq %rdi, %rax
jne .L2
vmovdqa %xmm0, %xmm1
vextracti128 $1, %ymm0, %xmm0
vpaddd %xmm0, %xmm1, %xmm0
vpsrldq $8, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $4, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vmovd %xmm0, %eax
vzeroupper
ret
which besides being faster is also smaller (less prefixes).
SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
effects reduction vectorization epilogues I didn't expect big effects
but for loops that do not run much (more likely with AVX512).
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
Ok for trunk?
The PR mentions some more tricks to optimize the sequence but
those look like backend only optimizations.
Thanks,
Richard.
2017-11-28 Richard Biener <rguenther@suse.de>
PR tree-optimization/80846
* target.def (split_reduction): New target hook.
* targhooks.c (default_split_reduction): New function.
* targhooks.h (default_split_reduction): Declare.
* tree-vect-loop.c (vect_create_epilog_for_reduction): If the
target requests first reduce vectors by combining low and high
parts.
* tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
(get_vectype_for_scalar_type_and_size): Export.
* tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
* doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
* doc/tm.texi: Regenerate.
i386/
* config/i386/i386.c (ix86_split_reduction): Implement
TARGET_VECTORIZE_SPLIT_REDUCTION.
* gcc.target/i386/pr80846-1.c: New testcase.
* gcc.target/i386/pr80846-2.c: Likewise.
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c (revision 255197)
+++ gcc/config/i386/i386.c (working copy)
@@ -48864,6 +48864,36 @@ ix86_preferred_simd_mode (scalar_mode mo
}
}
+/* All CPUs perfer to avoid cross-lane operations so perform reductions
+ upper against lower halves up to SSE reg size. */
+
+static machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+ /* Reduce lowpart against highpart until we reach SSE reg width to
+ avoid cross-lane operations. */
+ switch (mode)
+ {
+ case E_V16SImode:
+ case E_V8SImode:
+ return V4SImode;
+ case E_V32HImode:
+ case E_V16HImode:
+ return V8HImode;
+ case E_V64QImode:
+ case E_V32QImode:
+ return V16QImode;
+ case E_V16SFmode:
+ case E_V8SFmode:
+ return V4SFmode;
+ case E_V8DFmode:
+ case E_V4DFmode:
+ return V2DFmode;
+ default:
+ return mode;
+ }
+}
+
/* If AVX is enabled then try vectorizing with both 256bit and 128bit
vectors. If AVX512F is enabled then try vectorizing with 512bit,
256bit and 128bit vectors. */
@@ -50486,6 +50516,9 @@ ix86_run_selftests (void)
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
ix86_preferred_simd_mode
+#undef TARGET_VECTORIZE_SPLIT_REDUCTION
+#define TARGET_VECTORIZE_SPLIT_REDUCTION \
+ ix86_split_reduction
#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
ix86_autovectorize_vector_sizes
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi (revision 255197)
+++ gcc/doc/tm.texi (working copy)
@@ -5844,6 +5844,13 @@ equal to @code{word_mode}, because the v
transformations even in absence of specialized @acronym{SIMD} hardware.
@end deftypefn
+@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION (machine_mode)
+This hook should return the preferred mode to split the final reduction
+step on @var{mode} to. The reduction is then carried out reducing upper
+against lower halves of vectors recursively until the specified mode is
+reached. The default is @var{mode} which means no splitting.
+@end deftypefn
+
@deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (void)
This hook should return a mask of sizes that should be iterated over
after trying to autovectorize using the vector size derived from the
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in (revision 255197)
+++ gcc/doc/tm.texi.in (working copy)
@@ -4091,6 +4091,8 @@ address; but often a machine-dependent
@hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+@hook TARGET_VECTORIZE_SPLIT_REDUCTION
+
@hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
@hook TARGET_VECTORIZE_GET_MASK_MODE
Index: gcc/target.def
===================================================================
--- gcc/target.def (revision 255197)
+++ gcc/target.def (working copy)
@@ -1875,6 +1875,17 @@ transformations even in absence of speci
(scalar_mode mode),
default_preferred_simd_mode)
+/* Returns the preferred mode for splitting SIMD reductions to. */
+DEFHOOK
+(split_reduction,
+ "This hook should return the preferred mode to split the final reduction\n\
+step on @var{mode} to. The reduction is then carried out reducing upper\n\
+against lower halves of vectors recursively until the specified mode is\n\
+reached. The default is @var{mode} which means no splitting.",
+ machine_mode,
+ (machine_mode),
+ default_split_reduction)
+
/* Returns a mask of vector sizes to iterate over when auto-vectorizing
after processing the preferred one derived from preferred_simd_mode. */
DEFHOOK
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c (revision 255197)
+++ gcc/targhooks.c (working copy)
@@ -1281,6 +1281,14 @@ default_preferred_simd_mode (scalar_mode
return word_mode;
}
+/* By default do not split reductions further. */
+
+machine_mode
+default_split_reduction (machine_mode mode)
+{
+ return mode;
+}
+
/* By default only the size derived from the preferred vector mode
is tried. */
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h (revision 255197)
+++ gcc/targhooks.h (working copy)
@@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm
const_tree,
int, bool);
extern machine_mode default_preferred_simd_mode (scalar_mode mode);
+extern machine_mode default_split_reduction (machine_mode);
extern unsigned int default_autovectorize_vector_sizes (void);
extern opt_machine_mode default_get_mask_mode (unsigned, unsigned);
extern void *default_init_cost (struct loop *);
Index: gcc/testsuite/gcc.target/i386/pr80846-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-1.c (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-1.c (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f" } */
+
+int sumint(const int arr[]) {
+ arr = __builtin_assume_aligned(arr, 64);
+ int sum=0;
+ for (int i=0 ; i<1024 ; i++)
+ sum+=arr[i];
+ return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 2 } } */
Index: gcc/testsuite/gcc.target/i386/pr80846-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-2.c (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-2.c (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2" } */
+
+int sumint(const int arr[]) {
+ arr = __builtin_assume_aligned(arr, 64);
+ int sum=0;
+ for (int i=0 ; i<1024 ; i++)
+ sum+=arr[i];
+ return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 1 } } */
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c (revision 255197)
+++ gcc/tree-vect-loop.c (working copy)
@@ -4994,38 +4994,126 @@ vect_create_epilog_for_reduction (vec<tr
}
else
{
- bool reduce_with_shift = have_whole_vector_shift (mode);
- int element_bitsize = tree_to_uhwi (bitsize);
- int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+ bool reduce_with_shift;
tree vec_temp;
-
+
/* COND reductions all do the final reduction with MAX_EXPR. */
if (code == COND_EXPR)
code = MAX_EXPR;
- /* Regardless of whether we have a whole vector shift, if we're
- emulating the operation via tree-vect-generic, we don't want
- to use it. Only the first round of the reduction is likely
- to still be profitable via emulation. */
- /* ??? It might be better to emit a reduction tree code here, so that
- tree-vect-generic can expand the first round via bit tricks. */
- if (!VECTOR_MODE_P (mode))
- reduce_with_shift = false;
+ /* See if the target wants to do the final (shift) reduction
+ in a vector mode of smaller size and first reduce upper/lower
+ halves against each other. */
+ enum machine_mode mode1 = mode;
+ tree vectype1 = vectype;
+ unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+ unsigned sz1 = sz;
+ if (!slp_reduc
+ && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
+ sz1 = GET_MODE_SIZE (mode1);
+
+ vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+ reduce_with_shift = have_whole_vector_shift (mode1);
+ if (!VECTOR_MODE_P (mode1))
+ reduce_with_shift = false;
else
- {
- optab optab = optab_for_tree_code (code, vectype, optab_default);
- if (optab_handler (optab, mode) == CODE_FOR_nothing)
- reduce_with_shift = false;
- }
+ {
+ optab optab = optab_for_tree_code (code, vectype1, optab_default);
+ if (optab_handler (optab, mode1) == CODE_FOR_nothing)
+ reduce_with_shift = false;
+ }
+
+ /* First reduce the vector to the desired vector size we should
+ do shift reduction on by combining upper and lower halves. */
+ new_temp = new_phi_result;
+ while (sz > sz1)
+ {
+ gcc_assert (!slp_reduc);
+ sz /= 2;
+ vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
+
+ /* The target has to make sure we support lowpart/highpart
+ extraction, either via direct vector extract or through
+ an integer mode punning. */
+ tree dst1, dst2;
+ if (convert_optab_handler (vec_extract_optab,
+ TYPE_MODE (TREE_TYPE (new_temp)),
+ TYPE_MODE (vectype1))
+ != CODE_FOR_nothing)
+ {
+ /* Extract sub-vectors directly once vec_extract becomes
+ a conversion optab. */
+ dst1 = make_ssa_name (vectype1);
+ epilog_stmt
+ = gimple_build_assign (dst1, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, vectype1,
+ new_temp, TYPE_SIZE (vectype1),
+ bitsize_int (0)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst2 = make_ssa_name (vectype1);
+ epilog_stmt
+ = gimple_build_assign (dst2, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, vectype1,
+ new_temp, TYPE_SIZE (vectype1),
+ bitsize_int (sz * BITS_PER_UNIT)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
+ else
+ {
+ /* Extract via punning to appropriately sized integer mode
+ vector. */
+ tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
+ 1);
+ tree etype = build_vector_type (eltype, 2);
+ gcc_assert (convert_optab_handler (vec_extract_optab,
+ TYPE_MODE (etype),
+ TYPE_MODE (eltype))
+ != CODE_FOR_nothing);
+ tree tem = make_ssa_name (etype);
+ epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ etype, new_temp));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ new_temp = tem;
+ tem = make_ssa_name (eltype);
+ epilog_stmt
+ = gimple_build_assign (tem, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, eltype,
+ new_temp, TYPE_SIZE (eltype),
+ bitsize_int (0)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst1 = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ vectype1, tem));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ tem = make_ssa_name (eltype);
+ epilog_stmt
+ = gimple_build_assign (tem, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, eltype,
+ new_temp, TYPE_SIZE (eltype),
+ bitsize_int (sz * BITS_PER_UNIT)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst2 = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ vectype1, tem));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
+
+ new_temp = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
if (reduce_with_shift && !slp_reduc)
{
- int nelements = vec_size_in_bits / element_bitsize;
+ int nelements = TYPE_VECTOR_SUBPARTS (vectype1);
auto_vec_perm_indices sel (nelements);
int elt_offset;
- tree zero_vec = build_zero_cst (vectype);
+ tree zero_vec = build_zero_cst (vectype1);
/* Case 2: Create:
for (offset = nelements/2; offset >= 1; offset/=2)
{
@@ -5039,15 +5127,15 @@ vect_create_epilog_for_reduction (vec<tr
dump_printf_loc (MSG_NOTE, vect_location,
"Reduce using vector shifts\n");
- vec_dest = vect_create_destination_var (scalar_dest, vectype);
- new_temp = new_phi_result;
+ mode1 = TYPE_MODE (vectype1);
+ vec_dest = vect_create_destination_var (scalar_dest, vectype1);
for (elt_offset = nelements / 2;
elt_offset >= 1;
elt_offset /= 2)
{
sel.truncate (0);
- calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
- tree mask = vect_gen_perm_mask_any (vectype, sel);
+ calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
+ tree mask = vect_gen_perm_mask_any (vectype1, sel);
epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
new_temp, zero_vec, mask);
new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -5092,7 +5180,8 @@ vect_create_epilog_for_reduction (vec<tr
dump_printf_loc (MSG_NOTE, vect_location,
"Reduce using scalar code.\n");
- vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+ int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+ int element_bitsize = tree_to_uhwi (bitsize);
FOR_EACH_VEC_ELT (new_phis, i, new_phi)
{
int bit_offset;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c (revision 255197)
+++ gcc/tree-vect-stmts.c (working copy)
@@ -6514,7 +6514,7 @@ vect_gen_perm_mask_any (tree vectype, ve
mask_elt_type = lang_hooks.types.type_for_mode
(int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
- mask_type = get_vectype_for_scalar_type (mask_elt_type);
+ mask_type = get_same_sized_vectype (mask_elt_type, vectype);
auto_vec<tree, 32> mask_elts (nunits);
for (unsigned int i = 0; i < nunits; ++i)
@@ -9065,7 +9065,7 @@ free_stmt_vec_info (gimple *stmt)
Returns the vector type corresponding to SCALAR_TYPE and SIZE as supported
by the target. */
-static tree
+tree
get_vectype_for_scalar_type_and_size (tree scalar_type, unsigned size)
{
tree orig_scalar_type = scalar_type;
Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h (revision 255197)
+++ gcc/tree-vectorizer.h (working copy)
@@ -1151,6 +1151,7 @@ extern bool vect_can_advance_ivs_p (loop
/* In tree-vect-stmts.c. */
extern unsigned int current_vector_size;
extern tree get_vectype_for_scalar_type (tree);
+extern tree get_vectype_for_scalar_type_and_size (tree, unsigned);
extern tree get_mask_type_for_scalar_type (tree);
extern tree get_same_sized_vectype (tree, tree);
extern bool vect_is_simple_use (tree, vec_info *, gimple **,
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
2017-11-28 15:25 [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86) Richard Biener
@ 2017-12-05 20:18 ` Jeff Law
2017-12-06 6:42 ` Richard Biener
2018-01-05 9:01 ` Richard Biener
1 sibling, 1 reply; 8+ messages in thread
From: Jeff Law @ 2017-12-05 20:18 UTC (permalink / raw)
To: Richard Biener, gcc-patches; +Cc: Jan Hubicka, ubizjak, Martin Jambor
On 11/28/2017 08:15 AM, Richard Biener wrote:
>
> The following adds a new target hook, targetm.vectorize.split_reduction,
> which allows the target to specify a preferred mode to perform the
> final reducion on using either vector shifts or scalar extractions.
> Up to that mode the vector reduction result is reduced by combining
> lowparts and highparts recursively. This avoids lane-crossing operations
> when doing AVX256 on Zen and Bulldozer and also speeds up things on
> Haswell (I verified ~20% speedup on Broadwell).
>
> Thus the patch implements the target hook on x86 to _always_ prefer
> SSE modes for the final reduction.
>
> For the testcase in the bugzilla
>
> int sumint(const int arr[]) {
> arr = __builtin_assume_aligned(arr, 64);
> int sum=0;
> for (int i=0 ; i<1024 ; i++)
> sum+=arr[i];
> return sum;
> }
>
> this changes -O3 -mavx512f code from
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxord %zmm0, %zmm0, %zmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %zmm0, %zmm0
> addq $64, %rdi
> cmpq %rdi, %rax
> jne .L2
> vpxord %zmm1, %zmm1, %zmm1
> vshufi32x4 $78, %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovdqa64 .LC0(%rip), %zmm2
> vpermi2d %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovdqa64 .LC1(%rip), %zmm2
> vpermi2d %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovdqa64 .LC2(%rip), %zmm2
> vpermi2d %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovd %xmm0, %eax
>
> to
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxord %zmm0, %zmm0, %zmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %zmm0, %zmm0
> addq $64, %rdi
> cmpq %rdi, %rax
> jne .L2
> vextracti64x4 $0x1, %zmm0, %ymm1
> vpaddd %ymm0, %ymm1, %ymm1
> vmovdqa %xmm1, %xmm0
> vextracti128 $1, %ymm1, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vpsrldq $8, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vpsrldq $4, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vmovd %xmm0, %eax
>
> and for -O3 -mavx2 from
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxor %xmm0, %xmm0, %xmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %ymm0, %ymm0
> addq $32, %rdi
> cmpq %rdi, %rax
> jne .L2
> vpxor %xmm1, %xmm1, %xmm1
> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> vpaddd %ymm2, %ymm0, %ymm0
> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> vpalignr $8, %ymm0, %ymm2, %ymm2
> vpaddd %ymm2, %ymm0, %ymm0
> vperm2i128 $33, %ymm1, %ymm0, %ymm1
> vpalignr $4, %ymm0, %ymm1, %ymm1
> vpaddd %ymm1, %ymm0, %ymm0
> vmovd %xmm0, %eax
>
> to
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxor %xmm0, %xmm0, %xmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %ymm0, %ymm0
> addq $32, %rdi
> cmpq %rdi, %rax
> jne .L2
> vmovdqa %xmm0, %xmm1
> vextracti128 $1, %ymm0, %xmm0
> vpaddd %xmm0, %xmm1, %xmm0
> vpsrldq $8, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vpsrldq $4, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vmovd %xmm0, %eax
> vzeroupper
> ret
>
> which besides being faster is also smaller (less prefixes).
>
> SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
> effects reduction vectorization epilogues I didn't expect big effects
> but for loops that do not run much (more likely with AVX512).
>
> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>
> Ok for trunk?
>
> The PR mentions some more tricks to optimize the sequence but
> those look like backend only optimizations.
>
> Thanks,
> Richard.
>
> 2017-11-28 Richard Biener <rguenther@suse.de>
>
> PR tree-optimization/80846
> * target.def (split_reduction): New target hook.
> * targhooks.c (default_split_reduction): New function.
> * targhooks.h (default_split_reduction): Declare.
> * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
> target requests first reduce vectors by combining low and high
> parts.
> * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
> (get_vectype_for_scalar_type_and_size): Export.
> * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
>
> * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
> * doc/tm.texi: Regenerate.
>
> i386/
> * config/i386/i386.c (ix86_split_reduction): Implement
> TARGET_VECTORIZE_SPLIT_REDUCTION.
>
> * gcc.target/i386/pr80846-1.c: New testcase.
> * gcc.target/i386/pr80846-2.c: Likewise.
I'm not a big fan of introducing these kinds of target queries into the
gimple optimizers, but I think we've all agreed to allow them to varying
degrees within the vectorizer.
So no objections from me. You know the vectorizer bits far better than
I :-)
jeff
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
2017-12-05 20:18 ` Jeff Law
@ 2017-12-06 6:42 ` Richard Biener
0 siblings, 0 replies; 8+ messages in thread
From: Richard Biener @ 2017-12-06 6:42 UTC (permalink / raw)
To: Jeff Law, gcc-patches; +Cc: Jan Hubicka, ubizjak, Martin Jambor
On December 5, 2017 9:18:46 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:
>On 11/28/2017 08:15 AM, Richard Biener wrote:
>>
>> The following adds a new target hook,
>targetm.vectorize.split_reduction,
>> which allows the target to specify a preferred mode to perform the
>> final reducion on using either vector shifts or scalar extractions.
>> Up to that mode the vector reduction result is reduced by combining
>> lowparts and highparts recursively. This avoids lane-crossing
>operations
>> when doing AVX256 on Zen and Bulldozer and also speeds up things on
>> Haswell (I verified ~20% speedup on Broadwell).
>>
>> Thus the patch implements the target hook on x86 to _always_ prefer
>> SSE modes for the final reduction.
>>
>> For the testcase in the bugzilla
>>
>> int sumint(const int arr[]) {
>> arr = __builtin_assume_aligned(arr, 64);
>> int sum=0;
>> for (int i=0 ; i<1024 ; i++)
>> sum+=arr[i];
>> return sum;
>> }
>>
>> this changes -O3 -mavx512f code from
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxord %zmm0, %zmm0, %zmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %zmm0, %zmm0
>> addq $64, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vpxord %zmm1, %zmm1, %zmm1
>> vshufi32x4 $78, %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovdqa64 .LC0(%rip), %zmm2
>> vpermi2d %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovdqa64 .LC1(%rip), %zmm2
>> vpermi2d %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovdqa64 .LC2(%rip), %zmm2
>> vpermi2d %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovd %xmm0, %eax
>>
>> to
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxord %zmm0, %zmm0, %zmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %zmm0, %zmm0
>> addq $64, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vextracti64x4 $0x1, %zmm0, %ymm1
>> vpaddd %ymm0, %ymm1, %ymm1
>> vmovdqa %xmm1, %xmm0
>> vextracti128 $1, %ymm1, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vpsrldq $8, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vpsrldq $4, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vmovd %xmm0, %eax
>>
>> and for -O3 -mavx2 from
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxor %xmm0, %xmm0, %xmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %ymm0, %ymm0
>> addq $32, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vpxor %xmm1, %xmm1, %xmm1
>> vperm2i128 $33, %ymm1, %ymm0, %ymm2
>> vpaddd %ymm2, %ymm0, %ymm0
>> vperm2i128 $33, %ymm1, %ymm0, %ymm2
>> vpalignr $8, %ymm0, %ymm2, %ymm2
>> vpaddd %ymm2, %ymm0, %ymm0
>> vperm2i128 $33, %ymm1, %ymm0, %ymm1
>> vpalignr $4, %ymm0, %ymm1, %ymm1
>> vpaddd %ymm1, %ymm0, %ymm0
>> vmovd %xmm0, %eax
>>
>> to
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxor %xmm0, %xmm0, %xmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %ymm0, %ymm0
>> addq $32, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vmovdqa %xmm0, %xmm1
>> vextracti128 $1, %ymm0, %xmm0
>> vpaddd %xmm0, %xmm1, %xmm0
>> vpsrldq $8, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vpsrldq $4, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vmovd %xmm0, %eax
>> vzeroupper
>> ret
>>
>> which besides being faster is also smaller (less prefixes).
>>
>> SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
>> effects reduction vectorization epilogues I didn't expect big effects
>> but for loops that do not run much (more likely with AVX512).
>>
>> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>>
>> Ok for trunk?
>>
>> The PR mentions some more tricks to optimize the sequence but
>> those look like backend only optimizations.
>>
>> Thanks,
>> Richard.
>>
>> 2017-11-28 Richard Biener <rguenther@suse.de>
>>
>> PR tree-optimization/80846
>> * target.def (split_reduction): New target hook.
>> * targhooks.c (default_split_reduction): New function.
>> * targhooks.h (default_split_reduction): Declare.
>> * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
>> target requests first reduce vectors by combining low and high
>> parts.
>> * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
>> (get_vectype_for_scalar_type_and_size): Export.
>> * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
>>
>> * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
>> * doc/tm.texi: Regenerate.
>>
>> i386/
>> * config/i386/i386.c (ix86_split_reduction): Implement
>> TARGET_VECTORIZE_SPLIT_REDUCTION.
>>
>> * gcc.target/i386/pr80846-1.c: New testcase.
>> * gcc.target/i386/pr80846-2.c: Likewise.
>I'm not a big fan of introducing these kinds of target queries into the
>gimple optimizers, but I think we've all agreed to allow them to
>varying
>degrees within the vectorizer.
>
>So no objections from me. You know the vectorizer bits far better than
>I :-)
I had first (ab)used the vector_sizes hook but Jakub convinced me to add a new one. There might be non-trivial costs when moving between vector sizes.
Richard.
>
>jeff
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
2017-11-28 15:25 [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86) Richard Biener
2017-12-05 20:18 ` Jeff Law
@ 2018-01-05 9:01 ` Richard Biener
2018-01-09 23:13 ` Jeff Law
1 sibling, 1 reply; 8+ messages in thread
From: Richard Biener @ 2018-01-05 9:01 UTC (permalink / raw)
To: gcc-patches; +Cc: Jan Hubicka, ubizjak, Martin Jambor
On Tue, 28 Nov 2017, Richard Biener wrote:
>
> The following adds a new target hook, targetm.vectorize.split_reduction,
> which allows the target to specify a preferred mode to perform the
> final reducion on using either vector shifts or scalar extractions.
> Up to that mode the vector reduction result is reduced by combining
> lowparts and highparts recursively. This avoids lane-crossing operations
> when doing AVX256 on Zen and Bulldozer and also speeds up things on
> Haswell (I verified ~20% speedup on Broadwell).
>
> Thus the patch implements the target hook on x86 to _always_ prefer
> SSE modes for the final reduction.
>
> For the testcase in the bugzilla
>
> int sumint(const int arr[]) {
> arr = __builtin_assume_aligned(arr, 64);
> int sum=0;
> for (int i=0 ; i<1024 ; i++)
> sum+=arr[i];
> return sum;
> }
>
> this changes -O3 -mavx512f code from
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxord %zmm0, %zmm0, %zmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %zmm0, %zmm0
> addq $64, %rdi
> cmpq %rdi, %rax
> jne .L2
> vpxord %zmm1, %zmm1, %zmm1
> vshufi32x4 $78, %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovdqa64 .LC0(%rip), %zmm2
> vpermi2d %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovdqa64 .LC1(%rip), %zmm2
> vpermi2d %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovdqa64 .LC2(%rip), %zmm2
> vpermi2d %zmm1, %zmm0, %zmm2
> vpaddd %zmm2, %zmm0, %zmm0
> vmovd %xmm0, %eax
>
> to
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxord %zmm0, %zmm0, %zmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %zmm0, %zmm0
> addq $64, %rdi
> cmpq %rdi, %rax
> jne .L2
> vextracti64x4 $0x1, %zmm0, %ymm1
> vpaddd %ymm0, %ymm1, %ymm1
> vmovdqa %xmm1, %xmm0
> vextracti128 $1, %ymm1, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vpsrldq $8, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vpsrldq $4, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vmovd %xmm0, %eax
>
> and for -O3 -mavx2 from
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxor %xmm0, %xmm0, %xmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %ymm0, %ymm0
> addq $32, %rdi
> cmpq %rdi, %rax
> jne .L2
> vpxor %xmm1, %xmm1, %xmm1
> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> vpaddd %ymm2, %ymm0, %ymm0
> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> vpalignr $8, %ymm0, %ymm2, %ymm2
> vpaddd %ymm2, %ymm0, %ymm0
> vperm2i128 $33, %ymm1, %ymm0, %ymm1
> vpalignr $4, %ymm0, %ymm1, %ymm1
> vpaddd %ymm1, %ymm0, %ymm0
> vmovd %xmm0, %eax
>
> to
>
> sumint:
> .LFB0:
> .cfi_startproc
> vpxor %xmm0, %xmm0, %xmm0
> leaq 4096(%rdi), %rax
> .p2align 4,,10
> .p2align 3
> .L2:
> vpaddd (%rdi), %ymm0, %ymm0
> addq $32, %rdi
> cmpq %rdi, %rax
> jne .L2
> vmovdqa %xmm0, %xmm1
> vextracti128 $1, %ymm0, %xmm0
> vpaddd %xmm0, %xmm1, %xmm0
> vpsrldq $8, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vpsrldq $4, %xmm0, %xmm1
> vpaddd %xmm1, %xmm0, %xmm0
> vmovd %xmm0, %eax
> vzeroupper
> ret
>
> which besides being faster is also smaller (less prefixes).
>
> SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
> effects reduction vectorization epilogues I didn't expect big effects
> but for loops that do not run much (more likely with AVX512).
>
> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>
> Ok for trunk?
Ping?
Richard.
> The PR mentions some more tricks to optimize the sequence but
> those look like backend only optimizations.
>
> Thanks,
> Richard.
>
> 2017-11-28 Richard Biener <rguenther@suse.de>
>
> PR tree-optimization/80846
> * target.def (split_reduction): New target hook.
> * targhooks.c (default_split_reduction): New function.
> * targhooks.h (default_split_reduction): Declare.
> * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
> target requests first reduce vectors by combining low and high
> parts.
> * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
> (get_vectype_for_scalar_type_and_size): Export.
> * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
>
> * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
> * doc/tm.texi: Regenerate.
>
> i386/
> * config/i386/i386.c (ix86_split_reduction): Implement
> TARGET_VECTORIZE_SPLIT_REDUCTION.
>
> * gcc.target/i386/pr80846-1.c: New testcase.
> * gcc.target/i386/pr80846-2.c: Likewise.
>
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c (revision 255197)
> +++ gcc/config/i386/i386.c (working copy)
> @@ -48864,6 +48864,36 @@ ix86_preferred_simd_mode (scalar_mode mo
> }
> }
>
> +/* All CPUs perfer to avoid cross-lane operations so perform reductions
> + upper against lower halves up to SSE reg size. */
> +
> +static machine_mode
> +ix86_split_reduction (machine_mode mode)
> +{
> + /* Reduce lowpart against highpart until we reach SSE reg width to
> + avoid cross-lane operations. */
> + switch (mode)
> + {
> + case E_V16SImode:
> + case E_V8SImode:
> + return V4SImode;
> + case E_V32HImode:
> + case E_V16HImode:
> + return V8HImode;
> + case E_V64QImode:
> + case E_V32QImode:
> + return V16QImode;
> + case E_V16SFmode:
> + case E_V8SFmode:
> + return V4SFmode;
> + case E_V8DFmode:
> + case E_V4DFmode:
> + return V2DFmode;
> + default:
> + return mode;
> + }
> +}
> +
> /* If AVX is enabled then try vectorizing with both 256bit and 128bit
> vectors. If AVX512F is enabled then try vectorizing with 512bit,
> 256bit and 128bit vectors. */
> @@ -50486,6 +50516,9 @@ ix86_run_selftests (void)
> #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
> #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
> ix86_preferred_simd_mode
> +#undef TARGET_VECTORIZE_SPLIT_REDUCTION
> +#define TARGET_VECTORIZE_SPLIT_REDUCTION \
> + ix86_split_reduction
> #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
> #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
> ix86_autovectorize_vector_sizes
> Index: gcc/doc/tm.texi
> ===================================================================
> --- gcc/doc/tm.texi (revision 255197)
> +++ gcc/doc/tm.texi (working copy)
> @@ -5844,6 +5844,13 @@ equal to @code{word_mode}, because the v
> transformations even in absence of specialized @acronym{SIMD} hardware.
> @end deftypefn
>
> +@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION (machine_mode)
> +This hook should return the preferred mode to split the final reduction
> +step on @var{mode} to. The reduction is then carried out reducing upper
> +against lower halves of vectors recursively until the specified mode is
> +reached. The default is @var{mode} which means no splitting.
> +@end deftypefn
> +
> @deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (void)
> This hook should return a mask of sizes that should be iterated over
> after trying to autovectorize using the vector size derived from the
> Index: gcc/doc/tm.texi.in
> ===================================================================
> --- gcc/doc/tm.texi.in (revision 255197)
> +++ gcc/doc/tm.texi.in (working copy)
> @@ -4091,6 +4091,8 @@ address; but often a machine-dependent
>
> @hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE
>
> +@hook TARGET_VECTORIZE_SPLIT_REDUCTION
> +
> @hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
>
> @hook TARGET_VECTORIZE_GET_MASK_MODE
> Index: gcc/target.def
> ===================================================================
> --- gcc/target.def (revision 255197)
> +++ gcc/target.def (working copy)
> @@ -1875,6 +1875,17 @@ transformations even in absence of speci
> (scalar_mode mode),
> default_preferred_simd_mode)
>
> +/* Returns the preferred mode for splitting SIMD reductions to. */
> +DEFHOOK
> +(split_reduction,
> + "This hook should return the preferred mode to split the final reduction\n\
> +step on @var{mode} to. The reduction is then carried out reducing upper\n\
> +against lower halves of vectors recursively until the specified mode is\n\
> +reached. The default is @var{mode} which means no splitting.",
> + machine_mode,
> + (machine_mode),
> + default_split_reduction)
> +
> /* Returns a mask of vector sizes to iterate over when auto-vectorizing
> after processing the preferred one derived from preferred_simd_mode. */
> DEFHOOK
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c (revision 255197)
> +++ gcc/targhooks.c (working copy)
> @@ -1281,6 +1281,14 @@ default_preferred_simd_mode (scalar_mode
> return word_mode;
> }
>
> +/* By default do not split reductions further. */
> +
> +machine_mode
> +default_split_reduction (machine_mode mode)
> +{
> + return mode;
> +}
> +
> /* By default only the size derived from the preferred vector mode
> is tried. */
>
> Index: gcc/targhooks.h
> ===================================================================
> --- gcc/targhooks.h (revision 255197)
> +++ gcc/targhooks.h (working copy)
> @@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm
> const_tree,
> int, bool);
> extern machine_mode default_preferred_simd_mode (scalar_mode mode);
> +extern machine_mode default_split_reduction (machine_mode);
> extern unsigned int default_autovectorize_vector_sizes (void);
> extern opt_machine_mode default_get_mask_mode (unsigned, unsigned);
> extern void *default_init_cost (struct loop *);
> Index: gcc/testsuite/gcc.target/i386/pr80846-1.c
> ===================================================================
> --- gcc/testsuite/gcc.target/i386/pr80846-1.c (nonexistent)
> +++ gcc/testsuite/gcc.target/i386/pr80846-1.c (working copy)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f" } */
> +
> +int sumint(const int arr[]) {
> + arr = __builtin_assume_aligned(arr, 64);
> + int sum=0;
> + for (int i=0 ; i<1024 ; i++)
> + sum+=arr[i];
> + return sum;
> +}
> +
> +/* { dg-final { scan-assembler-times "vextracti" 2 } } */
> Index: gcc/testsuite/gcc.target/i386/pr80846-2.c
> ===================================================================
> --- gcc/testsuite/gcc.target/i386/pr80846-2.c (nonexistent)
> +++ gcc/testsuite/gcc.target/i386/pr80846-2.c (working copy)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2" } */
> +
> +int sumint(const int arr[]) {
> + arr = __builtin_assume_aligned(arr, 64);
> + int sum=0;
> + for (int i=0 ; i<1024 ; i++)
> + sum+=arr[i];
> + return sum;
> +}
> +
> +/* { dg-final { scan-assembler-times "vextracti" 1 } } */
> Index: gcc/tree-vect-loop.c
> ===================================================================
> --- gcc/tree-vect-loop.c (revision 255197)
> +++ gcc/tree-vect-loop.c (working copy)
> @@ -4994,38 +4994,126 @@ vect_create_epilog_for_reduction (vec<tr
> }
> else
> {
> - bool reduce_with_shift = have_whole_vector_shift (mode);
> - int element_bitsize = tree_to_uhwi (bitsize);
> - int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
> + bool reduce_with_shift;
> tree vec_temp;
> -
> +
> /* COND reductions all do the final reduction with MAX_EXPR. */
> if (code == COND_EXPR)
> code = MAX_EXPR;
>
> - /* Regardless of whether we have a whole vector shift, if we're
> - emulating the operation via tree-vect-generic, we don't want
> - to use it. Only the first round of the reduction is likely
> - to still be profitable via emulation. */
> - /* ??? It might be better to emit a reduction tree code here, so that
> - tree-vect-generic can expand the first round via bit tricks. */
> - if (!VECTOR_MODE_P (mode))
> - reduce_with_shift = false;
> + /* See if the target wants to do the final (shift) reduction
> + in a vector mode of smaller size and first reduce upper/lower
> + halves against each other. */
> + enum machine_mode mode1 = mode;
> + tree vectype1 = vectype;
> + unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
> + unsigned sz1 = sz;
> + if (!slp_reduc
> + && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
> + sz1 = GET_MODE_SIZE (mode1);
> +
> + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
> + reduce_with_shift = have_whole_vector_shift (mode1);
> + if (!VECTOR_MODE_P (mode1))
> + reduce_with_shift = false;
> else
> - {
> - optab optab = optab_for_tree_code (code, vectype, optab_default);
> - if (optab_handler (optab, mode) == CODE_FOR_nothing)
> - reduce_with_shift = false;
> - }
> + {
> + optab optab = optab_for_tree_code (code, vectype1, optab_default);
> + if (optab_handler (optab, mode1) == CODE_FOR_nothing)
> + reduce_with_shift = false;
> + }
> +
> + /* First reduce the vector to the desired vector size we should
> + do shift reduction on by combining upper and lower halves. */
> + new_temp = new_phi_result;
> + while (sz > sz1)
> + {
> + gcc_assert (!slp_reduc);
> + sz /= 2;
> + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
> +
> + /* The target has to make sure we support lowpart/highpart
> + extraction, either via direct vector extract or through
> + an integer mode punning. */
> + tree dst1, dst2;
> + if (convert_optab_handler (vec_extract_optab,
> + TYPE_MODE (TREE_TYPE (new_temp)),
> + TYPE_MODE (vectype1))
> + != CODE_FOR_nothing)
> + {
> + /* Extract sub-vectors directly once vec_extract becomes
> + a conversion optab. */
> + dst1 = make_ssa_name (vectype1);
> + epilog_stmt
> + = gimple_build_assign (dst1, BIT_FIELD_REF,
> + build3 (BIT_FIELD_REF, vectype1,
> + new_temp, TYPE_SIZE (vectype1),
> + bitsize_int (0)));
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + dst2 = make_ssa_name (vectype1);
> + epilog_stmt
> + = gimple_build_assign (dst2, BIT_FIELD_REF,
> + build3 (BIT_FIELD_REF, vectype1,
> + new_temp, TYPE_SIZE (vectype1),
> + bitsize_int (sz * BITS_PER_UNIT)));
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + }
> + else
> + {
> + /* Extract via punning to appropriately sized integer mode
> + vector. */
> + tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
> + 1);
> + tree etype = build_vector_type (eltype, 2);
> + gcc_assert (convert_optab_handler (vec_extract_optab,
> + TYPE_MODE (etype),
> + TYPE_MODE (eltype))
> + != CODE_FOR_nothing);
> + tree tem = make_ssa_name (etype);
> + epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
> + build1 (VIEW_CONVERT_EXPR,
> + etype, new_temp));
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + new_temp = tem;
> + tem = make_ssa_name (eltype);
> + epilog_stmt
> + = gimple_build_assign (tem, BIT_FIELD_REF,
> + build3 (BIT_FIELD_REF, eltype,
> + new_temp, TYPE_SIZE (eltype),
> + bitsize_int (0)));
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + dst1 = make_ssa_name (vectype1);
> + epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
> + build1 (VIEW_CONVERT_EXPR,
> + vectype1, tem));
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + tem = make_ssa_name (eltype);
> + epilog_stmt
> + = gimple_build_assign (tem, BIT_FIELD_REF,
> + build3 (BIT_FIELD_REF, eltype,
> + new_temp, TYPE_SIZE (eltype),
> + bitsize_int (sz * BITS_PER_UNIT)));
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + dst2 = make_ssa_name (vectype1);
> + epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
> + build1 (VIEW_CONVERT_EXPR,
> + vectype1, tem));
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + }
> +
> + new_temp = make_ssa_name (vectype1);
> + epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
> + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> + }
>
> if (reduce_with_shift && !slp_reduc)
> {
> - int nelements = vec_size_in_bits / element_bitsize;
> + int nelements = TYPE_VECTOR_SUBPARTS (vectype1);
> auto_vec_perm_indices sel (nelements);
>
> int elt_offset;
>
> - tree zero_vec = build_zero_cst (vectype);
> + tree zero_vec = build_zero_cst (vectype1);
> /* Case 2: Create:
> for (offset = nelements/2; offset >= 1; offset/=2)
> {
> @@ -5039,15 +5127,15 @@ vect_create_epilog_for_reduction (vec<tr
> dump_printf_loc (MSG_NOTE, vect_location,
> "Reduce using vector shifts\n");
>
> - vec_dest = vect_create_destination_var (scalar_dest, vectype);
> - new_temp = new_phi_result;
> + mode1 = TYPE_MODE (vectype1);
> + vec_dest = vect_create_destination_var (scalar_dest, vectype1);
> for (elt_offset = nelements / 2;
> elt_offset >= 1;
> elt_offset /= 2)
> {
> sel.truncate (0);
> - calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
> - tree mask = vect_gen_perm_mask_any (vectype, sel);
> + calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
> + tree mask = vect_gen_perm_mask_any (vectype1, sel);
> epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
> new_temp, zero_vec, mask);
> new_name = make_ssa_name (vec_dest, epilog_stmt);
> @@ -5092,7 +5180,8 @@ vect_create_epilog_for_reduction (vec<tr
> dump_printf_loc (MSG_NOTE, vect_location,
> "Reduce using scalar code.\n");
>
> - vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
> + int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
> + int element_bitsize = tree_to_uhwi (bitsize);
> FOR_EACH_VEC_ELT (new_phis, i, new_phi)
> {
> int bit_offset;
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c (revision 255197)
> +++ gcc/tree-vect-stmts.c (working copy)
> @@ -6514,7 +6514,7 @@ vect_gen_perm_mask_any (tree vectype, ve
>
> mask_elt_type = lang_hooks.types.type_for_mode
> (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
> - mask_type = get_vectype_for_scalar_type (mask_elt_type);
> + mask_type = get_same_sized_vectype (mask_elt_type, vectype);
>
> auto_vec<tree, 32> mask_elts (nunits);
> for (unsigned int i = 0; i < nunits; ++i)
> @@ -9065,7 +9065,7 @@ free_stmt_vec_info (gimple *stmt)
> Returns the vector type corresponding to SCALAR_TYPE and SIZE as supported
> by the target. */
>
> -static tree
> +tree
> get_vectype_for_scalar_type_and_size (tree scalar_type, unsigned size)
> {
> tree orig_scalar_type = scalar_type;
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h (revision 255197)
> +++ gcc/tree-vectorizer.h (working copy)
> @@ -1151,6 +1151,7 @@ extern bool vect_can_advance_ivs_p (loop
> /* In tree-vect-stmts.c. */
> extern unsigned int current_vector_size;
> extern tree get_vectype_for_scalar_type (tree);
> +extern tree get_vectype_for_scalar_type_and_size (tree, unsigned);
> extern tree get_mask_type_for_scalar_type (tree);
> extern tree get_same_sized_vectype (tree, tree);
> extern bool vect_is_simple_use (tree, vec_info *, gimple **,
>
--
Richard Biener <rguenther@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
2018-01-05 9:01 ` Richard Biener
@ 2018-01-09 23:13 ` Jeff Law
2018-01-10 8:23 ` Richard Biener
0 siblings, 1 reply; 8+ messages in thread
From: Jeff Law @ 2018-01-09 23:13 UTC (permalink / raw)
To: Richard Biener, gcc-patches; +Cc: Jan Hubicka, ubizjak, Martin Jambor
On 01/05/2018 02:01 AM, Richard Biener wrote:
> On Tue, 28 Nov 2017, Richard Biener wrote:
>
>>
>> The following adds a new target hook, targetm.vectorize.split_reduction,
>> which allows the target to specify a preferred mode to perform the
>> final reducion on using either vector shifts or scalar extractions.
>> Up to that mode the vector reduction result is reduced by combining
>> lowparts and highparts recursively. This avoids lane-crossing operations
>> when doing AVX256 on Zen and Bulldozer and also speeds up things on
>> Haswell (I verified ~20% speedup on Broadwell).
>>
>> Thus the patch implements the target hook on x86 to _always_ prefer
>> SSE modes for the final reduction.
>>
>> For the testcase in the bugzilla
>>
>> int sumint(const int arr[]) {
>> arr = __builtin_assume_aligned(arr, 64);
>> int sum=0;
>> for (int i=0 ; i<1024 ; i++)
>> sum+=arr[i];
>> return sum;
>> }
>>
>> this changes -O3 -mavx512f code from
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxord %zmm0, %zmm0, %zmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %zmm0, %zmm0
>> addq $64, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vpxord %zmm1, %zmm1, %zmm1
>> vshufi32x4 $78, %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovdqa64 .LC0(%rip), %zmm2
>> vpermi2d %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovdqa64 .LC1(%rip), %zmm2
>> vpermi2d %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovdqa64 .LC2(%rip), %zmm2
>> vpermi2d %zmm1, %zmm0, %zmm2
>> vpaddd %zmm2, %zmm0, %zmm0
>> vmovd %xmm0, %eax
>>
>> to
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxord %zmm0, %zmm0, %zmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %zmm0, %zmm0
>> addq $64, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vextracti64x4 $0x1, %zmm0, %ymm1
>> vpaddd %ymm0, %ymm1, %ymm1
>> vmovdqa %xmm1, %xmm0
>> vextracti128 $1, %ymm1, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vpsrldq $8, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vpsrldq $4, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vmovd %xmm0, %eax
>>
>> and for -O3 -mavx2 from
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxor %xmm0, %xmm0, %xmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %ymm0, %ymm0
>> addq $32, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vpxor %xmm1, %xmm1, %xmm1
>> vperm2i128 $33, %ymm1, %ymm0, %ymm2
>> vpaddd %ymm2, %ymm0, %ymm0
>> vperm2i128 $33, %ymm1, %ymm0, %ymm2
>> vpalignr $8, %ymm0, %ymm2, %ymm2
>> vpaddd %ymm2, %ymm0, %ymm0
>> vperm2i128 $33, %ymm1, %ymm0, %ymm1
>> vpalignr $4, %ymm0, %ymm1, %ymm1
>> vpaddd %ymm1, %ymm0, %ymm0
>> vmovd %xmm0, %eax
>>
>> to
>>
>> sumint:
>> .LFB0:
>> .cfi_startproc
>> vpxor %xmm0, %xmm0, %xmm0
>> leaq 4096(%rdi), %rax
>> .p2align 4,,10
>> .p2align 3
>> .L2:
>> vpaddd (%rdi), %ymm0, %ymm0
>> addq $32, %rdi
>> cmpq %rdi, %rax
>> jne .L2
>> vmovdqa %xmm0, %xmm1
>> vextracti128 $1, %ymm0, %xmm0
>> vpaddd %xmm0, %xmm1, %xmm0
>> vpsrldq $8, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vpsrldq $4, %xmm0, %xmm1
>> vpaddd %xmm1, %xmm0, %xmm0
>> vmovd %xmm0, %eax
>> vzeroupper
>> ret
>>
>> which besides being faster is also smaller (less prefixes).
>>
>> SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
>> effects reduction vectorization epilogues I didn't expect big effects
>> but for loops that do not run much (more likely with AVX512).
>>
>> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>>
>> Ok for trunk?
>
> Ping?
>
> Richard.
>
>> The PR mentions some more tricks to optimize the sequence but
>> those look like backend only optimizations.
>>
>> Thanks,
>> Richard.
>>
>> 2017-11-28 Richard Biener <rguenther@suse.de>
>>
>> PR tree-optimization/80846
>> * target.def (split_reduction): New target hook.
>> * targhooks.c (default_split_reduction): New function.
>> * targhooks.h (default_split_reduction): Declare.
>> * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
>> target requests first reduce vectors by combining low and high
>> parts.
>> * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
>> (get_vectype_for_scalar_type_and_size): Export.
>> * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
>>
>> * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
>> * doc/tm.texi: Regenerate.
>>
>> i386/
>> * config/i386/i386.c (ix86_split_reduction): Implement
>> TARGET_VECTORIZE_SPLIT_REDUCTION.
>>
>> * gcc.target/i386/pr80846-1.c: New testcase.
>> * gcc.target/i386/pr80846-2.c: Likewise.
I've got no objections here and you know this code far better than I.
OK.
jeff
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
2018-01-09 23:13 ` Jeff Law
@ 2018-01-10 8:23 ` Richard Biener
2018-01-11 12:11 ` Richard Biener
2018-01-11 16:21 ` Jeff Law
0 siblings, 2 replies; 8+ messages in thread
From: Richard Biener @ 2018-01-10 8:23 UTC (permalink / raw)
To: Jeff Law; +Cc: gcc-patches, Jan Hubicka, ubizjak, Martin Jambor
On Tue, 9 Jan 2018, Jeff Law wrote:
> On 01/05/2018 02:01 AM, Richard Biener wrote:
> > On Tue, 28 Nov 2017, Richard Biener wrote:
> >
> >>
> >> The following adds a new target hook, targetm.vectorize.split_reduction,
> >> which allows the target to specify a preferred mode to perform the
> >> final reducion on using either vector shifts or scalar extractions.
> >> Up to that mode the vector reduction result is reduced by combining
> >> lowparts and highparts recursively. This avoids lane-crossing operations
> >> when doing AVX256 on Zen and Bulldozer and also speeds up things on
> >> Haswell (I verified ~20% speedup on Broadwell).
> >>
> >> Thus the patch implements the target hook on x86 to _always_ prefer
> >> SSE modes for the final reduction.
> >>
> >> For the testcase in the bugzilla
> >>
> >> int sumint(const int arr[]) {
> >> arr = __builtin_assume_aligned(arr, 64);
> >> int sum=0;
> >> for (int i=0 ; i<1024 ; i++)
> >> sum+=arr[i];
> >> return sum;
> >> }
> >>
> >> this changes -O3 -mavx512f code from
> >>
> >> sumint:
> >> .LFB0:
> >> .cfi_startproc
> >> vpxord %zmm0, %zmm0, %zmm0
> >> leaq 4096(%rdi), %rax
> >> .p2align 4,,10
> >> .p2align 3
> >> .L2:
> >> vpaddd (%rdi), %zmm0, %zmm0
> >> addq $64, %rdi
> >> cmpq %rdi, %rax
> >> jne .L2
> >> vpxord %zmm1, %zmm1, %zmm1
> >> vshufi32x4 $78, %zmm1, %zmm0, %zmm2
> >> vpaddd %zmm2, %zmm0, %zmm0
> >> vmovdqa64 .LC0(%rip), %zmm2
> >> vpermi2d %zmm1, %zmm0, %zmm2
> >> vpaddd %zmm2, %zmm0, %zmm0
> >> vmovdqa64 .LC1(%rip), %zmm2
> >> vpermi2d %zmm1, %zmm0, %zmm2
> >> vpaddd %zmm2, %zmm0, %zmm0
> >> vmovdqa64 .LC2(%rip), %zmm2
> >> vpermi2d %zmm1, %zmm0, %zmm2
> >> vpaddd %zmm2, %zmm0, %zmm0
> >> vmovd %xmm0, %eax
> >>
> >> to
> >>
> >> sumint:
> >> .LFB0:
> >> .cfi_startproc
> >> vpxord %zmm0, %zmm0, %zmm0
> >> leaq 4096(%rdi), %rax
> >> .p2align 4,,10
> >> .p2align 3
> >> .L2:
> >> vpaddd (%rdi), %zmm0, %zmm0
> >> addq $64, %rdi
> >> cmpq %rdi, %rax
> >> jne .L2
> >> vextracti64x4 $0x1, %zmm0, %ymm1
> >> vpaddd %ymm0, %ymm1, %ymm1
> >> vmovdqa %xmm1, %xmm0
> >> vextracti128 $1, %ymm1, %xmm1
> >> vpaddd %xmm1, %xmm0, %xmm0
> >> vpsrldq $8, %xmm0, %xmm1
> >> vpaddd %xmm1, %xmm0, %xmm0
> >> vpsrldq $4, %xmm0, %xmm1
> >> vpaddd %xmm1, %xmm0, %xmm0
> >> vmovd %xmm0, %eax
> >>
> >> and for -O3 -mavx2 from
> >>
> >> sumint:
> >> .LFB0:
> >> .cfi_startproc
> >> vpxor %xmm0, %xmm0, %xmm0
> >> leaq 4096(%rdi), %rax
> >> .p2align 4,,10
> >> .p2align 3
> >> .L2:
> >> vpaddd (%rdi), %ymm0, %ymm0
> >> addq $32, %rdi
> >> cmpq %rdi, %rax
> >> jne .L2
> >> vpxor %xmm1, %xmm1, %xmm1
> >> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> >> vpaddd %ymm2, %ymm0, %ymm0
> >> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> >> vpalignr $8, %ymm0, %ymm2, %ymm2
> >> vpaddd %ymm2, %ymm0, %ymm0
> >> vperm2i128 $33, %ymm1, %ymm0, %ymm1
> >> vpalignr $4, %ymm0, %ymm1, %ymm1
> >> vpaddd %ymm1, %ymm0, %ymm0
> >> vmovd %xmm0, %eax
> >>
> >> to
> >>
> >> sumint:
> >> .LFB0:
> >> .cfi_startproc
> >> vpxor %xmm0, %xmm0, %xmm0
> >> leaq 4096(%rdi), %rax
> >> .p2align 4,,10
> >> .p2align 3
> >> .L2:
> >> vpaddd (%rdi), %ymm0, %ymm0
> >> addq $32, %rdi
> >> cmpq %rdi, %rax
> >> jne .L2
> >> vmovdqa %xmm0, %xmm1
> >> vextracti128 $1, %ymm0, %xmm0
> >> vpaddd %xmm0, %xmm1, %xmm0
> >> vpsrldq $8, %xmm0, %xmm1
> >> vpaddd %xmm1, %xmm0, %xmm0
> >> vpsrldq $4, %xmm0, %xmm1
> >> vpaddd %xmm1, %xmm0, %xmm0
> >> vmovd %xmm0, %eax
> >> vzeroupper
> >> ret
> >>
> >> which besides being faster is also smaller (less prefixes).
> >>
> >> SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
> >> effects reduction vectorization epilogues I didn't expect big effects
> >> but for loops that do not run much (more likely with AVX512).
> >>
> >> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >>
> >> Ok for trunk?
> >
> > Ping?
> >
> > Richard.
> >
> >> The PR mentions some more tricks to optimize the sequence but
> >> those look like backend only optimizations.
> >>
> >> Thanks,
> >> Richard.
> >>
> >> 2017-11-28 Richard Biener <rguenther@suse.de>
> >>
> >> PR tree-optimization/80846
> >> * target.def (split_reduction): New target hook.
> >> * targhooks.c (default_split_reduction): New function.
> >> * targhooks.h (default_split_reduction): Declare.
> >> * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
> >> target requests first reduce vectors by combining low and high
> >> parts.
> >> * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
> >> (get_vectype_for_scalar_type_and_size): Export.
> >> * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
> >>
> >> * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
> >> * doc/tm.texi: Regenerate.
> >>
> >> i386/
> >> * config/i386/i386.c (ix86_split_reduction): Implement
> >> TARGET_VECTORIZE_SPLIT_REDUCTION.
> >>
> >> * gcc.target/i386/pr80846-1.c: New testcase.
> >> * gcc.target/i386/pr80846-2.c: Likewise.
> I've got no objections here and you know this code far better than I.
I was really looking for x86 maintainer ack for the target hook
implementation which I just quote here for reference again:
+/* All CPUs perfer to avoid cross-lane operations so perform reductions
+ upper against lower halves up to SSE reg size. */
+
+static machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+ /* Reduce lowpart against highpart until we reach SSE reg width to
+ avoid cross-lane operations. */
+ switch (mode)
+ {
+ case E_V16SImode:
+ case E_V8SImode:
+ return V4SImode;
+ case E_V32HImode:
+ case E_V16HImode:
+ return V8HImode;
+ case E_V64QImode:
+ case E_V32QImode:
+ return V16QImode;
+ case E_V16SFmode:
+ case E_V8SFmode:
+ return V4SFmode;
+ case E_V8DFmode:
+ case E_V4DFmode:
+ return V2DFmode;
+ default:
+ return mode;
+ }
+}
this means we'll do [zmm -> ymm] -> xmm (looks like I forgot VnDImode
in the above list, consider that added).
Richard.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
2018-01-10 8:23 ` Richard Biener
@ 2018-01-11 12:11 ` Richard Biener
2018-01-11 16:21 ` Jeff Law
1 sibling, 0 replies; 8+ messages in thread
From: Richard Biener @ 2018-01-11 12:11 UTC (permalink / raw)
To: Jeff Law; +Cc: gcc-patches, Jan Hubicka, ubizjak, Martin Jambor
On Wed, 10 Jan 2018, Richard Biener wrote:
> On Tue, 9 Jan 2018, Jeff Law wrote:
>
> > On 01/05/2018 02:01 AM, Richard Biener wrote:
> > > On Tue, 28 Nov 2017, Richard Biener wrote:
> > >
> > >>
> > >> The following adds a new target hook, targetm.vectorize.split_reduction,
> > >> which allows the target to specify a preferred mode to perform the
> > >> final reducion on using either vector shifts or scalar extractions.
> > >> Up to that mode the vector reduction result is reduced by combining
> > >> lowparts and highparts recursively. This avoids lane-crossing operations
> > >> when doing AVX256 on Zen and Bulldozer and also speeds up things on
> > >> Haswell (I verified ~20% speedup on Broadwell).
> > >>
> > >> Thus the patch implements the target hook on x86 to _always_ prefer
> > >> SSE modes for the final reduction.
> > >>
> > >> For the testcase in the bugzilla
> > >>
> > >> int sumint(const int arr[]) {
> > >> arr = __builtin_assume_aligned(arr, 64);
> > >> int sum=0;
> > >> for (int i=0 ; i<1024 ; i++)
> > >> sum+=arr[i];
> > >> return sum;
> > >> }
> > >>
> > >> this changes -O3 -mavx512f code from
> > >>
> > >> sumint:
> > >> .LFB0:
> > >> .cfi_startproc
> > >> vpxord %zmm0, %zmm0, %zmm0
> > >> leaq 4096(%rdi), %rax
> > >> .p2align 4,,10
> > >> .p2align 3
> > >> .L2:
> > >> vpaddd (%rdi), %zmm0, %zmm0
> > >> addq $64, %rdi
> > >> cmpq %rdi, %rax
> > >> jne .L2
> > >> vpxord %zmm1, %zmm1, %zmm1
> > >> vshufi32x4 $78, %zmm1, %zmm0, %zmm2
> > >> vpaddd %zmm2, %zmm0, %zmm0
> > >> vmovdqa64 .LC0(%rip), %zmm2
> > >> vpermi2d %zmm1, %zmm0, %zmm2
> > >> vpaddd %zmm2, %zmm0, %zmm0
> > >> vmovdqa64 .LC1(%rip), %zmm2
> > >> vpermi2d %zmm1, %zmm0, %zmm2
> > >> vpaddd %zmm2, %zmm0, %zmm0
> > >> vmovdqa64 .LC2(%rip), %zmm2
> > >> vpermi2d %zmm1, %zmm0, %zmm2
> > >> vpaddd %zmm2, %zmm0, %zmm0
> > >> vmovd %xmm0, %eax
> > >>
> > >> to
> > >>
> > >> sumint:
> > >> .LFB0:
> > >> .cfi_startproc
> > >> vpxord %zmm0, %zmm0, %zmm0
> > >> leaq 4096(%rdi), %rax
> > >> .p2align 4,,10
> > >> .p2align 3
> > >> .L2:
> > >> vpaddd (%rdi), %zmm0, %zmm0
> > >> addq $64, %rdi
> > >> cmpq %rdi, %rax
> > >> jne .L2
> > >> vextracti64x4 $0x1, %zmm0, %ymm1
> > >> vpaddd %ymm0, %ymm1, %ymm1
> > >> vmovdqa %xmm1, %xmm0
> > >> vextracti128 $1, %ymm1, %xmm1
> > >> vpaddd %xmm1, %xmm0, %xmm0
> > >> vpsrldq $8, %xmm0, %xmm1
> > >> vpaddd %xmm1, %xmm0, %xmm0
> > >> vpsrldq $4, %xmm0, %xmm1
> > >> vpaddd %xmm1, %xmm0, %xmm0
> > >> vmovd %xmm0, %eax
> > >>
> > >> and for -O3 -mavx2 from
> > >>
> > >> sumint:
> > >> .LFB0:
> > >> .cfi_startproc
> > >> vpxor %xmm0, %xmm0, %xmm0
> > >> leaq 4096(%rdi), %rax
> > >> .p2align 4,,10
> > >> .p2align 3
> > >> .L2:
> > >> vpaddd (%rdi), %ymm0, %ymm0
> > >> addq $32, %rdi
> > >> cmpq %rdi, %rax
> > >> jne .L2
> > >> vpxor %xmm1, %xmm1, %xmm1
> > >> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> > >> vpaddd %ymm2, %ymm0, %ymm0
> > >> vperm2i128 $33, %ymm1, %ymm0, %ymm2
> > >> vpalignr $8, %ymm0, %ymm2, %ymm2
> > >> vpaddd %ymm2, %ymm0, %ymm0
> > >> vperm2i128 $33, %ymm1, %ymm0, %ymm1
> > >> vpalignr $4, %ymm0, %ymm1, %ymm1
> > >> vpaddd %ymm1, %ymm0, %ymm0
> > >> vmovd %xmm0, %eax
> > >>
> > >> to
> > >>
> > >> sumint:
> > >> .LFB0:
> > >> .cfi_startproc
> > >> vpxor %xmm0, %xmm0, %xmm0
> > >> leaq 4096(%rdi), %rax
> > >> .p2align 4,,10
> > >> .p2align 3
> > >> .L2:
> > >> vpaddd (%rdi), %ymm0, %ymm0
> > >> addq $32, %rdi
> > >> cmpq %rdi, %rax
> > >> jne .L2
> > >> vmovdqa %xmm0, %xmm1
> > >> vextracti128 $1, %ymm0, %xmm0
> > >> vpaddd %xmm0, %xmm1, %xmm0
> > >> vpsrldq $8, %xmm0, %xmm1
> > >> vpaddd %xmm1, %xmm0, %xmm0
> > >> vpsrldq $4, %xmm0, %xmm1
> > >> vpaddd %xmm1, %xmm0, %xmm0
> > >> vmovd %xmm0, %eax
> > >> vzeroupper
> > >> ret
> > >>
> > >> which besides being faster is also smaller (less prefixes).
> > >>
> > >> SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
> > >> effects reduction vectorization epilogues I didn't expect big effects
> > >> but for loops that do not run much (more likely with AVX512).
> > >>
> > >> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> > >>
> > >> Ok for trunk?
> > >
> > > Ping?
> > >
> > > Richard.
> > >
> > >> The PR mentions some more tricks to optimize the sequence but
> > >> those look like backend only optimizations.
> > >>
> > >> Thanks,
> > >> Richard.
> > >>
> > >> 2017-11-28 Richard Biener <rguenther@suse.de>
> > >>
> > >> PR tree-optimization/80846
> > >> * target.def (split_reduction): New target hook.
> > >> * targhooks.c (default_split_reduction): New function.
> > >> * targhooks.h (default_split_reduction): Declare.
> > >> * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
> > >> target requests first reduce vectors by combining low and high
> > >> parts.
> > >> * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
> > >> (get_vectype_for_scalar_type_and_size): Export.
> > >> * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
> > >>
> > >> * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
> > >> * doc/tm.texi: Regenerate.
> > >>
> > >> i386/
> > >> * config/i386/i386.c (ix86_split_reduction): Implement
> > >> TARGET_VECTORIZE_SPLIT_REDUCTION.
> > >>
> > >> * gcc.target/i386/pr80846-1.c: New testcase.
> > >> * gcc.target/i386/pr80846-2.c: Likewise.
> > I've got no objections here and you know this code far better than I.
>
> I was really looking for x86 maintainer ack for the target hook
> implementation which I just quote here for reference again:
>
> +/* All CPUs perfer to avoid cross-lane operations so perform reductions
> + upper against lower halves up to SSE reg size. */
> +
> +static machine_mode
> +ix86_split_reduction (machine_mode mode)
> +{
> + /* Reduce lowpart against highpart until we reach SSE reg width to
> + avoid cross-lane operations. */
> + switch (mode)
> + {
> + case E_V16SImode:
> + case E_V8SImode:
> + return V4SImode;
> + case E_V32HImode:
> + case E_V16HImode:
> + return V8HImode;
> + case E_V64QImode:
> + case E_V32QImode:
> + return V16QImode;
> + case E_V16SFmode:
> + case E_V8SFmode:
> + return V4SFmode;
> + case E_V8DFmode:
> + case E_V4DFmode:
> + return V2DFmode;
> + default:
> + return mode;
> + }
> +}
>
> this means we'll do [zmm -> ymm] -> xmm (looks like I forgot VnDImode
> in the above list, consider that added).
For the record, here's what I am re-testing (crossed with some SVE
changes again...). Also we have regressed in code generation
for the pr80846-1.c testcase I am going to add and for some odd reason
started spilling the acummulator:
vmovdqa64 %zmm4, -64(%rsp)
.p2align 4,,10
.p2align 3
.L2:
vmovdqa64 -64(%rsp), %zmm3
vpaddd (%rdi), %zmm3, %zmm2
addq $64, %rdi
vmovdqa64 %zmm2, -64(%rsp)
cmpq %rdi, %rax
jne .L2
vmovdqa -32(%rsp), %ymm5
vpaddd -64(%rsp), %ymm5, %ymm0
vmovdqa %xmm0, %xmm1
vextracti128 $1, %ymm0, %xmm0
the load to %ymm5 should have been a vextract instead of a load
from stack. The RTL before ira/lra is
(insn 16 15 18 4 (set (reg:V8SI 107)
(plus:V8SI (subreg:V8SI (reg:V16SI 94 [ vect_sum_11.4 ]) 32)
(subreg:V8SI (reg:V16SI 94 [ vect_sum_11.4 ]) 0))) 2998
{*addv8si3}
(expr_list:REG_DEAD (reg:V16SI 94 [ vect_sum_11.4 ])
(nil)))
and that doesn't get handled properly for some reason anymore (it
worked a few months ago). So we spill reg:V16SI 94.
We should sort this out given the vectorizer creates such subregs
elsewhere already, so I'm going to ignore this "regression" and
let the testcase in FAIL state. The vectorizer produces
_20 = BIT_FIELD_REF <vect_sum_11.4_5, 256, 0>;
_13 = BIT_FIELD_REF <vect_sum_11.4_5, 256, 256>;
_18 = _20 + _13;
here after checking
if (convert_optab_handler (vec_extract_optab,
TYPE_MODE (TREE_TYPE (new_temp)),
TYPE_MODE (vectype1))
!= CODE_FOR_nothing)
{
and it does the v8si -> v4si via
_19 = VIEW_CONVERT_EXPR<vector(2) uint128_t>(_18);
_25 = BIT_FIELD_REF <_19, 128, 0>;
_26 = VIEW_CONVERT_EXPR<vector(4) int>(_25);
_27 = BIT_FIELD_REF <_19, 128, 128>;
_28 = VIEW_CONVERT_EXPR<vector(4) int>(_27);
_29 = _26 + _28;
where the target doesn't handle the vec_extract with vector modes.
So something broke that avx512 upper/lower half extraction in the
target (or the RTL that feeds it).
Re-bootstrap & regtest running on x86_64-unknown-linux-gnu. I'm going
to commit this with Jeffs approval as x86 maintainers seem to not
care too much here. I've done the benchmarking after all.
Will open a bug for the above after checkin.
Thanks,
Richard.
2018-01-11 Richard Biener <rguenther@suse.de>
PR tree-optimization/80846
* target.def (split_reduction): New target hook.
* targhooks.c (default_split_reduction): New function.
* targhooks.h (default_split_reduction): Declare.
* tree-vect-loop.c (vect_create_epilog_for_reduction): If the
target requests first reduce vectors by combining low and high
parts.
* tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
(get_vectype_for_scalar_type_and_size): Export.
* tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
* doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
* doc/tm.texi: Regenerate.
i386/
* config/i386/i386.c (ix86_split_reduction): Implement
TARGET_VECTORIZE_SPLIT_REDUCTION.
* gcc.target/i386/pr80846-1.c: New testcase.
* gcc.target/i386/pr80846-2.c: Likewise.
Index: gcc/target.def
===================================================================
--- gcc/target.def (revision 256526)
+++ gcc/target.def (working copy)
@@ -1890,6 +1890,17 @@ transformations even in absence of speci
(scalar_mode mode),
default_preferred_simd_mode)
+/* Returns the preferred mode for splitting SIMD reductions to. */
+DEFHOOK
+(split_reduction,
+ "This hook should return the preferred mode to split the final reduction\n\
+step on @var{mode} to. The reduction is then carried out reducing upper\n\
+against lower halves of vectors recursively until the specified mode is\n\
+reached. The default is @var{mode} which means no splitting.",
+ machine_mode,
+ (machine_mode),
+ default_split_reduction)
+
/* Returns a mask of vector sizes to iterate over when auto-vectorizing
after processing the preferred one derived from preferred_simd_mode. */
DEFHOOK
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c (revision 256526)
+++ gcc/targhooks.c (working copy)
@@ -1283,6 +1283,14 @@ default_preferred_simd_mode (scalar_mode
return word_mode;
}
+/* By default do not split reductions further. */
+
+machine_mode
+default_split_reduction (machine_mode mode)
+{
+ return mode;
+}
+
/* By default only the size derived from the preferred vector mode
is tried. */
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h (revision 256526)
+++ gcc/targhooks.h (working copy)
@@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm
const_tree,
int, bool);
extern machine_mode default_preferred_simd_mode (scalar_mode mode);
+extern machine_mode default_split_reduction (machine_mode);
extern void default_autovectorize_vector_sizes (vector_sizes *);
extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64);
extern void *default_init_cost (struct loop *);
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c (revision 256526)
+++ gcc/tree-vect-loop.c (working copy)
@@ -5062,12 +5062,7 @@ vect_create_epilog_for_reduction (vec<tr
}
else
{
- bool reduce_with_shift = have_whole_vector_shift (mode);
- int element_bitsize = tree_to_uhwi (bitsize);
- /* Enforced by vectorizable_reduction, which disallows SLP reductions
- for variable-length vectors and also requires direct target support
- for loop reductions. */
- int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+ bool reduce_with_shift;
tree vec_temp;
/* COND reductions all do the final reduction with MAX_EXPR
@@ -5081,30 +5076,125 @@ vect_create_epilog_for_reduction (vec<tr
code = MAX_EXPR;
}
- /* Regardless of whether we have a whole vector shift, if we're
- emulating the operation via tree-vect-generic, we don't want
- to use it. Only the first round of the reduction is likely
- to still be profitable via emulation. */
- /* ??? It might be better to emit a reduction tree code here, so that
- tree-vect-generic can expand the first round via bit tricks. */
- if (!VECTOR_MODE_P (mode))
- reduce_with_shift = false;
+ /* See if the target wants to do the final (shift) reduction
+ in a vector mode of smaller size and first reduce upper/lower
+ halves against each other. */
+ enum machine_mode mode1 = mode;
+ tree vectype1 = vectype;
+ unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+ unsigned sz1 = sz;
+ if (!slp_reduc
+ && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
+ sz1 = GET_MODE_SIZE (mode1).to_constant ();
+
+ vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+ reduce_with_shift = have_whole_vector_shift (mode1);
+ if (!VECTOR_MODE_P (mode1))
+ reduce_with_shift = false;
else
- {
- optab optab = optab_for_tree_code (code, vectype, optab_default);
- if (optab_handler (optab, mode) == CODE_FOR_nothing)
- reduce_with_shift = false;
- }
+ {
+ optab optab = optab_for_tree_code (code, vectype1, optab_default);
+ if (optab_handler (optab, mode1) == CODE_FOR_nothing)
+ reduce_with_shift = false;
+ }
+
+ /* First reduce the vector to the desired vector size we should
+ do shift reduction on by combining upper and lower halves. */
+ new_temp = new_phi_result;
+ while (sz > sz1)
+ {
+ gcc_assert (!slp_reduc);
+ sz /= 2;
+ vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
+
+ /* The target has to make sure we support lowpart/highpart
+ extraction, either via direct vector extract or through
+ an integer mode punning. */
+ tree dst1, dst2;
+ if (convert_optab_handler (vec_extract_optab,
+ TYPE_MODE (TREE_TYPE (new_temp)),
+ TYPE_MODE (vectype1))
+ != CODE_FOR_nothing)
+ {
+ /* Extract sub-vectors directly once vec_extract becomes
+ a conversion optab. */
+ dst1 = make_ssa_name (vectype1);
+ epilog_stmt
+ = gimple_build_assign (dst1, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, vectype1,
+ new_temp, TYPE_SIZE (vectype1),
+ bitsize_int (0)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst2 = make_ssa_name (vectype1);
+ epilog_stmt
+ = gimple_build_assign (dst2, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, vectype1,
+ new_temp, TYPE_SIZE (vectype1),
+ bitsize_int (sz * BITS_PER_UNIT)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
+ else
+ {
+ /* Extract via punning to appropriately sized integer mode
+ vector. */
+ tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
+ 1);
+ tree etype = build_vector_type (eltype, 2);
+ gcc_assert (convert_optab_handler (vec_extract_optab,
+ TYPE_MODE (etype),
+ TYPE_MODE (eltype))
+ != CODE_FOR_nothing);
+ tree tem = make_ssa_name (etype);
+ epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ etype, new_temp));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ new_temp = tem;
+ tem = make_ssa_name (eltype);
+ epilog_stmt
+ = gimple_build_assign (tem, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, eltype,
+ new_temp, TYPE_SIZE (eltype),
+ bitsize_int (0)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst1 = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ vectype1, tem));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ tem = make_ssa_name (eltype);
+ epilog_stmt
+ = gimple_build_assign (tem, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, eltype,
+ new_temp, TYPE_SIZE (eltype),
+ bitsize_int (sz * BITS_PER_UNIT)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst2 = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ vectype1, tem));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
+
+ new_temp = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
if (reduce_with_shift && !slp_reduc)
- {
- int nelements = vec_size_in_bits / element_bitsize;
+ {
+ int element_bitsize = tree_to_uhwi (bitsize);
+ /* Enforced by vectorizable_reduction, which disallows SLP reductions
+ for variable-length vectors and also requires direct target support
+ for loop reductions. */
+ int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+ int nelements = vec_size_in_bits / element_bitsize;
vec_perm_builder sel;
vec_perm_indices indices;
int elt_offset;
- tree zero_vec = build_zero_cst (vectype);
+ tree zero_vec = build_zero_cst (vectype1);
/* Case 2: Create:
for (offset = nelements/2; offset >= 1; offset/=2)
{
@@ -5118,15 +5208,15 @@ vect_create_epilog_for_reduction (vec<tr
dump_printf_loc (MSG_NOTE, vect_location,
"Reduce using vector shifts\n");
- vec_dest = vect_create_destination_var (scalar_dest, vectype);
- new_temp = new_phi_result;
+ mode1 = TYPE_MODE (vectype1);
+ vec_dest = vect_create_destination_var (scalar_dest, vectype1);
for (elt_offset = nelements / 2;
elt_offset >= 1;
elt_offset /= 2)
{
calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
indices.new_vector (sel, 2, nelements);
- tree mask = vect_gen_perm_mask_any (vectype, indices);
+ tree mask = vect_gen_perm_mask_any (vectype1, indices);
epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
new_temp, zero_vec, mask);
new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -5171,7 +5261,8 @@ vect_create_epilog_for_reduction (vec<tr
dump_printf_loc (MSG_NOTE, vect_location,
"Reduce using scalar code.\n");
- vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+ int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+ int element_bitsize = tree_to_uhwi (bitsize);
FOR_EACH_VEC_ELT (new_phis, i, new_phi)
{
int bit_offset;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c (revision 256526)
+++ gcc/tree-vect-stmts.c (working copy)
@@ -9068,7 +9068,7 @@ free_stmt_vec_info (gimple *stmt)
Returns the vector type corresponding to SCALAR_TYPE and SIZE as supported
by the target. */
-static tree
+tree
get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size)
{
tree orig_scalar_type = scalar_type;
Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h (revision 256526)
+++ gcc/tree-vectorizer.h (working copy)
@@ -1209,6 +1209,7 @@ extern bool vect_can_advance_ivs_p (loop
/* In tree-vect-stmts.c. */
extern poly_uint64 current_vector_size;
extern tree get_vectype_for_scalar_type (tree);
+extern tree get_vectype_for_scalar_type_and_size (tree, poly_uint64);
extern tree get_mask_type_for_scalar_type (tree);
extern tree get_same_sized_vectype (tree, tree);
extern bool vect_is_simple_use (tree, vec_info *, gimple **,
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in (revision 256526)
+++ gcc/doc/tm.texi.in (working copy)
@@ -4093,6 +4093,8 @@ address; but often a machine-dependent
@hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+@hook TARGET_VECTORIZE_SPLIT_REDUCTION
+
@hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
@hook TARGET_VECTORIZE_GET_MASK_MODE
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c (revision 256526)
+++ gcc/config/i386/i386.c (working copy)
@@ -48969,6 +48969,39 @@ ix86_preferred_simd_mode (scalar_mode mo
}
}
+/* All CPUs perfer to avoid cross-lane operations so perform reductions
+ upper against lower halves up to SSE reg size. */
+
+static machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+ /* Reduce lowpart against highpart until we reach SSE reg width to
+ avoid cross-lane operations. */
+ switch (mode)
+ {
+ case E_V8DImode:
+ case E_V4DImode:
+ return V2DImode;
+ case E_V16SImode:
+ case E_V8SImode:
+ return V4SImode;
+ case E_V32HImode:
+ case E_V16HImode:
+ return V8HImode;
+ case E_V64QImode:
+ case E_V32QImode:
+ return V16QImode;
+ case E_V16SFmode:
+ case E_V8SFmode:
+ return V4SFmode;
+ case E_V8DFmode:
+ case E_V4DFmode:
+ return V2DFmode;
+ default:
+ return mode;
+ }
+}
+
/* If AVX is enabled then try vectorizing with both 256bit and 128bit
vectors. If AVX512F is enabled then try vectorizing with 512bit,
256bit and 128bit vectors. */
@@ -50601,6 +50634,9 @@ ix86_run_selftests (void)
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
ix86_preferred_simd_mode
+#undef TARGET_VECTORIZE_SPLIT_REDUCTION
+#define TARGET_VECTORIZE_SPLIT_REDUCTION \
+ ix86_split_reduction
#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
ix86_autovectorize_vector_sizes
Index: gcc/testsuite/gcc.target/i386/pr80846-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-1.c (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-1.c (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f" } */
+
+int sumint(const int arr[]) {
+ arr = __builtin_assume_aligned(arr, 64);
+ int sum=0;
+ for (int i=0 ; i<1024 ; i++)
+ sum+=arr[i];
+ return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 2 } } */
Index: gcc/testsuite/gcc.target/i386/pr80846-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-2.c (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-2.c (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2" } */
+
+int sumint(const int arr[]) {
+ arr = __builtin_assume_aligned(arr, 64);
+ int sum=0;
+ for (int i=0 ; i<1024 ; i++)
+ sum+=arr[i];
+ return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 1 } } */
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)
2018-01-10 8:23 ` Richard Biener
2018-01-11 12:11 ` Richard Biener
@ 2018-01-11 16:21 ` Jeff Law
1 sibling, 0 replies; 8+ messages in thread
From: Jeff Law @ 2018-01-11 16:21 UTC (permalink / raw)
To: Richard Biener; +Cc: gcc-patches, Jan Hubicka, ubizjak, Martin Jambor
On 01/10/2018 01:10 AM, Richard Biener wrote:
> On Tue, 9 Jan 2018, Jeff Law wrote:
>
>> On 01/05/2018 02:01 AM, Richard Biener wrote:
>>> On Tue, 28 Nov 2017, Richard Biener wrote:
>>>
>>>>
>>>> The following adds a new target hook, targetm.vectorize.split_reduction,
>>>> which allows the target to specify a preferred mode to perform the
>>>> final reducion on using either vector shifts or scalar extractions.
>>>> Up to that mode the vector reduction result is reduced by combining
>>>> lowparts and highparts recursively. This avoids lane-crossing operations
>>>> when doing AVX256 on Zen and Bulldozer and also speeds up things on
>>>> Haswell (I verified ~20% speedup on Broadwell).
>>>>
>>>> Thus the patch implements the target hook on x86 to _always_ prefer
>>>> SSE modes for the final reduction.
>>>>
>>>> For the testcase in the bugzilla
>>>>
>>>> int sumint(const int arr[]) {
>>>> arr = __builtin_assume_aligned(arr, 64);
>>>> int sum=0;
>>>> for (int i=0 ; i<1024 ; i++)
>>>> sum+=arr[i];
>>>> return sum;
>>>> }
>>>>
>>>> this changes -O3 -mavx512f code from
>>>>
>>>> sumint:
>>>> .LFB0:
>>>> .cfi_startproc
>>>> vpxord %zmm0, %zmm0, %zmm0
>>>> leaq 4096(%rdi), %rax
>>>> .p2align 4,,10
>>>> .p2align 3
>>>> .L2:
>>>> vpaddd (%rdi), %zmm0, %zmm0
>>>> addq $64, %rdi
>>>> cmpq %rdi, %rax
>>>> jne .L2
>>>> vpxord %zmm1, %zmm1, %zmm1
>>>> vshufi32x4 $78, %zmm1, %zmm0, %zmm2
>>>> vpaddd %zmm2, %zmm0, %zmm0
>>>> vmovdqa64 .LC0(%rip), %zmm2
>>>> vpermi2d %zmm1, %zmm0, %zmm2
>>>> vpaddd %zmm2, %zmm0, %zmm0
>>>> vmovdqa64 .LC1(%rip), %zmm2
>>>> vpermi2d %zmm1, %zmm0, %zmm2
>>>> vpaddd %zmm2, %zmm0, %zmm0
>>>> vmovdqa64 .LC2(%rip), %zmm2
>>>> vpermi2d %zmm1, %zmm0, %zmm2
>>>> vpaddd %zmm2, %zmm0, %zmm0
>>>> vmovd %xmm0, %eax
>>>>
>>>> to
>>>>
>>>> sumint:
>>>> .LFB0:
>>>> .cfi_startproc
>>>> vpxord %zmm0, %zmm0, %zmm0
>>>> leaq 4096(%rdi), %rax
>>>> .p2align 4,,10
>>>> .p2align 3
>>>> .L2:
>>>> vpaddd (%rdi), %zmm0, %zmm0
>>>> addq $64, %rdi
>>>> cmpq %rdi, %rax
>>>> jne .L2
>>>> vextracti64x4 $0x1, %zmm0, %ymm1
>>>> vpaddd %ymm0, %ymm1, %ymm1
>>>> vmovdqa %xmm1, %xmm0
>>>> vextracti128 $1, %ymm1, %xmm1
>>>> vpaddd %xmm1, %xmm0, %xmm0
>>>> vpsrldq $8, %xmm0, %xmm1
>>>> vpaddd %xmm1, %xmm0, %xmm0
>>>> vpsrldq $4, %xmm0, %xmm1
>>>> vpaddd %xmm1, %xmm0, %xmm0
>>>> vmovd %xmm0, %eax
>>>>
>>>> and for -O3 -mavx2 from
>>>>
>>>> sumint:
>>>> .LFB0:
>>>> .cfi_startproc
>>>> vpxor %xmm0, %xmm0, %xmm0
>>>> leaq 4096(%rdi), %rax
>>>> .p2align 4,,10
>>>> .p2align 3
>>>> .L2:
>>>> vpaddd (%rdi), %ymm0, %ymm0
>>>> addq $32, %rdi
>>>> cmpq %rdi, %rax
>>>> jne .L2
>>>> vpxor %xmm1, %xmm1, %xmm1
>>>> vperm2i128 $33, %ymm1, %ymm0, %ymm2
>>>> vpaddd %ymm2, %ymm0, %ymm0
>>>> vperm2i128 $33, %ymm1, %ymm0, %ymm2
>>>> vpalignr $8, %ymm0, %ymm2, %ymm2
>>>> vpaddd %ymm2, %ymm0, %ymm0
>>>> vperm2i128 $33, %ymm1, %ymm0, %ymm1
>>>> vpalignr $4, %ymm0, %ymm1, %ymm1
>>>> vpaddd %ymm1, %ymm0, %ymm0
>>>> vmovd %xmm0, %eax
>>>>
>>>> to
>>>>
>>>> sumint:
>>>> .LFB0:
>>>> .cfi_startproc
>>>> vpxor %xmm0, %xmm0, %xmm0
>>>> leaq 4096(%rdi), %rax
>>>> .p2align 4,,10
>>>> .p2align 3
>>>> .L2:
>>>> vpaddd (%rdi), %ymm0, %ymm0
>>>> addq $32, %rdi
>>>> cmpq %rdi, %rax
>>>> jne .L2
>>>> vmovdqa %xmm0, %xmm1
>>>> vextracti128 $1, %ymm0, %xmm0
>>>> vpaddd %xmm0, %xmm1, %xmm0
>>>> vpsrldq $8, %xmm0, %xmm1
>>>> vpaddd %xmm1, %xmm0, %xmm0
>>>> vpsrldq $4, %xmm0, %xmm1
>>>> vpaddd %xmm1, %xmm0, %xmm0
>>>> vmovd %xmm0, %eax
>>>> vzeroupper
>>>> ret
>>>>
>>>> which besides being faster is also smaller (less prefixes).
>>>>
>>>> SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely
>>>> effects reduction vectorization epilogues I didn't expect big effects
>>>> but for loops that do not run much (more likely with AVX512).
>>>>
>>>> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>>>>
>>>> Ok for trunk?
>>>
>>> Ping?
>>>
>>> Richard.
>>>
>>>> The PR mentions some more tricks to optimize the sequence but
>>>> those look like backend only optimizations.
>>>>
>>>> Thanks,
>>>> Richard.
>>>>
>>>> 2017-11-28 Richard Biener <rguenther@suse.de>
>>>>
>>>> PR tree-optimization/80846
>>>> * target.def (split_reduction): New target hook.
>>>> * targhooks.c (default_split_reduction): New function.
>>>> * targhooks.h (default_split_reduction): Declare.
>>>> * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
>>>> target requests first reduce vectors by combining low and high
>>>> parts.
>>>> * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
>>>> (get_vectype_for_scalar_type_and_size): Export.
>>>> * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
>>>>
>>>> * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
>>>> * doc/tm.texi: Regenerate.
>>>>
>>>> i386/
>>>> * config/i386/i386.c (ix86_split_reduction): Implement
>>>> TARGET_VECTORIZE_SPLIT_REDUCTION.
>>>>
>>>> * gcc.target/i386/pr80846-1.c: New testcase.
>>>> * gcc.target/i386/pr80846-2.c: Likewise.
>> I've got no objections here and you know this code far better than I.
>
> I was really looking for x86 maintainer ack for the target hook
> implementation which I just quote here for reference again:
>
> +/* All CPUs perfer to avoid cross-lane operations so perform reductions
> + upper against lower halves up to SSE reg size. */
> +
> +static machine_mode
> +ix86_split_reduction (machine_mode mode)
> +{
> + /* Reduce lowpart against highpart until we reach SSE reg width to
> + avoid cross-lane operations. */
> + switch (mode)
> + {
> + case E_V16SImode:
> + case E_V8SImode:
> + return V4SImode;
> + case E_V32HImode:
> + case E_V16HImode:
> + return V8HImode;
> + case E_V64QImode:
> + case E_V32QImode:
> + return V16QImode;
> + case E_V16SFmode:
> + case E_V8SFmode:
> + return V4SFmode;
> + case E_V8DFmode:
> + case E_V4DFmode:
> + return V2DFmode;
> + default:
> + return mode;
> + }
> +}
>
> this means we'll do [zmm -> ymm] -> xmm (looks like I forgot VnDImode
> in the above list, consider that added).
That seemed reasonable to me as well. We're in an unfortunate situation
with Uros stepping back from heavy GCC x86 work, so we have to muddle
through as best as we can.
I do note a nit s/perfer/prefer/.
jeff
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2018-01-11 16:06 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-28 15:25 [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86) Richard Biener
2017-12-05 20:18 ` Jeff Law
2017-12-06 6:42 ` Richard Biener
2018-01-05 9:01 ` Richard Biener
2018-01-09 23:13 ` Jeff Law
2018-01-10 8:23 ` Richard Biener
2018-01-11 12:11 ` Richard Biener
2018-01-11 16:21 ` Jeff Law
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).