* [PATCH] Support AVX512F masked gather loads (PR tree-optimization/88464)
@ 2018-12-12 22:43 Jakub Jelinek
2018-12-13 16:09 ` Richard Biener
0 siblings, 1 reply; 3+ messages in thread
From: Jakub Jelinek @ 2018-12-12 22:43 UTC (permalink / raw)
To: Richard Biener, Richard Sandiford; +Cc: gcc-patches
Hi!
We support either the AVX2 gather loads (128-bit or 256-bit, using masks in
vector registers), both conditional and unconditional, but AVX512F gather
loads only unconditional. The problem was that tree-vect-stmts.c didn't
have code to deal with the integral masktype argument the builtins have,
where we need to compute it as vector bool first and then VCE.
Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk?
2018-12-12 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/88464
* tree-vect-stmts.c (vect_build_gather_load_calls): Handle INTEGER_TYPE
masktype if mask is non-NULL.
(vectorizable_load): Don't reject masked gather loads if masktype
in the decl is INTEGER_TYPE.
* gcc.target/i386/avx512f-pr88462-1.c: New test.
* gcc.target/i386/avx512f-pr88462-2.c: New test.
--- gcc/tree-vect-stmts.c.jj 2018-11-20 21:39:06.983478940 +0100
+++ gcc/tree-vect-stmts.c 2018-12-12 20:05:42.980019685 +0100
@@ -2647,8 +2647,13 @@ vect_build_gather_load_calls (stmt_vec_i
tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
tree scaletype = TREE_VALUE (arglist);
+ tree real_masktype = masktype;
gcc_checking_assert (types_compatible_p (srctype, rettype)
- && (!mask || types_compatible_p (srctype, masktype)));
+ && (!mask
+ || TREE_CODE (masktype) == INTEGER_TYPE
+ || types_compatible_p (srctype, masktype)));
+ if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
+ masktype = build_same_sized_truth_vector_type (srctype);
tree perm_mask = NULL_TREE;
tree mask_perm_mask = NULL_TREE;
@@ -2763,9 +2768,9 @@ vect_build_gather_load_calls (stmt_vec_i
mask_op = vec_mask;
if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask)))
{
- gcc_assert
- (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)),
- TYPE_VECTOR_SUBPARTS (masktype)));
+ poly_uint64 sub1 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op));
+ poly_uint64 sub2 = TYPE_VECTOR_SUBPARTS (masktype);
+ gcc_assert (known_eq (sub1, sub2));
var = vect_get_new_ssa_name (masktype, vect_simple_var);
mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op);
gassign *new_stmt
@@ -2777,8 +2782,33 @@ vect_build_gather_load_calls (stmt_vec_i
src_op = mask_op;
}
+ tree mask_arg = mask_op;
+ if (masktype != real_masktype)
+ {
+ tree utype;
+ if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype))
+ utype = real_masktype;
+ else
+ utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1);
+ var = vect_get_new_ssa_name (utype, vect_scalar_var);
+ mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op);
+ gassign *new_stmt
+ = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ mask_arg = var;
+ if (!useless_type_conversion_p (real_masktype, utype))
+ {
+ gcc_assert (TYPE_PRECISION (utype)
+ <= TYPE_PRECISION (real_masktype));
+ var = vect_get_new_ssa_name (real_masktype, vect_scalar_var);
+ new_stmt = gimple_build_assign (var, NOP_EXPR, utype);
+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ mask_arg = var;
+ }
+ src_op = build_zero_cst (srctype);
+ }
gcall *new_call = gimple_build_call (gs_info->decl, 5, src_op, ptr, op,
- mask_op, scale);
+ mask_arg, scale);
stmt_vec_info new_stmt_info;
if (!useless_type_conversion_p (vectype, rettype))
@@ -7555,20 +7585,6 @@ vectorizable_load (stmt_vec_info stmt_in
TYPE_MODE (mask_vectype), true))
return false;
}
- else if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
- {
- tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
- tree masktype
- = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
- if (TREE_CODE (masktype) == INTEGER_TYPE)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "masked gather with integer mask not"
- " supported.");
- return false;
- }
- }
else if (memory_access_type != VMAT_LOAD_STORE_LANES
&& memory_access_type != VMAT_GATHER_SCATTER)
{
--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj 2018-12-12 20:39:23.207028399 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-12-12 20:44:22.195148359 +0100
@@ -0,0 +1,35 @@
+/* PR tree-optimization/88462 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */
+
+__attribute__((noipa)) void
+f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n)
+{
+ int i;
+#pragma GCC ivdep
+ for (i = 0; i < n; ++i)
+ if (a[i] > 10.0)
+ a[i] = b[c[i]];
+}
+
+__attribute__((noipa)) void
+f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n)
+{
+ int i;
+#pragma GCC ivdep
+ for (i = 0; i < n; ++i)
+ if (a[i] > 10.0)
+ a[i] = b[c[i]];
+}
+
+__attribute__((noipa)) void
+f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n)
+{
+ int i;
+#pragma GCC ivdep
+ for (i = 0; i < n; ++i)
+ if (a[i] > 10.0f)
+ a[i] = b[c[i]];
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj 2018-12-12 20:58:36.540211660 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-12-12 21:00:26.182423054 +0100
@@ -0,0 +1,51 @@
+/* PR tree-optimization/88462 */
+/* { dg-do run { target { avx512f } } } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-pr88462-1.c"
+
+static void
+avx512f_test (void)
+{
+ double a[1024], b[1024];
+ float c[1024], f[1024];
+ int d[1024];
+ long e[1024];
+ int i;
+ for (i = 0; i < 1024; i++)
+ {
+ asm volatile ("" : "+g" (i));
+ a[i] = (i % 3) != 0 ? 15.0 : -5.0;
+ b[i] = 2 * i;
+ d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
+ }
+ f1 (a, b, d, 1024);
+ for (i = 0; i < 1024; i++)
+ {
+ asm volatile ("" : "+g" (i));
+ if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
+ abort ();
+ a[i] = (i % 3) != 1 ? 15.0 : -5.0;
+ b[i] = 3 * i;
+ e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
+ }
+ f2 (a, b, e, 1024);
+ for (i = 0; i < 1024; i++)
+ {
+ asm volatile ("" : "+g" (i));
+ if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
+ abort ();
+ c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
+ d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
+ f[i] = 4 * i;
+ }
+ f3 (c, f, d, 1024);
+ for (i = 0; i < 1024; i++)
+ {
+ asm volatile ("" : "+g" (i));
+ if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
+ abort ();
+ }
+}
Jakub
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] Support AVX512F masked gather loads (PR tree-optimization/88464)
2018-12-12 22:43 [PATCH] Support AVX512F masked gather loads (PR tree-optimization/88464) Jakub Jelinek
@ 2018-12-13 16:09 ` Richard Biener
2018-12-13 16:17 ` Jakub Jelinek
0 siblings, 1 reply; 3+ messages in thread
From: Richard Biener @ 2018-12-13 16:09 UTC (permalink / raw)
To: Jakub Jelinek, Richard Sandiford; +Cc: gcc-patches
On December 12, 2018 11:43:10 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>We support either the AVX2 gather loads (128-bit or 256-bit, using
>masks in
>vector registers), both conditional and unconditional, but AVX512F
>gather
>loads only unconditional. The problem was that tree-vect-stmts.c
>didn't
>have code to deal with the integral masktype argument the builtins
>have,
>where we need to compute it as vector bool first and then VCE.
>
>Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok
>for
>trunk?
OK.
Is there any chance you could look at replacing the i386 code to work with the new IFN style used by Aarch64?
>2018-12-12 Jakub Jelinek <jakub@redhat.com>
>
> PR tree-optimization/88464
> * tree-vect-stmts.c (vect_build_gather_load_calls): Handle
>INTEGER_TYPE
> masktype if mask is non-NULL.
> (vectorizable_load): Don't reject masked gather loads if masktype
> in the decl is INTEGER_TYPE.
>
> * gcc.target/i386/avx512f-pr88462-1.c: New test.
> * gcc.target/i386/avx512f-pr88462-2.c: New test.
>
>--- gcc/tree-vect-stmts.c.jj 2018-11-20 21:39:06.983478940 +0100
>+++ gcc/tree-vect-stmts.c 2018-12-12 20:05:42.980019685 +0100
>@@ -2647,8 +2647,13 @@ vect_build_gather_load_calls (stmt_vec_i
> tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
> tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
> tree scaletype = TREE_VALUE (arglist);
>+ tree real_masktype = masktype;
> gcc_checking_assert (types_compatible_p (srctype, rettype)
>- && (!mask || types_compatible_p (srctype, masktype)));
>+ && (!mask
>+ || TREE_CODE (masktype) == INTEGER_TYPE
>+ || types_compatible_p (srctype, masktype)));
>+ if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
>+ masktype = build_same_sized_truth_vector_type (srctype);
>
> tree perm_mask = NULL_TREE;
> tree mask_perm_mask = NULL_TREE;
>@@ -2763,9 +2768,9 @@ vect_build_gather_load_calls (stmt_vec_i
> mask_op = vec_mask;
> if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask)))
> {
>- gcc_assert
>- (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)),
>- TYPE_VECTOR_SUBPARTS (masktype)));
>+ poly_uint64 sub1 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op));
>+ poly_uint64 sub2 = TYPE_VECTOR_SUBPARTS (masktype);
>+ gcc_assert (known_eq (sub1, sub2));
> var = vect_get_new_ssa_name (masktype, vect_simple_var);
> mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op);
> gassign *new_stmt
>@@ -2777,8 +2782,33 @@ vect_build_gather_load_calls (stmt_vec_i
> src_op = mask_op;
> }
>
>+ tree mask_arg = mask_op;
>+ if (masktype != real_masktype)
>+ {
>+ tree utype;
>+ if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype))
>+ utype = real_masktype;
>+ else
>+ utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1);
>+ var = vect_get_new_ssa_name (utype, vect_scalar_var);
>+ mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op);
>+ gassign *new_stmt
>+ = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
>+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+ mask_arg = var;
>+ if (!useless_type_conversion_p (real_masktype, utype))
>+ {
>+ gcc_assert (TYPE_PRECISION (utype)
>+ <= TYPE_PRECISION (real_masktype));
>+ var = vect_get_new_ssa_name (real_masktype, vect_scalar_var);
>+ new_stmt = gimple_build_assign (var, NOP_EXPR, utype);
>+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+ mask_arg = var;
>+ }
>+ src_op = build_zero_cst (srctype);
>+ }
>gcall *new_call = gimple_build_call (gs_info->decl, 5, src_op, ptr, op,
>- mask_op, scale);
>+ mask_arg, scale);
>
> stmt_vec_info new_stmt_info;
> if (!useless_type_conversion_p (vectype, rettype))
>@@ -7555,20 +7585,6 @@ vectorizable_load (stmt_vec_info stmt_in
> TYPE_MODE (mask_vectype), true))
> return false;
> }
>- else if (memory_access_type == VMAT_GATHER_SCATTER &&
>gs_info.decl)
>- {
>- tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
>- tree masktype
>- = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
>- if (TREE_CODE (masktype) == INTEGER_TYPE)
>- {
>- if (dump_enabled_p ())
>- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>- "masked gather with integer mask not"
>- " supported.");
>- return false;
>- }
>- }
> else if (memory_access_type != VMAT_LOAD_STORE_LANES
> && memory_access_type != VMAT_GATHER_SCATTER)
> {
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj 2018-12-12
>20:39:23.207028399 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-12-12
>20:44:22.195148359 +0100
>@@ -0,0 +1,35 @@
>+/* PR tree-optimization/88462 */
>+/* { dg-do compile } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512 -fdump-tree-vect-details" } */
>+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
>vectors" 3 "vect" } } */
>+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function"
>3 "vect" } } */
>+
>+__attribute__((noipa)) void
>+f1 (double * __restrict__ a, const double * __restrict__ b, const int
>* __restrict__ c, int n)
>+{
>+ int i;
>+#pragma GCC ivdep
>+ for (i = 0; i < n; ++i)
>+ if (a[i] > 10.0)
>+ a[i] = b[c[i]];
>+}
>+
>+__attribute__((noipa)) void
>+f2 (double * __restrict__ a, const double * __restrict__ b, const long
>* __restrict__ c, int n)
>+{
>+ int i;
>+#pragma GCC ivdep
>+ for (i = 0; i < n; ++i)
>+ if (a[i] > 10.0)
>+ a[i] = b[c[i]];
>+}
>+
>+__attribute__((noipa)) void
>+f3 (float * __restrict__ a, const float * __restrict__ b, const int *
>__restrict__ c, int n)
>+{
>+ int i;
>+#pragma GCC ivdep
>+ for (i = 0; i < n; ++i)
>+ if (a[i] > 10.0f)
>+ a[i] = b[c[i]];
>+}
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj 2018-12-12
>20:58:36.540211660 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-12-12
>21:00:26.182423054 +0100
>@@ -0,0 +1,51 @@
>+/* PR tree-optimization/88462 */
>+/* { dg-do run { target { avx512f } } } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512" } */
>+
>+#include "avx512f-check.h"
>+
>+#include "avx512f-pr88462-1.c"
>+
>+static void
>+avx512f_test (void)
>+{
>+ double a[1024], b[1024];
>+ float c[1024], f[1024];
>+ int d[1024];
>+ long e[1024];
>+ int i;
>+ for (i = 0; i < 1024; i++)
>+ {
>+ asm volatile ("" : "+g" (i));
>+ a[i] = (i % 3) != 0 ? 15.0 : -5.0;
>+ b[i] = 2 * i;
>+ d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
>+ }
>+ f1 (a, b, d, 1024);
>+ for (i = 0; i < 1024; i++)
>+ {
>+ asm volatile ("" : "+g" (i));
>+ if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
>+ abort ();
>+ a[i] = (i % 3) != 1 ? 15.0 : -5.0;
>+ b[i] = 3 * i;
>+ e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
>+ }
>+ f2 (a, b, e, 1024);
>+ for (i = 0; i < 1024; i++)
>+ {
>+ asm volatile ("" : "+g" (i));
>+ if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
>+ abort ();
>+ c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
>+ d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
>+ f[i] = 4 * i;
>+ }
>+ f3 (c, f, d, 1024);
>+ for (i = 0; i < 1024; i++)
>+ {
>+ asm volatile ("" : "+g" (i));
>+ if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
>+ abort ();
>+ }
>+}
>
> Jakub
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] Support AVX512F masked gather loads (PR tree-optimization/88464)
2018-12-13 16:09 ` Richard Biener
@ 2018-12-13 16:17 ` Jakub Jelinek
0 siblings, 0 replies; 3+ messages in thread
From: Jakub Jelinek @ 2018-12-13 16:17 UTC (permalink / raw)
To: Richard Biener; +Cc: Richard Sandiford, gcc-patches
On Thu, Dec 13, 2018 at 05:09:17PM +0100, Richard Biener wrote:
> Is there any chance you could look at replacing the i386 code to work with
> the new IFN style used by Aarch64?
Maybe for GCC 10, I'm afraid it is a lot of work mainly due to the weirdnesses of
x86 mixed index vs. data sized types, that would be non-fun to represent in
the optabs.
Jakub
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2018-12-13 16:17 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-12 22:43 [PATCH] Support AVX512F masked gather loads (PR tree-optimization/88464) Jakub Jelinek
2018-12-13 16:09 ` Richard Biener
2018-12-13 16:17 ` Jakub Jelinek
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).