public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [AVX512F] Add scatter support for vectorizer
@ 2015-07-31 11:06 Petr Murzin
  2015-08-04 12:15 ` Richard Biener
  0 siblings, 1 reply; 8+ messages in thread
From: Petr Murzin @ 2015-07-31 11:06 UTC (permalink / raw)
  To: rguenther, Kirill Yukhin, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1977 bytes --]

Hello,
This patch adds scatter support for vectorizer (for AVX512F
instructions). Please have a look. Is it OK for trunk?

Thanks,
Petr


2015-07-31  Andrey Turetskiy  <andrey.turetskiy@intel.com>
                  Petr Murzin  <petr.murzin@intel.com>

gcc/

* config/i386/i386-builtin-types.def
(VOID_PFLOAT_HI_V8DI_V16SF_INT): New.
(VOID_PDOUBLE_QI_V16SI_V8DF_INT): Ditto.
(VOID_PINT_HI_V8DI_V16SI_INT): Ditto.
(VOID_PLONGLONG_QI_V16SI_V8DI_INT): Ditto.
* config/i386/i386.c
(ix86_builtins): Add IX86_BUILTIN_SCATTERALTSIV8DF,
IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
IX86_BUILTIN_SCATTERALTDIV16SI.
(ix86_init_mmx_sse_builtins): Define __builtin_ia32_scatteraltsiv8df,
__builtin_ia32_scatteraltdiv8sf, __builtin_ia32_scatteraltsiv8di,
__builtin_ia32_scatteraltdiv8si.
(ix86_expand_builtin): Handle IX86_BUILTIN_SCATTERALTSIV8DF,
IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
IX86_BUILTIN_SCATTERALTDIV16SI.
(ix86_vectorize_builtin_scatter): New.
(TARGET_VECTORIZE_BUILTIN_SCATTER): Define as
ix86_vectorize_builtin_scatter.
* doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_SCATTER): New.
* doc/tm.texi: Regenerate.
* target.def: Add scatter builtin.
* tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Add new
checkings for STMT_VINFO_SCATTER_P.
(vect_check_gather): Rename to ...
(vect_check_gather_scatter): this and enhance number of arguments.
(vect_analyze_data_refs): Add scatter and maybe_scatter variables and
new checkings for it accordingly.
* tree-vectorizer.h (STMT_VINFO_SCATTER_P(S)): Define.
(STMT_VINFO_STRIDE_LOAD_P(S)): Ditto.
(vect_check_gather): Rename to ...
(vect_check_gather_scatter): this.
* triee-vect-stmts.c (vectorizable_mask_load_store): Ditto.
(vectorizable_store): Add checkings for STMT_VINFO_SCATTER_P.
(vect_mark_stmts_to_be_vectorized): Ditto.

gcc/testsuite/

* gcc.target/i386/avx512f-scatter-1.c: New.
* gcc.target/i386/avx512f-scatter-2.c: Ditto.
* gcc.target/i386/avx512f-scatter-3.c: Ditto.

[-- Attachment #2: scatter.patch --]
[-- Type: application/octet-stream, Size: 37440 bytes --]

commit e00159729e4070bf8e019ee0714ea8d4ed498cc6
Author: Petr Murzin <petr.murzin@intel.com>
Date:   Fri Jul 31 13:42:51 2015 +0300

    [AVX512F] Add scatter support for vectorizer

diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index ee31ee3..b892f08 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1021,6 +1021,10 @@ DEF_FUNCTION_TYPE (VOID, PINT, QI, V8DI, V8SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V4DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V2DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8DI, V8DI, INT)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, HI, V8DI, V16SF, INT)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V16SI, V8DF, INT)
+DEF_FUNCTION_TYPE (VOID, PINT, HI, V8DI, V16SI, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V16SI, V8DI, INT)
 
 DEF_FUNCTION_TYPE (VOID, QI, V8SI, PCINT64, INT, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V4DI, V4DI, INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 128c5af..1e01c9f8 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -30386,6 +30386,10 @@ enum ix86_builtins
   IX86_BUILTIN_GATHER3SIV16SI,
   IX86_BUILTIN_GATHER3SIV8DF,
   IX86_BUILTIN_GATHER3SIV8DI,
+  IX86_BUILTIN_SCATTERALTSIV8DF,
+  IX86_BUILTIN_SCATTERALTDIV16SF,
+  IX86_BUILTIN_SCATTERALTSIV8DI,
+  IX86_BUILTIN_SCATTERALTDIV16SI,
   IX86_BUILTIN_SCATTERDIV16SF,
   IX86_BUILTIN_SCATTERDIV16SI,
   IX86_BUILTIN_SCATTERDIV8DF,
@@ -34202,6 +34206,21 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
 	       VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
 	       IX86_BUILTIN_SCATTERDIV2DI);
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
+	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
+	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
+	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
+	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SI);
 
   /* AVX512PF */
   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
@@ -39851,6 +39870,18 @@ addcarryx:
     case IX86_BUILTIN_GATHERPFDPD:
       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
       goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DF:
+      icode = CODE_FOR_avx512f_scattersiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SF:
+      icode = CODE_FOR_avx512f_scatterdiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DI:
+      icode = CODE_FOR_avx512f_scattersiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SI:
+      icode = CODE_FOR_avx512f_scatterdiv16si;
+      goto scatter_gen;
     case IX86_BUILTIN_GATHERPFDPS:
       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
       goto vec_prefetch_gen;
@@ -40114,6 +40145,36 @@ addcarryx:
       mode3 = insn_data[icode].operand[3].mode;
       mode4 = insn_data[icode].operand[4].mode;
 
+      /* Scatter instruction stores operand op3 to memory with
+	 indices from op2 and scale from op4 under writemask op1.
+	 If index operand op2 has more elements then source operand
+	 op3 one need to use only its low half. And vice versa.  */
+      switch (fcode)
+	{
+	case IX86_BUILTIN_SCATTERALTSIV8DF:
+	case IX86_BUILTIN_SCATTERALTSIV8DI:
+	  half = gen_reg_rtx (V8SImode);
+	  if (!nonimmediate_operand (op2, V16SImode))
+	    op2 = copy_to_mode_reg (V16SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTDIV16SF:
+	case IX86_BUILTIN_SCATTERALTDIV16SI:
+	  half = gen_reg_rtx (mode3);
+	  if (mode3 == V8SFmode)
+	    gen = gen_vec_extract_lo_v16sf;
+	  else
+	    gen = gen_vec_extract_lo_v16si;
+	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	  emit_insn (gen (half, op3));
+	  op3 = half;
+	  break;
+	default:
+	  break;
+	}
+
       /* Force memory operand only with base register here.  But we
 	 don't want to do it on memory operand for other builtin
 	 functions.  */
@@ -41193,6 +41254,62 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   return ix86_get_builtin (code);
 }
 
+/* Returns a decl of a function that implements scatter store with
+   register type VECTYPE and index type INDEX_TYPE and SCALE.
+   Return NULL_TREE if it is not available.  */
+
+static tree
+ix86_vectorize_builtin_scatter (const_tree vectype,
+				const_tree index_type, int scale)
+{
+  bool si;
+  enum ix86_builtins code;
+
+  if (! TARGET_AVX512F)
+    return NULL_TREE;
+
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+	  && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
+
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
+
+  /* v*scatter* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
+
+  /* Scale can be 1, 2, 4 or 8.  */
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
+
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (vectype))
+    {
+    case V8DFmode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
+      break;
+    case V8DImode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
+      break;
+    case V16SFmode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
+      break;
+    case V16SImode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  return ix86_builtins[code];
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -52324,6 +52441,9 @@ ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_BUILTIN_SCATTER
+#define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index f95646c..02dab1a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5739,6 +5739,13 @@ in vectorized loops in current function, or non-negative number if it is
 usable.  In that case, the smaller the number is, the more desirable it is
 to use it.
 @end deftypefn
+@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_SCATTER (const_tree @var{vectype}, const_tree @var{index_type}, int @var{scale})
+Target builtin that implements vector scatter operation.  @var{vectype}
+is the vector type of the store and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize scatter
+stores.
+@end deftypefn
 
 @node Anchored Addresses
 @section Anchored Addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 2383fb9..e2655a8 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4244,6 +4244,13 @@ address;  but often a machine-dependent strategy can generate better code.
 @hook TARGET_SIMD_CLONE_ADJUST
 
 @hook TARGET_SIMD_CLONE_USABLE
+@hook TARGET_VECTORIZE_BUILTIN_SCATTER
+Target builtin that implements vector scatter operation.  @var{vectype}
+is the vector type of the store and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize scatter
+stores.
+@end deftypefn
 
 @node Anchored Addresses
 @section Anchored Addresses
diff --git a/gcc/target.def b/gcc/target.def
index 4edc209..7eef7c1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1801,6 +1801,14 @@ loads.",
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Target builtin that implements vector scatter operation.  */
+DEFHOOK
+(builtin_scatter,
+ "",
+ tree,
+ (const_tree vectype, const_tree index_type, int scale),
+ NULL)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c
new file mode 100644
index 0000000..7631849
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c
@@ -0,0 +1,216 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+float vf1[N], vf2[2*N+16];
+double vd1[N], vd2[2*N+16];
+int vi1[N], vi2[2*N+16], k[N];
+long vl1[N], vl2[2*N+16], l[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[k[i]] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[k[i] + x] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[k[i]] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[k[i] + x] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[l[i]] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[l[i] + x] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[l[i]] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[l[i] + x] = vl1[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      vf1[i] = 17.0f + i;
+      vd1[i] = 19.0 + i;
+      vi1[i] = 21 + i;
+      vl1[i] = 23L + i;
+    }
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      k[i] = (i % 2) ? (N / 2 + i) : (N / 2 - i / 2);
+      l[i] = 2 * i + i % 2;
+    }
+
+  f1 ();
+  f2 ();
+  for (i = 0; i < N; i++)
+    if (vf2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17
+	|| vi2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 21)
+      abort ();
+
+  f3 (12);
+  f4 (14);
+  for (i = 0; i < N; i++)
+    if (vf2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 12] != i + 17
+	|| vi2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 14] != i + 21)
+      abort ();
+
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; i++)
+    if (vd2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19
+	|| vl2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 23)
+      abort ();
+
+  f7 (7);
+  f8 (9);
+  for (i = 0; i < N; i++)
+    if (vd2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 7] != i + 19
+	|| vl2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 9] != i + 23)
+      abort ();
+
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2] != i + 17
+	|| vi2[2 * i + i % 2] != i + 21)
+      abort ();
+
+  f11 (2);
+  f12 (4);
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2 + 2] != i + 17
+	|| vi2[2 * i + i % 2 + 4] != i + 21)
+      abort ();
+
+  f13 ();
+  f14 ();
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2] != i + 19
+	|| vl2[2 * i + i % 2] != i + 23)
+      abort ();
+
+  f15 (13);
+  f16 (15);
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2 + 13] != i + 19
+	|| vl2[2 * i + i % 2 + 15] != i + 23)
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c
new file mode 100644
index 0000000..5eabab6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c
@@ -0,0 +1,215 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+float vf1[N], vf2[2*N+16];
+double vd1[N], vd2[2*N+16];
+int k[N];
+long l[N];
+short n[2*N+16];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i]] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i] + x] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i]] = (int) vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i]] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i] + x] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i]] = (int) vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i] + x] = (int) vd1[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      vf1[i] = 17.0f + i;
+      vd1[i] = 19.0 + i;
+    }
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      k[i] = (i % 2) ? (N / 2 + i) : (N / 2 - i / 2);
+      l[i] = 2 * i + i % 2;
+    }
+
+  f1 ();
+  f2 ();
+  for (i = 0; i < N; i++)
+    if (vf2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17
+	|| n[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17)
+      abort ();
+
+  f3 (12);
+  f4 (14);
+  for (i = 0; i < N; i++)
+    if (vf2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 12] != i + 17
+	|| n[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 14] != i + 17)
+      abort ();
+
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; i++)
+    if (vd2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19
+	|| n[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19)
+      abort ();
+
+  f7 (7);
+  f8 (9);
+  for (i = 0; i < N; i++)
+    if (vd2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 7] != i + 19
+	|| n[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 9] != i + 19)
+      abort ();
+
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2] != i + 17
+	|| n[2 * i + i % 2] != i + 17)
+      abort ();
+
+  f11 (2);
+  f12 (4);
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2 + 2] != i + 17
+	|| n[2 * i + i % 2 + 4] != i + 17)
+      abort ();
+
+  f13 ();
+  f14 ();
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2] != i + 19
+	|| n[2 * i + i % 2] != i + 19)
+      abort ();
+
+  f15 (13);
+  f16 (15);
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2 + 13] != i + 19
+	|| n[2 * i + i % 2 + 15] != i + 19)
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c
new file mode 100644
index 0000000..dccbdb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+int a[N], b[N];
+
+__attribute__((noinline, noclone)) void
+foo (float *__restrict p, float *__restrict q,
+     int s1, int s2, int s3)
+{
+  int i;
+  for (i = 0; i < (N / 8); i++)
+    p[a[i] * s1 + b[i] * s2 + s3] = q[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+  float c[N], d[N];
+  for (i = 0; i < N; i++)
+    {
+      a[i] = (i * 7) & (N / 8 - 1);
+      b[i] = (i * 13) & (N / 8 - 1);
+      c[i] = 179.13 + i;
+    }
+  foo (d, c, 3, 2, 4);
+  for (i = 0; i < (N / 8); i++)
+    if (d[a[i] * 3 + b[i] * 2 + 4] != (float) (179.13 + i))
+      abort ();
+}
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 731fe7d..2de0369 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -65,6 +65,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "params.h"
 
+
+
 /* Return true if load- or store-lanes optab OPTAB is implemented for
    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
 
@@ -268,7 +270,9 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	}
 
       if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+	  || STMT_VINFO_GATHER_P (stmtinfo_b)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -316,7 +320,9 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	}
 
       if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+	  || STMT_VINFO_GATHER_P (stmtinfo_b)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -2307,10 +2313,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 	                     "zero step in outer loop.\n");
-	  if (DR_IS_READ (dr))
-  	    return true;
-	  else
-	    return false;
+	  return (DR_IS_READ (dr)) ? true : false;
 	}
     }
 
@@ -2956,12 +2959,12 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   return true;
 }
 
-/* Check whether a non-affine read in stmt is suitable for gather load
-   and if so, return a builtin decl for that operation.  */
+/* Check whether a non-affine read or write in stmt is suitable for gather load
+   or scatter store and if so, return a builtin decl for that operation.  */
 
 tree
-vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
-		   tree *offp, int *scalep)
+vect_check_gather_scatter (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
+			   tree *offp, int *scalep, bool is_load)
 {
   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -2990,7 +2993,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
     }
 
-  /* The gather builtins need address of the form
+  /* The gather and scatter builtins need address of the form
      loop_invariant + vector * {1, 2, 4, 8}
      or
      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
@@ -3153,8 +3156,13 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
   if (offtype == NULL_TREE)
     offtype = TREE_TYPE (off);
 
-  decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
-					   offtype, scale);
+  if (is_load)
+    decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
+					     offtype, scale);
+  else
+    decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
+					      offtype, scale);
+
   if (decl == NULL_TREE)
     return NULL_TREE;
 
@@ -3304,6 +3312,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
       stmt_vec_info stmt_info;
       tree base, offset, init;
       bool gather = false;
+      bool scatter = false;
       bool simd_lane_access = false;
       int vf;
 
@@ -3342,18 +3351,22 @@ again:
 	    = DR_IS_READ (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
 	      && targetm.vectorize.builtin_gather != NULL;
+	  bool maybe_scatter
+	    = DR_IS_WRITE (dr)
+	      && !TREE_THIS_VOLATILE (DR_REF (dr))
+	      && targetm.vectorize.builtin_scatter != NULL;
 	  bool maybe_simd_lane_access
 	    = loop_vinfo && loop->simduid;
 
-	  /* If target supports vector gather loads, or if this might be
-	     a SIMD lane access, see if they can't be used.  */
+	  /* If target supports vector gather loads or scatter stores, or if
+	     this might be a SIMD lane access, see if they can't be used.  */
 	  if (loop_vinfo
-	      && (maybe_gather || maybe_simd_lane_access)
+	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
 	      && !nested_in_vect_loop_p (loop, stmt))
 	    {
 	      struct data_reference *newdr
 		= create_data_ref (NULL, loop_containing_stmt (stmt),
-				   DR_REF (dr), stmt, true);
+				   DR_REF (dr), stmt, maybe_scatter ? false : true);
 	      gcc_assert (newdr != NULL && DR_REF (newdr));
 	      if (DR_BASE_ADDRESS (newdr)
 		  && DR_OFFSET (newdr)
@@ -3406,17 +3419,18 @@ again:
 			    }
 			}
 		    }
-		  if (!simd_lane_access && maybe_gather)
+		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
 		    {
 		      dr = newdr;
-		      gather = true;
+		      gather = DR_IS_READ (dr);
+		      scatter = DR_IS_WRITE (dr);
 		    }
 		}
-	      if (!gather && !simd_lane_access)
+	      if (!gather && !scatter && !simd_lane_access)
 		free_data_ref (newdr);
 	    }
 
-	  if (!gather && !simd_lane_access)
+	  if (!gather && !scatter && !simd_lane_access)
 	    {
 	      if (dump_enabled_p ())
 		{
@@ -3444,7 +3458,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
         }
@@ -3479,7 +3493,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3499,7 +3513,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
           return false;
 	}
@@ -3524,7 +3538,7 @@ again:
 	  if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
 	}
@@ -3662,7 +3676,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3695,10 +3709,10 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
-	      if (gather)
+	      if (gather || scatter)
 		free_data_ref (dr);
 	    }
 	  return false;
@@ -3722,23 +3736,29 @@ again:
       if (vf > *min_vf)
 	*min_vf = vf;
 
-      if (gather)
+      if (gather || scatter)
 	{
 	  tree off;
 
-	  gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
-	  if (gather
+	  gather = 0 != vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, true);
+	  scatter = 0 != vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, false);
+
+          if ((gather || scatter)
 	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
-	    gather = false;
-	  if (!gather)
+            {
+	      gather = false;
+	      scatter = false;
+            }
+
+	  if (!gather && !scatter)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
 	      free_data_ref (dr);
 	      if (dump_enabled_p ())
 		{
-		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                   "not vectorized: not suitable for gather "
-                                   "load ");
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   strcat("not vectorized: not suitable for ",
+					  (STMT_VINFO_GATHER_P (stmt_info)) ? "gather " "load " : "scatter " "store "));
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 		}
@@ -3747,7 +3767,9 @@ again:
 
 	  datarefs[i] = dr;
 	  STMT_VINFO_GATHER_P (stmt_info) = true;
+	  STMT_VINFO_SCATTER_P (stmt_info) = true;
 	}
+
       else if (loop_vinfo
 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
 	{
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f06e57c..8b3f539 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -810,14 +810,22 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
               return false;
           }
 
-      if (STMT_VINFO_GATHER_P (stmt_vinfo))
+      if (STMT_VINFO_GATHER_P (stmt_vinfo) || STMT_VINFO_SCATTER_P (stmt_vinfo))
 	{
 	  tree off;
-	  tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+	  tree decl = vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL,
+						 (STMT_VINFO_GATHER_P (stmt_vinfo)) ? true : false);
 	  gcc_assert (decl);
 	  if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
 			    &worklist, true))
-	    return false;
+            {
+	      if (STMT_VINFO_SCATTER_P (stmt_vinfo) &&
+                  !process_use (stmt, gimple_assign_rhs1 (stmt), loop_vinfo, live_p,
+			        relevant, &worklist, true))
+                worklist.release();
+
+	      return false;
+            }
 	}
     } /* while worklist */
 
@@ -1819,8 +1827,8 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
-				       &gather_off, &gather_scale);
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
+				       &gather_off, &gather_scale, true);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, NULL,
 				 &def_stmt, &def, &gather_dt,
@@ -5142,6 +5150,12 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   unsigned int vec_num;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree aggr_type;
+  tree scatter_base = NULL_TREE, scatter_off = NULL_TREE;
+  tree scatter_off_vectype = NULL_TREE, scatter_decl = NULL_TREE;
+  int scatter_scale = 1;
+  enum vect_def_type scatter_idx_dt = vect_unknown_def_type;
+  enum vect_def_type scatter_src_dt = vect_unknown_def_type;
+  gimple new_stmt;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
@@ -5299,6 +5313,32 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
         }
     }
 
+  if (STMT_VINFO_SCATTER_P (stmt_info))
+    {
+      gimple def_stmt;
+      tree def;
+      scatter_decl = vect_check_gather_scatter (stmt, loop_vinfo, &scatter_base,
+						&scatter_off, &scatter_scale, false);
+      gcc_assert (scatter_decl);
+      if (!vect_is_simple_use_1 (scatter_off, NULL, loop_vinfo, bb_vinfo,
+				 &def_stmt, &def, &scatter_idx_dt,
+				 &scatter_off_vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "scatter index use not simple.");
+	  return false;
+	}
+      if (!vect_is_simple_use (gimple_assign_rhs1 (stmt), NULL, loop_vinfo, bb_vinfo,
+			       &def_stmt, &def, &scatter_src_dt))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "scatter source use not simple.");
+	  return false;
+	}
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
@@ -5313,6 +5353,150 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
   ensure_base_align (stmt_info, dr);
 
+  if (STMT_VINFO_SCATTER_P (stmt_info))
+    {
+      tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
+      tree arglist = TYPE_ARG_TYPES (TREE_TYPE (scatter_decl));
+      tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+      tree ptr, mask, var, scale, perm_mask = NULL_TREE;
+      edge pe = loop_preheader_edge (loop);
+      gimple_seq seq;
+      basic_block new_bb;
+      enum { NARROW, NONE, WIDEN } modifier;
+      int scatter_off_nunits = TYPE_VECTOR_SUBPARTS (scatter_off_vectype);
+
+      if (nunits == scatter_off_nunits)
+	modifier = NONE;
+      else if (nunits == scatter_off_nunits / 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, scatter_off_nunits);
+	  modifier = WIDEN;
+
+	  for (i = 0; i < (unsigned int) scatter_off_nunits; ++i)
+	    sel[i] = i | nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (scatter_off_vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	}
+      else if (nunits == scatter_off_nunits * 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+	  modifier = NARROW;
+
+	  for (i = 0; i < (unsigned int) nunits; ++i)
+	    sel[i] = i | scatter_off_nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	  ncopies *= 2;
+	}
+      else
+	gcc_unreachable ();
+
+      rettype = TREE_TYPE (TREE_TYPE (scatter_decl));
+      ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      scaletype = TREE_VALUE (arglist);
+
+      gcc_checking_assert (TREE_CODE (masktype) == INTEGER_TYPE
+			   && TREE_CODE (rettype) == VOID_TYPE);
+
+      ptr = fold_convert (ptrtype, scatter_base);
+      if (!is_gimple_min_invariant (ptr))
+	{
+	  ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+	  gcc_assert (!new_bb);
+	}
+
+      /* Currently we support only unconditional scatter stores,
+	 so mask should be all ones.  */
+      mask = build_int_cst (masktype, -1);
+      mask = vect_init_vector (stmt, mask, masktype, NULL);
+
+      scale = build_int_cst (scaletype, scatter_scale);
+
+      prev_stmt_info = NULL;
+      for (j = 0; j < ncopies; ++j)
+	{
+	  if (j == 0)
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_operand (gimple_assign_rhs1 (stmt), stmt, NULL);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_operand (scatter_off, stmt, NULL);
+	    }
+	  else if (modifier != NONE && (j & 1))
+	    {
+	      if (modifier == WIDEN)
+		{
+		  src = vec_oprnd1
+		    = vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+		  op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask,
+					     stmt, gsi);
+		}
+	      else if (modifier == NARROW)
+		{
+		  src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask,
+					      stmt, gsi);
+		  op = vec_oprnd0
+		    = vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+		}
+	      else
+		gcc_unreachable ();
+	    }
+	  else
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+	    }
+
+	  if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (src))
+			  == TYPE_VECTOR_SUBPARTS (srctype));
+	      var = vect_get_new_vect_var (srctype, vect_simple_var, NULL);
+	      var = make_ssa_name (var, NULL);
+	      src = build1 (VIEW_CONVERT_EXPR, srctype, src);
+	      new_stmt
+		= gimple_build_assign (var, VIEW_CONVERT_EXPR,
+						src, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      src = var;
+	    }
+
+	  if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+			  == TYPE_VECTOR_SUBPARTS (idxtype));
+	      var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+	      var = make_ssa_name (var, NULL);
+	      op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+	      new_stmt
+		= gimple_build_assign (var, VIEW_CONVERT_EXPR,
+						op, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      op = var;
+	    }
+
+	  new_stmt
+	    = gimple_build_call (scatter_decl, 5, ptr, mask, op, src, scale);
+
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	  if (prev_stmt_info == NULL)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+      return true;
+    }
+
   if (grouped_store)
     {
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
@@ -5586,8 +5770,6 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   prev_stmt_info = NULL;
   for (j = 0; j < ncopies; j++)
     {
-      gimple new_stmt;
-
       if (j == 0)
 	{
           if (slp)
@@ -5853,10 +6035,12 @@ permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt,
 {
   tree vectype = TREE_TYPE (x);
   tree perm_dest, data_ref;
+  tree scalar_dest = TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
+		     ? gimple_assign_lhs (stmt) : x;
   gimple perm_stmt;
 
-  perm_dest = vect_create_destination_var (gimple_get_lhs (stmt), vectype);
-  data_ref = make_ssa_name (perm_dest);
+  perm_dest = vect_create_destination_var (scalar_dest, vectype);
+  data_ref = make_ssa_name (perm_dest, NULL);
 
   /* Generate the permute statement.  */
   perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, x, y, mask_vec);
@@ -6136,8 +6320,8 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
-				       &gather_off, &gather_scale);
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
+					       &gather_off, &gather_scale, true);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, bb_vinfo,
 				 &def_stmt, &def, &gather_dt,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index dfa8795..3b8bce4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -652,6 +652,9 @@ typedef struct _stmt_vec_info {
   /* True if this is an access with loop-invariant stride.  */
   bool strided_p;
 
+  /* For stores only, true if this is a scatter store.  */
+  bool scatter_p;
+
   /* For both loads and stores.  */
   bool simd_lane_access_p;
 } *stmt_vec_info;
@@ -669,6 +672,8 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
 #define STMT_VINFO_GATHER_P(S)		   (S)->gather_p
 #define STMT_VINFO_STRIDED_P(S)	   	   (S)->strided_p
+#define STMT_VINFO_STRIDE_LOAD_P(S)	   (S)->stride_load_p
+#define STMT_VINFO_SCATTER_P(S)		   (S)->scatter_p
 #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
 
 #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_base_address
@@ -1060,8 +1065,8 @@ extern bool vect_analyze_data_refs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info);
 extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
-extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
-			       int *);
+extern tree vect_check_gather_scatter (gimple, loop_vec_info, tree *,
+				       tree *, int *, bool);
 extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *,
 				    unsigned *);
 extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] [AVX512F] Add scatter support for vectorizer
  2015-07-31 11:06 [PATCH] [AVX512F] Add scatter support for vectorizer Petr Murzin
@ 2015-08-04 12:15 ` Richard Biener
  2015-08-04 12:42   ` Uros Bizjak
  2015-08-21 12:21   ` Petr Murzin
  0 siblings, 2 replies; 8+ messages in thread
From: Richard Biener @ 2015-08-04 12:15 UTC (permalink / raw)
  To: Petr Murzin; +Cc: Kirill Yukhin, gcc-patches, ubizjak

On Fri, 31 Jul 2015, Petr Murzin wrote:

> Hello,
> This patch adds scatter support for vectorizer (for AVX512F
> instructions). Please have a look. Is it OK for trunk?

+/* Target builtin that implements vector scatter operation.  */
+DEFHOOK
+(builtin_scatter,
+ "",
+ tree,
+ (const_tree vectype, const_tree index_type, int scale),
+ NULL)

please add documentation inline here, like for builtin_gather,
and let tm.texi be auto-populated.

Note that the i386 changes need target maintainer approval, CCing
Uros.

diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 731fe7d..2de0369 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -65,6 +65,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "params.h"

+
+
 /* Return true if load- or store-lanes optab OPTAB is implemented for
    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */


please avoid this kind of spurious whitespace changes.

@@ -2307,10 +2313,7 @@ vect_analyze_data_ref_access (struct data_reference 
*dr)
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "zero step in outer loop.\n");
-         if (DR_IS_READ (dr))
-           return true;
-         else
-           return false;
+         return (DR_IS_READ (dr)) ? true : false;
        }
     }

Likewise.  If anything then do

     return DR_IS_READ (dr);

-      if (gather)
+      if (gather || scatter)
        {
          tree off;

-         gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, 
NULL);
-         if (gather
+         gather = 0 != vect_check_gather_scatter (stmt, loop_vinfo, NULL, 
&off, NULL, true);
+         scatter = 0 != vect_check_gather_scatter (stmt, loop_vinfo, 
NULL, &off, NULL, false);
+

please only check gather/scatter once - only one, gather or scatter
can ever be true.  This also means that the idea of having both
bools is not reflecting the state in a very good way.  Instead
please add a

  enum { SG_NONE, SCATTER, GATHER } gatherscatter;

and replace 'gather' with it.

@@ -3747,7 +3767,9 @@ again:

          datarefs[i] = dr;
          STMT_VINFO_GATHER_P (stmt_info) = true;
+         STMT_VINFO_SCATTER_P (stmt_info) = true;
        }

this looks bougs as well due to the mechanical change - a stmt
cannot be gather and scatter at the same time.

-         tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, 
NULL);
+         tree decl = vect_check_gather_scatter (stmt, loop_vinfo, NULL, 
&off, NULL,
+                                                (STMT_VINFO_GATHER_P 
(stmt_vinfo)) ? true : false);

watch long lines

          if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
                            &worklist, true))
-           return false;
+            {
+             if (STMT_VINFO_SCATTER_P (stmt_vinfo) &&
+                  !process_use (stmt, gimple_assign_rhs1 (stmt), 
loop_vinfo, live_p,
+                               relevant, &worklist, true))
+                worklist.release();
+
+             return false;
+            }

no need to cut-off the early return, no?  Also rhs1 should be
already handled via

        FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, iter, SSA_OP_USE)
          {
            tree op = USE_FROM_PTR (use_p);
            if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
                              &worklist, false))
              return false;
          }

note that 'force' doesn't apply here.

I wonder why vect_check_gather_scatter cannot figure out itself
whether scatter or gather is used.  After all it does

  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);

so DR_IS_READ/WRITE is readily available.  Please rework accordingly.
This should also simplify the patch.

+      if (!vect_is_simple_use (gimple_assign_rhs1 (stmt), NULL, 
loop_vinfo, bb_vinfo,
+                              &def_stmt, &def, &scatter_src_dt))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "scatter source use not simple.");
+         return false;
+       }

This is redundant, it is verified earlier.

+             var = make_ssa_name (var, NULL);

make_ssa_name (var);

+             new_stmt
+               = gimple_build_assign (var, VIEW_CONVERT_EXPR,
+                                               src, NULL_TREE);

you can omit the NULL_TREE

@@ -5586,8 +5770,6 @@ vectorizable_store (gimple stmt, 
gimple_stmt_iterator *gsi, gimple *vec_stmt,
   prev_stmt_info = NULL;
   for (j = 0; j < ncopies; j++)
     {
-      gimple new_stmt;
-
       if (j == 0)
        {
           if (slp)

spurious change?

@@ -5853,10 +6035,12 @@ permute_vec_elements (tree x, tree y, tree 
mask_vec, gimple stmt,
 {
   tree vectype = TREE_TYPE (x);
   tree perm_dest, data_ref;
+  tree scalar_dest = TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
+                    ? gimple_assign_lhs (stmt) : x;

please instead rework vect_create_destination_var to remove the
assert this triggers for non-SSA names.

-  perm_dest = vect_create_destination_var (gimple_get_lhs (stmt), 
vectype);
-  data_ref = make_ssa_name (perm_dest);
+  perm_dest = vect_create_destination_var (scalar_dest, vectype);
+  data_ref = make_ssa_name (perm_dest, NULL);

spurious (bad) change.

@@ -652,6 +652,9 @@ typedef struct _stmt_vec_info {
   /* True if this is an access with loop-invariant stride.  */
   bool strided_p;

+  /* For stores only, true if this is a scatter store.  */
+  bool scatter_p;
+

it can be only scatter or gather, so IMHO unifying the flags
makes sense.  So

   /* For stores if this is a scatter, for loads if this is a gather.  */
   bool scatter_gather_p;

@@ -669,6 +672,8 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
 #define STMT_VINFO_GATHER_P(S)            (S)->gather_p
 #define STMT_VINFO_STRIDED_P(S)                   (S)->strided_p
+#define STMT_VINFO_STRIDE_LOAD_P(S)       (S)->stride_load_p
+#define STMT_VINFO_SCATTER_P(S)                   (S)->scatter_p

spurious change.

Thanks,
Richard.

> Thanks,
> Petr
> 
> 
> 2015-07-31  Andrey Turetskiy  <andrey.turetskiy@intel.com>
>                   Petr Murzin  <petr.murzin@intel.com>
> 
> gcc/
> 
> * config/i386/i386-builtin-types.def
> (VOID_PFLOAT_HI_V8DI_V16SF_INT): New.
> (VOID_PDOUBLE_QI_V16SI_V8DF_INT): Ditto.
> (VOID_PINT_HI_V8DI_V16SI_INT): Ditto.
> (VOID_PLONGLONG_QI_V16SI_V8DI_INT): Ditto.
> * config/i386/i386.c
> (ix86_builtins): Add IX86_BUILTIN_SCATTERALTSIV8DF,
> IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
> IX86_BUILTIN_SCATTERALTDIV16SI.
> (ix86_init_mmx_sse_builtins): Define __builtin_ia32_scatteraltsiv8df,
> __builtin_ia32_scatteraltdiv8sf, __builtin_ia32_scatteraltsiv8di,
> __builtin_ia32_scatteraltdiv8si.
> (ix86_expand_builtin): Handle IX86_BUILTIN_SCATTERALTSIV8DF,
> IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
> IX86_BUILTIN_SCATTERALTDIV16SI.
> (ix86_vectorize_builtin_scatter): New.
> (TARGET_VECTORIZE_BUILTIN_SCATTER): Define as
> ix86_vectorize_builtin_scatter.
> * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_SCATTER): New.
> * doc/tm.texi: Regenerate.
> * target.def: Add scatter builtin.
> * tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Add new
> checkings for STMT_VINFO_SCATTER_P.
> (vect_check_gather): Rename to ...
> (vect_check_gather_scatter): this and enhance number of arguments.
> (vect_analyze_data_refs): Add scatter and maybe_scatter variables and
> new checkings for it accordingly.
> * tree-vectorizer.h (STMT_VINFO_SCATTER_P(S)): Define.
> (STMT_VINFO_STRIDE_LOAD_P(S)): Ditto.
> (vect_check_gather): Rename to ...
> (vect_check_gather_scatter): this.
> * triee-vect-stmts.c (vectorizable_mask_load_store): Ditto.
> (vectorizable_store): Add checkings for STMT_VINFO_SCATTER_P.
> (vect_mark_stmts_to_be_vectorized): Ditto.
> 
> gcc/testsuite/
> 
> * gcc.target/i386/avx512f-scatter-1.c: New.
> * gcc.target/i386/avx512f-scatter-2.c: Ditto.
> * gcc.target/i386/avx512f-scatter-3.c: Ditto.
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Dilip Upmanyu, Graham Norton, HRB 21284 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] [AVX512F] Add scatter support for vectorizer
  2015-08-04 12:15 ` Richard Biener
@ 2015-08-04 12:42   ` Uros Bizjak
  2015-08-21 12:21   ` Petr Murzin
  1 sibling, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2015-08-04 12:42 UTC (permalink / raw)
  To: Richard Biener; +Cc: Petr Murzin, Kirill Yukhin, gcc-patches

On Tue, Aug 4, 2015 at 2:15 PM, Richard Biener <rguenther@suse.de> wrote:

>> This patch adds scatter support for vectorizer (for AVX512F
>> instructions). Please have a look. Is it OK for trunk?
>
> +/* Target builtin that implements vector scatter operation.  */
> +DEFHOOK
> +(builtin_scatter,
> + "",
> + tree,
> + (const_tree vectype, const_tree index_type, int scale),
> + NULL)
>
> please add documentation inline here, like for builtin_gather,
> and let tm.texi be auto-populated.
>
> Note that the i386 changes need target maintainer approval, CCing
> Uros.

As said many times, please don't mix middle-end and target parts into
one patch. Middle-end part (usually algorithmic one) has to be
discussed, reviewed and approved first, and at that stage, the target
part can be used as an implementation example. Only *after* approval
of the middle-end part, target part can be reviewed.

Not to mention that every part has different reviews, so the review of
the patch can stall due to this fact.

Uros.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] [AVX512F] Add scatter support for vectorizer
  2015-08-04 12:15 ` Richard Biener
  2015-08-04 12:42   ` Uros Bizjak
@ 2015-08-21 12:21   ` Petr Murzin
  2015-08-26  7:44     ` Richard Biener
  1 sibling, 1 reply; 8+ messages in thread
From: Petr Murzin @ 2015-08-21 12:21 UTC (permalink / raw)
  To: Richard Biener; +Cc: Kirill Yukhin, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2320 bytes --]

Hello,
Please have a look at updated patch.

On Tue, Aug 4, 2015 at 3:15 PM, Richard Biener <rguenther@suse.de> wrote:
> On Fri, 31 Jul 2015, Petr Murzin wrote:
> @@ -5586,8 +5770,6 @@ vectorizable_store (gimple stmt,
> gimple_stmt_iterator *gsi, gimple *vec_stmt,
>    prev_stmt_info = NULL;
>    for (j = 0; j < ncopies; j++)
>      {
> -      gimple new_stmt;
> -
>        if (j == 0)
>         {
>            if (slp)
>
> spurious change?

I have increased the scope of this variable to use it in checking for
STMT_VINFO_SCATTER_P (stmt_info).

Thanks,
Petr

2015-08-21  Andrey Turetskiy  <andrey.turetskiy@intel.com>
            Petr Murzin  <petr.murzin@intel.com>

gcc/

* config/i386/i386-builtin-types.def
(VOID_PFLOAT_HI_V8DI_V16SF_INT): New.
(VOID_PDOUBLE_QI_V16SI_V8DF_INT): Ditto.
(VOID_PINT_HI_V8DI_V16SI_INT): Ditto.
(VOID_PLONGLONG_QI_V16SI_V8DI_INT): Ditto.
* config/i386/i386.c
(ix86_builtins): Add IX86_BUILTIN_SCATTERALTSIV8DF,
IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
IX86_BUILTIN_SCATTERALTDIV16SI.
(ix86_init_mmx_sse_builtins): Define __builtin_ia32_scatteraltsiv8df,
__builtin_ia32_scatteraltdiv8sf, __builtin_ia32_scatteraltsiv8di,
__builtin_ia32_scatteraltdiv8si.
(ix86_expand_builtin): Handle IX86_BUILTIN_SCATTERALTSIV8DF,
IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
IX86_BUILTIN_SCATTERALTDIV16SI.
(ix86_vectorize_builtin_scatter): New.
(TARGET_VECTORIZE_BUILTIN_SCATTER): Define as
ix86_vectorize_builtin_scatter.
* doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_SCATTER): New.
* doc/tm.texi: Regenerate.
* target.def: Add scatter builtin.
* tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Add new
checkings for STMT_VINFO_SCATTER_P.
(vect_check_gather): Rename to ...
(vect_check_gather_scatter): this and enhance number of arguments.
(vect_analyze_data_refs): Add gatherscatter enum and maybe_scatter variable
and new checkings for it accordingly.
* tree-vectorizer.h: Rename gather_p to gather_scatter_p and use it
for loads/stores
 in case of gather/scatter accordingly.
(STMT_VINFO_SCATTER_P(S)): Define.
(vect_check_gather): Rename to ...
(vect_check_gather_scatter): this.
* triee-vect-stmts.c (vectorizable_mask_load_store): Ditto.
(vectorizable_store): Add checkings for STMT_VINFO_SCATTER_P.
(vect_mark_stmts_to_be_vectorized): Ditto.

[-- Attachment #2: scatter_patch_upd --]
[-- Type: application/octet-stream, Size: 27061 bytes --]

commit 055bd3159c6f70fc43affbc6a8639e27b3aa111a
Author: Petr Murzin <petr.murzin@intel.com>
Date:   Fri Aug 21 14:35:29 2015 +0300

    Scatter patch

diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index ee31ee3..b892f08 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1021,6 +1021,10 @@ DEF_FUNCTION_TYPE (VOID, PINT, QI, V8DI, V8SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V4DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V2DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8DI, V8DI, INT)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, HI, V8DI, V16SF, INT)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V16SI, V8DF, INT)
+DEF_FUNCTION_TYPE (VOID, PINT, HI, V8DI, V16SI, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V16SI, V8DI, INT)
 
 DEF_FUNCTION_TYPE (VOID, QI, V8SI, PCINT64, INT, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V4DI, V4DI, INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 05fa5e1..a086d7c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -30388,6 +30388,10 @@ enum ix86_builtins
   IX86_BUILTIN_GATHER3SIV16SI,
   IX86_BUILTIN_GATHER3SIV8DF,
   IX86_BUILTIN_GATHER3SIV8DI,
+  IX86_BUILTIN_SCATTERALTSIV8DF,
+  IX86_BUILTIN_SCATTERALTDIV16SF,
+  IX86_BUILTIN_SCATTERALTSIV8DI,
+  IX86_BUILTIN_SCATTERALTDIV16SI,
   IX86_BUILTIN_SCATTERDIV16SF,
   IX86_BUILTIN_SCATTERDIV16SI,
   IX86_BUILTIN_SCATTERDIV8DF,
@@ -34204,6 +34208,21 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
 	       VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
 	       IX86_BUILTIN_SCATTERDIV2DI);
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
+	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
+	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
+	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
+	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SI);
 
   /* AVX512PF */
   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
@@ -39859,6 +39878,18 @@ addcarryx:
     case IX86_BUILTIN_GATHERPFDPD:
       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
       goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DF:
+      icode = CODE_FOR_avx512f_scattersiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SF:
+      icode = CODE_FOR_avx512f_scatterdiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DI:
+      icode = CODE_FOR_avx512f_scattersiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SI:
+      icode = CODE_FOR_avx512f_scatterdiv16si;
+      goto scatter_gen;
     case IX86_BUILTIN_GATHERPFDPS:
       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
       goto vec_prefetch_gen;
@@ -40122,6 +40153,36 @@ addcarryx:
       mode3 = insn_data[icode].operand[3].mode;
       mode4 = insn_data[icode].operand[4].mode;
 
+      /* Scatter instruction stores operand op3 to memory with
+	 indices from op2 and scale from op4 under writemask op1.
+	 If index operand op2 has more elements then source operand
+	 op3 one need to use only its low half. And vice versa.  */
+      switch (fcode)
+	{
+	case IX86_BUILTIN_SCATTERALTSIV8DF:
+	case IX86_BUILTIN_SCATTERALTSIV8DI:
+	  half = gen_reg_rtx (V8SImode);
+	  if (!nonimmediate_operand (op2, V16SImode))
+	    op2 = copy_to_mode_reg (V16SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTDIV16SF:
+	case IX86_BUILTIN_SCATTERALTDIV16SI:
+	  half = gen_reg_rtx (mode3);
+	  if (mode3 == V8SFmode)
+	    gen = gen_vec_extract_lo_v16sf;
+	  else
+	    gen = gen_vec_extract_lo_v16si;
+	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	  emit_insn (gen (half, op3));
+	  op3 = half;
+	  break;
+	default:
+	  break;
+	}
+
       /* Force memory operand only with base register here.  But we
 	 don't want to do it on memory operand for other builtin
 	 functions.  */
@@ -41201,6 +41262,62 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   return ix86_get_builtin (code);
 }
 
+/* Returns a decl of a function that implements scatter store with
+   register type VECTYPE and index type INDEX_TYPE and SCALE.
+   Return NULL_TREE if it is not available.  */
+
+static tree
+ix86_vectorize_builtin_scatter (const_tree vectype,
+				const_tree index_type, int scale)
+{
+  bool si;
+  enum ix86_builtins code;
+
+  if (!TARGET_AVX512F)
+    return NULL_TREE;
+
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+	  && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
+
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
+
+  /* v*scatter* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
+
+  /* Scale can be 1, 2, 4 or 8.  */
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
+
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (vectype))
+    {
+    case V8DFmode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
+      break;
+    case V8DImode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
+      break;
+    case V16SFmode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
+      break;
+    case V16SImode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  return ix86_builtins[code];
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -52331,6 +52448,9 @@ ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_BUILTIN_SCATTER
+#define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index f5a1f84..d548d96 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5720,6 +5720,14 @@ The default is @code{NULL_TREE} which means to not vectorize gather
 loads.
 @end deftypefn
 
+@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_SCATTER (const_tree @var{vectype}, const_tree @var{index_type}, int @var{scale})
+Target builtin that implements vector scatter operation.  @var{vectype}
+is the vector type of the store and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize scatter
+stores.
+@end deftypefn
+
 @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int})
 This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float}
 fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 9d5ac0a..9bef4a5 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4239,6 +4239,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
+@hook TARGET_VECTORIZE_BUILTIN_SCATTER
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
diff --git a/gcc/target.def b/gcc/target.def
index 4edc209..aa5a1f1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1801,6 +1801,18 @@ loads.",
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Target builtin that implements vector scatter operation.  */
+DEFHOOK
+(builtin_scatter,
+"Target builtin that implements vector scatter operation.  @var{vectype}\n\
+is the vector type of the store and @var{index_type} is scalar type of\n\
+the index, scaled by @var{scale}.\n\
+The default is @code{NULL_TREE} which means to not vectorize scatter\n\
+stores.",
+ tree,
+ (const_tree vectype, const_tree index_type, int scale),
+ NULL)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index f1eaef4..bea7b45 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -268,7 +268,9 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	}
 
       if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+	  || STMT_VINFO_GATHER_P (stmtinfo_b)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -316,7 +318,9 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	}
 
       if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+	  || STMT_VINFO_GATHER_P (stmtinfo_b)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -2344,10 +2348,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 	                     "zero step in outer loop.\n");
-	  if (DR_IS_READ (dr))
-  	    return true;
-	  else
-	    return false;
+	  return DR_IS_READ (dr);
 	}
     }
 
@@ -2997,12 +2998,12 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   return true;
 }
 
-/* Check whether a non-affine read in stmt is suitable for gather load
-   and if so, return a builtin decl for that operation.  */
+/* Check whether a non-affine read or write in stmt is suitable for gather load
+   or scatter store and if so, return a builtin decl for that operation.  */
 
 tree
-vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
-		   tree *offp, int *scalep)
+vect_check_gather_scatter (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
+			   tree *offp, int *scalep, bool is_load)
 {
   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -3031,7 +3032,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
     }
 
-  /* The gather builtins need address of the form
+  /* The gather and scatter builtins need address of the form
      loop_invariant + vector * {1, 2, 4, 8}
      or
      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
@@ -3194,8 +3195,13 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
   if (offtype == NULL_TREE)
     offtype = TREE_TYPE (off);
 
-  decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
-					   offtype, scale);
+  if (is_load)
+    decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
+					     offtype, scale);
+  else
+    decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
+					      offtype, scale);
+
   if (decl == NULL_TREE)
     return NULL_TREE;
 
@@ -3344,7 +3350,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
       gimple stmt;
       stmt_vec_info stmt_info;
       tree base, offset, init;
-      bool gather = false;
+      enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
       bool simd_lane_access = false;
       int vf;
 
@@ -3383,18 +3389,22 @@ again:
 	    = DR_IS_READ (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
 	      && targetm.vectorize.builtin_gather != NULL;
+	  bool maybe_scatter
+	    = DR_IS_WRITE (dr)
+	      && !TREE_THIS_VOLATILE (DR_REF (dr))
+	      && targetm.vectorize.builtin_scatter != NULL;
 	  bool maybe_simd_lane_access
 	    = loop_vinfo && loop->simduid;
 
-	  /* If target supports vector gather loads, or if this might be
-	     a SIMD lane access, see if they can't be used.  */
+	  /* If target supports vector gather loads or scatter stores, or if
+	     this might be a SIMD lane access, see if they can't be used.  */
 	  if (loop_vinfo
-	      && (maybe_gather || maybe_simd_lane_access)
+	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
 	      && !nested_in_vect_loop_p (loop, stmt))
 	    {
 	      struct data_reference *newdr
 		= create_data_ref (NULL, loop_containing_stmt (stmt),
-				   DR_REF (dr), stmt, true);
+				   DR_REF (dr), stmt, maybe_scatter ? false : true);
 	      gcc_assert (newdr != NULL && DR_REF (newdr));
 	      if (DR_BASE_ADDRESS (newdr)
 		  && DR_OFFSET (newdr)
@@ -3447,17 +3457,20 @@ again:
 			    }
 			}
 		    }
-		  if (!simd_lane_access && maybe_gather)
+		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
 		    {
 		      dr = newdr;
-		      gather = true;
+		      if (maybe_gather)
+			gatherscatter = GATHER;
+		      else
+			gatherscatter = SCATTER;
 		    }
 		}
-	      if (!gather && !simd_lane_access)
+	      if (gatherscatter == SG_NONE && !simd_lane_access)
 		free_data_ref (newdr);
 	    }
 
-	  if (!gather && !simd_lane_access)
+	  if (gatherscatter == SG_NONE && !simd_lane_access)
 	    {
 	      if (dump_enabled_p ())
 		{
@@ -3485,7 +3498,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
         }
@@ -3520,7 +3533,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3540,7 +3553,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
           return false;
 	}
@@ -3565,7 +3578,7 @@ again:
 	  if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
 	}
@@ -3703,7 +3716,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3736,10 +3749,10 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
-	      if (gather)
+	      if (gatherscatter != SG_NONE)
 		free_data_ref (dr);
 	    }
 	  return false;
@@ -3763,32 +3776,46 @@ again:
       if (vf > *min_vf)
 	*min_vf = vf;
 
-      if (gather)
+      if (gatherscatter != SG_NONE)
 	{
 	  tree off;
+	  if (vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, true) != 0)
+	    gatherscatter = GATHER;
+	  else if (vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, false)
+					      != 0)
+	    gatherscatter = SCATTER;
+	  else
+	    gatherscatter = SG_NONE;
 
-	  gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
-	  if (gather
+	  if (gatherscatter != SG_NONE
 	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
-	    gather = false;
-	  if (!gather)
+	      gatherscatter = SG_NONE;
+
+	  if (gatherscatter == SG_NONE)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
 	      free_data_ref (dr);
 	      if (dump_enabled_p ())
 		{
-		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                   "not vectorized: not suitable for gather "
-                                   "load ");
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   !(DR_IS_READ (dr)) ?
+				    "not vectorized: not suitable for gather "
+				    "load " :
+				    "not vectorized: not suitable for scatter "
+				    "store ");
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+		  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 		}
 	      return false;
 	    }
 
 	  datarefs[i] = dr;
-	  STMT_VINFO_GATHER_P (stmt_info) = true;
+	  if (gatherscatter == GATHER)
+	    STMT_VINFO_GATHER_P (stmt_info) = true;
+	  else
+	    STMT_VINFO_SCATTER_P (stmt_info) = true;
 	}
+
       else if (loop_vinfo
 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
 	{
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 2ddd434..b3959f3 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -810,10 +810,12 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
               return false;
           }
 
-      if (STMT_VINFO_GATHER_P (stmt_vinfo))
+      if (STMT_VINFO_GATHER_P (stmt_vinfo) || STMT_VINFO_SCATTER_P (stmt_vinfo))
 	{
 	  tree off;
-	  tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+	  tree decl = vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL,
+						 (STMT_VINFO_GATHER_P (stmt_vinfo)) ?
+						 true : false);
 	  gcc_assert (decl);
 	  if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
 			    &worklist, true))
@@ -1819,8 +1821,8 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
-				       &gather_off, &gather_scale);
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
+				       &gather_off, &gather_scale, true);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, NULL,
 				 &def_stmt, &def, &gather_dt,
@@ -5144,6 +5146,12 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   unsigned int vec_num;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree aggr_type;
+  tree scatter_base = NULL_TREE, scatter_off = NULL_TREE;
+  tree scatter_off_vectype = NULL_TREE, scatter_decl = NULL_TREE;
+  int scatter_scale = 1;
+  enum vect_def_type scatter_idx_dt = vect_unknown_def_type;
+  enum vect_def_type scatter_src_dt = vect_unknown_def_type;
+  gimple new_stmt;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
@@ -5301,6 +5309,24 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
         }
     }
 
+  if (STMT_VINFO_SCATTER_P (stmt_info))
+    {
+      gimple def_stmt;
+      tree def;
+      scatter_decl = vect_check_gather_scatter (stmt, loop_vinfo, &scatter_base,
+						&scatter_off, &scatter_scale, false);
+      gcc_assert (scatter_decl);
+      if (!vect_is_simple_use_1 (scatter_off, NULL, loop_vinfo, bb_vinfo,
+				 &def_stmt, &def, &scatter_idx_dt,
+				 &scatter_off_vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "scatter index use not simple.");
+	  return false;
+	}
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
@@ -5315,6 +5341,146 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
   ensure_base_align (stmt_info, dr);
 
+  if (STMT_VINFO_SCATTER_P (stmt_info))
+    {
+      tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
+      tree arglist = TYPE_ARG_TYPES (TREE_TYPE (scatter_decl));
+      tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+      tree ptr, mask, var, scale, perm_mask = NULL_TREE;
+      edge pe = loop_preheader_edge (loop);
+      gimple_seq seq;
+      basic_block new_bb;
+      enum { NARROW, NONE, WIDEN } modifier;
+      int scatter_off_nunits = TYPE_VECTOR_SUBPARTS (scatter_off_vectype);
+
+      if (nunits == (unsigned int) scatter_off_nunits)
+	modifier = NONE;
+      else if (nunits == (unsigned int) scatter_off_nunits / 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, scatter_off_nunits);
+	  modifier = WIDEN;
+
+	  for (i = 0; i < (unsigned int) scatter_off_nunits; ++i)
+	    sel[i] = i | nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (scatter_off_vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	}
+      else if (nunits == (unsigned int) scatter_off_nunits * 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+	  modifier = NARROW;
+
+	  for (i = 0; i < (unsigned int) nunits; ++i)
+	    sel[i] = i | scatter_off_nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	  ncopies *= 2;
+	}
+      else
+	gcc_unreachable ();
+
+      rettype = TREE_TYPE (TREE_TYPE (scatter_decl));
+      ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      scaletype = TREE_VALUE (arglist);
+
+      gcc_checking_assert (TREE_CODE (masktype) == INTEGER_TYPE
+			   && TREE_CODE (rettype) == VOID_TYPE);
+
+      ptr = fold_convert (ptrtype, scatter_base);
+      if (!is_gimple_min_invariant (ptr))
+	{
+	  ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+	  gcc_assert (!new_bb);
+	}
+
+      /* Currently we support only unconditional scatter stores,
+	 so mask should be all ones.  */
+      mask = build_int_cst (masktype, -1);
+      mask = vect_init_vector (stmt, mask, masktype, NULL);
+
+      scale = build_int_cst (scaletype, scatter_scale);
+
+      prev_stmt_info = NULL;
+      for (j = 0; j < ncopies; ++j)
+	{
+	  if (j == 0)
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_operand (gimple_assign_rhs1 (stmt), stmt, NULL);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_operand (scatter_off, stmt, NULL);
+	    }
+	  else if (modifier != NONE && (j & 1))
+	    {
+	      if (modifier == WIDEN)
+		{
+		  src = vec_oprnd1
+		    = vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+		  op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask,
+					     stmt, gsi);
+		}
+	      else if (modifier == NARROW)
+		{
+		  src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask,
+					      stmt, gsi);
+		  op = vec_oprnd0
+		    = vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+		}
+	      else
+		gcc_unreachable ();
+	    }
+	  else
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+	    }
+
+	  if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (src))
+			  == TYPE_VECTOR_SUBPARTS (srctype));
+	      var = vect_get_new_vect_var (srctype, vect_simple_var, NULL);
+	      var = make_ssa_name (var);
+	      src = build1 (VIEW_CONVERT_EXPR, srctype, src);
+	      new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, src);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      src = var;
+	    }
+
+	  if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+			  == TYPE_VECTOR_SUBPARTS (idxtype));
+	      var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+	      var = make_ssa_name (var);
+	      op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+	      new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      op = var;
+	    }
+
+	  new_stmt
+	    = gimple_build_call (scatter_decl, 5, ptr, mask, op, src, scale);
+
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	  if (prev_stmt_info == NULL)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+      return true;
+    }
+
   if (grouped_store)
     {
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
@@ -5588,7 +5754,6 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   prev_stmt_info = NULL;
   for (j = 0; j < ncopies; j++)
     {
-      gimple new_stmt;
 
       if (j == 0)
 	{
@@ -6142,8 +6307,8 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
-				       &gather_off, &gather_scale);
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
+					       &gather_off, &gather_scale, true);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, bb_vinfo,
 				 &def_stmt, &def, &gather_dt,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 58e8f10..3a30985 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -646,8 +646,8 @@ typedef struct _stmt_vec_info {
      vectorization.  */
   bool vectorizable;
 
-  /* For loads only, true if this is a gather load.  */
-  bool gather_p;
+  /* For loads if this is a gather, for stores if this is a scatter.  */
+  bool gather_scatter_p;
 
   /* True if this is an access with loop-invariant stride.  */
   bool strided_p;
@@ -667,7 +667,8 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_VEC_STMT(S)             (S)->vectorized_stmt
 #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
-#define STMT_VINFO_GATHER_P(S)		   (S)->gather_p
+#define STMT_VINFO_GATHER_P(S)		   (S)->gather_scatter_p
+#define STMT_VINFO_SCATTER_P(S)		   (S)->gather_scatter_p
 #define STMT_VINFO_STRIDED_P(S)	   	   (S)->strided_p
 #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
 
@@ -1063,8 +1064,8 @@ extern bool vect_analyze_data_refs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info);
 extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
-extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
-			       int *);
+extern tree vect_check_gather_scatter (gimple, loop_vec_info, tree *, tree *,
+				       int *, bool);
 extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *,
 				    unsigned *);
 extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] [AVX512F] Add scatter support for vectorizer
  2015-08-21 12:21   ` Petr Murzin
@ 2015-08-26  7:44     ` Richard Biener
  2015-08-26 18:46       ` Petr Murzin
  0 siblings, 1 reply; 8+ messages in thread
From: Richard Biener @ 2015-08-26  7:44 UTC (permalink / raw)
  To: Petr Murzin; +Cc: Richard Biener, Kirill Yukhin, gcc-patches

On Fri, Aug 21, 2015 at 2:18 PM, Petr Murzin <petrmurzin1@gmail.com> wrote:
> Hello,
> Please have a look at updated patch.
>
> On Tue, Aug 4, 2015 at 3:15 PM, Richard Biener <rguenther@suse.de> wrote:
>> On Fri, 31 Jul 2015, Petr Murzin wrote:
>> @@ -5586,8 +5770,6 @@ vectorizable_store (gimple stmt,
>> gimple_stmt_iterator *gsi, gimple *vec_stmt,
>>    prev_stmt_info = NULL;
>>    for (j = 0; j < ncopies; j++)
>>      {
>> -      gimple new_stmt;
>> -
>>        if (j == 0)
>>         {
>>            if (slp)
>>
>> spurious change?
>
> I have increased the scope of this variable to use it in checking for
> STMT_VINFO_SCATTER_P (stmt_info).

@@ -3763,32 +3776,46 @@ again:
       if (vf > *min_vf)
        *min_vf = vf;

-      if (gather)
+      if (gatherscatter != SG_NONE)
        {
          tree off;
+         if (vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off,
NULL, true) != 0)
+           gatherscatter = GATHER;
+         else if (vect_check_gather_scatter (stmt, loop_vinfo, NULL,
&off, NULL, false)
+                                             != 0)
+           gatherscatter = SCATTER;
+         else
+           gatherscatter = SG_NONE;

as I said vect_check_gather_scatter already knows whether the DR is a read or
a write and thus whether it needs to check for gather or scatter.  Remove
the new argument.  And simply do

   if (!vect_check_gather_scatter (stmt....))
     gatherscatter = SG_NONE;

-         STMT_VINFO_GATHER_P (stmt_info) = true;
+         if (gatherscatter == GATHER)
+           STMT_VINFO_GATHER_P (stmt_info) = true;
+         else
+           STMT_VINFO_SCATTER_P (stmt_info) = true;
        }

and as suggested merge STMT_VINFO_GATHER_P and STMT_VINFO_SCATTER_P
using the enum so you can simply do

 STMT_VINFO_SCATTER_GATHER_P (smt_info) = gatherscatter;

I miss a few testcases that exercise scatter vectorization.  And as Uros
said, the i386 specific parts should be split out.

Otherwise the patch looks ok to me.

Thanks,
Richard.


> Thanks,
> Petr
>
> 2015-08-21  Andrey Turetskiy  <andrey.turetskiy@intel.com>
>             Petr Murzin  <petr.murzin@intel.com>
>
> gcc/
>
> * config/i386/i386-builtin-types.def
> (VOID_PFLOAT_HI_V8DI_V16SF_INT): New.
> (VOID_PDOUBLE_QI_V16SI_V8DF_INT): Ditto.
> (VOID_PINT_HI_V8DI_V16SI_INT): Ditto.
> (VOID_PLONGLONG_QI_V16SI_V8DI_INT): Ditto.
> * config/i386/i386.c
> (ix86_builtins): Add IX86_BUILTIN_SCATTERALTSIV8DF,
> IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
> IX86_BUILTIN_SCATTERALTDIV16SI.
> (ix86_init_mmx_sse_builtins): Define __builtin_ia32_scatteraltsiv8df,
> __builtin_ia32_scatteraltdiv8sf, __builtin_ia32_scatteraltsiv8di,
> __builtin_ia32_scatteraltdiv8si.
> (ix86_expand_builtin): Handle IX86_BUILTIN_SCATTERALTSIV8DF,
> IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
> IX86_BUILTIN_SCATTERALTDIV16SI.
> (ix86_vectorize_builtin_scatter): New.
> (TARGET_VECTORIZE_BUILTIN_SCATTER): Define as
> ix86_vectorize_builtin_scatter.
> * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_SCATTER): New.
> * doc/tm.texi: Regenerate.
> * target.def: Add scatter builtin.
> * tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Add new
> checkings for STMT_VINFO_SCATTER_P.
> (vect_check_gather): Rename to ...
> (vect_check_gather_scatter): this and enhance number of arguments.
> (vect_analyze_data_refs): Add gatherscatter enum and maybe_scatter variable
> and new checkings for it accordingly.
> * tree-vectorizer.h: Rename gather_p to gather_scatter_p and use it
> for loads/stores
>  in case of gather/scatter accordingly.
> (STMT_VINFO_SCATTER_P(S)): Define.
> (vect_check_gather): Rename to ...
> (vect_check_gather_scatter): this.
> * triee-vect-stmts.c (vectorizable_mask_load_store): Ditto.
> (vectorizable_store): Add checkings for STMT_VINFO_SCATTER_P.
> (vect_mark_stmts_to_be_vectorized): Ditto.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] [AVX512F] Add scatter support for vectorizer
  2015-08-26  7:44     ` Richard Biener
@ 2015-08-26 18:46       ` Petr Murzin
  2015-08-26 22:15         ` Uros Bizjak
  0 siblings, 1 reply; 8+ messages in thread
From: Petr Murzin @ 2015-08-26 18:46 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Richard Biener, Kirill Yukhin, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3463 bytes --]

On Wed, Aug 26, 2015 at 10:41 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> @@ -3763,32 +3776,46 @@ again:
>        if (vf > *min_vf)
>         *min_vf = vf;
>
> -      if (gather)
> +      if (gatherscatter != SG_NONE)
>         {
>           tree off;
> +         if (vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off,
> NULL, true) != 0)
> +           gatherscatter = GATHER;
> +         else if (vect_check_gather_scatter (stmt, loop_vinfo, NULL,
> &off, NULL, false)
> +                                             != 0)
> +           gatherscatter = SCATTER;
> +         else
> +           gatherscatter = SG_NONE;
>
> as I said vect_check_gather_scatter already knows whether the DR is a read or
> a write and thus whether it needs to check for gather or scatter.  Remove
> the new argument.  And simply do
>
>    if (!vect_check_gather_scatter (stmt....))
>      gatherscatter = SG_NONE;
>
> -         STMT_VINFO_GATHER_P (stmt_info) = true;
> +         if (gatherscatter == GATHER)
> +           STMT_VINFO_GATHER_P (stmt_info) = true;
> +         else
> +           STMT_VINFO_SCATTER_P (stmt_info) = true;
>         }
>
> and as suggested merge STMT_VINFO_GATHER_P and STMT_VINFO_SCATTER_P
> using the enum so you can simply do
>
>  STMT_VINFO_SCATTER_GATHER_P (smt_info) = gatherscatter;
> Otherwise the patch looks ok to me.

Fixed.
Uros, could you please have a look at target part of patch?

Thanks,
Petr

2015-08-26  Andrey Turetskiy  <andrey.turetskiy@intel.com>
            Petr Murzin  <petr.murzin@intel.com>

gcc/

* config/i386/i386-builtin-types.def
(VOID_PFLOAT_HI_V8DI_V16SF_INT): New.
(VOID_PDOUBLE_QI_V16SI_V8DF_INT): Ditto.
(VOID_PINT_HI_V8DI_V16SI_INT): Ditto.
(VOID_PLONGLONG_QI_V16SI_V8DI_INT): Ditto.
* config/i386/i386.c
(ix86_builtins): Add IX86_BUILTIN_SCATTERALTSIV8DF,
IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
IX86_BUILTIN_SCATTERALTDIV16SI.
(ix86_init_mmx_sse_builtins): Define __builtin_ia32_scatteraltsiv8df,
__builtin_ia32_scatteraltdiv8sf, __builtin_ia32_scatteraltsiv8di,
__builtin_ia32_scatteraltdiv8si.
(ix86_expand_builtin): Handle IX86_BUILTIN_SCATTERALTSIV8DF,
IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
IX86_BUILTIN_SCATTERALTDIV16SI.
(ix86_vectorize_builtin_scatter): New.
(TARGET_VECTORIZE_BUILTIN_SCATTER): Define as
ix86_vectorize_builtin_scatter.
* doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_SCATTER): New.
* doc/tm.texi: Regenerate.
* target.def: Add scatter builtin.
* tree-vectorizer.h: Rename gather_p to gather_scatter_p and use it
for loads/stores in case of gather/scatter accordingly.
(STMT_VINFO_GATHER_SCATTER_P(S)): Use it instead of STMT_VINFO_GATHER_P(S).
(vect_check_gather): Rename to ...
(vect_check_gather_scatter): this.
* tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Use
STMT_VINFO_GATHER_SCATTER_P instead of STMT_VINFO_SCATTER_P.
(vect_check_gather_scatter): Use it instead of vect_check_gather.
(vect_analyze_data_refs): Add gatherscatter enum and maybe_scatter variable
and new checkings for it accordingly.
* tree-vect-stmts.c
(STMT_VINFO_GATHER_SCATTER_P(S)): Use it instead of STMT_VINFO_GATHER_P(S).
(vect_check_gather_scatter): Use it instead of vect_check_gather.
(vectorizable_store): Add checkings for STMT_VINFO_GATHER_SCATTER_P.

gcc/testsuite/

        * gcc.target/i386/avx512f-scatter-1.c: New.
        * gcc.target/i386/avx512f-scatter-2.c: Ditto.
        * gcc.target/i386/avx512f-scatter-3.c: Ditto.

[-- Attachment #2: scatter --]
[-- Type: application/octet-stream, Size: 27741 bytes --]

commit d2189141aeefd0c5b579801d93dabb0c2df17c40
Author: Petr Murzin <petr.murzin@intel.com>
Date:   Wed Aug 26 19:54:18 2015 +0300

    Scatter patch

diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index ee31ee3..b892f08 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1021,6 +1021,10 @@ DEF_FUNCTION_TYPE (VOID, PINT, QI, V8DI, V8SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V4DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V2DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8DI, V8DI, INT)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, HI, V8DI, V16SF, INT)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V16SI, V8DF, INT)
+DEF_FUNCTION_TYPE (VOID, PINT, HI, V8DI, V16SI, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V16SI, V8DI, INT)
 
 DEF_FUNCTION_TYPE (VOID, QI, V8SI, PCINT64, INT, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V4DI, V4DI, INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 070605f..272f827 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -30388,6 +30388,10 @@ enum ix86_builtins
   IX86_BUILTIN_GATHER3SIV16SI,
   IX86_BUILTIN_GATHER3SIV8DF,
   IX86_BUILTIN_GATHER3SIV8DI,
+  IX86_BUILTIN_SCATTERALTSIV8DF,
+  IX86_BUILTIN_SCATTERALTDIV16SF,
+  IX86_BUILTIN_SCATTERALTSIV8DI,
+  IX86_BUILTIN_SCATTERALTDIV16SI,
   IX86_BUILTIN_SCATTERDIV16SF,
   IX86_BUILTIN_SCATTERDIV16SI,
   IX86_BUILTIN_SCATTERDIV8DF,
@@ -34204,6 +34208,21 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
 	       VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
 	       IX86_BUILTIN_SCATTERDIV2DI);
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
+	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
+	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
+	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
+	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SI);
 
   /* AVX512PF */
   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
@@ -39859,6 +39878,18 @@ addcarryx:
     case IX86_BUILTIN_GATHERPFDPD:
       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
       goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DF:
+      icode = CODE_FOR_avx512f_scattersiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SF:
+      icode = CODE_FOR_avx512f_scatterdiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DI:
+      icode = CODE_FOR_avx512f_scattersiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SI:
+      icode = CODE_FOR_avx512f_scatterdiv16si;
+      goto scatter_gen;
     case IX86_BUILTIN_GATHERPFDPS:
       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
       goto vec_prefetch_gen;
@@ -40122,6 +40153,36 @@ addcarryx:
       mode3 = insn_data[icode].operand[3].mode;
       mode4 = insn_data[icode].operand[4].mode;
 
+      /* Scatter instruction stores operand op3 to memory with
+	 indices from op2 and scale from op4 under writemask op1.
+	 If index operand op2 has more elements then source operand
+	 op3 one need to use only its low half. And vice versa.  */
+      switch (fcode)
+	{
+	case IX86_BUILTIN_SCATTERALTSIV8DF:
+	case IX86_BUILTIN_SCATTERALTSIV8DI:
+	  half = gen_reg_rtx (V8SImode);
+	  if (!nonimmediate_operand (op2, V16SImode))
+	    op2 = copy_to_mode_reg (V16SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTDIV16SF:
+	case IX86_BUILTIN_SCATTERALTDIV16SI:
+	  half = gen_reg_rtx (mode3);
+	  if (mode3 == V8SFmode)
+	    gen = gen_vec_extract_lo_v16sf;
+	  else
+	    gen = gen_vec_extract_lo_v16si;
+	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	  emit_insn (gen (half, op3));
+	  op3 = half;
+	  break;
+	default:
+	  break;
+	}
+
       /* Force memory operand only with base register here.  But we
 	 don't want to do it on memory operand for other builtin
 	 functions.  */
@@ -41201,6 +41262,62 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   return ix86_get_builtin (code);
 }
 
+/* Returns a decl of a function that implements scatter store with
+   register type VECTYPE and index type INDEX_TYPE and SCALE.
+   Return NULL_TREE if it is not available.  */
+
+static tree
+ix86_vectorize_builtin_scatter (const_tree vectype,
+				const_tree index_type, int scale)
+{
+  bool si;
+  enum ix86_builtins code;
+
+  if (!TARGET_AVX512F)
+    return NULL_TREE;
+
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+	  && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
+
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
+
+  /* v*scatter* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
+
+  /* Scale can be 1, 2, 4 or 8.  */
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
+
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (vectype))
+    {
+    case V8DFmode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
+      break;
+    case V8DImode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
+      break;
+    case V16SFmode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
+      break;
+    case V16SImode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  return ix86_builtins[code];
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -52331,6 +52448,9 @@ ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_BUILTIN_SCATTER
+#define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index f5a1f84..d548d96 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5720,6 +5720,14 @@ The default is @code{NULL_TREE} which means to not vectorize gather
 loads.
 @end deftypefn
 
+@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_SCATTER (const_tree @var{vectype}, const_tree @var{index_type}, int @var{scale})
+Target builtin that implements vector scatter operation.  @var{vectype}
+is the vector type of the store and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize scatter
+stores.
+@end deftypefn
+
 @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int})
 This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float}
 fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 9d5ac0a..9bef4a5 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4239,6 +4239,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
+@hook TARGET_VECTORIZE_BUILTIN_SCATTER
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
diff --git a/gcc/target.def b/gcc/target.def
index 4edc209..aa5a1f1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1801,6 +1801,18 @@ loads.",
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Target builtin that implements vector scatter operation.  */
+DEFHOOK
+(builtin_scatter,
+"Target builtin that implements vector scatter operation.  @var{vectype}\n\
+is the vector type of the store and @var{index_type} is scalar type of\n\
+the index, scaled by @var{scale}.\n\
+The default is @code{NULL_TREE} which means to not vectorize scatter\n\
+stores.",
+ tree,
+ (const_tree vectype, const_tree index_type, int scale),
+ NULL)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index f1eaef4..2439bd6 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -267,8 +267,8 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	  return false;
 	}
 
-      if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -315,8 +315,8 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	  return false;
 	}
 
-      if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -2344,10 +2344,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 	                     "zero step in outer loop.\n");
-	  if (DR_IS_READ (dr))
-  	    return true;
-	  else
-	    return false;
+	  return DR_IS_READ (dr);
 	}
     }
 
@@ -2997,12 +2994,12 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   return true;
 }
 
-/* Check whether a non-affine read in stmt is suitable for gather load
-   and if so, return a builtin decl for that operation.  */
+/* Check whether a non-affine read or write in stmt is suitable for gather load
+   or scatter store and if so, return a builtin decl for that operation.  */
 
 tree
-vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
-		   tree *offp, int *scalep)
+vect_check_gather_scatter (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
+			   tree *offp, int *scalep)
 {
   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -3031,7 +3028,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
     }
 
-  /* The gather builtins need address of the form
+  /* The gather and scatter builtins need address of the form
      loop_invariant + vector * {1, 2, 4, 8}
      or
      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
@@ -3194,8 +3191,13 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
   if (offtype == NULL_TREE)
     offtype = TREE_TYPE (off);
 
-  decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
-					   offtype, scale);
+  if (DR_IS_READ (dr))
+    decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
+					     offtype, scale);
+  else
+    decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
+					      offtype, scale);
+
   if (decl == NULL_TREE)
     return NULL_TREE;
 
@@ -3344,7 +3346,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
       gimple stmt;
       stmt_vec_info stmt_info;
       tree base, offset, init;
-      bool gather = false;
+      enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
       bool simd_lane_access = false;
       int vf;
 
@@ -3383,18 +3385,22 @@ again:
 	    = DR_IS_READ (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
 	      && targetm.vectorize.builtin_gather != NULL;
+	  bool maybe_scatter
+	    = DR_IS_WRITE (dr)
+	      && !TREE_THIS_VOLATILE (DR_REF (dr))
+	      && targetm.vectorize.builtin_scatter != NULL;
 	  bool maybe_simd_lane_access
 	    = loop_vinfo && loop->simduid;
 
-	  /* If target supports vector gather loads, or if this might be
-	     a SIMD lane access, see if they can't be used.  */
+	  /* If target supports vector gather loads or scatter stores, or if
+	     this might be a SIMD lane access, see if they can't be used.  */
 	  if (loop_vinfo
-	      && (maybe_gather || maybe_simd_lane_access)
+	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
 	      && !nested_in_vect_loop_p (loop, stmt))
 	    {
 	      struct data_reference *newdr
 		= create_data_ref (NULL, loop_containing_stmt (stmt),
-				   DR_REF (dr), stmt, true);
+				   DR_REF (dr), stmt, maybe_scatter ? false : true);
 	      gcc_assert (newdr != NULL && DR_REF (newdr));
 	      if (DR_BASE_ADDRESS (newdr)
 		  && DR_OFFSET (newdr)
@@ -3447,17 +3453,20 @@ again:
 			    }
 			}
 		    }
-		  if (!simd_lane_access && maybe_gather)
+		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
 		    {
 		      dr = newdr;
-		      gather = true;
+		      if (maybe_gather)
+			gatherscatter = GATHER;
+		      else
+			gatherscatter = SCATTER;
 		    }
 		}
-	      if (!gather && !simd_lane_access)
+	      if (gatherscatter == SG_NONE && !simd_lane_access)
 		free_data_ref (newdr);
 	    }
 
-	  if (!gather && !simd_lane_access)
+	  if (gatherscatter == SG_NONE && !simd_lane_access)
 	    {
 	      if (dump_enabled_p ())
 		{
@@ -3485,7 +3494,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
         }
@@ -3520,7 +3529,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3540,7 +3549,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
           return false;
 	}
@@ -3565,7 +3574,7 @@ again:
 	  if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
 	}
@@ -3703,7 +3712,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3736,10 +3745,10 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gatherscatter != SG_NONE || simd_lane_access)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
-	      if (gather)
+	      if (gatherscatter != SG_NONE)
 		free_data_ref (dr);
 	    }
 	  return false;
@@ -3763,23 +3772,22 @@ again:
       if (vf > *min_vf)
 	*min_vf = vf;
 
-      if (gather)
+      if (gatherscatter != SG_NONE)
 	{
 	  tree off;
-
-	  gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
-	  if (gather
-	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
-	    gather = false;
-	  if (!gather)
+	  if (!vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL)
+	      || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
 	      free_data_ref (dr);
 	      if (dump_enabled_p ())
 		{
-		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                   "not vectorized: not suitable for gather "
-                                   "load ");
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   (gatherscatter == GATHER) ?
+				   "not vectorized: not suitable for gather "
+				   "load " :
+				   "not vectorized: not suitable for scatter "
+				   "store ");
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 		}
@@ -3787,8 +3795,9 @@ again:
 	    }
 
 	  datarefs[i] = dr;
-	  STMT_VINFO_GATHER_P (stmt_info) = true;
+	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
 	}
+
       else if (loop_vinfo
 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
 	{
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f87c066..359e010 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -810,10 +810,10 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
               return false;
           }
 
-      if (STMT_VINFO_GATHER_P (stmt_vinfo))
+      if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
 	{
 	  tree off;
-	  tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+	  tree decl = vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL);
 	  gcc_assert (decl);
 	  if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
 			    &worklist, true))
@@ -1815,11 +1815,11 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
   if (STMT_VINFO_STRIDED_P (stmt_info))
     return false;
 
-  if (STMT_VINFO_GATHER_P (stmt_info))
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
 				       &gather_off, &gather_scale);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, NULL,
@@ -1879,7 +1879,7 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
 
   /** Transform.  **/
 
-  if (STMT_VINFO_GATHER_P (stmt_info))
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     {
       tree vec_oprnd0 = NULL_TREE, op;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
@@ -5140,6 +5140,12 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   unsigned int vec_num;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree aggr_type;
+  tree scatter_base = NULL_TREE, scatter_off = NULL_TREE;
+  tree scatter_off_vectype = NULL_TREE, scatter_decl = NULL_TREE;
+  int scatter_scale = 1;
+  enum vect_def_type scatter_idx_dt = vect_unknown_def_type;
+  enum vect_def_type scatter_src_dt = vect_unknown_def_type;
+  gimple new_stmt;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
@@ -5297,6 +5303,24 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
         }
     }
 
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    {
+      gimple def_stmt;
+      tree def;
+      scatter_decl = vect_check_gather_scatter (stmt, loop_vinfo, &scatter_base,
+						&scatter_off, &scatter_scale);
+      gcc_assert (scatter_decl);
+      if (!vect_is_simple_use_1 (scatter_off, NULL, loop_vinfo, bb_vinfo,
+				 &def_stmt, &def, &scatter_idx_dt,
+				 &scatter_off_vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "scatter index use not simple.");
+	  return false;
+	}
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
@@ -5311,6 +5335,146 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
   ensure_base_align (stmt_info, dr);
 
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    {
+      tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
+      tree arglist = TYPE_ARG_TYPES (TREE_TYPE (scatter_decl));
+      tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+      tree ptr, mask, var, scale, perm_mask = NULL_TREE;
+      edge pe = loop_preheader_edge (loop);
+      gimple_seq seq;
+      basic_block new_bb;
+      enum { NARROW, NONE, WIDEN } modifier;
+      int scatter_off_nunits = TYPE_VECTOR_SUBPARTS (scatter_off_vectype);
+
+      if (nunits == (unsigned int) scatter_off_nunits)
+	modifier = NONE;
+      else if (nunits == (unsigned int) scatter_off_nunits / 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, scatter_off_nunits);
+	  modifier = WIDEN;
+
+	  for (i = 0; i < (unsigned int) scatter_off_nunits; ++i)
+	    sel[i] = i | nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (scatter_off_vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	}
+      else if (nunits == (unsigned int) scatter_off_nunits * 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+	  modifier = NARROW;
+
+	  for (i = 0; i < (unsigned int) nunits; ++i)
+	    sel[i] = i | scatter_off_nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	  ncopies *= 2;
+	}
+      else
+	gcc_unreachable ();
+
+      rettype = TREE_TYPE (TREE_TYPE (scatter_decl));
+      ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      scaletype = TREE_VALUE (arglist);
+
+      gcc_checking_assert (TREE_CODE (masktype) == INTEGER_TYPE
+			   && TREE_CODE (rettype) == VOID_TYPE);
+
+      ptr = fold_convert (ptrtype, scatter_base);
+      if (!is_gimple_min_invariant (ptr))
+	{
+	  ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+	  gcc_assert (!new_bb);
+	}
+
+      /* Currently we support only unconditional scatter stores,
+	 so mask should be all ones.  */
+      mask = build_int_cst (masktype, -1);
+      mask = vect_init_vector (stmt, mask, masktype, NULL);
+
+      scale = build_int_cst (scaletype, scatter_scale);
+
+      prev_stmt_info = NULL;
+      for (j = 0; j < ncopies; ++j)
+	{
+	  if (j == 0)
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_operand (gimple_assign_rhs1 (stmt), stmt, NULL);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_operand (scatter_off, stmt, NULL);
+	    }
+	  else if (modifier != NONE && (j & 1))
+	    {
+	      if (modifier == WIDEN)
+		{
+		  src = vec_oprnd1
+		    = vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+		  op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask,
+					     stmt, gsi);
+		}
+	      else if (modifier == NARROW)
+		{
+		  src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask,
+					      stmt, gsi);
+		  op = vec_oprnd0
+		    = vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+		}
+	      else
+		gcc_unreachable ();
+	    }
+	  else
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+	    }
+
+	  if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (src))
+			  == TYPE_VECTOR_SUBPARTS (srctype));
+	      var = vect_get_new_vect_var (srctype, vect_simple_var, NULL);
+	      var = make_ssa_name (var);
+	      src = build1 (VIEW_CONVERT_EXPR, srctype, src);
+	      new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, src);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      src = var;
+	    }
+
+	  if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+			  == TYPE_VECTOR_SUBPARTS (idxtype));
+	      var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+	      var = make_ssa_name (var);
+	      op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+	      new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      op = var;
+	    }
+
+	  new_stmt
+	    = gimple_build_call (scatter_decl, 5, ptr, mask, op, src, scale);
+
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	  if (prev_stmt_info == NULL)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+      return true;
+    }
+
   if (grouped_store)
     {
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
@@ -5584,7 +5748,6 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   prev_stmt_info = NULL;
   for (j = 0; j < ncopies; j++)
     {
-      gimple new_stmt;
 
       if (j == 0)
 	{
@@ -6071,7 +6234,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
     {
       grouped_load = true;
       /* FORNOW */
-      gcc_assert (! nested_in_vect_loop && !STMT_VINFO_GATHER_P (stmt_info));
+      gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
 
@@ -6134,12 +6297,12 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
     }
 
 
-  if (STMT_VINFO_GATHER_P (stmt_info))
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
-				       &gather_off, &gather_scale);
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
+					       &gather_off, &gather_scale);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, bb_vinfo,
 				 &def_stmt, &def, &gather_dt,
@@ -6225,7 +6388,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
   ensure_base_align (stmt_info, dr);
 
-  if (STMT_VINFO_GATHER_P (stmt_info))
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     {
       tree vec_oprnd0 = NULL_TREE, op;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 58e8f10..95276fa 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -646,8 +646,8 @@ typedef struct _stmt_vec_info {
      vectorization.  */
   bool vectorizable;
 
-  /* For loads only, true if this is a gather load.  */
-  bool gather_p;
+  /* For loads if this is a gather, for stores if this is a scatter.  */
+  bool gather_scatter_p;
 
   /* True if this is an access with loop-invariant stride.  */
   bool strided_p;
@@ -667,7 +667,7 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_VEC_STMT(S)             (S)->vectorized_stmt
 #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
-#define STMT_VINFO_GATHER_P(S)		   (S)->gather_p
+#define STMT_VINFO_GATHER_SCATTER_P(S)	   (S)->gather_scatter_p
 #define STMT_VINFO_STRIDED_P(S)	   	   (S)->strided_p
 #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
 
@@ -1063,8 +1063,8 @@ extern bool vect_analyze_data_refs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info);
 extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
-extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
-			       int *);
+extern tree vect_check_gather_scatter (gimple, loop_vec_info, tree *, tree *,
+				       int *);
 extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *,
 				    unsigned *);
 extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,

[-- Attachment #3: tests --]
[-- Type: application/octet-stream, Size: 9638 bytes --]

commit c53a830c72f803522dc719f9525638f25d8e1d7a
Author: Petr Murzin <petr.murzin@intel.com>
Date:   Wed Aug 26 19:54:49 2015 +0300

    Tests for scatter patch

diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c
new file mode 100644
index 0000000..7631849
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c
@@ -0,0 +1,216 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+float vf1[N], vf2[2*N+16];
+double vd1[N], vd2[2*N+16];
+int vi1[N], vi2[2*N+16], k[N];
+long vl1[N], vl2[2*N+16], l[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[k[i]] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[k[i] + x] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[k[i]] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[k[i] + x] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[l[i]] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[l[i] + x] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[l[i]] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[l[i] + x] = vl1[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      vf1[i] = 17.0f + i;
+      vd1[i] = 19.0 + i;
+      vi1[i] = 21 + i;
+      vl1[i] = 23L + i;
+    }
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      k[i] = (i % 2) ? (N / 2 + i) : (N / 2 - i / 2);
+      l[i] = 2 * i + i % 2;
+    }
+
+  f1 ();
+  f2 ();
+  for (i = 0; i < N; i++)
+    if (vf2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17
+	|| vi2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 21)
+      abort ();
+
+  f3 (12);
+  f4 (14);
+  for (i = 0; i < N; i++)
+    if (vf2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 12] != i + 17
+	|| vi2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 14] != i + 21)
+      abort ();
+
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; i++)
+    if (vd2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19
+	|| vl2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 23)
+      abort ();
+
+  f7 (7);
+  f8 (9);
+  for (i = 0; i < N; i++)
+    if (vd2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 7] != i + 19
+	|| vl2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 9] != i + 23)
+      abort ();
+
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2] != i + 17
+	|| vi2[2 * i + i % 2] != i + 21)
+      abort ();
+
+  f11 (2);
+  f12 (4);
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2 + 2] != i + 17
+	|| vi2[2 * i + i % 2 + 4] != i + 21)
+      abort ();
+
+  f13 ();
+  f14 ();
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2] != i + 19
+	|| vl2[2 * i + i % 2] != i + 23)
+      abort ();
+
+  f15 (13);
+  f16 (15);
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2 + 13] != i + 19
+	|| vl2[2 * i + i % 2 + 15] != i + 23)
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c
new file mode 100644
index 0000000..5eabab6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c
@@ -0,0 +1,215 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+float vf1[N], vf2[2*N+16];
+double vd1[N], vd2[2*N+16];
+int k[N];
+long l[N];
+short n[2*N+16];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i]] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i] + x] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i]] = (int) vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i]] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i] + x] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i]] = (int) vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i] + x] = (int) vd1[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      vf1[i] = 17.0f + i;
+      vd1[i] = 19.0 + i;
+    }
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      k[i] = (i % 2) ? (N / 2 + i) : (N / 2 - i / 2);
+      l[i] = 2 * i + i % 2;
+    }
+
+  f1 ();
+  f2 ();
+  for (i = 0; i < N; i++)
+    if (vf2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17
+	|| n[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17)
+      abort ();
+
+  f3 (12);
+  f4 (14);
+  for (i = 0; i < N; i++)
+    if (vf2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 12] != i + 17
+	|| n[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 14] != i + 17)
+      abort ();
+
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; i++)
+    if (vd2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19
+	|| n[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19)
+      abort ();
+
+  f7 (7);
+  f8 (9);
+  for (i = 0; i < N; i++)
+    if (vd2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 7] != i + 19
+	|| n[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 9] != i + 19)
+      abort ();
+
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2] != i + 17
+	|| n[2 * i + i % 2] != i + 17)
+      abort ();
+
+  f11 (2);
+  f12 (4);
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2 + 2] != i + 17
+	|| n[2 * i + i % 2 + 4] != i + 17)
+      abort ();
+
+  f13 ();
+  f14 ();
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2] != i + 19
+	|| n[2 * i + i % 2] != i + 19)
+      abort ();
+
+  f15 (13);
+  f16 (15);
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2 + 13] != i + 19
+	|| n[2 * i + i % 2 + 15] != i + 19)
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c
new file mode 100644
index 0000000..dccbdb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+int a[N], b[N];
+
+__attribute__((noinline, noclone)) void
+foo (float *__restrict p, float *__restrict q,
+     int s1, int s2, int s3)
+{
+  int i;
+  for (i = 0; i < (N / 8); i++)
+    p[a[i] * s1 + b[i] * s2 + s3] = q[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+  float c[N], d[N];
+  for (i = 0; i < N; i++)
+    {
+      a[i] = (i * 7) & (N / 8 - 1);
+      b[i] = (i * 13) & (N / 8 - 1);
+      c[i] = 179.13 + i;
+    }
+  foo (d, c, 3, 2, 4);
+  for (i = 0; i < (N / 8); i++)
+    if (d[a[i] * 3 + b[i] * 2 + 4] != (float) (179.13 + i))
+      abort ();
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] [AVX512F] Add scatter support for vectorizer
  2015-08-26 18:46       ` Petr Murzin
@ 2015-08-26 22:15         ` Uros Bizjak
  0 siblings, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2015-08-26 22:15 UTC (permalink / raw)
  To: Petr Murzin; +Cc: Richard Biener, Kirill Yukhin, gcc-patches

On Wed, Aug 26, 2015 at 7:39 PM, Petr Murzin <petrmurzin1@gmail.com> wrote:
> On Wed, Aug 26, 2015 at 10:41 AM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> @@ -3763,32 +3776,46 @@ again:
>>        if (vf > *min_vf)
>>         *min_vf = vf;
>>
>> -      if (gather)
>> +      if (gatherscatter != SG_NONE)
>>         {
>>           tree off;
>> +         if (vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off,
>> NULL, true) != 0)
>> +           gatherscatter = GATHER;
>> +         else if (vect_check_gather_scatter (stmt, loop_vinfo, NULL,
>> &off, NULL, false)
>> +                                             != 0)
>> +           gatherscatter = SCATTER;
>> +         else
>> +           gatherscatter = SG_NONE;
>>
>> as I said vect_check_gather_scatter already knows whether the DR is a read or
>> a write and thus whether it needs to check for gather or scatter.  Remove
>> the new argument.  And simply do
>>
>>    if (!vect_check_gather_scatter (stmt....))
>>      gatherscatter = SG_NONE;
>>
>> -         STMT_VINFO_GATHER_P (stmt_info) = true;
>> +         if (gatherscatter == GATHER)
>> +           STMT_VINFO_GATHER_P (stmt_info) = true;
>> +         else
>> +           STMT_VINFO_SCATTER_P (stmt_info) = true;
>>         }
>>
>> and as suggested merge STMT_VINFO_GATHER_P and STMT_VINFO_SCATTER_P
>> using the enum so you can simply do
>>
>>  STMT_VINFO_SCATTER_GATHER_P (smt_info) = gatherscatter;
>> Otherwise the patch looks ok to me.
>
> Fixed.
> Uros, could you please have a look at target part of patch?
>
> 2015-08-26  Andrey Turetskiy  <andrey.turetskiy@intel.com>
>             Petr Murzin  <petr.murzin@intel.com>
>
> gcc/
>
> * config/i386/i386-builtin-types.def
> (VOID_PFLOAT_HI_V8DI_V16SF_INT): New.
> (VOID_PDOUBLE_QI_V16SI_V8DF_INT): Ditto.
> (VOID_PINT_HI_V8DI_V16SI_INT): Ditto.
> (VOID_PLONGLONG_QI_V16SI_V8DI_INT): Ditto.
> * config/i386/i386.c
> (ix86_builtins): Add IX86_BUILTIN_SCATTERALTSIV8DF,
> IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
> IX86_BUILTIN_SCATTERALTDIV16SI.
> (ix86_init_mmx_sse_builtins): Define __builtin_ia32_scatteraltsiv8df,
> __builtin_ia32_scatteraltdiv8sf, __builtin_ia32_scatteraltsiv8di,
> __builtin_ia32_scatteraltdiv8si.
> (ix86_expand_builtin): Handle IX86_BUILTIN_SCATTERALTSIV8DF,
> IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
> IX86_BUILTIN_SCATTERALTDIV16SI.
> (ix86_vectorize_builtin_scatter): New.
> (TARGET_VECTORIZE_BUILTIN_SCATTER): Define as
> ix86_vectorize_builtin_scatter.
> * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_SCATTER): New.
> * doc/tm.texi: Regenerate.
> * target.def: Add scatter builtin.
> * tree-vectorizer.h: Rename gather_p to gather_scatter_p and use it
> for loads/stores in case of gather/scatter accordingly.
> (STMT_VINFO_GATHER_SCATTER_P(S)): Use it instead of STMT_VINFO_GATHER_P(S).
> (vect_check_gather): Rename to ...
> (vect_check_gather_scatter): this.
> * tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Use
> STMT_VINFO_GATHER_SCATTER_P instead of STMT_VINFO_SCATTER_P.
> (vect_check_gather_scatter): Use it instead of vect_check_gather.
> (vect_analyze_data_refs): Add gatherscatter enum and maybe_scatter variable
> and new checkings for it accordingly.
> * tree-vect-stmts.c
> (STMT_VINFO_GATHER_SCATTER_P(S)): Use it instead of STMT_VINFO_GATHER_P(S).
> (vect_check_gather_scatter): Use it instead of vect_check_gather.
> (vectorizable_store): Add checkings for STMT_VINFO_GATHER_SCATTER_P.
>
> gcc/testsuite/
>
>         * gcc.target/i386/avx512f-scatter-1.c: New.
>         * gcc.target/i386/avx512f-scatter-2.c: Ditto.
>         * gcc.target/i386/avx512f-scatter-3.c: Ditto.

x86 target part and testsuite are OK with the following change to the testcases:

> +/* { dg-do run } */
> +/* { dg-require-effective-target avx512f } */
> +/* { dg-options "-O3 -mavx512f -DAVX512F" } */
> +
> +#include "avx512f-check.h"
> +
> +#define N 1024

We don't want -D in the options, please move these to the source:

/* { dg-do run } */
/* { dg-require-effective-target avx512f } */
/* { dg-options "-O3 -mavx512f" } */

#define AVX512F

#include "avx512f-check.h"

#define N 1024

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] [AVX512F] Add scatter support for vectorizer
@ 2015-03-05 10:16 Petr Murzin
  0 siblings, 0 replies; 8+ messages in thread
From: Petr Murzin @ 2015-03-05 10:16 UTC (permalink / raw)
  To: gcc-patches, Uros Bizjak; +Cc: Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 2296 bytes --]

Hello,
This patch adds scatter support for vectorizer (for AVX512F
instructions). Please have a look. Is it ok for stage 1?

2015-03-05  Andrey Turetskiy  <andrey.turetskiy@intel.com>

        * config/i386/i386-builtin-types.def
        (VOID_PFLOAT_HI_V8DI_V16SF_INT): New.
        (VOID_PDOUBLE_QI_V16SI_V8DF_INT): Ditto.
        (VOID_PINT_HI_V8DI_V16SI_INT): Ditto.
        (VOID_PLONGLONG_QI_V16SI_V8DI_INT): Ditto.
        * config/i386/i386.c
        (ix86_builtins): Add IX86_BUILTIN_SCATTERALTSIV8DF,
        IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
        IX86_BUILTIN_SCATTERALTDIV16SI.
        (ix86_init_mmx_sse_builtins): Define __builtin_ia32_scatteraltsiv8df,
        __builtin_ia32_scatteraltdiv8sf, __builtin_ia32_scatteraltsiv8di,
        __builtin_ia32_scatteraltdiv8si.
        (ix86_expand_builtin): Handle IX86_BUILTIN_SCATTERALTSIV8DF,
        IX86_BUILTIN_SCATTERALTDIV16SF, IX86_BUILTIN_SCATTERALTSIV8DI,
        IX86_BUILTIN_SCATTERALTDIV16SI.
        (ix86_vectorize_builtin_scatter): New.
        (ix86_initialize_bounds):
        (TARGET_VECTORIZE_BUILTIN_SCATTER): Ditto.
        * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_SCATTER): Ditto.
        * doc/tm.texi: Regenerate.
        * target.def: Add scatter builtin.
        * tree-vect-data-refs.c (vect_analyze_data_ref_dependence): Add new
        checkings for STMT_VINFO_SCATTER_P.
        (vect_check_gather): Rename to ...
        (vect_check_gather_scatter): this and enhance number of arguments.
        (vect_analyze_data_ref_access): Update comment and returnable values.
        (vect_analyze_data_refs): Add maybe_scatter and new checking for it
        accordingly.
        * tree-vectorizer.h (STMT_VINFO_SCATTER_P(S)): Define.
        (vect_check_gather): Rename to ...
        (vect_check_gather_scatter): this.
        * tree-vect-stmts.c: Ditto.
        (vectorizable_store): Add checkings for STMT_VINFO_SCATTER_P.

2015-03-05  Andrey Turetskiy  <andrey.turetskiy@intel.com>

testsuite/

        * gcc.target/i386/avx512f-scatter-1.c: New.
        * gcc.target/i386/avx512f-scatter-2.c: Ditto.
        * gcc.target/i386/avx512f-scatter-3.c: Ditto.
        * gcc.target/i386/avx512f-scatter-4.c: Ditto.
        * gcc.target/i386/avx512f-scatter-5.c: Ditto.

Thanks,
Petr

[-- Attachment #2: scatter_patch --]
[-- Type: application/octet-stream, Size: 38520 bytes --]

commit c6583396252185c6fdeba2487c5892505f4227e4
Author: Andrey Turetskiy <andrey.turetskiy@intel.com>
Date:   Tue Oct 8 18:27:56 2013 +0400

    [AVX512F] Add scatter support for vectorizer

diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 864d0ea..92a858b 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1171,6 +1171,10 @@ DEF_FUNCTION_TYPE (VOID, PINT, QI, V8DI, V8SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V4DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PINT, QI, V2DI, V4SI, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8DI, V8DI, INT)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, HI, V8DI, V16SF, INT)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V16SI, V8DF, INT)
+DEF_FUNCTION_TYPE (VOID, PINT, HI, V8DI, V16SI, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V16SI, V8DI, INT)
 
 DEF_FUNCTION_TYPE (VOID, QI, V8SI, PCINT64, INT, INT)
 DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V4DI, V4DI, INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ab8f03a..5d93f22 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -30220,6 +30220,10 @@ enum ix86_builtins
   IX86_BUILTIN_GATHER3SIV16SI,
   IX86_BUILTIN_GATHER3SIV8DF,
   IX86_BUILTIN_GATHER3SIV8DI,
+  IX86_BUILTIN_SCATTERALTSIV8DF,
+  IX86_BUILTIN_SCATTERALTDIV16SF,
+  IX86_BUILTIN_SCATTERALTSIV8DI,
+  IX86_BUILTIN_SCATTERALTDIV16SI,
   IX86_BUILTIN_SCATTERDIV16SF,
   IX86_BUILTIN_SCATTERDIV16SI,
   IX86_BUILTIN_SCATTERDIV8DF,
@@ -34021,6 +34025,21 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
 	       VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
 	       IX86_BUILTIN_SCATTERDIV2DI);
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
+	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
+	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
+	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
+	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SI);
 
   /* AVX512PF */
   def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
@@ -39694,6 +39713,18 @@ addcarryx:
     case IX86_BUILTIN_GATHERPFDPD:
       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
       goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DF:
+      icode = CODE_FOR_avx512f_scattersiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SF:
+      icode = CODE_FOR_avx512f_scatterdiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DI:
+      icode = CODE_FOR_avx512f_scattersiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SI:
+      icode = CODE_FOR_avx512f_scatterdiv16si;
+      goto scatter_gen;
     case IX86_BUILTIN_GATHERPFDPS:
       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
       goto vec_prefetch_gen;
@@ -39954,6 +39985,36 @@ addcarryx:
       mode3 = insn_data[icode].operand[3].mode;
       mode4 = insn_data[icode].operand[4].mode;
 
+      /* Scatter instruction stores operand op3 to memory with
+	 indices from op2 and scale from op4 under writemask op1.
+	 If index operand op2 has more elements then source operand
+	 op3 one need to use only its low half. And vice versa.  */
+      switch (fcode)
+	{
+	case IX86_BUILTIN_SCATTERALTSIV8DF:
+	case IX86_BUILTIN_SCATTERALTSIV8DI:
+	  half = gen_reg_rtx (V8SImode);
+	  if (!nonimmediate_operand (op2, V16SImode))
+	    op2 = copy_to_mode_reg (V16SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTDIV16SF:
+	case IX86_BUILTIN_SCATTERALTDIV16SI:
+	  half = gen_reg_rtx (mode3);
+	  if (mode3 == V8SFmode)
+	    gen = gen_vec_extract_lo_v16sf;
+	  else
+	    gen = gen_vec_extract_lo_v16si;
+	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	  emit_insn (gen (half, op3));
+	  op3 = half;
+	  break;
+	default:
+	  break;
+	}
+
       /* Force memory operand only with base register here.  But we
 	 don't want to do it on memory operand for other builtin
 	 functions.  */
@@ -41023,6 +41084,62 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   return ix86_get_builtin (code);
 }
 
+/* Returns a decl of a function that implements scatter store with
+   register type VECTYPE and index type INDEX_TYPE and SCALE.
+   Return NULL_TREE if it is not available.  */
+
+static tree
+ix86_vectorize_builtin_scatter (const_tree vectype,
+				const_tree index_type, int scale)
+{
+  bool si;
+  enum ix86_builtins code;
+
+  if (! TARGET_AVX512F)
+    return NULL_TREE;
+
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+	  && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
+
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
+
+  /* v*scatter* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
+
+  /* Scale can be 1, 2, 4 or 8.  */
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
+
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (vectype))
+    {
+    case V8DFmode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
+      break;
+    case V8DImode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
+      break;
+    case V16SFmode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
+      break;
+    case V16SImode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  return ix86_builtins[code];
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -51785,6 +51902,9 @@ ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_BUILTIN_SCATTER
+#define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 6c5bfab..41a685b 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5736,6 +5736,13 @@ in vectorized loops in current function, or non-negative number if it is
 usable.  In that case, the smaller the number is, the more desirable it is
 to use it.
 @end deftypefn
+@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_SCATTER (const_tree @var{vectype}, const_tree @var{index_type}, int @var{scale})
+Target builtin that implements vector scatter operation.  @var{vectype}
+is the vector type of the store and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize scatter
+stores.
+@end deftypefn
 
 @node Anchored Addresses
 @section Anchored Addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 8d6dfbc..11fe2c2 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4246,6 +4246,13 @@ address;  but often a machine-dependent strategy can generate better code.
 @hook TARGET_SIMD_CLONE_ADJUST
 
 @hook TARGET_SIMD_CLONE_USABLE
+@hook TARGET_VECTORIZE_BUILTIN_SCATTER
+Target builtin that implements vector scatter operation.  @var{vectype}
+is the vector type of the store and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize scatter
+stores.
+@end deftypefn
 
 @node Anchored Addresses
 @section Anchored Addresses
diff --git a/gcc/target.def b/gcc/target.def
index a00181a..b2c70ec 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1801,6 +1801,14 @@ loads.",
  (const_tree mem_vectype, const_tree index_type, int scale),
  NULL)
 
+/* Target builtin that implements vector scatter operation.  */
+DEFHOOK
+(builtin_scatter,
+ "",
+ tree,
+ (const_tree vectype, const_tree index_type, int scale),
+ NULL)
+
 /* Target function to initialize the cost model for a loop or block.  */
 DEFHOOK
 (init_cost,
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c
new file mode 100644
index 0000000..7631849
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-1.c
@@ -0,0 +1,216 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+float vf1[N], vf2[2*N+16];
+double vd1[N], vd2[2*N+16];
+int vi1[N], vi2[2*N+16], k[N];
+long vl1[N], vl2[2*N+16], l[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[k[i]] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[k[i] + x] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[k[i]] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[k[i] + x] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[l[i]] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vi2[l[i] + x] = vi1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[l[i]] = vl1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vl2[l[i] + x] = vl1[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      vf1[i] = 17.0f + i;
+      vd1[i] = 19.0 + i;
+      vi1[i] = 21 + i;
+      vl1[i] = 23L + i;
+    }
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      k[i] = (i % 2) ? (N / 2 + i) : (N / 2 - i / 2);
+      l[i] = 2 * i + i % 2;
+    }
+
+  f1 ();
+  f2 ();
+  for (i = 0; i < N; i++)
+    if (vf2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17
+	|| vi2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 21)
+      abort ();
+
+  f3 (12);
+  f4 (14);
+  for (i = 0; i < N; i++)
+    if (vf2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 12] != i + 17
+	|| vi2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 14] != i + 21)
+      abort ();
+
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; i++)
+    if (vd2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19
+	|| vl2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 23)
+      abort ();
+
+  f7 (7);
+  f8 (9);
+  for (i = 0; i < N; i++)
+    if (vd2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 7] != i + 19
+	|| vl2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 9] != i + 23)
+      abort ();
+
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2] != i + 17
+	|| vi2[2 * i + i % 2] != i + 21)
+      abort ();
+
+  f11 (2);
+  f12 (4);
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2 + 2] != i + 17
+	|| vi2[2 * i + i % 2 + 4] != i + 21)
+      abort ();
+
+  f13 ();
+  f14 ();
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2] != i + 19
+	|| vl2[2 * i + i % 2] != i + 23)
+      abort ();
+
+  f15 (13);
+  f16 (15);
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2 + 13] != i + 19
+	|| vl2[2 * i + i % 2 + 15] != i + 23)
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c
new file mode 100644
index 0000000..6f81ca1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -DAVX512F -fdump-tree-vect-details" } */
+
+#include "avx512f-scatter-1.c"
+
+/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 16 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c
new file mode 100644
index 0000000..5eabab6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-3.c
@@ -0,0 +1,215 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+float vf1[N], vf2[2*N+16];
+double vd1[N], vd2[2*N+16];
+int k[N];
+long l[N];
+short n[2*N+16];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i]] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[k[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i] + x] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i]] = (int) vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[k[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i]] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i]] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vf2[l[i] + x] = vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i] + x] = (int) vf1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i]] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i]] = (int) vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    vd2[l[i] + x] = vd1[i];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    n[l[i] + x] = (int) vd1[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      vf1[i] = 17.0f + i;
+      vd1[i] = 19.0 + i;
+    }
+  for (i = 0; i < N; i++)
+    {
+      asm ("");
+      k[i] = (i % 2) ? (N / 2 + i) : (N / 2 - i / 2);
+      l[i] = 2 * i + i % 2;
+    }
+
+  f1 ();
+  f2 ();
+  for (i = 0; i < N; i++)
+    if (vf2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17
+	|| n[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 17)
+      abort ();
+
+  f3 (12);
+  f4 (14);
+  for (i = 0; i < N; i++)
+    if (vf2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 12] != i + 17
+	|| n[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 14] != i + 17)
+      abort ();
+
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; i++)
+    if (vd2[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19
+	|| n[(i % 2) ? (N / 2 + i) : (N / 2 - i / 2)] != i + 19)
+      abort ();
+
+  f7 (7);
+  f8 (9);
+  for (i = 0; i < N; i++)
+    if (vd2[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 7] != i + 19
+	|| n[((i % 2) ? (N / 2 + i) : (N / 2 - i / 2)) + 9] != i + 19)
+      abort ();
+
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2] != i + 17
+	|| n[2 * i + i % 2] != i + 17)
+      abort ();
+
+  f11 (2);
+  f12 (4);
+  for (i = 0; i < N; i++)
+    if (vf2[2 * i + i % 2 + 2] != i + 17
+	|| n[2 * i + i % 2 + 4] != i + 17)
+      abort ();
+
+  f13 ();
+  f14 ();
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2] != i + 19
+	|| n[2 * i + i % 2] != i + 19)
+      abort ();
+
+  f15 (13);
+  f16 (15);
+  for (i = 0; i < N; i++)
+    if (vd2[2 * i + i % 2 + 13] != i + 19
+	|| n[2 * i + i % 2 + 15] != i + 19)
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-4.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-4.c
new file mode 100644
index 0000000..dccbdb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-4.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-check.h"
+
+#define N 1024
+int a[N], b[N];
+
+__attribute__((noinline, noclone)) void
+foo (float *__restrict p, float *__restrict q,
+     int s1, int s2, int s3)
+{
+  int i;
+  for (i = 0; i < (N / 8); i++)
+    p[a[i] * s1 + b[i] * s2 + s3] = q[i];
+}
+
+static void
+avx512f_test (void)
+{
+  int i;
+  float c[N], d[N];
+  for (i = 0; i < N; i++)
+    {
+      a[i] = (i * 7) & (N / 8 - 1);
+      b[i] = (i * 13) & (N / 8 - 1);
+      c[i] = 179.13 + i;
+    }
+  foo (d, c, 3, 2, 4);
+  for (i = 0; i < (N / 8); i++)
+    if (d[a[i] * 3 + b[i] * 2 + 4] != (float) (179.13 + i))
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-scatter-5.c b/gcc/testsuite/gcc.target/i386/avx512f-scatter-5.c
new file mode 100644
index 0000000..f80c9ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-scatter-5.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -DAVX512F" } */
+
+#include "avx512f-scatter-4.c"
+
+/* { dg-final { scan-assembler "scatter\[^\n\]*zmm" } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index ffe83e2..71621f5 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -294,7 +294,9 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	}
 
       if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+	  || STMT_VINFO_GATHER_P (stmtinfo_b)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -342,7 +344,9 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 	}
 
       if (STMT_VINFO_GATHER_P (stmtinfo_a)
-	  || STMT_VINFO_GATHER_P (stmtinfo_b))
+	  || STMT_VINFO_GATHER_P (stmtinfo_b)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_a)
+	  || STMT_VINFO_SCATTER_P (stmtinfo_b))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -2319,7 +2323,8 @@ vect_analyze_data_ref_access (struct data_reference *dr)
       return false;
     }
 
-  /* Allow invariant loads in not nested loops.  */
+  /* Allow invariant loads and stores in loops.  Invariant stores
+     are allowed only if they are scatter stores.  */
   if (loop_vinfo && integer_zerop (step))
     {
       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
@@ -2330,7 +2335,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
 			     "zero step in inner loop of nest\n");
 	  return false;
 	}
-      return DR_IS_READ (dr);
+      return DR_IS_READ (dr) || STMT_VINFO_SCATTER_P (stmt_info);
     }
 
   if (loop && nested_in_vect_loop_p (loop, stmt))
@@ -2346,10 +2351,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 	                     "zero step in outer loop.\n");
-	  if (DR_IS_READ (dr))
-  	    return true;
-	  else
-	    return false;
+	  return DR_IS_READ (dr) || STMT_VINFO_SCATTER_P (stmt_info);
 	}
     }
 
@@ -2978,12 +2980,12 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   return true;
 }
 
-/* Check whether a non-affine read in stmt is suitable for gather load
-   and if so, return a builtin decl for that operation.  */
+/* Check whether a non-affine read or write in stmt is suitable for gather load
+   or scatter store and if so, return a builtin decl for that operation.  */
 
 tree
-vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
-		   tree *offp, int *scalep)
+vect_check_gather_scatter (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
+			   tree *offp, int *scalep, bool is_load)
 {
   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -3012,7 +3014,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
     }
 
-  /* The gather builtins need address of the form
+  /* The gather and scatter builtins need address of the form
      loop_invariant + vector * {1, 2, 4, 8}
      or
      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
@@ -3175,8 +3177,13 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
   if (offtype == NULL_TREE)
     offtype = TREE_TYPE (off);
 
-  decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
-					   offtype, scale);
+  if (is_load)
+    decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
+					     offtype, scale);
+  else
+    decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
+					      offtype, scale);
+
   if (decl == NULL_TREE)
     return NULL_TREE;
 
@@ -3325,7 +3332,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
       gimple stmt;
       stmt_vec_info stmt_info;
       tree base, offset, init;
-      bool gather = false;
+      bool gather = false, scatter = false;
       bool simd_lane_access = false;
       int vf;
 
@@ -3364,18 +3371,22 @@ again:
 	    = DR_IS_READ (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
 	      && targetm.vectorize.builtin_gather != NULL;
+	  bool maybe_scatter
+	    = DR_IS_WRITE (dr)
+	      && !TREE_THIS_VOLATILE (DR_REF (dr))
+	      && targetm.vectorize.builtin_scatter != NULL;
 	  bool maybe_simd_lane_access
 	    = loop_vinfo && loop->simduid;
 
-	  /* If target supports vector gather loads, or if this might be
-	     a SIMD lane access, see if they can't be used.  */
+	  /* If target supports vector gather loads or scatter stores, or if
+	     this might be a SIMD lane access, see if they can't be used.  */
 	  if (loop_vinfo
-	      && (maybe_gather || maybe_simd_lane_access)
+	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
 	      && !nested_in_vect_loop_p (loop, stmt))
 	    {
 	      struct data_reference *newdr
 		= create_data_ref (NULL, loop_containing_stmt (stmt),
-				   DR_REF (dr), stmt, true);
+				   DR_REF (dr), stmt, maybe_scatter ? false : true);
 	      gcc_assert (newdr != NULL && DR_REF (newdr));
 	      if (DR_BASE_ADDRESS (newdr)
 		  && DR_OFFSET (newdr)
@@ -3428,17 +3439,18 @@ again:
 			    }
 			}
 		    }
-		  if (!simd_lane_access && maybe_gather)
+		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
 		    {
 		      dr = newdr;
-		      gather = true;
+		      gather = DR_IS_READ (dr);
+		      scatter = DR_IS_WRITE (dr);
 		    }
 		}
-	      if (!gather && !simd_lane_access)
+	      if (!gather && !scatter && !simd_lane_access)
 		free_data_ref (newdr);
 	    }
 
-	  if (!gather && !simd_lane_access)
+	  if (!gather && !scatter && !simd_lane_access)
 	    {
 	      if (dump_enabled_p ())
 		{
@@ -3466,7 +3478,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
         }
@@ -3501,7 +3513,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3521,7 +3533,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
           return false;
 	}
@@ -3546,7 +3558,7 @@ again:
 	  if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
 	}
@@ -3684,7 +3696,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -3717,7 +3729,7 @@ again:
           if (bb_vinfo)
 	    break;
 
-	  if (gather || simd_lane_access)
+	  if (gather || scatter || simd_lane_access)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
 	      if (gather)
@@ -3744,11 +3756,36 @@ again:
       if (vf > *min_vf)
 	*min_vf = vf;
 
-      if (gather)
+      if (scatter)
+	{
+	  tree off;
+
+	  scatter = 0 != vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, false);
+	  if (scatter
+	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
+	    scatter = false;
+	  if (!scatter)
+	    {
+	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
+	      free_data_ref (dr);
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+                                   "not vectorized: not suitable for scatter "
+                                   "store ");
+		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+		}
+	      return false;
+	    }
+
+	  datarefs[i] = dr;
+	  STMT_VINFO_SCATTER_P (stmt_info) = true;
+	}
+      else if (gather)
 	{
 	  tree off;
 
-	  gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+	  gather = 0 != vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, true);
 	  if (gather
 	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
 	    gather = false;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index aa9d43f..1458528 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -840,12 +840,27 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
       if (STMT_VINFO_GATHER_P (stmt_vinfo))
 	{
 	  tree off;
-	  tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+	  tree decl = vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, true);
 	  gcc_assert (decl);
 	  if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
 			    &worklist, true))
 	    return false;
 	}
+
+      if (STMT_VINFO_SCATTER_P (stmt_vinfo))
+	{
+	  tree off;
+	  tree decl = vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL, false);
+	  gcc_assert (decl);
+	  if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
+			    &worklist, true)
+	      || !process_use (stmt, gimple_assign_rhs1 (stmt), loop_vinfo, live_p,
+			       relevant, &worklist, true))
+	    {
+	      worklist.release ();
+	      return false;
+	    }
+	}
     } /* while worklist */
 
   return true;
@@ -1835,8 +1850,8 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
-				       &gather_off, &gather_scale);
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
+				       &gather_off, &gather_scale, true);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, NULL,
 				 &def_stmt, &def, &gather_dt,
@@ -5034,6 +5049,12 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   unsigned int vec_num;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree aggr_type;
+  tree scatter_base = NULL_TREE, scatter_off = NULL_TREE;
+  tree scatter_off_vectype = NULL_TREE, scatter_decl = NULL_TREE;
+  int scatter_scale = 1;
+  enum vect_def_type scatter_idx_dt = vect_unknown_def_type;
+  enum vect_def_type scatter_src_dt = vect_unknown_def_type;
+  gimple new_stmt;
 
   if (loop_vinfo)
     loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -5173,6 +5194,32 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
         }
     }
 
+  if (STMT_VINFO_SCATTER_P (stmt_info))
+    {
+      gimple def_stmt;
+      tree def;
+      scatter_decl = vect_check_gather_scatter (stmt, loop_vinfo, &scatter_base,
+						&scatter_off, &scatter_scale, false);
+      gcc_assert (scatter_decl);
+      if (!vect_is_simple_use_1 (scatter_off, NULL, loop_vinfo, bb_vinfo,
+				 &def_stmt, &def, &scatter_idx_dt,
+				 &scatter_off_vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "scatter index use not simple.");
+	  return false;
+	}
+      if (!vect_is_simple_use (gimple_assign_rhs1 (stmt), NULL, loop_vinfo, bb_vinfo,
+			       &def_stmt, &def, &scatter_src_dt))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "scatter source use not simple.");
+	  return false;
+	}
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
@@ -5185,6 +5232,150 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
   ensure_base_align (stmt_info, dr);
 
+  if (STMT_VINFO_SCATTER_P (stmt_info))
+    {
+      tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
+      tree arglist = TYPE_ARG_TYPES (TREE_TYPE (scatter_decl));
+      tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+      tree ptr, mask, var, scale, perm_mask = NULL_TREE;
+      edge pe = loop_preheader_edge (loop);
+      gimple_seq seq;
+      basic_block new_bb;
+      enum { NARROW, NONE, WIDEN } modifier;
+      int scatter_off_nunits = TYPE_VECTOR_SUBPARTS (scatter_off_vectype);
+
+      if (nunits == scatter_off_nunits)
+	modifier = NONE;
+      else if (nunits == scatter_off_nunits / 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, scatter_off_nunits);
+	  modifier = WIDEN;
+
+	  for (i = 0; i < (unsigned int) scatter_off_nunits; ++i)
+	    sel[i] = i | nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (scatter_off_vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	}
+      else if (nunits == scatter_off_nunits * 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+	  modifier = NARROW;
+
+	  for (i = 0; i < (unsigned int) nunits; ++i)
+	    sel[i] = i | scatter_off_nunits;
+
+	  perm_mask = vect_gen_perm_mask_checked (vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	  ncopies *= 2;
+	}
+      else
+	gcc_unreachable ();
+
+      rettype = TREE_TYPE (TREE_TYPE (scatter_decl));
+      ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      scaletype = TREE_VALUE (arglist);
+
+      gcc_checking_assert (TREE_CODE (masktype) == INTEGER_TYPE
+			   && TREE_CODE (rettype) == VOID_TYPE);
+
+      ptr = fold_convert (ptrtype, scatter_base);
+      if (!is_gimple_min_invariant (ptr))
+	{
+	  ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+	  gcc_assert (!new_bb);
+	}
+
+      /* Currently we support only unconditional scatter stores,
+	 so mask should be all ones.  */
+      mask = build_int_cst (masktype, -1);
+      mask = vect_init_vector (stmt, mask, masktype, NULL);
+
+      scale = build_int_cst (scaletype, scatter_scale);
+
+      prev_stmt_info = NULL;
+      for (j = 0; j < ncopies; ++j)
+	{
+	  if (j == 0)
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_operand (gimple_assign_rhs1 (stmt), stmt, NULL);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_operand (scatter_off, stmt, NULL);
+	    }
+	  else if (modifier != NONE && (j & 1))
+	    {
+	      if (modifier == WIDEN)
+		{
+		  src = vec_oprnd1
+		    = vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+		  op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask,
+					     stmt, gsi);
+		}
+	      else if (modifier == NARROW)
+		{
+		  src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask,
+					      stmt, gsi);
+		  op = vec_oprnd0
+		    = vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+		}
+	      else
+		gcc_unreachable ();
+	    }
+	  else
+	    {
+	      src = vec_oprnd1
+		= vect_get_vec_def_for_stmt_copy (scatter_src_dt, vec_oprnd1);
+	      op = vec_oprnd0
+		= vect_get_vec_def_for_stmt_copy (scatter_idx_dt, vec_oprnd0);
+	    }
+
+	  if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (src))
+			  == TYPE_VECTOR_SUBPARTS (srctype));
+	      var = vect_get_new_vect_var (srctype, vect_simple_var, NULL);
+	      var = make_ssa_name (var, NULL);
+	      src = build1 (VIEW_CONVERT_EXPR, srctype, src);
+	      new_stmt
+		= gimple_build_assign (var, VIEW_CONVERT_EXPR,
+						src, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      src = var;
+	    }
+
+	  if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+			  == TYPE_VECTOR_SUBPARTS (idxtype));
+	      var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+	      var = make_ssa_name (var, NULL);
+	      op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+	      new_stmt
+		= gimple_build_assign (var, VIEW_CONVERT_EXPR,
+						op, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      op = var;
+	    }
+
+	  new_stmt
+	    = gimple_build_call (scatter_decl, 5, ptr, mask, op, src, scale);
+
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	  if (prev_stmt_info == NULL)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+      return true;
+    }
+
   if (grouped_store)
     {
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
@@ -5292,8 +5483,6 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   prev_stmt_info = NULL;
   for (j = 0; j < ncopies; j++)
     {
-      gimple new_stmt;
-
       if (j == 0)
 	{
           if (slp)
@@ -5557,10 +5746,12 @@ permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt,
 {
   tree vectype = TREE_TYPE (x);
   tree perm_dest, data_ref;
+  tree scalar_dest = TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
+		     ? gimple_assign_lhs (stmt) : x;
   gimple perm_stmt;
 
-  perm_dest = vect_create_destination_var (gimple_get_lhs (stmt), vectype);
-  data_ref = make_ssa_name (perm_dest);
+  perm_dest = vect_create_destination_var (scalar_dest, vectype);
+  data_ref = make_ssa_name (perm_dest, NULL);
 
   /* Generate the permute statement.  */
   perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, x, y, mask_vec);
@@ -5818,8 +6009,8 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
     {
       gimple def_stmt;
       tree def;
-      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
-				       &gather_off, &gather_scale);
+      gather_decl = vect_check_gather_scatter (stmt, loop_vinfo, &gather_base,
+					       &gather_off, &gather_scale, true);
       gcc_assert (gather_decl);
       if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, bb_vinfo,
 				 &def_stmt, &def, &gather_dt,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 0ede623..b5ef9d5 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -650,6 +650,9 @@ typedef struct _stmt_vec_info {
   bool gather_p;
   bool stride_load_p;
 
+  /* For stores only, true if this is a scatter store.  */
+  bool scatter_p;
+
   /* For both loads and stores.  */
   bool simd_lane_access_p;
 } *stmt_vec_info;
@@ -667,6 +670,7 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
 #define STMT_VINFO_GATHER_P(S)		   (S)->gather_p
 #define STMT_VINFO_STRIDE_LOAD_P(S)	   (S)->stride_load_p
+#define STMT_VINFO_SCATTER_P(S)		   (S)->scatter_p
 #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
 
 #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_base_address
@@ -1058,10 +1062,11 @@ extern bool vect_analyze_data_refs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info);
 extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info);
 extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
-extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
-			       int *);
+extern tree vect_check_gather_scatter (gimple, loop_vec_info, tree *,
+				       tree *, int *, bool);
 extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *,
-				    unsigned *);
+                                   unsigned *);
+extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *);
 extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,
 				      tree *, gimple_stmt_iterator *,
 				      gimple *, bool, bool *,

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-08-26 21:18 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-31 11:06 [PATCH] [AVX512F] Add scatter support for vectorizer Petr Murzin
2015-08-04 12:15 ` Richard Biener
2015-08-04 12:42   ` Uros Bizjak
2015-08-21 12:21   ` Petr Murzin
2015-08-26  7:44     ` Richard Biener
2015-08-26 18:46       ` Petr Murzin
2015-08-26 22:15         ` Uros Bizjak
  -- strict thread matches above, loose matches on Subject: below --
2015-03-05 10:16 Petr Murzin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).