public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
@ 2020-07-02 13:22 xiezhiheng
  2020-07-02 14:45 ` Richard Biener
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-07-02 13:22 UTC (permalink / raw)
  To: gcc-patches

Hi,

This is a fix for pr94442.
I modify get_inner_reference to handle the case for MEM[ptr, off].
I extract the "off" and add it to the recorded offset, then I build a
MEM[ptr, 0] and return it later.

diff --git a/gcc/expr.c b/gcc/expr.c
index 3c68b0d754c..8cc18449a0c 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -7362,7 +7362,8 @@ tree
 get_inner_reference (tree exp, poly_int64_pod *pbitsize,
 		     poly_int64_pod *pbitpos, tree *poffset,
 		     machine_mode *pmode, int *punsignedp,
-		     int *preversep, int *pvolatilep)
+		     int *preversep, int *pvolatilep,
+		     bool include_memref_p)
 {
   tree size_tree = 0;
   machine_mode mode = VOIDmode;
@@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod *pbitsize,
 		}
 	      exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
 	    }
+	  else if (include_memref_p
+		   && TREE_CODE (TREE_OPERAND (exp, 0)) == SSA_NAME)
+	    {
+	      tree off = TREE_OPERAND (exp, 1);
+	      if (!integer_zerop (off))
+		{
+		  poly_offset_int boff = mem_ref_offset (exp);
+		  boff <<= LOG2_BITS_PER_UNIT;
+		  bit_offset += boff;
+
+		  exp = build2 (MEM_REF, TREE_TYPE (exp),
+				TREE_OPERAND (exp, 0),
+				build_int_cst (TREE_TYPE (off), 0));
+		}
+	    }
 	  goto done;
 
 	default:
@@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
 	int reversep, volatilep = 0, must_force_mem;
 	tree tem
 	  = get_inner_reference (exp, &bitsize, &bitpos, &offset, &mode1,
-				 &unsignedp, &reversep, &volatilep);
+				 &unsignedp, &reversep, &volatilep, true);
 	rtx orig_op0, memloc;
 	bool clear_mem_expr = false;
 
diff --git a/gcc/tree.h b/gcc/tree.h
index a74872f5f3e..7df0d15f7f9 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p (const_tree, HOST_WIDE_INT, const_tree);
    look for the ultimate containing object, which is returned and specify
    the access position and size.  */
 extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod *,
-				 tree *, machine_mode *, int *, int *, int *);
+				 tree *, machine_mode *, int *, int *, int *,
+				 bool = false);
 
 extern tree build_personality_function (const char *);


I add an argument "include_memref_p" to control whether to go into MEM_REF,
because without it will cause the test case "Warray-bounds-46.c" to fail in regression.

It because function set_base_and_offset in gimple-ssa-warn-restrict.c
  base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
                              &mode, &sign, &reverse, &vol);
  ...
  ...
  if (TREE_CODE (base) == MEM_REF)
    {
      tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND (base, 1));
      extend_offset_range (memrefoff);
      base = TREE_OPERAND (base, 0);

      if (refoff != HOST_WIDE_INT_MIN
          && TREE_CODE (expr) == COMPONENT_REF)
        {
          /* Bump up the offset of the referenced subobject to reflect
             the offset to the enclosing object.  For example, so that
             in
               struct S { char a, b[3]; } s[2];
               strcpy (s[1].b, "1234");
             REFOFF is set to s[1].b - (char*)s.  */
          offset_int off = tree_to_shwi (memrefoff);
          refoff += off;
        }

      if (!integer_zerop (memrefoff))       <=================
        /* A non-zero offset into an array of struct with flexible array
           members implies that the array is empty because there is no
           way to initialize such a member when it belongs to an array.
           This must be some sort of a bug.  */
        refsize = 0;
    }

needs MEM_REF offset to judge whether refsize should be set to zero.
But I fold the offset into bitpos and the offset will always be zero.

Suggestion?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-02 13:22 [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3 xiezhiheng
@ 2020-07-02 14:45 ` Richard Biener
  2020-07-06  9:10   ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Biener @ 2020-07-02 14:45 UTC (permalink / raw)
  To: xiezhiheng; +Cc: gcc-patches

On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:
>
> Hi,
>
> This is a fix for pr94442.
> I modify get_inner_reference to handle the case for MEM[ptr, off].
> I extract the "off" and add it to the recorded offset, then I build a
> MEM[ptr, 0] and return it later.
>
> diff --git a/gcc/expr.c b/gcc/expr.c
> index 3c68b0d754c..8cc18449a0c 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -7362,7 +7362,8 @@ tree
>  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
>                      poly_int64_pod *pbitpos, tree *poffset,
>                      machine_mode *pmode, int *punsignedp,
> -                    int *preversep, int *pvolatilep)
> +                    int *preversep, int *pvolatilep,
> +                    bool include_memref_p)
>  {
>    tree size_tree = 0;
>    machine_mode mode = VOIDmode;
> @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod *pbitsize,
>                 }
>               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
>             }
> +         else if (include_memref_p
> +                  && TREE_CODE (TREE_OPERAND (exp, 0)) == SSA_NAME)
> +           {
> +             tree off = TREE_OPERAND (exp, 1);
> +             if (!integer_zerop (off))
> +               {
> +                 poly_offset_int boff = mem_ref_offset (exp);
> +                 boff <<= LOG2_BITS_PER_UNIT;
> +                 bit_offset += boff;
> +
> +                 exp = build2 (MEM_REF, TREE_TYPE (exp),
> +                               TREE_OPERAND (exp, 0),
> +                               build_int_cst (TREE_TYPE (off), 0));
> +               }
> +           }
>           goto done;
>
>         default:
> @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
>         int reversep, volatilep = 0, must_force_mem;
>         tree tem
>           = get_inner_reference (exp, &bitsize, &bitpos, &offset, &mode1,
> -                                &unsignedp, &reversep, &volatilep);
> +                                &unsignedp, &reversep, &volatilep, true);
>         rtx orig_op0, memloc;
>         bool clear_mem_expr = false;
>
> diff --git a/gcc/tree.h b/gcc/tree.h
> index a74872f5f3e..7df0d15f7f9 100644
> --- a/gcc/tree.h
> +++ b/gcc/tree.h
> @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p (const_tree, HOST_WIDE_INT, const_tree);
>     look for the ultimate containing object, which is returned and specify
>     the access position and size.  */
>  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod *,
> -                                tree *, machine_mode *, int *, int *, int *);
> +                                tree *, machine_mode *, int *, int *, int *,
> +                                bool = false);
>
>  extern tree build_personality_function (const char *);
>
>
> I add an argument "include_memref_p" to control whether to go into MEM_REF,
> because without it will cause the test case "Warray-bounds-46.c" to fail in regression.
>
> It because function set_base_and_offset in gimple-ssa-warn-restrict.c
>   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
>                               &mode, &sign, &reverse, &vol);
>   ...
>   ...
>   if (TREE_CODE (base) == MEM_REF)
>     {
>       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND (base, 1));
>       extend_offset_range (memrefoff);
>       base = TREE_OPERAND (base, 0);
>
>       if (refoff != HOST_WIDE_INT_MIN
>           && TREE_CODE (expr) == COMPONENT_REF)
>         {
>           /* Bump up the offset of the referenced subobject to reflect
>              the offset to the enclosing object.  For example, so that
>              in
>                struct S { char a, b[3]; } s[2];
>                strcpy (s[1].b, "1234");
>              REFOFF is set to s[1].b - (char*)s.  */
>           offset_int off = tree_to_shwi (memrefoff);
>           refoff += off;
>         }
>
>       if (!integer_zerop (memrefoff))       <=================
>         /* A non-zero offset into an array of struct with flexible array
>            members implies that the array is empty because there is no
>            way to initialize such a member when it belongs to an array.
>            This must be some sort of a bug.  */
>         refsize = 0;
>     }
>
> needs MEM_REF offset to judge whether refsize should be set to zero.
> But I fold the offset into bitpos and the offset will always be zero.
>
> Suggestion?

The thing you want to fix is not get_inner_reference but the aarch64 backend
to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That way
CSE can happen on GIMPLE which can handle the difference in the IL just
fine.

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-02 14:45 ` Richard Biener
@ 2020-07-06  9:10   ` xiezhiheng
  2020-07-06  9:31     ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-07-06  9:10 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, richard.sandiford

> -----Original Message-----
> From: Richard Biener [mailto:richard.guenther@gmail.com]
> Sent: Thursday, July 2, 2020 10:46 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:
> >
> > Hi,
> >
> > This is a fix for pr94442.
> > I modify get_inner_reference to handle the case for MEM[ptr, off].
> > I extract the "off" and add it to the recorded offset, then I build a
> > MEM[ptr, 0] and return it later.
> >
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index 3c68b0d754c..8cc18449a0c 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -7362,7 +7362,8 @@ tree
> >  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
> >                      poly_int64_pod *pbitpos, tree *poffset,
> >                      machine_mode *pmode, int *punsignedp,
> > -                    int *preversep, int *pvolatilep)
> > +                    int *preversep, int *pvolatilep,
> > +                    bool include_memref_p)
> >  {
> >    tree size_tree = 0;
> >    machine_mode mode = VOIDmode;
> > @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod
> *pbitsize,
> >                 }
> >               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
> >             }
> > +         else if (include_memref_p
> > +                  && TREE_CODE (TREE_OPERAND (exp, 0)) ==
> SSA_NAME)
> > +           {
> > +             tree off = TREE_OPERAND (exp, 1);
> > +             if (!integer_zerop (off))
> > +               {
> > +                 poly_offset_int boff = mem_ref_offset (exp);
> > +                 boff <<= LOG2_BITS_PER_UNIT;
> > +                 bit_offset += boff;
> > +
> > +                 exp = build2 (MEM_REF, TREE_TYPE (exp),
> > +                               TREE_OPERAND (exp, 0),
> > +                               build_int_cst (TREE_TYPE (off), 0));
> > +               }
> > +           }
> >           goto done;
> >
> >         default:
> > @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target,
> machine_mode tmode,
> >         int reversep, volatilep = 0, must_force_mem;
> >         tree tem
> >           = get_inner_reference (exp, &bitsize, &bitpos, &offset,
> &mode1,
> > -                                &unsignedp, &reversep, &volatilep);
> > +                                &unsignedp, &reversep, &volatilep,
> true);
> >         rtx orig_op0, memloc;
> >         bool clear_mem_expr = false;
> >
> > diff --git a/gcc/tree.h b/gcc/tree.h
> > index a74872f5f3e..7df0d15f7f9 100644
> > --- a/gcc/tree.h
> > +++ b/gcc/tree.h
> > @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p
> (const_tree, HOST_WIDE_INT, const_tree);
> >     look for the ultimate containing object, which is returned and specify
> >     the access position and size.  */
> >  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod
> *,
> > -                                tree *, machine_mode *, int *, int *,
> int *);
> > +                                tree *, machine_mode *, int *, int *,
> int *,
> > +                                bool = false);
> >
> >  extern tree build_personality_function (const char *);
> >
> >
> > I add an argument "include_memref_p" to control whether to go into
> MEM_REF,
> > because without it will cause the test case "Warray-bounds-46.c" to fail in
> regression.
> >
> > It because function set_base_and_offset in gimple-ssa-warn-restrict.c
> >   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
> >                               &mode, &sign, &reverse, &vol);
> >   ...
> >   ...
> >   if (TREE_CODE (base) == MEM_REF)
> >     {
> >       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND
> (base, 1));
> >       extend_offset_range (memrefoff);
> >       base = TREE_OPERAND (base, 0);
> >
> >       if (refoff != HOST_WIDE_INT_MIN
> >           && TREE_CODE (expr) == COMPONENT_REF)
> >         {
> >           /* Bump up the offset of the referenced subobject to reflect
> >              the offset to the enclosing object.  For example, so that
> >              in
> >                struct S { char a, b[3]; } s[2];
> >                strcpy (s[1].b, "1234");
> >              REFOFF is set to s[1].b - (char*)s.  */
> >           offset_int off = tree_to_shwi (memrefoff);
> >           refoff += off;
> >         }
> >
> >       if (!integer_zerop (memrefoff))       <=================
> >         /* A non-zero offset into an array of struct with flexible array
> >            members implies that the array is empty because there is no
> >            way to initialize such a member when it belongs to an array.
> >            This must be some sort of a bug.  */
> >         refsize = 0;
> >     }
> >
> > needs MEM_REF offset to judge whether refsize should be set to zero.
> > But I fold the offset into bitpos and the offset will always be zero.
> >
> > Suggestion?
> 
> The thing you want to fix is not get_inner_reference but the aarch64 backend
> to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That
> way
> CSE can happen on GIMPLE which can handle the difference in the IL just
> fine.
> 
> Richard.

Yes, __builtin_aarch64_sqaddv16qi is not set any attributes to describe that
it would not clobber global memory.  But I find it strange that when building
SIMD built-in FUNCTION_DECLs they are not set any attributes in the backend.

void
aarch64_init_simd_builtins (void)
{
...
      ftype = build_function_type (return_type, args);

      gcc_assert (ftype != NULL);

      if (print_type_signature_p)
        snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s_%s",
                  d->name, type_signature);
      else
        snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
                  d->name);

      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
      aarch64_builtin_decls[fcode] = fndecl;
...
}
static tree
aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
{
  code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
  return add_builtin_function (name, type, code, BUILT_IN_MD,
                               NULL, NULL_TREE);
}

The loop in aarch64_init_simd_builtins creates FUNCTION_DECL node for each
build-in function and put the node in array.  But it does not set any attributes.
And I did not find interface for each build-in function to control the attributes.

Did I miss anything?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-06  9:10   ` xiezhiheng
@ 2020-07-06  9:31     ` Richard Sandiford
  2020-07-07 12:49       ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-07-06  9:31 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>> Sent: Thursday, July 2, 2020 10:46 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:
>> >
>> > Hi,
>> >
>> > This is a fix for pr94442.
>> > I modify get_inner_reference to handle the case for MEM[ptr, off].
>> > I extract the "off" and add it to the recorded offset, then I build a
>> > MEM[ptr, 0] and return it later.
>> >
>> > diff --git a/gcc/expr.c b/gcc/expr.c
>> > index 3c68b0d754c..8cc18449a0c 100644
>> > --- a/gcc/expr.c
>> > +++ b/gcc/expr.c
>> > @@ -7362,7 +7362,8 @@ tree
>> >  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
>> >                      poly_int64_pod *pbitpos, tree *poffset,
>> >                      machine_mode *pmode, int *punsignedp,
>> > -                    int *preversep, int *pvolatilep)
>> > +                    int *preversep, int *pvolatilep,
>> > +                    bool include_memref_p)
>> >  {
>> >    tree size_tree = 0;
>> >    machine_mode mode = VOIDmode;
>> > @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod
>> *pbitsize,
>> >                 }
>> >               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
>> >             }
>> > +         else if (include_memref_p
>> > +                  && TREE_CODE (TREE_OPERAND (exp, 0)) ==
>> SSA_NAME)
>> > +           {
>> > +             tree off = TREE_OPERAND (exp, 1);
>> > +             if (!integer_zerop (off))
>> > +               {
>> > +                 poly_offset_int boff = mem_ref_offset (exp);
>> > +                 boff <<= LOG2_BITS_PER_UNIT;
>> > +                 bit_offset += boff;
>> > +
>> > +                 exp = build2 (MEM_REF, TREE_TYPE (exp),
>> > +                               TREE_OPERAND (exp, 0),
>> > +                               build_int_cst (TREE_TYPE (off), 0));
>> > +               }
>> > +           }
>> >           goto done;
>> >
>> >         default:
>> > @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target,
>> machine_mode tmode,
>> >         int reversep, volatilep = 0, must_force_mem;
>> >         tree tem
>> >           = get_inner_reference (exp, &bitsize, &bitpos, &offset,
>> &mode1,
>> > -                                &unsignedp, &reversep, &volatilep);
>> > +                                &unsignedp, &reversep, &volatilep,
>> true);
>> >         rtx orig_op0, memloc;
>> >         bool clear_mem_expr = false;
>> >
>> > diff --git a/gcc/tree.h b/gcc/tree.h
>> > index a74872f5f3e..7df0d15f7f9 100644
>> > --- a/gcc/tree.h
>> > +++ b/gcc/tree.h
>> > @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p
>> (const_tree, HOST_WIDE_INT, const_tree);
>> >     look for the ultimate containing object, which is returned and specify
>> >     the access position and size.  */
>> >  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod
>> *,
>> > -                                tree *, machine_mode *, int *, int *,
>> int *);
>> > +                                tree *, machine_mode *, int *, int *,
>> int *,
>> > +                                bool = false);
>> >
>> >  extern tree build_personality_function (const char *);
>> >
>> >
>> > I add an argument "include_memref_p" to control whether to go into
>> MEM_REF,
>> > because without it will cause the test case "Warray-bounds-46.c" to fail in
>> regression.
>> >
>> > It because function set_base_and_offset in gimple-ssa-warn-restrict.c
>> >   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
>> >                               &mode, &sign, &reverse, &vol);
>> >   ...
>> >   ...
>> >   if (TREE_CODE (base) == MEM_REF)
>> >     {
>> >       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND
>> (base, 1));
>> >       extend_offset_range (memrefoff);
>> >       base = TREE_OPERAND (base, 0);
>> >
>> >       if (refoff != HOST_WIDE_INT_MIN
>> >           && TREE_CODE (expr) == COMPONENT_REF)
>> >         {
>> >           /* Bump up the offset of the referenced subobject to reflect
>> >              the offset to the enclosing object.  For example, so that
>> >              in
>> >                struct S { char a, b[3]; } s[2];
>> >                strcpy (s[1].b, "1234");
>> >              REFOFF is set to s[1].b - (char*)s.  */
>> >           offset_int off = tree_to_shwi (memrefoff);
>> >           refoff += off;
>> >         }
>> >
>> >       if (!integer_zerop (memrefoff))       <=================
>> >         /* A non-zero offset into an array of struct with flexible array
>> >            members implies that the array is empty because there is no
>> >            way to initialize such a member when it belongs to an array.
>> >            This must be some sort of a bug.  */
>> >         refsize = 0;
>> >     }
>> >
>> > needs MEM_REF offset to judge whether refsize should be set to zero.
>> > But I fold the offset into bitpos and the offset will always be zero.
>> >
>> > Suggestion?
>> 
>> The thing you want to fix is not get_inner_reference but the aarch64 backend
>> to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That
>> way
>> CSE can happen on GIMPLE which can handle the difference in the IL just
>> fine.
>> 
>> Richard.
>
> Yes, __builtin_aarch64_sqaddv16qi is not set any attributes to describe that
> it would not clobber global memory.  But I find it strange that when building
> SIMD built-in FUNCTION_DECLs they are not set any attributes in the backend.
>
> void
> aarch64_init_simd_builtins (void)
> {
> ...
>       ftype = build_function_type (return_type, args);
>
>       gcc_assert (ftype != NULL);
>
>       if (print_type_signature_p)
>         snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s_%s",
>                   d->name, type_signature);
>       else
>         snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
>                   d->name);
>
>       fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
>       aarch64_builtin_decls[fcode] = fndecl;
> ...
> }
> static tree
> aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
> {
>   code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
>   return add_builtin_function (name, type, code, BUILT_IN_MD,
>                                NULL, NULL_TREE);
> }
>
> The loop in aarch64_init_simd_builtins creates FUNCTION_DECL node for each
> build-in function and put the node in array.  But it does not set any attributes.
> And I did not find interface for each build-in function to control the attributes.
>
> Did I miss anything?

No, this is unfortunately a known bug.  See:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

(Although the PR is recent, it's been a known bug for longer.)

As you say, the difficulty is that the correct attributes depend on what
the built-in function does.  Most integer arithmetic is “const”, but things
get more complicated for floating-point arithmetic.

The SVE intrinsics use a three stage process:

- each function is classified into one of several groups
- each group has a set of flags that describe what functions in the
  group can do
- these flags get converted into attributes based on the current
  command-line options

I guess we should have something similar for the arm_neon.h built-ins.

If you're willing to help fix this, that'd be great.  I think a first
step would be to agree a design.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-06  9:31     ` Richard Sandiford
@ 2020-07-07 12:49       ` xiezhiheng
  2020-07-07 14:07         ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-07-07 12:49 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Monday, July 6, 2020 5:31 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> No, this is unfortunately a known bug.  See:
> 
>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
> 
> (Although the PR is recent, it's been a known bug for longer.)
> 
> As you say, the difficulty is that the correct attributes depend on what
> the built-in function does.  Most integer arithmetic is “const”, but things
> get more complicated for floating-point arithmetic.
> 
> The SVE intrinsics use a three stage process:
> 
> - each function is classified into one of several groups
> - each group has a set of flags that describe what functions in the
>   group can do
> - these flags get converted into attributes based on the current
>   command-line options
> 
> I guess we should have something similar for the arm_neon.h built-ins.
> 
> If you're willing to help fix this, that'd be great.  I think a first
> step would be to agree a design.
> 
> Thanks,
> Richard

I'd like to have a try.  I have checked the steps in SVE intrinsics.
It defines a base class "function_base" and derives different classes
to describe several intrinsics for each.  And each class may
have its own unique flags described in virtual function "call_properties".
The specific attributes will be converted from these flags in
"get_attributes" later.

I find that there are more than 100 classes in total and if I only
need to classify them into different groups by attributes, maybe
we does not need so many classes?

The difficult thing I think is how to classify neon intrinsics into
different groups.  I'm going to follow up the way in SVE intrinsics
first now.

Xie Zhiheng

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-07 12:49       ` xiezhiheng
@ 2020-07-07 14:07         ` Richard Sandiford
  2020-07-15  8:49           ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-07-07 14:07 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Monday, July 6, 2020 5:31 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> No, this is unfortunately a known bug.  See:
>> 
>>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
>> 
>> (Although the PR is recent, it's been a known bug for longer.)
>> 
>> As you say, the difficulty is that the correct attributes depend on what
>> the built-in function does.  Most integer arithmetic is “const”, but things
>> get more complicated for floating-point arithmetic.
>> 
>> The SVE intrinsics use a three stage process:
>> 
>> - each function is classified into one of several groups
>> - each group has a set of flags that describe what functions in the
>>   group can do
>> - these flags get converted into attributes based on the current
>>   command-line options
>> 
>> I guess we should have something similar for the arm_neon.h built-ins.
>> 
>> If you're willing to help fix this, that'd be great.  I think a first
>> step would be to agree a design.
>> 
>> Thanks,
>> Richard
>
> I'd like to have a try.

Great!

> I have checked the steps in SVE intrinsics.
> It defines a base class "function_base" and derives different classes
> to describe several intrinsics for each.  And each class may
> have its own unique flags described in virtual function "call_properties".
> The specific attributes will be converted from these flags in
> "get_attributes" later.
>
> I find that there are more than 100 classes in total and if I only
> need to classify them into different groups by attributes, maybe
> we does not need so many classes?

Yeah, I agree.

Long term, there might be value in defining arm_neon.h in a similar
way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
a special compiler pragma.  But that's going to be a lot of work.

I think it's possible to make incremental improvements to the current
arm_neon.h implementation without that work being thrown away if we ever
did switch to a pragma in future.  And the incremental approach seems
more practical.

> The difficult thing I think is how to classify neon intrinsics into
> different groups.  I'm going to follow up the way in SVE intrinsics
> first now.

For now I'd suggest just giving a name to each combination of flags
that the intrinsics need, rather than splitting instructions in a
more fine-grained way.  (It's not at all obvious from the final state
of the SVE code, but even there, the idea was to have as few groups as
possible.  I.e. the groups were supposedly only split where necessary.
As you say, there still ended up being a lot of groups in the end…)

It'd be easier to review if the work was split up into smaller steps.
E.g. maybe one way would be this, with each number being a single
patch:

(1) (a) Add a flags field to the built-in function definitions
        that for now is always zero.
    (b) Pick a name N to describe the most conservative set of flags.
    (c) Make every built-in function definition use N.

(2) (a) Pick one type of function that cannot yet be described properly.
    (b) Pick a name N for that type of function.
    (c) Add whichever new flags are needed.
    (d) Add the appropriate attributes when the flags are set,
        possibly based on command-line options.
    (e) Make (exactly) one built-in function definition use N.

(3) (a) Pick some functions that all need the same attributes and
        that can already be described properly
    (b) Update all of their built-in function definitions accordingly,
        as a single change.

So after (1), filling out the table is an iterative process of (2) and
(3), in any order that's convenient (although it might help to order the
(2) patches so that each one adds as few flags as possible).  Each patch
would then be fairly small and self-contained.

That's just a suggestion though.  Please let me know if you have
any other suggestions.

I guess there are two obvious ways of adding the flags field:

- add a new parameter to every built-in function macro, e.g.
  BUILTIN_VSDQ_I and VAR1.

- wrap the definitions in a new macro, e.g.
  MY_NEW_GROUP (BUILTIN_VSDQ_I (BINOP, sqshl, 0))

I don't really have a preference, and I guess all other things being
equal, the first one wins by being more obvious than the second.
Just thought I'd mention the second way in case anyone preferred it.

Thanks,
Richard


^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-07 14:07         ` Richard Sandiford
@ 2020-07-15  8:49           ` xiezhiheng
  2020-07-16 12:41             ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-07-15  8:49 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 6228 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, July 7, 2020 10:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Monday, July 6, 2020 5:31 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> No, this is unfortunately a known bug.  See:
> >>
> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
> >>
> >> (Although the PR is recent, it's been a known bug for longer.)
> >>
> >> As you say, the difficulty is that the correct attributes depend on what
> >> the built-in function does.  Most integer arithmetic is “const”, but
> things
> >> get more complicated for floating-point arithmetic.
> >>
> >> The SVE intrinsics use a three stage process:
> >>
> >> - each function is classified into one of several groups
> >> - each group has a set of flags that describe what functions in the
> >>   group can do
> >> - these flags get converted into attributes based on the current
> >>   command-line options
> >>
> >> I guess we should have something similar for the arm_neon.h built-ins.
> >>
> >> If you're willing to help fix this, that'd be great.  I think a first
> >> step would be to agree a design.
> >>
> >> Thanks,
> >> Richard
> >
> > I'd like to have a try.
> 
> Great!
> 
> > I have checked the steps in SVE intrinsics.
> > It defines a base class "function_base" and derives different classes
> > to describe several intrinsics for each.  And each class may
> > have its own unique flags described in virtual function "call_properties".
> > The specific attributes will be converted from these flags in
> > "get_attributes" later.
> >
> > I find that there are more than 100 classes in total and if I only
> > need to classify them into different groups by attributes, maybe
> > we does not need so many classes?
> 
> Yeah, I agree.
> 
> Long term, there might be value in defining arm_neon.h in a similar
> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
> a special compiler pragma.  But that's going to be a lot of work.
> 
> I think it's possible to make incremental improvements to the current
> arm_neon.h implementation without that work being thrown away if we
> ever
> did switch to a pragma in future.  And the incremental approach seems
> more practical.
> 
> > The difficult thing I think is how to classify neon intrinsics into
> > different groups.  I'm going to follow up the way in SVE intrinsics
> > first now.
> 
> For now I'd suggest just giving a name to each combination of flags
> that the intrinsics need, rather than splitting instructions in a
> more fine-grained way.  (It's not at all obvious from the final state
> of the SVE code, but even there, the idea was to have as few groups as
> possible.  I.e. the groups were supposedly only split where necessary.
> As you say, there still ended up being a lot of groups in the end…)
> 
> It'd be easier to review if the work was split up into smaller steps.
> E.g. maybe one way would be this, with each number being a single
> patch:
> 
> (1) (a) Add a flags field to the built-in function definitions
>         that for now is always zero.
>     (b) Pick a name N to describe the most conservative set of flags.
>     (c) Make every built-in function definition use N.
> 

I have finished the first part.

(a) I add a new parameter called FLAG to every built-in function macro.

(b) I define some flags in aarch64-builtins.c
FLAG_NONE for no needed flags
FLAG_READ_FPCR for functions will read FPCR register
FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
FLAG_READ_MEMORY for functions will read global memory
FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
FLAG_WRITE_MEMORY for functions will write global memory

FLAG_FP is used for floating-point arithmetic
FLAG_ALL is all flags above

(c) I add a field in struct aarch64_simd_builtin_datum to record flags
for each built-in function.  But the default flags I set for built-in functions
are FLAG_ALL because by default the built-in functions might do anything.

And bootstrap and regression are tested ok on aarch64 Linux platform.

Any suggestions?

Thanks,
Xie Zhiheng

> (2) (a) Pick one type of function that cannot yet be described properly.
>     (b) Pick a name N for that type of function.
>     (c) Add whichever new flags are needed.
>     (d) Add the appropriate attributes when the flags are set,
>         possibly based on command-line options.
>     (e) Make (exactly) one built-in function definition use N.
> 
> (3) (a) Pick some functions that all need the same attributes and
>         that can already be described properly
>     (b) Update all of their built-in function definitions accordingly,
>         as a single change.
> 
> So after (1), filling out the table is an iterative process of (2) and
> (3), in any order that's convenient (although it might help to order the
> (2) patches so that each one adds as few flags as possible).  Each patch
> would then be fairly small and self-contained.
> 
> That's just a suggestion though.  Please let me know if you have
> any other suggestions.
> 
> I guess there are two obvious ways of adding the flags field:
> 
> - add a new parameter to every built-in function macro, e.g.
>   BUILTIN_VSDQ_I and VAR1.
> 
> - wrap the definitions in a new macro, e.g.
>   MY_NEW_GROUP (BUILTIN_VSDQ_I (BINOP, sqshl, 0))
> 
> I don't really have a preference, and I guess all other things being
> equal, the first one wins by being more obvious than the second.
> Just thought I'd mention the second way in case anyone preferred it.
> 
> Thanks,
> Richard


[-- Attachment #2: pr94442-v1.patch --]
[-- Type: application/octet-stream, Size: 56848 bytes --]

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 49dfbafec3a..d5fb29048c4 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -117,6 +117,18 @@ enum aarch64_type_qualifiers
   qualifier_lane_quadtup_index = 0x1000,
 };
 
+/* Flags that describe what a function might do.  */
+const unsigned int FLAG_NONE = 0U;
+const unsigned int FLAG_READ_FPCR = 1U << 0;
+const unsigned int FLAG_RAISE_FP_EXCEPTIONS = 1U << 1;
+const unsigned int FLAG_READ_MEMORY = 1U << 2;
+const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
+const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
+
+const unsigned int FLAG_FP = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS;
+const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
+  | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
+
 typedef struct
 {
   const char *name;
@@ -124,6 +136,7 @@ typedef struct
   const enum insn_code code;
   unsigned int fcode;
   enum aarch64_type_qualifiers *qualifiers;
+  unsigned int flags;
 } aarch64_simd_builtin_datum;
 
 static enum aarch64_type_qualifiers
@@ -336,53 +349,53 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define CF4(N, X) CODE_FOR_##N##X##4
 #define CF10(N, X) CODE_FOR_##N##X
 
-#define VAR1(T, N, MAP, A) \
-  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T},
-#define VAR2(T, N, MAP, A, B) \
-  VAR1 (T, N, MAP, A) \
-  VAR1 (T, N, MAP, B)
-#define VAR3(T, N, MAP, A, B, C) \
-  VAR2 (T, N, MAP, A, B) \
-  VAR1 (T, N, MAP, C)
-#define VAR4(T, N, MAP, A, B, C, D) \
-  VAR3 (T, N, MAP, A, B, C) \
-  VAR1 (T, N, MAP, D)
-#define VAR5(T, N, MAP, A, B, C, D, E) \
-  VAR4 (T, N, MAP, A, B, C, D) \
-  VAR1 (T, N, MAP, E)
-#define VAR6(T, N, MAP, A, B, C, D, E, F) \
-  VAR5 (T, N, MAP, A, B, C, D, E) \
-  VAR1 (T, N, MAP, F)
-#define VAR7(T, N, MAP, A, B, C, D, E, F, G) \
-  VAR6 (T, N, MAP, A, B, C, D, E, F) \
-  VAR1 (T, N, MAP, G)
-#define VAR8(T, N, MAP, A, B, C, D, E, F, G, H) \
-  VAR7 (T, N, MAP, A, B, C, D, E, F, G) \
-  VAR1 (T, N, MAP, H)
-#define VAR9(T, N, MAP, A, B, C, D, E, F, G, H, I) \
-  VAR8 (T, N, MAP, A, B, C, D, E, F, G, H) \
-  VAR1 (T, N, MAP, I)
-#define VAR10(T, N, MAP, A, B, C, D, E, F, G, H, I, J) \
-  VAR9 (T, N, MAP, A, B, C, D, E, F, G, H, I) \
-  VAR1 (T, N, MAP, J)
-#define VAR11(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
-  VAR10 (T, N, MAP, A, B, C, D, E, F, G, H, I, J) \
-  VAR1 (T, N, MAP, K)
-#define VAR12(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
-  VAR11 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
-  VAR1 (T, N, MAP, L)
-#define VAR13(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
-  VAR12 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
-  VAR1 (T, N, MAP, M)
-#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
-  VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
-  VAR1 (T, X, MAP, N)
-#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
-  VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
-  VAR1 (T, X, MAP, O)
-#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
-  VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
-  VAR1 (T, X, MAP, P)
+#define VAR1(T, N, MAP, FLAG, A) \
+  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
+#define VAR2(T, N, MAP, FLAG, A, B) \
+  VAR1 (T, N, MAP, FLAG, A) \
+  VAR1 (T, N, MAP, FLAG, B)
+#define VAR3(T, N, MAP, FLAG, A, B, C) \
+  VAR2 (T, N, MAP, FLAG, A, B) \
+  VAR1 (T, N, MAP, FLAG, C)
+#define VAR4(T, N, MAP, FLAG, A, B, C, D) \
+  VAR3 (T, N, MAP, FLAG, A, B, C) \
+  VAR1 (T, N, MAP, FLAG, D)
+#define VAR5(T, N, MAP, FLAG, A, B, C, D, E) \
+  VAR4 (T, N, MAP, FLAG, A, B, C, D) \
+  VAR1 (T, N, MAP, FLAG, E)
+#define VAR6(T, N, MAP, FLAG, A, B, C, D, E, F) \
+  VAR5 (T, N, MAP, FLAG, A, B, C, D, E) \
+  VAR1 (T, N, MAP, FLAG, F)
+#define VAR7(T, N, MAP, FLAG, A, B, C, D, E, F, G) \
+  VAR6 (T, N, MAP, FLAG, A, B, C, D, E, F) \
+  VAR1 (T, N, MAP, FLAG, G)
+#define VAR8(T, N, MAP, FLAG, A, B, C, D, E, F, G, H) \
+  VAR7 (T, N, MAP, FLAG, A, B, C, D, E, F, G) \
+  VAR1 (T, N, MAP, FLAG, H)
+#define VAR9(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I) \
+  VAR8 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H) \
+  VAR1 (T, N, MAP, FLAG, I)
+#define VAR10(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J) \
+  VAR9 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I) \
+  VAR1 (T, N, MAP, FLAG, J)
+#define VAR11(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR10 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J) \
+  VAR1 (T, N, MAP, FLAG, K)
+#define VAR12(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR11 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR1 (T, N, MAP, FLAG, L)
+#define VAR13(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, MAP, FLAG, M)
+#define VAR14(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR13 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR1 (T, X, MAP, FLAG, N)
+#define VAR15(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR14 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR1 (T, X, MAP, FLAG, O)
+#define VAR16(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+  VAR15 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR1 (T, X, MAP, FLAG, P)
 
 #include "aarch64-builtin-iterators.h"
 
@@ -438,7 +451,7 @@ typedef struct
   AARCH64_SIMD_BUILTIN_FCMLA_LANEQ##I##_##M,
 
 #undef VAR1
-#define VAR1(T, N, MAP, A) \
+#define VAR1(T, N, MAP, FLAG, A) \
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 enum aarch64_builtins
@@ -2196,7 +2209,7 @@ aarch64_general_builtin_rsqrt (unsigned int fn)
 }
 
 #undef VAR1
-#define VAR1(T, N, MAP, A) \
+#define VAR1(T, N, MAP, FLAG, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
 
 /* Try to fold a call to the built-in function with subcode FCODE.  The
@@ -2209,11 +2222,11 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
 {
   switch (fcode)
     {
-      BUILTIN_VDQF (UNOP, abs, 2)
+      BUILTIN_VDQF (UNOP, abs, 2, ALL)
 	return fold_build1 (ABS_EXPR, type, args[0]);
-      VAR1 (UNOP, floatv2si, 2, v2sf)
-      VAR1 (UNOP, floatv4si, 2, v4sf)
-      VAR1 (UNOP, floatv2di, 2, v2df)
+      VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
+      VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
+      VAR1 (UNOP, floatv2di, 2, ALL, v2df)
 	return fold_build1 (FLOAT_EXPR, type, args[0]);
       default:
 	break;
@@ -2239,24 +2252,24 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
      the arguments to the __builtin.  */
   switch (fcode)
     {
-      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_PLUS,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
-      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
+      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10, ALL)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
-      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
+      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10, ALL)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_MIN,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_GPF (BINOP, fmulx, 0)
+      BUILTIN_GPF (BINOP, fmulx, 0, ALL)
 	{
 	  gcc_assert (nargs == 2);
 	  bool a0_cst_p = TREE_CODE (args[0]) == REAL_CST;
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6b1ea..e8650121cd6 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -39,693 +39,693 @@
    1-9 - CODE_FOR_<name><mode><1-9>
    10 - CODE_FOR_<name><mode>.  */
 
-  BUILTIN_VDC (COMBINE, combine, 0)
-  VAR1 (COMBINEP, combine, 0, di)
-  BUILTIN_VB (BINOP, pmul, 0)
-  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2)
-  BUILTIN_VD_BHSI (BINOP, addp, 0)
-  VAR1 (UNOP, addp, 0, di)
-  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
-  BUILTIN_VDQ_BHSI (UNOP, clz, 2)
-  BUILTIN_VS (UNOP, ctz, 2)
-  BUILTIN_VB (UNOP, popcount, 2)
+  BUILTIN_VDC (COMBINE, combine, 0, ALL)
+  VAR1 (COMBINEP, combine, 0, ALL, di)
+  BUILTIN_VB (BINOP, pmul, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
+  BUILTIN_VD_BHSI (BINOP, addp, 0, ALL)
+  VAR1 (UNOP, addp, 0, ALL, di)
+  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
+  BUILTIN_VDQ_BHSI (UNOP, clz, 2, ALL)
+  BUILTIN_VS (UNOP, ctz, 2, ALL)
+  BUILTIN_VB (UNOP, popcount, 2, ALL)
 
   /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0)
-  BUILTIN_VSDQ_I (BINOP, sqrshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0)
+  BUILTIN_VSDQ_I (BINOP, sqshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP, sqrshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, ALL)
   /* Implemented by aarch64_<su_optab><optab><mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqadd, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqadd, 0)
-  BUILTIN_VSDQ_I (BINOP, sqsub, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqsub, 0)
+  BUILTIN_VSDQ_I (BINOP, sqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOPU, uqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP, sqsub, 0, ALL)
+  BUILTIN_VSDQ_I (BINOPU, uqsub, 0, ALL)
   /* Implemented by aarch64_<sur>qadd<mode>.  */
-  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0)
+  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, ALL)
 
   /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
-  BUILTIN_VDC (GETREG, get_dregoi, 0)
-  BUILTIN_VDC (GETREG, get_dregci, 0)
-  BUILTIN_VDC (GETREG, get_dregxi, 0)
-  VAR1 (GETREGP, get_dregoi, 0, di)
-  VAR1 (GETREGP, get_dregci, 0, di)
-  VAR1 (GETREGP, get_dregxi, 0, di)
+  BUILTIN_VDC (GETREG, get_dregoi, 0, ALL)
+  BUILTIN_VDC (GETREG, get_dregci, 0, ALL)
+  BUILTIN_VDC (GETREG, get_dregxi, 0, ALL)
+  VAR1 (GETREGP, get_dregoi, 0, ALL, di)
+  VAR1 (GETREGP, get_dregci, 0, ALL, di)
+  VAR1 (GETREGP, get_dregxi, 0, ALL, di)
   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (GETREG, get_qregoi, 0)
-  BUILTIN_VQ (GETREG, get_qregci, 0)
-  BUILTIN_VQ (GETREG, get_qregxi, 0)
-  VAR1 (GETREGP, get_qregoi, 0, v2di)
-  VAR1 (GETREGP, get_qregci, 0, v2di)
-  VAR1 (GETREGP, get_qregxi, 0, v2di)
+  BUILTIN_VQ (GETREG, get_qregoi, 0, ALL)
+  BUILTIN_VQ (GETREG, get_qregci, 0, ALL)
+  BUILTIN_VQ (GETREG, get_qregxi, 0, ALL)
+  VAR1 (GETREGP, get_qregoi, 0, ALL, v2di)
+  VAR1 (GETREGP, get_qregci, 0, ALL, v2di)
+  VAR1 (GETREGP, get_qregxi, 0, ALL, v2di)
   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (SETREG, set_qregoi, 0)
-  BUILTIN_VQ (SETREG, set_qregci, 0)
-  BUILTIN_VQ (SETREG, set_qregxi, 0)
-  VAR1 (SETREGP, set_qregoi, 0, v2di)
-  VAR1 (SETREGP, set_qregci, 0, v2di)
-  VAR1 (SETREGP, set_qregxi, 0, v2di)
+  BUILTIN_VQ (SETREG, set_qregoi, 0, ALL)
+  BUILTIN_VQ (SETREG, set_qregci, 0, ALL)
+  BUILTIN_VQ (SETREG, set_qregxi, 0, ALL)
+  VAR1 (SETREGP, set_qregoi, 0, ALL, v2di)
+  VAR1 (SETREGP, set_qregci, 0, ALL, v2di)
+  VAR1 (SETREGP, set_qregxi, 0, ALL, v2di)
   /* Implemented by aarch64_ld1x2<VQ:mode>. */
-  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0, ALL)
   /* Implemented by aarch64_ld1x2<VDC:mode>. */
-  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (LOADSTRUCT, ld2, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld3, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld4, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld2, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld3, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld4, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (LOADSTRUCT, ld2, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld3, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld4, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld2, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld3, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld4, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (STORESTRUCT, st2, 0)
-  BUILTIN_VDC (STORESTRUCT, st3, 0)
-  BUILTIN_VDC (STORESTRUCT, st4, 0)
+  BUILTIN_VDC (STORESTRUCT, st2, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st3, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st4, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (STORESTRUCT, st2, 0)
-  BUILTIN_VQ (STORESTRUCT, st3, 0)
-  BUILTIN_VQ (STORESTRUCT, st4, 0)
-
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0)
-
-  BUILTIN_VQW (BINOP, saddl2, 0)
-  BUILTIN_VQW (BINOP, uaddl2, 0)
-  BUILTIN_VQW (BINOP, ssubl2, 0)
-  BUILTIN_VQW (BINOP, usubl2, 0)
-  BUILTIN_VQW (BINOP, saddw2, 0)
-  BUILTIN_VQW (BINOP, uaddw2, 0)
-  BUILTIN_VQW (BINOP, ssubw2, 0)
-  BUILTIN_VQW (BINOP, usubw2, 0)
+  BUILTIN_VQ (STORESTRUCT, st2, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st3, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st4, 0, ALL)
+
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0, ALL)
+
+  BUILTIN_VQW (BINOP, saddl2, 0, ALL)
+  BUILTIN_VQW (BINOP, uaddl2, 0, ALL)
+  BUILTIN_VQW (BINOP, ssubl2, 0, ALL)
+  BUILTIN_VQW (BINOP, usubl2, 0, ALL)
+  BUILTIN_VQW (BINOP, saddw2, 0, ALL)
+  BUILTIN_VQW (BINOP, uaddw2, 0, ALL)
+  BUILTIN_VQW (BINOP, ssubw2, 0, ALL)
+  BUILTIN_VQW (BINOP, usubw2, 0, ALL)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddl, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddl, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubl, 0)
-  BUILTIN_VD_BHSI (BINOP, usubl, 0)
+  BUILTIN_VD_BHSI (BINOP, saddl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, uaddl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, ssubl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, usubl, 0, ALL)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddw, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddw, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubw, 0)
-  BUILTIN_VD_BHSI (BINOP, usubw, 0)
+  BUILTIN_VD_BHSI (BINOP, saddw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, uaddw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, ssubw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, usubw, 0, ALL)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, shadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, shsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, ALL)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
-  BUILTIN_VQN (BINOP, addhn, 0)
-  BUILTIN_VQN (BINOP, subhn, 0)
-  BUILTIN_VQN (BINOP, raddhn, 0)
-  BUILTIN_VQN (BINOP, rsubhn, 0)
+  BUILTIN_VQN (BINOP, addhn, 0, ALL)
+  BUILTIN_VQN (BINOP, subhn, 0, ALL)
+  BUILTIN_VQN (BINOP, raddhn, 0, ALL)
+  BUILTIN_VQN (BINOP, rsubhn, 0, ALL)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
-  BUILTIN_VQN (TERNOP, addhn2, 0)
-  BUILTIN_VQN (TERNOP, subhn2, 0)
-  BUILTIN_VQN (TERNOP, raddhn2, 0)
-  BUILTIN_VQN (TERNOP, rsubhn2, 0)
+  BUILTIN_VQN (TERNOP, addhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, subhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, raddhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0, ALL)
 
-  BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
+  BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0, ALL)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
-  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0)
-  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0)
+  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, ALL)
+  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, ALL)
   /* Implemented by aarch64_s<optab><mode>.  */
-  BUILTIN_VSDQ_I (UNOP, sqabs, 0)
-  BUILTIN_VSDQ_I (UNOP, sqneg, 0)
+  BUILTIN_VSDQ_I (UNOP, sqabs, 0, ALL)
+  BUILTIN_VSDQ_I (UNOP, sqneg, 0, ALL)
 
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l<mode>.  */
-  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0)
-  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0)
+  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_lane<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0, ALL)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_laneq<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0, ALL)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_n<mode>.  */
-  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0)
-  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0)
-
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0)
-
-  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0)
-  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0)
-
-  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
-  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
-
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0)
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0)
-
-  BUILTIN_VSD_HSI (BINOP, sqdmull, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0)
-  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0)
+  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0, ALL)
+  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0, ALL)
+
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0, ALL)
+
+  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, ALL)
+  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, ALL)
+
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, ALL)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, ALL)
+
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0, ALL)
+
+  BUILTIN_VSD_HSI (BINOP, sqdmull, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0, ALL)
+  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0, ALL)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0, ALL)
   /* Implemented by aarch64_sq<r>dmulh<mode>.  */
-  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0)
-  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0)
+  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0, ALL)
+  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0, ALL)
   /* Implemented by aarch64_sq<r>dmulh_lane<q><mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0, ALL)
 
-  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
+  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3, ALL)
   /* Implemented by aarch64_<sur>shl<mode>.  */
-  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
+  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, ALL)
 
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0)
-  BUILTIN_VB (TERNOPU, udot, 0)
-  BUILTIN_VB (TERNOP_SSUS, usdot, 0)
-  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
-  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
-  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
-  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0)
+  BUILTIN_VB (TERNOP, sdot, 0, ALL)
+  BUILTIN_VB (TERNOPU, udot, 0, ALL)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, ALL)
+  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, ALL)
+  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0, ALL)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
-  BUILTIN_VHSDF (BINOP, fcadd90, 0)
-  BUILTIN_VHSDF (BINOP, fcadd270, 0)
+  BUILTIN_VHSDF (BINOP, fcadd90, 0, ALL)
+  BUILTIN_VHSDF (BINOP, fcadd270, 0, ALL)
 
   /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>.   */
-  BUILTIN_VHSDF (TERNOP, fcmla0, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla90, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla180, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla270, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0)
-
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0)
-
-  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
-  VAR1 (SHIFTIMM, ashr_simd, 0, di)
-  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
-  VAR1 (USHIFTIMM, lshr_simd, 0, di)
+  BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla90, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla180, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla270, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0, ALL)
+
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0, ALL)
+
+  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3, ALL)
+  VAR1 (SHIFTIMM, ashr_simd, 0, ALL, di)
+  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3, ALL)
+  VAR1 (USHIFTIMM, lshr_simd, 0, ALL, di)
   /* Implemented by aarch64_<sur>shr_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0, ALL)
   /* Implemented by aarch64_<sur>sra_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0, ALL)
   /* Implemented by aarch64_<sur>shll_n<mode>.  */
-  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0)
-  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0)
+  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0, ALL)
+  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0, ALL)
   /* Implemented by aarch64_<sur>shll2_n<mode>.  */
-  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0)
-  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0)
+  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0, ALL)
+  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0, ALL)
   /* Implemented by aarch64_<sur>q<r>shr<u>n_n<mode>.  */
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0, ALL)
   /* Implemented by aarch64_<sur>s<lr>i_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
-  VAR2 (SHIFTINSERTP, ssli_n, 0, di, v2di)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, ALL)
+  VAR2 (SHIFTINSERTP, ssli_n, 0, ALL, di, v2di)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, ALL)
   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
-  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
-  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0)
-  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0)
+  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, ALL)
+  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0, ALL)
+  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
 
   /* Implemented by aarch64_reduc_plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10)
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
-  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10)
-  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, ALL)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10, ALL)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, ALL)
+  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10, ALL)
+  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10, ALL)
 
   /* Implemented by <maxmin_uns><mode>3.
      smax variants map to fmaxnm,
      smax_nan variants map to fmax.  */
-  BUILTIN_VDQ_BHSI (BINOP, smax, 3)
-  BUILTIN_VDQ_BHSI (BINOP, smin, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umax, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umin, 3)
-  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3)
-  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3)
+  BUILTIN_VDQ_BHSI (BINOP, smax, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, smin, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, umax, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, umin, 3, ALL)
+  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3, ALL)
+  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3, ALL)
 
   /* Implemented by <maxmin_uns><mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3)
-  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3)
+  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3, ALL)
 
   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uminp, 0)
-  BUILTIN_VHSDF (BINOP, smaxp, 0)
-  BUILTIN_VHSDF (BINOP, sminp, 0)
-  BUILTIN_VHSDF (BINOP, smax_nanp, 0)
-  BUILTIN_VHSDF (BINOP, smin_nanp, 0)
+  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, sminp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, uminp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, smaxp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, sminp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, smax_nanp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, smin_nanp, 0, ALL)
 
   /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VHSDF (UNOP, btrunc, 2)
-  BUILTIN_VHSDF (UNOP, ceil, 2)
-  BUILTIN_VHSDF (UNOP, floor, 2)
-  BUILTIN_VHSDF (UNOP, nearbyint, 2)
-  BUILTIN_VHSDF (UNOP, rint, 2)
-  BUILTIN_VHSDF (UNOP, round, 2)
-  BUILTIN_VHSDF_DF (UNOP, frintn, 2)
-
-  VAR1 (UNOP, btrunc, 2, hf)
-  VAR1 (UNOP, ceil, 2, hf)
-  VAR1 (UNOP, floor, 2, hf)
-  VAR1 (UNOP, frintn, 2, hf)
-  VAR1 (UNOP, nearbyint, 2, hf)
-  VAR1 (UNOP, rint, 2, hf)
-  VAR1 (UNOP, round, 2, hf)
+  BUILTIN_VHSDF (UNOP, btrunc, 2, ALL)
+  BUILTIN_VHSDF (UNOP, ceil, 2, ALL)
+  BUILTIN_VHSDF (UNOP, floor, 2, ALL)
+  BUILTIN_VHSDF (UNOP, nearbyint, 2, ALL)
+  BUILTIN_VHSDF (UNOP, rint, 2, ALL)
+  BUILTIN_VHSDF (UNOP, round, 2, ALL)
+  BUILTIN_VHSDF_DF (UNOP, frintn, 2, ALL)
+
+  VAR1 (UNOP, btrunc, 2, ALL, hf)
+  VAR1 (UNOP, ceil, 2, ALL, hf)
+  VAR1 (UNOP, floor, 2, ALL, hf)
+  VAR1 (UNOP, frintn, 2, ALL, hf)
+  VAR1 (UNOP, nearbyint, 2, ALL, hf)
+  VAR1 (UNOP, rint, 2, ALL, hf)
+  VAR1 (UNOP, round, 2, ALL, hf)
 
   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
-  VAR1 (UNOP, lbtruncv4hf, 2, v4hi)
-  VAR1 (UNOP, lbtruncv8hf, 2, v8hi)
-  VAR1 (UNOP, lbtruncv2sf, 2, v2si)
-  VAR1 (UNOP, lbtruncv4sf, 2, v4si)
-  VAR1 (UNOP, lbtruncv2df, 2, v2di)
-
-  VAR1 (UNOPUS, lbtruncuv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lbtruncuv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)
-
-  VAR1 (UNOP, lroundv4hf, 2, v4hi)
-  VAR1 (UNOP, lroundv8hf, 2, v8hi)
-  VAR1 (UNOP, lroundv2sf, 2, v2si)
-  VAR1 (UNOP, lroundv4sf, 2, v4si)
-  VAR1 (UNOP, lroundv2df, 2, v2di)
+  VAR1 (UNOP, lbtruncv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lbtruncv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lbtruncv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lbtruncv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lbtruncv2df, 2, ALL, v2di)
+
+  VAR1 (UNOPUS, lbtruncuv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lbtruncuv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lbtruncuv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lbtruncuv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lbtruncuv2df, 2, ALL, v2di)
+
+  VAR1 (UNOP, lroundv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lroundv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lroundv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lroundv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lroundv2df, 2, ALL, v2di)
   /* Implemented by l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2.  */
-  BUILTIN_GPI_I16 (UNOP, lroundhf, 2)
-  VAR1 (UNOP, lroundsf, 2, si)
-  VAR1 (UNOP, lrounddf, 2, di)
-
-  VAR1 (UNOPUS, lrounduv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lrounduv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
-  VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
-  VAR1 (UNOPUS, lrounduv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2)
-  VAR1 (UNOPUS, lroundusf, 2, si)
-  VAR1 (UNOPUS, lroundudf, 2, di)
-
-  VAR1 (UNOP, lceilv4hf, 2, v4hi)
-  VAR1 (UNOP, lceilv8hf, 2, v8hi)
-  VAR1 (UNOP, lceilv2sf, 2, v2si)
-  VAR1 (UNOP, lceilv4sf, 2, v4si)
-  VAR1 (UNOP, lceilv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lceilhf, 2)
-
-  VAR1 (UNOPUS, lceiluv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lceiluv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
-  VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
-  VAR1 (UNOPUS, lceiluv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2)
-  VAR1 (UNOPUS, lceilusf, 2, si)
-  VAR1 (UNOPUS, lceiludf, 2, di)
-
-  VAR1 (UNOP, lfloorv4hf, 2, v4hi)
-  VAR1 (UNOP, lfloorv8hf, 2, v8hi)
-  VAR1 (UNOP, lfloorv2sf, 2, v2si)
-  VAR1 (UNOP, lfloorv4sf, 2, v4si)
-  VAR1 (UNOP, lfloorv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2)
-
-  VAR1 (UNOPUS, lflooruv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lflooruv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
-  VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
-  VAR1 (UNOPUS, lflooruv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2)
-  VAR1 (UNOPUS, lfloorusf, 2, si)
-  VAR1 (UNOPUS, lfloorudf, 2, di)
-
-  VAR1 (UNOP, lfrintnv4hf, 2, v4hi)
-  VAR1 (UNOP, lfrintnv8hf, 2, v8hi)
-  VAR1 (UNOP, lfrintnv2sf, 2, v2si)
-  VAR1 (UNOP, lfrintnv4sf, 2, v4si)
-  VAR1 (UNOP, lfrintnv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2)
-  VAR1 (UNOP, lfrintnsf, 2, si)
-  VAR1 (UNOP, lfrintndf, 2, di)
-
-  VAR1 (UNOPUS, lfrintnuv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lfrintnuv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2)
-  VAR1 (UNOPUS, lfrintnusf, 2, si)
-  VAR1 (UNOPUS, lfrintnudf, 2, di)
+  BUILTIN_GPI_I16 (UNOP, lroundhf, 2, ALL)
+  VAR1 (UNOP, lroundsf, 2, ALL, si)
+  VAR1 (UNOP, lrounddf, 2, ALL, di)
+
+  VAR1 (UNOPUS, lrounduv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lrounduv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lrounduv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lrounduv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lrounduv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2, ALL)
+  VAR1 (UNOPUS, lroundusf, 2, ALL, si)
+  VAR1 (UNOPUS, lroundudf, 2, ALL, di)
+
+  VAR1 (UNOP, lceilv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lceilv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lceilv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lceilv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lceilv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOP, lceilhf, 2, ALL)
+
+  VAR1 (UNOPUS, lceiluv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lceiluv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lceiluv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lceiluv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lceiluv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2, ALL)
+  VAR1 (UNOPUS, lceilusf, 2, ALL, si)
+  VAR1 (UNOPUS, lceiludf, 2, ALL, di)
+
+  VAR1 (UNOP, lfloorv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lfloorv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lfloorv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lfloorv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lfloorv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2, ALL)
+
+  VAR1 (UNOPUS, lflooruv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lflooruv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lflooruv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lflooruv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lflooruv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2, ALL)
+  VAR1 (UNOPUS, lfloorusf, 2, ALL, si)
+  VAR1 (UNOPUS, lfloorudf, 2, ALL, di)
+
+  VAR1 (UNOP, lfrintnv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lfrintnv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lfrintnv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lfrintnv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lfrintnv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2, ALL)
+  VAR1 (UNOP, lfrintnsf, 2, ALL, si)
+  VAR1 (UNOP, lfrintndf, 2, ALL, di)
+
+  VAR1 (UNOPUS, lfrintnuv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lfrintnuv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lfrintnuv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lfrintnuv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lfrintnuv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2, ALL)
+  VAR1 (UNOPUS, lfrintnusf, 2, ALL, si)
+  VAR1 (UNOPUS, lfrintnudf, 2, ALL, di)
 
   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
-  VAR1 (UNOP, floatv4hi, 2, v4hf)
-  VAR1 (UNOP, floatv8hi, 2, v8hf)
-  VAR1 (UNOP, floatv2si, 2, v2sf)
-  VAR1 (UNOP, floatv4si, 2, v4sf)
-  VAR1 (UNOP, floatv2di, 2, v2df)
+  VAR1 (UNOP, floatv4hi, 2, ALL, v4hf)
+  VAR1 (UNOP, floatv8hi, 2, ALL, v8hf)
+  VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
+  VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
+  VAR1 (UNOP, floatv2di, 2, ALL, v2df)
 
-  VAR1 (UNOP, floatunsv4hi, 2, v4hf)
-  VAR1 (UNOP, floatunsv8hi, 2, v8hf)
-  VAR1 (UNOP, floatunsv2si, 2, v2sf)
-  VAR1 (UNOP, floatunsv4si, 2, v4sf)
-  VAR1 (UNOP, floatunsv2di, 2, v2df)
+  VAR1 (UNOP, floatunsv4hi, 2, ALL, v4hf)
+  VAR1 (UNOP, floatunsv8hi, 2, ALL, v8hf)
+  VAR1 (UNOP, floatunsv2si, 2, ALL, v2sf)
+  VAR1 (UNOP, floatunsv4si, 2, ALL, v4sf)
+  VAR1 (UNOP, floatunsv2di, 2, ALL, v2df)
 
-  VAR5 (UNOPU, bswap, 2, v4hi, v8hi, v2si, v4si, v2di)
+  VAR5 (UNOPU, bswap, 2, ALL, v4hi, v8hi, v2si, v4si, v2di)
 
-  BUILTIN_VB (UNOP, rbit, 0)
+  BUILTIN_VB (UNOP, rbit, 0, ALL)
 
   /* Implemented by
      aarch64_<PERMUTE:perm_insn><mode>.  */
-  BUILTIN_VALL (BINOP, zip1, 0)
-  BUILTIN_VALL (BINOP, zip2, 0)
-  BUILTIN_VALL (BINOP, uzp1, 0)
-  BUILTIN_VALL (BINOP, uzp2, 0)
-  BUILTIN_VALL (BINOP, trn1, 0)
-  BUILTIN_VALL (BINOP, trn2, 0)
+  BUILTIN_VALL (BINOP, zip1, 0, ALL)
+  BUILTIN_VALL (BINOP, zip2, 0, ALL)
+  BUILTIN_VALL (BINOP, uzp1, 0, ALL)
+  BUILTIN_VALL (BINOP, uzp2, 0, ALL)
+  BUILTIN_VALL (BINOP, trn1, 0, ALL)
+  BUILTIN_VALL (BINOP, trn2, 0, ALL)
 
-  BUILTIN_GPF_F16 (UNOP, frecpe, 0)
-  BUILTIN_GPF_F16 (UNOP, frecpx, 0)
+  BUILTIN_GPF_F16 (UNOP, frecpe, 0, ALL)
+  BUILTIN_GPF_F16 (UNOP, frecpx, 0, ALL)
 
-  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0, ALL)
 
-  BUILTIN_VHSDF (UNOP, frecpe, 0)
-  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0)
+  BUILTIN_VHSDF (UNOP, frecpe, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0, ALL)
 
   /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
-  BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
-  BUILTIN_VHSDF (UNOP, abs, 2)
-  VAR1 (UNOP, abs, 2, hf)
+  BUILTIN_VSDQ_I_DI (UNOP, abs, 0, ALL)
+  BUILTIN_VHSDF (UNOP, abs, 2, ALL)
+  VAR1 (UNOP, abs, 2, ALL, hf)
 
-  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
-  VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
-  VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
+  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10, ALL)
+  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v4sf)
+  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v8hf)
 
-  VAR1 (UNOP, float_extend_lo_, 0, v2df)
-  VAR1 (UNOP, float_extend_lo_,  0, v4sf)
-  BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
+  VAR1 (UNOP, float_extend_lo_, 0, ALL, v2df)
+  VAR1 (UNOP, float_extend_lo_,  0, ALL, v4sf)
+  BUILTIN_VDF (UNOP, float_truncate_lo_, 0, ALL)
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
-  VAR1(STORE1P, ld1, 0, v2di)
+  BUILTIN_VALL_F16 (LOAD1, ld1, 0, ALL)
+  VAR1(STORE1P, ld1, 0, ALL, v2di)
 
   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (STORE1, st1, 0)
-  VAR1(STORE1P, st1, 0, v2di)
+  BUILTIN_VALL_F16 (STORE1, st1, 0, ALL)
+  VAR1(STORE1P, st1, 0, ALL, v2di)
 
   /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0, ALL)
 
   /* Implemented by aarch64_ld1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0, ALL)
 
   /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0, ALL)
 
   /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0, ALL)
 
   /* Implemented by aarch64_st1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, ALL)
 
   /* Implemented by fma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fma, 4)
-  VAR1 (TERNOP, fma, 4, hf)
+  BUILTIN_VHSDF (TERNOP, fma, 4, ALL)
+  VAR1 (TERNOP, fma, 4, ALL, hf)
   /* Implemented by fnma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fnma, 4)
-  VAR1 (TERNOP, fnma, 4, hf)
+  BUILTIN_VHSDF (TERNOP, fnma, 4, ALL)
+  VAR1 (TERNOP, fnma, 4, ALL, hf)
 
   /* Implemented by aarch64_simd_bsl<mode>.  */
-  BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
-  VAR2 (BSL_P, simd_bsl,0, di, v2di)
-  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
-  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
+  BUILTIN_VDQQH (BSL_P, simd_bsl, 0, ALL)
+  VAR2 (BSL_P, simd_bsl,0, ALL, di, v2di)
+  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0, ALL)
+  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0, ALL)
 
   /* Implemented by aarch64_crypto_aes<op><mode>.  */
-  VAR1 (BINOPU, crypto_aese, 0, v16qi)
-  VAR1 (BINOPU, crypto_aesd, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesmc, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesimc, 0, v16qi)
+  VAR1 (BINOPU, crypto_aese, 0, ALL, v16qi)
+  VAR1 (BINOPU, crypto_aesd, 0, ALL, v16qi)
+  VAR1 (UNOPU, crypto_aesmc, 0, ALL, v16qi)
+  VAR1 (UNOPU, crypto_aesimc, 0, ALL, v16qi)
 
   /* Implemented by aarch64_crypto_sha1<op><mode>.  */
-  VAR1 (UNOPU, crypto_sha1h, 0, si)
-  VAR1 (BINOPU, crypto_sha1su1, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1c, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1m, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1p, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1su0, 0, v4si)
+  VAR1 (UNOPU, crypto_sha1h, 0, ALL, si)
+  VAR1 (BINOPU, crypto_sha1su1, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1c, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1m, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1p, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1su0, 0, ALL, v4si)
 
   /* Implemented by aarch64_crypto_sha256<op><mode>.  */
-  VAR1 (TERNOPU, crypto_sha256h, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256h2, 0, v4si)
-  VAR1 (BINOPU, crypto_sha256su0, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256su1, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha256h, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha256h2, 0, ALL, v4si)
+  VAR1 (BINOPU, crypto_sha256su0, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha256su1, 0, ALL, v4si)
 
   /* Implemented by aarch64_crypto_pmull<mode>.  */
-  VAR1 (BINOPP, crypto_pmull, 0, di)
-  VAR1 (BINOPP, crypto_pmull, 0, v2di)
+  VAR1 (BINOPP, crypto_pmull, 0, ALL, di)
+  VAR1 (BINOPP, crypto_pmull, 0, ALL, v2di)
 
   /* Implemented by aarch64_tbl3<mode>.  */
-  VAR1 (BINOP, tbl3, 0, v8qi)
-  VAR1 (BINOP, tbl3, 0, v16qi)
+  VAR1 (BINOP, tbl3, 0, ALL, v8qi)
+  VAR1 (BINOP, tbl3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbl3<mode>.  */
-  VAR1 (BINOP, qtbl3, 0, v8qi)
-  VAR1 (BINOP, qtbl3, 0, v16qi)
+  VAR1 (BINOP, qtbl3, 0, ALL, v8qi)
+  VAR1 (BINOP, qtbl3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbl4<mode>.  */
-  VAR1 (BINOP, qtbl4, 0, v8qi)
-  VAR1 (BINOP, qtbl4, 0, v16qi)
+  VAR1 (BINOP, qtbl4, 0, ALL, v8qi)
+  VAR1 (BINOP, qtbl4, 0, ALL, v16qi)
 
   /* Implemented by aarch64_tbx4<mode>.  */
-  VAR1 (TERNOP, tbx4, 0, v8qi)
-  VAR1 (TERNOP, tbx4, 0, v16qi)
+  VAR1 (TERNOP, tbx4, 0, ALL, v8qi)
+  VAR1 (TERNOP, tbx4, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbx3<mode>.  */
-  VAR1 (TERNOP, qtbx3, 0, v8qi)
-  VAR1 (TERNOP, qtbx3, 0, v16qi)
+  VAR1 (TERNOP, qtbx3, 0, ALL, v8qi)
+  VAR1 (TERNOP, qtbx3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbx4<mode>.  */
-  VAR1 (TERNOP, qtbx4, 0, v8qi)
-  VAR1 (TERNOP, qtbx4, 0, v16qi)
+  VAR1 (TERNOP, qtbx4, 0, ALL, v8qi)
+  VAR1 (TERNOP, qtbx4, 0, ALL, v16qi)
 
   /* Builtins for ARMv8.1-A Adv.SIMD instructions.  */
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0)
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0, ALL)
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_lane<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0, ALL)
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0, ALL)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0, ALL)
 
   /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
-  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3)
-  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3)
-  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3)
-  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3)
-  VAR1 (SHIFTIMM, scvtfsi, 3, hf)
-  VAR1 (SHIFTIMM, scvtfdi, 3, hf)
-  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, hf)
-  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, hf)
-  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3)
-  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3)
+  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3, ALL)
+  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3, ALL)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3, ALL)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3, ALL)
+  VAR1 (SHIFTIMM, scvtfsi, 3, ALL, hf)
+  VAR1 (SHIFTIMM, scvtfdi, 3, ALL, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, ALL, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, ALL, hf)
+  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3, ALL)
+  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3, ALL)
 
   /* Implemented by aarch64_rsqrte<mode>.  */
-  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0)
+  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0, ALL)
 
   /* Implemented by aarch64_rsqrts<mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0)
+  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0, ALL)
 
   /* Implemented by fabd<mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3)
+  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, ALL)
 
   /* Implemented by aarch64_faddp<mode>.  */
-  BUILTIN_VHSDF (BINOP, faddp, 0)
+  BUILTIN_VHSDF (BINOP, faddp, 0, ALL)
 
   /* Implemented by aarch64_cm<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0, ALL)
 
   /* Implemented by neg<mode>2.  */
-  BUILTIN_VHSDF_HSDF (UNOP, neg, 2)
+  BUILTIN_VHSDF_HSDF (UNOP, neg, 2, ALL)
 
   /* Implemented by aarch64_fac<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, ALL)
 
   /* Implemented by sqrt<mode>2.  */
-  VAR1 (UNOP, sqrt, 2, hf)
+  VAR1 (UNOP, sqrt, 2, ALL, hf)
 
   /* Implemented by <optab><mode>hf2.  */
-  VAR1 (UNOP, floatdi, 2, hf)
-  VAR1 (UNOP, floatsi, 2, hf)
-  VAR1 (UNOP, floathi, 2, hf)
-  VAR1 (UNOPUS, floatunsdi, 2, hf)
-  VAR1 (UNOPUS, floatunssi, 2, hf)
-  VAR1 (UNOPUS, floatunshi, 2, hf)
-  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2)
-  BUILTIN_GPI (UNOP, fix_truncsf, 2)
-  BUILTIN_GPI (UNOP, fix_truncdf, 2)
-  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2)
-  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2)
-  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2)
+  VAR1 (UNOP, floatdi, 2, ALL, hf)
+  VAR1 (UNOP, floatsi, 2, ALL, hf)
+  VAR1 (UNOP, floathi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunsdi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunssi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunshi, 2, ALL, hf)
+  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2, ALL)
+  BUILTIN_GPI (UNOP, fix_truncsf, 2, ALL)
+  BUILTIN_GPI (UNOP, fix_truncdf, 2, ALL)
+  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2, ALL)
+  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2, ALL)
+  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2, ALL)
 
   /* Implemented by aarch64_sm3ss1qv4si.  */
-  VAR1 (TERNOPU, sm3ss1q, 0, v4si)
+  VAR1 (TERNOPU, sm3ss1q, 0, ALL, v4si)
   /* Implemented by aarch64_sm3tt<sm3tt_op>qv4si.  */
-  VAR1 (QUADOPUI, sm3tt1aq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt1bq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt2aq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt2bq, 0, v4si)
+  VAR1 (QUADOPUI, sm3tt1aq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt1bq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt2aq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt2bq, 0, ALL, v4si)
   /* Implemented by aarch64_sm3partw<sm3part_op>qv4si.  */
-  VAR1 (TERNOPU, sm3partw1q, 0, v4si)
-  VAR1 (TERNOPU, sm3partw2q, 0, v4si)
+  VAR1 (TERNOPU, sm3partw1q, 0, ALL, v4si)
+  VAR1 (TERNOPU, sm3partw2q, 0, ALL, v4si)
   /* Implemented by aarch64_sm4eqv4si.  */
-  VAR1 (BINOPU, sm4eq, 0, v4si)
+  VAR1 (BINOPU, sm4eq, 0, ALL, v4si)
   /* Implemented by aarch64_sm4ekeyqv4si.  */
-  VAR1 (BINOPU, sm4ekeyq, 0, v4si)
+  VAR1 (BINOPU, sm4ekeyq, 0, ALL, v4si)
   /* Implemented by aarch64_crypto_sha512hqv2di.  */
-  VAR1 (TERNOPU, crypto_sha512hq, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512hq, 0, ALL, v2di)
   /* Implemented by aarch64_sha512h2qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512h2q, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512h2q, 0, ALL, v2di)
   /* Implemented by aarch64_crypto_sha512su0qv2di.  */
-  VAR1 (BINOPU, crypto_sha512su0q, 0, v2di)
+  VAR1 (BINOPU, crypto_sha512su0q, 0, ALL, v2di)
   /* Implemented by aarch64_crypto_sha512su1qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512su1q, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512su1q, 0, ALL, v2di)
   /* Implemented by eor3q<mode>4.  */
-  BUILTIN_VQ_I (TERNOPU, eor3q, 4)
-  BUILTIN_VQ_I (TERNOP, eor3q, 4)
+  BUILTIN_VQ_I (TERNOPU, eor3q, 4, ALL)
+  BUILTIN_VQ_I (TERNOP, eor3q, 4, ALL)
   /* Implemented by aarch64_rax1qv2di.  */
-  VAR1 (BINOPU, rax1q, 0, v2di)
+  VAR1 (BINOPU, rax1q, 0, ALL, v2di)
   /* Implemented by aarch64_xarqv2di.  */
-  VAR1 (TERNOPUI, xarq, 0, v2di)
+  VAR1 (TERNOPUI, xarq, 0, ALL, v2di)
   /* Implemented by bcaxq<mode>4.  */
-  BUILTIN_VQ_I (TERNOPU, bcaxq, 4)
-  BUILTIN_VQ_I (TERNOP, bcaxq, 4)
+  BUILTIN_VQ_I (TERNOPU, bcaxq, 4, ALL)
+  BUILTIN_VQ_I (TERNOP, bcaxq, 4, ALL)
 
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_low<mode>.  */
-  VAR1 (TERNOP, fmlal_low, 0, v2sf)
-  VAR1 (TERNOP, fmlsl_low, 0, v2sf)
-  VAR1 (TERNOP, fmlalq_low, 0, v4sf)
-  VAR1 (TERNOP, fmlslq_low, 0, v4sf)
+  VAR1 (TERNOP, fmlal_low, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlsl_low, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlalq_low, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlslq_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_high<mode>.  */
-  VAR1 (TERNOP, fmlal_high, 0, v2sf)
-  VAR1 (TERNOP, fmlsl_high, 0, v2sf)
-  VAR1 (TERNOP, fmlalq_high, 0, v4sf)
-  VAR1 (TERNOP, fmlslq_high, 0, v4sf)
+  VAR1 (TERNOP, fmlal_high, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlsl_high, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlalq_high, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlslq_high, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l_lane_lowv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_lane_low, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_lane_low, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>l_laneq_lowv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_laneq_low, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_laneq_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_laneq_low, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_low, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>lq_lane_lowv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_lane_low, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_lane_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_lane_low, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>lq_laneq_lowv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_laneq_low, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_laneq_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_laneq_low, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l_lane_highv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_lane_high, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_lane_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_lane_high, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_high, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>l_laneq_highv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_laneq_high, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_laneq_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_laneq_high, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_high, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>lq_lane_highv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_lane_high, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_lane_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_lane_high, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_high, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>lq_laneq_highv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, ALL, v4sf)
 
   /* Implemented by aarch64_<frintnzs_op><mode>.  */
-  BUILTIN_VSFDF (UNOP, frint32z, 0)
-  BUILTIN_VSFDF (UNOP, frint32x, 0)
-  BUILTIN_VSFDF (UNOP, frint64z, 0)
-  BUILTIN_VSFDF (UNOP, frint64x, 0)
+  BUILTIN_VSFDF (UNOP, frint32z, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint32x, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint64z, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint64x, 0, ALL)
 
   /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
-  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
-  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
-  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
+  VAR2 (TERNOP, bfdot, 0, ALL, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, ALL, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, ALL, v2sf, v4sf)
 
   /* Implemented by aarch64_bfmmlaqv4sf  */
-  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+  VAR1 (TERNOP, bfmmlaq, 0, ALL, v4sf)
 
   /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf  */
-  VAR1 (TERNOP, bfmlalb, 0, v4sf)
-  VAR1 (TERNOP, bfmlalt, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
+  VAR1 (TERNOP, bfmlalb, 0, ALL, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
-  VAR1 (TERNOP, simd_smmla, 0, v16qi)
-  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
-  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
+  VAR1 (TERNOP, simd_smmla, 0, ALL, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, ALL, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, ALL, v16qi)
 
   /* Implemented by aarch64_bfcvtn{q}{2}<mode>  */
-  VAR1 (UNOP, bfcvtn, 0, v4bf)
-  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
-  VAR1 (BINOP, bfcvtn2, 0, v8bf)
-  VAR1 (UNOP, bfcvt, 0, bf)
+  VAR1 (UNOP, bfcvtn, 0, ALL, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
+  VAR1 (UNOP, bfcvt, 0, ALL, bf)
diff --git a/gcc/config/aarch64/geniterators.sh b/gcc/config/aarch64/geniterators.sh
index a7420964f85..43feb482ce9 100644
--- a/gcc/config/aarch64/geniterators.sh
+++ b/gcc/config/aarch64/geniterators.sh
@@ -70,8 +70,8 @@ iterdef {
 	sub(/ *\]/, "", s)
 
 	n = split(s, a)
-	printf "#define BUILTIN_" a[1] "(T, N, MAP) \\\n"
-	printf "  VAR" (n-1) " (T, N, MAP"
+	printf "#define BUILTIN_" a[1] "(T, N, MAP, FLAG) \\\n"
+	printf "  VAR" (n-1) " (T, N, MAP, FLAG"
 	for (i = 2; i <= n; i++)
 		printf ", "  tolower(a[i])
 	printf ")\n"

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-15  8:49           ` xiezhiheng
@ 2020-07-16 12:41             ` Richard Sandiford
  2020-07-16 14:05               ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-07-16 12:41 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, July 7, 2020 10:08 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Monday, July 6, 2020 5:31 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> No, this is unfortunately a known bug.  See:
>> >>
>> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
>> >>
>> >> (Although the PR is recent, it's been a known bug for longer.)
>> >>
>> >> As you say, the difficulty is that the correct attributes depend on what
>> >> the built-in function does.  Most integer arithmetic is “const”, but
>> things
>> >> get more complicated for floating-point arithmetic.
>> >>
>> >> The SVE intrinsics use a three stage process:
>> >>
>> >> - each function is classified into one of several groups
>> >> - each group has a set of flags that describe what functions in the
>> >>   group can do
>> >> - these flags get converted into attributes based on the current
>> >>   command-line options
>> >>
>> >> I guess we should have something similar for the arm_neon.h built-ins.
>> >>
>> >> If you're willing to help fix this, that'd be great.  I think a first
>> >> step would be to agree a design.
>> >>
>> >> Thanks,
>> >> Richard
>> >
>> > I'd like to have a try.
>> 
>> Great!
>> 
>> > I have checked the steps in SVE intrinsics.
>> > It defines a base class "function_base" and derives different classes
>> > to describe several intrinsics for each.  And each class may
>> > have its own unique flags described in virtual function "call_properties".
>> > The specific attributes will be converted from these flags in
>> > "get_attributes" later.
>> >
>> > I find that there are more than 100 classes in total and if I only
>> > need to classify them into different groups by attributes, maybe
>> > we does not need so many classes?
>> 
>> Yeah, I agree.
>> 
>> Long term, there might be value in defining arm_neon.h in a similar
>> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
>> a special compiler pragma.  But that's going to be a lot of work.
>> 
>> I think it's possible to make incremental improvements to the current
>> arm_neon.h implementation without that work being thrown away if we
>> ever
>> did switch to a pragma in future.  And the incremental approach seems
>> more practical.
>> 
>> > The difficult thing I think is how to classify neon intrinsics into
>> > different groups.  I'm going to follow up the way in SVE intrinsics
>> > first now.
>> 
>> For now I'd suggest just giving a name to each combination of flags
>> that the intrinsics need, rather than splitting instructions in a
>> more fine-grained way.  (It's not at all obvious from the final state
>> of the SVE code, but even there, the idea was to have as few groups as
>> possible.  I.e. the groups were supposedly only split where necessary.
>> As you say, there still ended up being a lot of groups in the end…)
>> 
>> It'd be easier to review if the work was split up into smaller steps.
>> E.g. maybe one way would be this, with each number being a single
>> patch:
>> 
>> (1) (a) Add a flags field to the built-in function definitions
>>         that for now is always zero.
>>     (b) Pick a name N to describe the most conservative set of flags.
>>     (c) Make every built-in function definition use N.
>> 
>
> I have finished the first part.
>
> (a) I add a new parameter called FLAG to every built-in function macro.
>
> (b) I define some flags in aarch64-builtins.c
> FLAG_NONE for no needed flags
> FLAG_READ_FPCR for functions will read FPCR register
> FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
> FLAG_READ_MEMORY for functions will read global memory
> FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
> FLAG_WRITE_MEMORY for functions will write global memory
>
> FLAG_FP is used for floating-point arithmetic
> FLAG_ALL is all flags above
>
> (c) I add a field in struct aarch64_simd_builtin_datum to record flags
> for each built-in function.  But the default flags I set for built-in functions
> are FLAG_ALL because by default the built-in functions might do anything.
>
> And bootstrap and regression are tested ok on aarch64 Linux platform.

This looks great.

The patch is OK for trunk, but could you send a changelog too,
so that I can include it in the commit message?

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-16 12:41             ` Richard Sandiford
@ 2020-07-16 14:05               ` xiezhiheng
  2020-07-17  9:03                 ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-07-16 14:05 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 6492 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Thursday, July 16, 2020 8:42 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Tuesday, July 7, 2020 10:08 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> >> Sent: Monday, July 6, 2020 5:31 PM
> >> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> >> gcc-patches@gcc.gnu.org
> >> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp
> instructions
> >> >> emitted at -O3
> >> >>
> >> >> No, this is unfortunately a known bug.  See:
> >> >>
> >> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
> >> >>
> >> >> (Although the PR is recent, it's been a known bug for longer.)
> >> >>
> >> >> As you say, the difficulty is that the correct attributes depend on what
> >> >> the built-in function does.  Most integer arithmetic is “const”, but
> >> things
> >> >> get more complicated for floating-point arithmetic.
> >> >>
> >> >> The SVE intrinsics use a three stage process:
> >> >>
> >> >> - each function is classified into one of several groups
> >> >> - each group has a set of flags that describe what functions in the
> >> >>   group can do
> >> >> - these flags get converted into attributes based on the current
> >> >>   command-line options
> >> >>
> >> >> I guess we should have something similar for the arm_neon.h built-ins.
> >> >>
> >> >> If you're willing to help fix this, that'd be great.  I think a first
> >> >> step would be to agree a design.
> >> >>
> >> >> Thanks,
> >> >> Richard
> >> >
> >> > I'd like to have a try.
> >>
> >> Great!
> >>
> >> > I have checked the steps in SVE intrinsics.
> >> > It defines a base class "function_base" and derives different classes
> >> > to describe several intrinsics for each.  And each class may
> >> > have its own unique flags described in virtual function "call_properties".
> >> > The specific attributes will be converted from these flags in
> >> > "get_attributes" later.
> >> >
> >> > I find that there are more than 100 classes in total and if I only
> >> > need to classify them into different groups by attributes, maybe
> >> > we does not need so many classes?
> >>
> >> Yeah, I agree.
> >>
> >> Long term, there might be value in defining arm_neon.h in a similar
> >> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
> >> a special compiler pragma.  But that's going to be a lot of work.
> >>
> >> I think it's possible to make incremental improvements to the current
> >> arm_neon.h implementation without that work being thrown away if we
> >> ever
> >> did switch to a pragma in future.  And the incremental approach seems
> >> more practical.
> >>
> >> > The difficult thing I think is how to classify neon intrinsics into
> >> > different groups.  I'm going to follow up the way in SVE intrinsics
> >> > first now.
> >>
> >> For now I'd suggest just giving a name to each combination of flags
> >> that the intrinsics need, rather than splitting instructions in a
> >> more fine-grained way.  (It's not at all obvious from the final state
> >> of the SVE code, but even there, the idea was to have as few groups as
> >> possible.  I.e. the groups were supposedly only split where necessary.
> >> As you say, there still ended up being a lot of groups in the end…)
> >>
> >> It'd be easier to review if the work was split up into smaller steps.
> >> E.g. maybe one way would be this, with each number being a single
> >> patch:
> >>
> >> (1) (a) Add a flags field to the built-in function definitions
> >>         that for now is always zero.
> >>     (b) Pick a name N to describe the most conservative set of flags.
> >>     (c) Make every built-in function definition use N.
> >>
> >
> > I have finished the first part.
> >
> > (a) I add a new parameter called FLAG to every built-in function macro.
> >
> > (b) I define some flags in aarch64-builtins.c
> > FLAG_NONE for no needed flags
> > FLAG_READ_FPCR for functions will read FPCR register
> > FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
> > FLAG_READ_MEMORY for functions will read global memory
> > FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
> > FLAG_WRITE_MEMORY for functions will write global memory
> >
> > FLAG_FP is used for floating-point arithmetic
> > FLAG_ALL is all flags above
> >
> > (c) I add a field in struct aarch64_simd_builtin_datum to record flags
> > for each built-in function.  But the default flags I set for built-in functions
> > are FLAG_ALL because by default the built-in functions might do anything.
> >
> > And bootstrap and regression are tested ok on aarch64 Linux platform.
> 
> This looks great.
> 
> The patch is OK for trunk, but could you send a changelog too,
> so that I can include it in the commit message?
> 
> Thanks,
> Richard

OK, and I add the git commit msg in patch.

Thanks,
XieZhiheng

+2020-07-16  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	PR tree-optimization/94442
+	* config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers):
+	Add new field flags.
+	(VAR1): Add new field FLAG in macro.
+	(VAR2): Likewise.
+	(VAR3): Likewise.
+	(VAR4): Likewise.
+	(VAR5): Likewise.
+	(VAR6): Likewise.
+	(VAR7): Likewise.
+	(VAR8): Likewise.
+	(VAR9): Likewise.
+	(VAR10): Likewise.
+	(VAR11): Likewise.
+	(VAR12): Likewise.
+	(VAR13): Likewise.
+	(VAR14): Likewise.
+	(VAR15): Likewise.
+	(VAR16): Likewise.
+	(aarch64_general_fold_builtin): Likewise.
+	(aarch64_general_gimple_fold_builtin): Likewise.
+	* config/aarch64/aarch64-simd-builtins.def: Add default flag for
+	each built-in function.
+	* config/aarch64/geniterators.sh: Add new field in BUILTIN macro.
+

[-- Attachment #2: pr94442-v2.patch --]
[-- Type: application/octet-stream, Size: 58061 bytes --]

From 9b6fd3669b71f5b8b42e4a804284f8b2f1479b79 Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Thu, 16 Jul 2020 09:40:22 -0400
Subject: [PATCH] AArch64: Add flags in built-in functions [PR94442]

2020-07-16  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers):
	Add new field flags.
	(VAR1): Add new field FLAG in macro.
	(VAR2): Likewise.
	(VAR3): Likewise.
	(VAR4): Likewise.
	(VAR5): Likewise.
	(VAR6): Likewise.
	(VAR7): Likewise.
	(VAR8): Likewise.
	(VAR9): Likewise.
	(VAR10): Likewise.
	(VAR11): Likewise.
	(VAR12): Likewise.
	(VAR13): Likewise.
	(VAR14): Likewise.
	(VAR15): Likewise.
	(VAR16): Likewise.
	(aarch64_general_fold_builtin): Likewise.
	(aarch64_general_gimple_fold_builtin): Likewise.
	* config/aarch64/aarch64-simd-builtins.def: Add default flag for
	each built-in function.
	* config/aarch64/geniterators.sh: Add new field in BUILTIN macro.
---
 gcc/config/aarch64/aarch64-builtins.c        |  131 ++-
 gcc/config/aarch64/aarch64-simd-builtins.def | 1014 +++++++++---------
 gcc/config/aarch64/geniterators.sh           |    4 +-
 3 files changed, 581 insertions(+), 568 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 49dfbafec3a..d5fb29048c4 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -117,6 +117,18 @@ enum aarch64_type_qualifiers
   qualifier_lane_quadtup_index = 0x1000,
 };
 
+/* Flags that describe what a function might do.  */
+const unsigned int FLAG_NONE = 0U;
+const unsigned int FLAG_READ_FPCR = 1U << 0;
+const unsigned int FLAG_RAISE_FP_EXCEPTIONS = 1U << 1;
+const unsigned int FLAG_READ_MEMORY = 1U << 2;
+const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
+const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
+
+const unsigned int FLAG_FP = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS;
+const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
+  | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
+
 typedef struct
 {
   const char *name;
@@ -124,6 +136,7 @@ typedef struct
   const enum insn_code code;
   unsigned int fcode;
   enum aarch64_type_qualifiers *qualifiers;
+  unsigned int flags;
 } aarch64_simd_builtin_datum;
 
 static enum aarch64_type_qualifiers
@@ -336,53 +349,53 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define CF4(N, X) CODE_FOR_##N##X##4
 #define CF10(N, X) CODE_FOR_##N##X
 
-#define VAR1(T, N, MAP, A) \
-  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T},
-#define VAR2(T, N, MAP, A, B) \
-  VAR1 (T, N, MAP, A) \
-  VAR1 (T, N, MAP, B)
-#define VAR3(T, N, MAP, A, B, C) \
-  VAR2 (T, N, MAP, A, B) \
-  VAR1 (T, N, MAP, C)
-#define VAR4(T, N, MAP, A, B, C, D) \
-  VAR3 (T, N, MAP, A, B, C) \
-  VAR1 (T, N, MAP, D)
-#define VAR5(T, N, MAP, A, B, C, D, E) \
-  VAR4 (T, N, MAP, A, B, C, D) \
-  VAR1 (T, N, MAP, E)
-#define VAR6(T, N, MAP, A, B, C, D, E, F) \
-  VAR5 (T, N, MAP, A, B, C, D, E) \
-  VAR1 (T, N, MAP, F)
-#define VAR7(T, N, MAP, A, B, C, D, E, F, G) \
-  VAR6 (T, N, MAP, A, B, C, D, E, F) \
-  VAR1 (T, N, MAP, G)
-#define VAR8(T, N, MAP, A, B, C, D, E, F, G, H) \
-  VAR7 (T, N, MAP, A, B, C, D, E, F, G) \
-  VAR1 (T, N, MAP, H)
-#define VAR9(T, N, MAP, A, B, C, D, E, F, G, H, I) \
-  VAR8 (T, N, MAP, A, B, C, D, E, F, G, H) \
-  VAR1 (T, N, MAP, I)
-#define VAR10(T, N, MAP, A, B, C, D, E, F, G, H, I, J) \
-  VAR9 (T, N, MAP, A, B, C, D, E, F, G, H, I) \
-  VAR1 (T, N, MAP, J)
-#define VAR11(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
-  VAR10 (T, N, MAP, A, B, C, D, E, F, G, H, I, J) \
-  VAR1 (T, N, MAP, K)
-#define VAR12(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
-  VAR11 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K) \
-  VAR1 (T, N, MAP, L)
-#define VAR13(T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
-  VAR12 (T, N, MAP, A, B, C, D, E, F, G, H, I, J, K, L) \
-  VAR1 (T, N, MAP, M)
-#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
-  VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
-  VAR1 (T, X, MAP, N)
-#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
-  VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
-  VAR1 (T, X, MAP, O)
-#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
-  VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
-  VAR1 (T, X, MAP, P)
+#define VAR1(T, N, MAP, FLAG, A) \
+  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
+#define VAR2(T, N, MAP, FLAG, A, B) \
+  VAR1 (T, N, MAP, FLAG, A) \
+  VAR1 (T, N, MAP, FLAG, B)
+#define VAR3(T, N, MAP, FLAG, A, B, C) \
+  VAR2 (T, N, MAP, FLAG, A, B) \
+  VAR1 (T, N, MAP, FLAG, C)
+#define VAR4(T, N, MAP, FLAG, A, B, C, D) \
+  VAR3 (T, N, MAP, FLAG, A, B, C) \
+  VAR1 (T, N, MAP, FLAG, D)
+#define VAR5(T, N, MAP, FLAG, A, B, C, D, E) \
+  VAR4 (T, N, MAP, FLAG, A, B, C, D) \
+  VAR1 (T, N, MAP, FLAG, E)
+#define VAR6(T, N, MAP, FLAG, A, B, C, D, E, F) \
+  VAR5 (T, N, MAP, FLAG, A, B, C, D, E) \
+  VAR1 (T, N, MAP, FLAG, F)
+#define VAR7(T, N, MAP, FLAG, A, B, C, D, E, F, G) \
+  VAR6 (T, N, MAP, FLAG, A, B, C, D, E, F) \
+  VAR1 (T, N, MAP, FLAG, G)
+#define VAR8(T, N, MAP, FLAG, A, B, C, D, E, F, G, H) \
+  VAR7 (T, N, MAP, FLAG, A, B, C, D, E, F, G) \
+  VAR1 (T, N, MAP, FLAG, H)
+#define VAR9(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I) \
+  VAR8 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H) \
+  VAR1 (T, N, MAP, FLAG, I)
+#define VAR10(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J) \
+  VAR9 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I) \
+  VAR1 (T, N, MAP, FLAG, J)
+#define VAR11(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR10 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J) \
+  VAR1 (T, N, MAP, FLAG, K)
+#define VAR12(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR11 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K) \
+  VAR1 (T, N, MAP, FLAG, L)
+#define VAR13(T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, MAP, FLAG, M)
+#define VAR14(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR13 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR1 (T, X, MAP, FLAG, N)
+#define VAR15(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR14 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR1 (T, X, MAP, FLAG, O)
+#define VAR16(T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+  VAR15 (T, X, MAP, FLAG, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR1 (T, X, MAP, FLAG, P)
 
 #include "aarch64-builtin-iterators.h"
 
@@ -438,7 +451,7 @@ typedef struct
   AARCH64_SIMD_BUILTIN_FCMLA_LANEQ##I##_##M,
 
 #undef VAR1
-#define VAR1(T, N, MAP, A) \
+#define VAR1(T, N, MAP, FLAG, A) \
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 enum aarch64_builtins
@@ -2196,7 +2209,7 @@ aarch64_general_builtin_rsqrt (unsigned int fn)
 }
 
 #undef VAR1
-#define VAR1(T, N, MAP, A) \
+#define VAR1(T, N, MAP, FLAG, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
 
 /* Try to fold a call to the built-in function with subcode FCODE.  The
@@ -2209,11 +2222,11 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
 {
   switch (fcode)
     {
-      BUILTIN_VDQF (UNOP, abs, 2)
+      BUILTIN_VDQF (UNOP, abs, 2, ALL)
 	return fold_build1 (ABS_EXPR, type, args[0]);
-      VAR1 (UNOP, floatv2si, 2, v2sf)
-      VAR1 (UNOP, floatv4si, 2, v4sf)
-      VAR1 (UNOP, floatv2di, 2, v2df)
+      VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
+      VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
+      VAR1 (UNOP, floatv2di, 2, ALL, v2df)
 	return fold_build1 (FLOAT_EXPR, type, args[0]);
       default:
 	break;
@@ -2239,24 +2252,24 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
      the arguments to the __builtin.  */
   switch (fcode)
     {
-      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_PLUS,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
-      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
+      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10, ALL)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
-      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
+      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10, ALL)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_MIN,
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
-      BUILTIN_GPF (BINOP, fmulx, 0)
+      BUILTIN_GPF (BINOP, fmulx, 0, ALL)
 	{
 	  gcc_assert (nargs == 2);
 	  bool a0_cst_p = TREE_CODE (args[0]) == REAL_CST;
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6b1ea..e8650121cd6 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -39,693 +39,693 @@
    1-9 - CODE_FOR_<name><mode><1-9>
    10 - CODE_FOR_<name><mode>.  */
 
-  BUILTIN_VDC (COMBINE, combine, 0)
-  VAR1 (COMBINEP, combine, 0, di)
-  BUILTIN_VB (BINOP, pmul, 0)
-  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2)
-  BUILTIN_VD_BHSI (BINOP, addp, 0)
-  VAR1 (UNOP, addp, 0, di)
-  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
-  BUILTIN_VDQ_BHSI (UNOP, clz, 2)
-  BUILTIN_VS (UNOP, ctz, 2)
-  BUILTIN_VB (UNOP, popcount, 2)
+  BUILTIN_VDC (COMBINE, combine, 0, ALL)
+  VAR1 (COMBINEP, combine, 0, ALL, di)
+  BUILTIN_VB (BINOP, pmul, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
+  BUILTIN_VD_BHSI (BINOP, addp, 0, ALL)
+  VAR1 (UNOP, addp, 0, ALL, di)
+  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
+  BUILTIN_VDQ_BHSI (UNOP, clz, 2, ALL)
+  BUILTIN_VS (UNOP, ctz, 2, ALL)
+  BUILTIN_VB (UNOP, popcount, 2, ALL)
 
   /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0)
-  BUILTIN_VSDQ_I (BINOP, sqrshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0)
+  BUILTIN_VSDQ_I (BINOP, sqshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP, sqrshl, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, ALL)
   /* Implemented by aarch64_<su_optab><optab><mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqadd, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqadd, 0)
-  BUILTIN_VSDQ_I (BINOP, sqsub, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqsub, 0)
+  BUILTIN_VSDQ_I (BINOP, sqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOPU, uqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP, sqsub, 0, ALL)
+  BUILTIN_VSDQ_I (BINOPU, uqsub, 0, ALL)
   /* Implemented by aarch64_<sur>qadd<mode>.  */
-  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0)
+  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, ALL)
 
   /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
-  BUILTIN_VDC (GETREG, get_dregoi, 0)
-  BUILTIN_VDC (GETREG, get_dregci, 0)
-  BUILTIN_VDC (GETREG, get_dregxi, 0)
-  VAR1 (GETREGP, get_dregoi, 0, di)
-  VAR1 (GETREGP, get_dregci, 0, di)
-  VAR1 (GETREGP, get_dregxi, 0, di)
+  BUILTIN_VDC (GETREG, get_dregoi, 0, ALL)
+  BUILTIN_VDC (GETREG, get_dregci, 0, ALL)
+  BUILTIN_VDC (GETREG, get_dregxi, 0, ALL)
+  VAR1 (GETREGP, get_dregoi, 0, ALL, di)
+  VAR1 (GETREGP, get_dregci, 0, ALL, di)
+  VAR1 (GETREGP, get_dregxi, 0, ALL, di)
   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (GETREG, get_qregoi, 0)
-  BUILTIN_VQ (GETREG, get_qregci, 0)
-  BUILTIN_VQ (GETREG, get_qregxi, 0)
-  VAR1 (GETREGP, get_qregoi, 0, v2di)
-  VAR1 (GETREGP, get_qregci, 0, v2di)
-  VAR1 (GETREGP, get_qregxi, 0, v2di)
+  BUILTIN_VQ (GETREG, get_qregoi, 0, ALL)
+  BUILTIN_VQ (GETREG, get_qregci, 0, ALL)
+  BUILTIN_VQ (GETREG, get_qregxi, 0, ALL)
+  VAR1 (GETREGP, get_qregoi, 0, ALL, v2di)
+  VAR1 (GETREGP, get_qregci, 0, ALL, v2di)
+  VAR1 (GETREGP, get_qregxi, 0, ALL, v2di)
   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (SETREG, set_qregoi, 0)
-  BUILTIN_VQ (SETREG, set_qregci, 0)
-  BUILTIN_VQ (SETREG, set_qregxi, 0)
-  VAR1 (SETREGP, set_qregoi, 0, v2di)
-  VAR1 (SETREGP, set_qregci, 0, v2di)
-  VAR1 (SETREGP, set_qregxi, 0, v2di)
+  BUILTIN_VQ (SETREG, set_qregoi, 0, ALL)
+  BUILTIN_VQ (SETREG, set_qregci, 0, ALL)
+  BUILTIN_VQ (SETREG, set_qregxi, 0, ALL)
+  VAR1 (SETREGP, set_qregoi, 0, ALL, v2di)
+  VAR1 (SETREGP, set_qregci, 0, ALL, v2di)
+  VAR1 (SETREGP, set_qregxi, 0, ALL, v2di)
   /* Implemented by aarch64_ld1x2<VQ:mode>. */
-  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0, ALL)
   /* Implemented by aarch64_ld1x2<VDC:mode>. */
-  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (LOADSTRUCT, ld2, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld3, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld4, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld2, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld3, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld4, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (LOADSTRUCT, ld2, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld3, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld4, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld2, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld3, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld4, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0, ALL)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (STORESTRUCT, st2, 0)
-  BUILTIN_VDC (STORESTRUCT, st3, 0)
-  BUILTIN_VDC (STORESTRUCT, st4, 0)
+  BUILTIN_VDC (STORESTRUCT, st2, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st3, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st4, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (STORESTRUCT, st2, 0)
-  BUILTIN_VQ (STORESTRUCT, st3, 0)
-  BUILTIN_VQ (STORESTRUCT, st4, 0)
-
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0)
-
-  BUILTIN_VQW (BINOP, saddl2, 0)
-  BUILTIN_VQW (BINOP, uaddl2, 0)
-  BUILTIN_VQW (BINOP, ssubl2, 0)
-  BUILTIN_VQW (BINOP, usubl2, 0)
-  BUILTIN_VQW (BINOP, saddw2, 0)
-  BUILTIN_VQW (BINOP, uaddw2, 0)
-  BUILTIN_VQW (BINOP, ssubw2, 0)
-  BUILTIN_VQW (BINOP, usubw2, 0)
+  BUILTIN_VQ (STORESTRUCT, st2, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st3, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st4, 0, ALL)
+
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0, ALL)
+
+  BUILTIN_VQW (BINOP, saddl2, 0, ALL)
+  BUILTIN_VQW (BINOP, uaddl2, 0, ALL)
+  BUILTIN_VQW (BINOP, ssubl2, 0, ALL)
+  BUILTIN_VQW (BINOP, usubl2, 0, ALL)
+  BUILTIN_VQW (BINOP, saddw2, 0, ALL)
+  BUILTIN_VQW (BINOP, uaddw2, 0, ALL)
+  BUILTIN_VQW (BINOP, ssubw2, 0, ALL)
+  BUILTIN_VQW (BINOP, usubw2, 0, ALL)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddl, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddl, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubl, 0)
-  BUILTIN_VD_BHSI (BINOP, usubl, 0)
+  BUILTIN_VD_BHSI (BINOP, saddl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, uaddl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, ssubl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, usubl, 0, ALL)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddw, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddw, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubw, 0)
-  BUILTIN_VD_BHSI (BINOP, usubw, 0)
+  BUILTIN_VD_BHSI (BINOP, saddw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, uaddw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, ssubw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, usubw, 0, ALL)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, shadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, shsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, ALL)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
-  BUILTIN_VQN (BINOP, addhn, 0)
-  BUILTIN_VQN (BINOP, subhn, 0)
-  BUILTIN_VQN (BINOP, raddhn, 0)
-  BUILTIN_VQN (BINOP, rsubhn, 0)
+  BUILTIN_VQN (BINOP, addhn, 0, ALL)
+  BUILTIN_VQN (BINOP, subhn, 0, ALL)
+  BUILTIN_VQN (BINOP, raddhn, 0, ALL)
+  BUILTIN_VQN (BINOP, rsubhn, 0, ALL)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
-  BUILTIN_VQN (TERNOP, addhn2, 0)
-  BUILTIN_VQN (TERNOP, subhn2, 0)
-  BUILTIN_VQN (TERNOP, raddhn2, 0)
-  BUILTIN_VQN (TERNOP, rsubhn2, 0)
+  BUILTIN_VQN (TERNOP, addhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, subhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, raddhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0, ALL)
 
-  BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
+  BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0, ALL)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
-  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0)
-  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0)
+  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, ALL)
+  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, ALL)
   /* Implemented by aarch64_s<optab><mode>.  */
-  BUILTIN_VSDQ_I (UNOP, sqabs, 0)
-  BUILTIN_VSDQ_I (UNOP, sqneg, 0)
+  BUILTIN_VSDQ_I (UNOP, sqabs, 0, ALL)
+  BUILTIN_VSDQ_I (UNOP, sqneg, 0, ALL)
 
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l<mode>.  */
-  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0)
-  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0)
+  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_lane<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0, ALL)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_laneq<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0, ALL)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0, ALL)
   /* Implemented by aarch64_sqdml<SBINQOPS:as>l_n<mode>.  */
-  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0)
-  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0)
-
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0)
-
-  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0)
-  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0)
-
-  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
-  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
-
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0)
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0)
-
-  BUILTIN_VSD_HSI (BINOP, sqdmull, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0)
-  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0)
+  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0, ALL)
+  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0, ALL)
+
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0, ALL)
+
+  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, ALL)
+  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, ALL)
+
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, ALL)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, ALL)
+
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0, ALL)
+
+  BUILTIN_VSD_HSI (BINOP, sqdmull, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0, ALL)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0, ALL)
+  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0, ALL)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0, ALL)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0, ALL)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0, ALL)
   /* Implemented by aarch64_sq<r>dmulh<mode>.  */
-  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0)
-  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0)
+  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0, ALL)
+  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0, ALL)
   /* Implemented by aarch64_sq<r>dmulh_lane<q><mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0, ALL)
 
-  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
+  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3, ALL)
   /* Implemented by aarch64_<sur>shl<mode>.  */
-  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
+  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, ALL)
 
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0)
-  BUILTIN_VB (TERNOPU, udot, 0)
-  BUILTIN_VB (TERNOP_SSUS, usdot, 0)
-  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
-  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
-  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
-  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0)
+  BUILTIN_VB (TERNOP, sdot, 0, ALL)
+  BUILTIN_VB (TERNOPU, udot, 0, ALL)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, ALL)
+  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, ALL)
+  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0, ALL)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0, ALL)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0, ALL)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
-  BUILTIN_VHSDF (BINOP, fcadd90, 0)
-  BUILTIN_VHSDF (BINOP, fcadd270, 0)
+  BUILTIN_VHSDF (BINOP, fcadd90, 0, ALL)
+  BUILTIN_VHSDF (BINOP, fcadd270, 0, ALL)
 
   /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>.   */
-  BUILTIN_VHSDF (TERNOP, fcmla0, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla90, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla180, 0)
-  BUILTIN_VHSDF (TERNOP, fcmla270, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0)
-
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0)
-
-  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
-  VAR1 (SHIFTIMM, ashr_simd, 0, di)
-  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
-  VAR1 (USHIFTIMM, lshr_simd, 0, di)
+  BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla90, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla180, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla270, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0, ALL)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0, ALL)
+
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0, ALL)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0, ALL)
+
+  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3, ALL)
+  VAR1 (SHIFTIMM, ashr_simd, 0, ALL, di)
+  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3, ALL)
+  VAR1 (USHIFTIMM, lshr_simd, 0, ALL, di)
   /* Implemented by aarch64_<sur>shr_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0, ALL)
   /* Implemented by aarch64_<sur>sra_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0, ALL)
   /* Implemented by aarch64_<sur>shll_n<mode>.  */
-  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0)
-  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0)
+  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0, ALL)
+  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0, ALL)
   /* Implemented by aarch64_<sur>shll2_n<mode>.  */
-  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0)
-  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0)
+  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0, ALL)
+  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0, ALL)
   /* Implemented by aarch64_<sur>q<r>shr<u>n_n<mode>.  */
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0, ALL)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0, ALL)
   /* Implemented by aarch64_<sur>s<lr>i_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
-  VAR2 (SHIFTINSERTP, ssli_n, 0, di, v2di)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, ALL)
+  VAR2 (SHIFTINSERTP, ssli_n, 0, ALL, di, v2di)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, ALL)
   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
-  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
-  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0)
-  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0)
+  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, ALL)
+  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0, ALL)
+  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
 
   /* Implemented by aarch64_reduc_plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10)
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
-  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10)
-  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, ALL)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10, ALL)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, ALL)
+  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10, ALL)
+  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10, ALL)
 
   /* Implemented by <maxmin_uns><mode>3.
      smax variants map to fmaxnm,
      smax_nan variants map to fmax.  */
-  BUILTIN_VDQ_BHSI (BINOP, smax, 3)
-  BUILTIN_VDQ_BHSI (BINOP, smin, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umax, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umin, 3)
-  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3)
-  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3)
+  BUILTIN_VDQ_BHSI (BINOP, smax, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, smin, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, umax, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, umin, 3, ALL)
+  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3, ALL)
+  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3, ALL)
 
   /* Implemented by <maxmin_uns><mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3)
-  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3)
+  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3, ALL)
 
   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uminp, 0)
-  BUILTIN_VHSDF (BINOP, smaxp, 0)
-  BUILTIN_VHSDF (BINOP, sminp, 0)
-  BUILTIN_VHSDF (BINOP, smax_nanp, 0)
-  BUILTIN_VHSDF (BINOP, smin_nanp, 0)
+  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, sminp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, uminp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, smaxp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, sminp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, smax_nanp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, smin_nanp, 0, ALL)
 
   /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VHSDF (UNOP, btrunc, 2)
-  BUILTIN_VHSDF (UNOP, ceil, 2)
-  BUILTIN_VHSDF (UNOP, floor, 2)
-  BUILTIN_VHSDF (UNOP, nearbyint, 2)
-  BUILTIN_VHSDF (UNOP, rint, 2)
-  BUILTIN_VHSDF (UNOP, round, 2)
-  BUILTIN_VHSDF_DF (UNOP, frintn, 2)
-
-  VAR1 (UNOP, btrunc, 2, hf)
-  VAR1 (UNOP, ceil, 2, hf)
-  VAR1 (UNOP, floor, 2, hf)
-  VAR1 (UNOP, frintn, 2, hf)
-  VAR1 (UNOP, nearbyint, 2, hf)
-  VAR1 (UNOP, rint, 2, hf)
-  VAR1 (UNOP, round, 2, hf)
+  BUILTIN_VHSDF (UNOP, btrunc, 2, ALL)
+  BUILTIN_VHSDF (UNOP, ceil, 2, ALL)
+  BUILTIN_VHSDF (UNOP, floor, 2, ALL)
+  BUILTIN_VHSDF (UNOP, nearbyint, 2, ALL)
+  BUILTIN_VHSDF (UNOP, rint, 2, ALL)
+  BUILTIN_VHSDF (UNOP, round, 2, ALL)
+  BUILTIN_VHSDF_DF (UNOP, frintn, 2, ALL)
+
+  VAR1 (UNOP, btrunc, 2, ALL, hf)
+  VAR1 (UNOP, ceil, 2, ALL, hf)
+  VAR1 (UNOP, floor, 2, ALL, hf)
+  VAR1 (UNOP, frintn, 2, ALL, hf)
+  VAR1 (UNOP, nearbyint, 2, ALL, hf)
+  VAR1 (UNOP, rint, 2, ALL, hf)
+  VAR1 (UNOP, round, 2, ALL, hf)
 
   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
-  VAR1 (UNOP, lbtruncv4hf, 2, v4hi)
-  VAR1 (UNOP, lbtruncv8hf, 2, v8hi)
-  VAR1 (UNOP, lbtruncv2sf, 2, v2si)
-  VAR1 (UNOP, lbtruncv4sf, 2, v4si)
-  VAR1 (UNOP, lbtruncv2df, 2, v2di)
-
-  VAR1 (UNOPUS, lbtruncuv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lbtruncuv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)
-
-  VAR1 (UNOP, lroundv4hf, 2, v4hi)
-  VAR1 (UNOP, lroundv8hf, 2, v8hi)
-  VAR1 (UNOP, lroundv2sf, 2, v2si)
-  VAR1 (UNOP, lroundv4sf, 2, v4si)
-  VAR1 (UNOP, lroundv2df, 2, v2di)
+  VAR1 (UNOP, lbtruncv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lbtruncv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lbtruncv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lbtruncv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lbtruncv2df, 2, ALL, v2di)
+
+  VAR1 (UNOPUS, lbtruncuv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lbtruncuv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lbtruncuv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lbtruncuv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lbtruncuv2df, 2, ALL, v2di)
+
+  VAR1 (UNOP, lroundv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lroundv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lroundv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lroundv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lroundv2df, 2, ALL, v2di)
   /* Implemented by l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2.  */
-  BUILTIN_GPI_I16 (UNOP, lroundhf, 2)
-  VAR1 (UNOP, lroundsf, 2, si)
-  VAR1 (UNOP, lrounddf, 2, di)
-
-  VAR1 (UNOPUS, lrounduv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lrounduv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
-  VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
-  VAR1 (UNOPUS, lrounduv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2)
-  VAR1 (UNOPUS, lroundusf, 2, si)
-  VAR1 (UNOPUS, lroundudf, 2, di)
-
-  VAR1 (UNOP, lceilv4hf, 2, v4hi)
-  VAR1 (UNOP, lceilv8hf, 2, v8hi)
-  VAR1 (UNOP, lceilv2sf, 2, v2si)
-  VAR1 (UNOP, lceilv4sf, 2, v4si)
-  VAR1 (UNOP, lceilv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lceilhf, 2)
-
-  VAR1 (UNOPUS, lceiluv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lceiluv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
-  VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
-  VAR1 (UNOPUS, lceiluv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2)
-  VAR1 (UNOPUS, lceilusf, 2, si)
-  VAR1 (UNOPUS, lceiludf, 2, di)
-
-  VAR1 (UNOP, lfloorv4hf, 2, v4hi)
-  VAR1 (UNOP, lfloorv8hf, 2, v8hi)
-  VAR1 (UNOP, lfloorv2sf, 2, v2si)
-  VAR1 (UNOP, lfloorv4sf, 2, v4si)
-  VAR1 (UNOP, lfloorv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2)
-
-  VAR1 (UNOPUS, lflooruv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lflooruv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
-  VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
-  VAR1 (UNOPUS, lflooruv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2)
-  VAR1 (UNOPUS, lfloorusf, 2, si)
-  VAR1 (UNOPUS, lfloorudf, 2, di)
-
-  VAR1 (UNOP, lfrintnv4hf, 2, v4hi)
-  VAR1 (UNOP, lfrintnv8hf, 2, v8hi)
-  VAR1 (UNOP, lfrintnv2sf, 2, v2si)
-  VAR1 (UNOP, lfrintnv4sf, 2, v4si)
-  VAR1 (UNOP, lfrintnv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2)
-  VAR1 (UNOP, lfrintnsf, 2, si)
-  VAR1 (UNOP, lfrintndf, 2, di)
-
-  VAR1 (UNOPUS, lfrintnuv4hf, 2, v4hi)
-  VAR1 (UNOPUS, lfrintnuv8hf, 2, v8hi)
-  VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2)
-  VAR1 (UNOPUS, lfrintnusf, 2, si)
-  VAR1 (UNOPUS, lfrintnudf, 2, di)
+  BUILTIN_GPI_I16 (UNOP, lroundhf, 2, ALL)
+  VAR1 (UNOP, lroundsf, 2, ALL, si)
+  VAR1 (UNOP, lrounddf, 2, ALL, di)
+
+  VAR1 (UNOPUS, lrounduv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lrounduv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lrounduv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lrounduv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lrounduv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2, ALL)
+  VAR1 (UNOPUS, lroundusf, 2, ALL, si)
+  VAR1 (UNOPUS, lroundudf, 2, ALL, di)
+
+  VAR1 (UNOP, lceilv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lceilv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lceilv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lceilv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lceilv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOP, lceilhf, 2, ALL)
+
+  VAR1 (UNOPUS, lceiluv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lceiluv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lceiluv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lceiluv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lceiluv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2, ALL)
+  VAR1 (UNOPUS, lceilusf, 2, ALL, si)
+  VAR1 (UNOPUS, lceiludf, 2, ALL, di)
+
+  VAR1 (UNOP, lfloorv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lfloorv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lfloorv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lfloorv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lfloorv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2, ALL)
+
+  VAR1 (UNOPUS, lflooruv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lflooruv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lflooruv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lflooruv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lflooruv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2, ALL)
+  VAR1 (UNOPUS, lfloorusf, 2, ALL, si)
+  VAR1 (UNOPUS, lfloorudf, 2, ALL, di)
+
+  VAR1 (UNOP, lfrintnv4hf, 2, ALL, v4hi)
+  VAR1 (UNOP, lfrintnv8hf, 2, ALL, v8hi)
+  VAR1 (UNOP, lfrintnv2sf, 2, ALL, v2si)
+  VAR1 (UNOP, lfrintnv4sf, 2, ALL, v4si)
+  VAR1 (UNOP, lfrintnv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2, ALL)
+  VAR1 (UNOP, lfrintnsf, 2, ALL, si)
+  VAR1 (UNOP, lfrintndf, 2, ALL, di)
+
+  VAR1 (UNOPUS, lfrintnuv4hf, 2, ALL, v4hi)
+  VAR1 (UNOPUS, lfrintnuv8hf, 2, ALL, v8hi)
+  VAR1 (UNOPUS, lfrintnuv2sf, 2, ALL, v2si)
+  VAR1 (UNOPUS, lfrintnuv4sf, 2, ALL, v4si)
+  VAR1 (UNOPUS, lfrintnuv2df, 2, ALL, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2, ALL)
+  VAR1 (UNOPUS, lfrintnusf, 2, ALL, si)
+  VAR1 (UNOPUS, lfrintnudf, 2, ALL, di)
 
   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
-  VAR1 (UNOP, floatv4hi, 2, v4hf)
-  VAR1 (UNOP, floatv8hi, 2, v8hf)
-  VAR1 (UNOP, floatv2si, 2, v2sf)
-  VAR1 (UNOP, floatv4si, 2, v4sf)
-  VAR1 (UNOP, floatv2di, 2, v2df)
+  VAR1 (UNOP, floatv4hi, 2, ALL, v4hf)
+  VAR1 (UNOP, floatv8hi, 2, ALL, v8hf)
+  VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
+  VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
+  VAR1 (UNOP, floatv2di, 2, ALL, v2df)
 
-  VAR1 (UNOP, floatunsv4hi, 2, v4hf)
-  VAR1 (UNOP, floatunsv8hi, 2, v8hf)
-  VAR1 (UNOP, floatunsv2si, 2, v2sf)
-  VAR1 (UNOP, floatunsv4si, 2, v4sf)
-  VAR1 (UNOP, floatunsv2di, 2, v2df)
+  VAR1 (UNOP, floatunsv4hi, 2, ALL, v4hf)
+  VAR1 (UNOP, floatunsv8hi, 2, ALL, v8hf)
+  VAR1 (UNOP, floatunsv2si, 2, ALL, v2sf)
+  VAR1 (UNOP, floatunsv4si, 2, ALL, v4sf)
+  VAR1 (UNOP, floatunsv2di, 2, ALL, v2df)
 
-  VAR5 (UNOPU, bswap, 2, v4hi, v8hi, v2si, v4si, v2di)
+  VAR5 (UNOPU, bswap, 2, ALL, v4hi, v8hi, v2si, v4si, v2di)
 
-  BUILTIN_VB (UNOP, rbit, 0)
+  BUILTIN_VB (UNOP, rbit, 0, ALL)
 
   /* Implemented by
      aarch64_<PERMUTE:perm_insn><mode>.  */
-  BUILTIN_VALL (BINOP, zip1, 0)
-  BUILTIN_VALL (BINOP, zip2, 0)
-  BUILTIN_VALL (BINOP, uzp1, 0)
-  BUILTIN_VALL (BINOP, uzp2, 0)
-  BUILTIN_VALL (BINOP, trn1, 0)
-  BUILTIN_VALL (BINOP, trn2, 0)
+  BUILTIN_VALL (BINOP, zip1, 0, ALL)
+  BUILTIN_VALL (BINOP, zip2, 0, ALL)
+  BUILTIN_VALL (BINOP, uzp1, 0, ALL)
+  BUILTIN_VALL (BINOP, uzp2, 0, ALL)
+  BUILTIN_VALL (BINOP, trn1, 0, ALL)
+  BUILTIN_VALL (BINOP, trn2, 0, ALL)
 
-  BUILTIN_GPF_F16 (UNOP, frecpe, 0)
-  BUILTIN_GPF_F16 (UNOP, frecpx, 0)
+  BUILTIN_GPF_F16 (UNOP, frecpe, 0, ALL)
+  BUILTIN_GPF_F16 (UNOP, frecpx, 0, ALL)
 
-  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0, ALL)
 
-  BUILTIN_VHSDF (UNOP, frecpe, 0)
-  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0)
+  BUILTIN_VHSDF (UNOP, frecpe, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0, ALL)
 
   /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
-  BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
-  BUILTIN_VHSDF (UNOP, abs, 2)
-  VAR1 (UNOP, abs, 2, hf)
+  BUILTIN_VSDQ_I_DI (UNOP, abs, 0, ALL)
+  BUILTIN_VHSDF (UNOP, abs, 2, ALL)
+  VAR1 (UNOP, abs, 2, ALL, hf)
 
-  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
-  VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
-  VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
+  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10, ALL)
+  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v4sf)
+  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v8hf)
 
-  VAR1 (UNOP, float_extend_lo_, 0, v2df)
-  VAR1 (UNOP, float_extend_lo_,  0, v4sf)
-  BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
+  VAR1 (UNOP, float_extend_lo_, 0, ALL, v2df)
+  VAR1 (UNOP, float_extend_lo_,  0, ALL, v4sf)
+  BUILTIN_VDF (UNOP, float_truncate_lo_, 0, ALL)
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
-  VAR1(STORE1P, ld1, 0, v2di)
+  BUILTIN_VALL_F16 (LOAD1, ld1, 0, ALL)
+  VAR1(STORE1P, ld1, 0, ALL, v2di)
 
   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (STORE1, st1, 0)
-  VAR1(STORE1P, st1, 0, v2di)
+  BUILTIN_VALL_F16 (STORE1, st1, 0, ALL)
+  VAR1(STORE1P, st1, 0, ALL, v2di)
 
   /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0, ALL)
 
   /* Implemented by aarch64_ld1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0, ALL)
 
   /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0, ALL)
 
   /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0, ALL)
 
   /* Implemented by aarch64_st1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, ALL)
 
   /* Implemented by fma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fma, 4)
-  VAR1 (TERNOP, fma, 4, hf)
+  BUILTIN_VHSDF (TERNOP, fma, 4, ALL)
+  VAR1 (TERNOP, fma, 4, ALL, hf)
   /* Implemented by fnma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fnma, 4)
-  VAR1 (TERNOP, fnma, 4, hf)
+  BUILTIN_VHSDF (TERNOP, fnma, 4, ALL)
+  VAR1 (TERNOP, fnma, 4, ALL, hf)
 
   /* Implemented by aarch64_simd_bsl<mode>.  */
-  BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
-  VAR2 (BSL_P, simd_bsl,0, di, v2di)
-  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
-  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
+  BUILTIN_VDQQH (BSL_P, simd_bsl, 0, ALL)
+  VAR2 (BSL_P, simd_bsl,0, ALL, di, v2di)
+  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0, ALL)
+  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0, ALL)
 
   /* Implemented by aarch64_crypto_aes<op><mode>.  */
-  VAR1 (BINOPU, crypto_aese, 0, v16qi)
-  VAR1 (BINOPU, crypto_aesd, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesmc, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesimc, 0, v16qi)
+  VAR1 (BINOPU, crypto_aese, 0, ALL, v16qi)
+  VAR1 (BINOPU, crypto_aesd, 0, ALL, v16qi)
+  VAR1 (UNOPU, crypto_aesmc, 0, ALL, v16qi)
+  VAR1 (UNOPU, crypto_aesimc, 0, ALL, v16qi)
 
   /* Implemented by aarch64_crypto_sha1<op><mode>.  */
-  VAR1 (UNOPU, crypto_sha1h, 0, si)
-  VAR1 (BINOPU, crypto_sha1su1, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1c, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1m, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1p, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1su0, 0, v4si)
+  VAR1 (UNOPU, crypto_sha1h, 0, ALL, si)
+  VAR1 (BINOPU, crypto_sha1su1, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1c, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1m, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1p, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha1su0, 0, ALL, v4si)
 
   /* Implemented by aarch64_crypto_sha256<op><mode>.  */
-  VAR1 (TERNOPU, crypto_sha256h, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256h2, 0, v4si)
-  VAR1 (BINOPU, crypto_sha256su0, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256su1, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha256h, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha256h2, 0, ALL, v4si)
+  VAR1 (BINOPU, crypto_sha256su0, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha256su1, 0, ALL, v4si)
 
   /* Implemented by aarch64_crypto_pmull<mode>.  */
-  VAR1 (BINOPP, crypto_pmull, 0, di)
-  VAR1 (BINOPP, crypto_pmull, 0, v2di)
+  VAR1 (BINOPP, crypto_pmull, 0, ALL, di)
+  VAR1 (BINOPP, crypto_pmull, 0, ALL, v2di)
 
   /* Implemented by aarch64_tbl3<mode>.  */
-  VAR1 (BINOP, tbl3, 0, v8qi)
-  VAR1 (BINOP, tbl3, 0, v16qi)
+  VAR1 (BINOP, tbl3, 0, ALL, v8qi)
+  VAR1 (BINOP, tbl3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbl3<mode>.  */
-  VAR1 (BINOP, qtbl3, 0, v8qi)
-  VAR1 (BINOP, qtbl3, 0, v16qi)
+  VAR1 (BINOP, qtbl3, 0, ALL, v8qi)
+  VAR1 (BINOP, qtbl3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbl4<mode>.  */
-  VAR1 (BINOP, qtbl4, 0, v8qi)
-  VAR1 (BINOP, qtbl4, 0, v16qi)
+  VAR1 (BINOP, qtbl4, 0, ALL, v8qi)
+  VAR1 (BINOP, qtbl4, 0, ALL, v16qi)
 
   /* Implemented by aarch64_tbx4<mode>.  */
-  VAR1 (TERNOP, tbx4, 0, v8qi)
-  VAR1 (TERNOP, tbx4, 0, v16qi)
+  VAR1 (TERNOP, tbx4, 0, ALL, v8qi)
+  VAR1 (TERNOP, tbx4, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbx3<mode>.  */
-  VAR1 (TERNOP, qtbx3, 0, v8qi)
-  VAR1 (TERNOP, qtbx3, 0, v16qi)
+  VAR1 (TERNOP, qtbx3, 0, ALL, v8qi)
+  VAR1 (TERNOP, qtbx3, 0, ALL, v16qi)
 
   /* Implemented by aarch64_qtbx4<mode>.  */
-  VAR1 (TERNOP, qtbx4, 0, v8qi)
-  VAR1 (TERNOP, qtbx4, 0, v16qi)
+  VAR1 (TERNOP, qtbx4, 0, ALL, v8qi)
+  VAR1 (TERNOP, qtbx4, 0, ALL, v16qi)
 
   /* Builtins for ARMv8.1-A Adv.SIMD instructions.  */
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0)
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0, ALL)
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0, ALL)
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_lane<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0, ALL)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0, ALL)
 
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0, ALL)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0, ALL)
 
   /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
-  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3)
-  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3)
-  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3)
-  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3)
-  VAR1 (SHIFTIMM, scvtfsi, 3, hf)
-  VAR1 (SHIFTIMM, scvtfdi, 3, hf)
-  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, hf)
-  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, hf)
-  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3)
-  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3)
+  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3, ALL)
+  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3, ALL)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3, ALL)
+  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3, ALL)
+  VAR1 (SHIFTIMM, scvtfsi, 3, ALL, hf)
+  VAR1 (SHIFTIMM, scvtfdi, 3, ALL, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, ALL, hf)
+  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, ALL, hf)
+  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3, ALL)
+  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3, ALL)
 
   /* Implemented by aarch64_rsqrte<mode>.  */
-  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0)
+  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0, ALL)
 
   /* Implemented by aarch64_rsqrts<mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0)
+  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0, ALL)
 
   /* Implemented by fabd<mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3)
+  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, ALL)
 
   /* Implemented by aarch64_faddp<mode>.  */
-  BUILTIN_VHSDF (BINOP, faddp, 0)
+  BUILTIN_VHSDF (BINOP, faddp, 0, ALL)
 
   /* Implemented by aarch64_cm<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0, ALL)
 
   /* Implemented by neg<mode>2.  */
-  BUILTIN_VHSDF_HSDF (UNOP, neg, 2)
+  BUILTIN_VHSDF_HSDF (UNOP, neg, 2, ALL)
 
   /* Implemented by aarch64_fac<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, ALL)
 
   /* Implemented by sqrt<mode>2.  */
-  VAR1 (UNOP, sqrt, 2, hf)
+  VAR1 (UNOP, sqrt, 2, ALL, hf)
 
   /* Implemented by <optab><mode>hf2.  */
-  VAR1 (UNOP, floatdi, 2, hf)
-  VAR1 (UNOP, floatsi, 2, hf)
-  VAR1 (UNOP, floathi, 2, hf)
-  VAR1 (UNOPUS, floatunsdi, 2, hf)
-  VAR1 (UNOPUS, floatunssi, 2, hf)
-  VAR1 (UNOPUS, floatunshi, 2, hf)
-  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2)
-  BUILTIN_GPI (UNOP, fix_truncsf, 2)
-  BUILTIN_GPI (UNOP, fix_truncdf, 2)
-  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2)
-  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2)
-  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2)
+  VAR1 (UNOP, floatdi, 2, ALL, hf)
+  VAR1 (UNOP, floatsi, 2, ALL, hf)
+  VAR1 (UNOP, floathi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunsdi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunssi, 2, ALL, hf)
+  VAR1 (UNOPUS, floatunshi, 2, ALL, hf)
+  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2, ALL)
+  BUILTIN_GPI (UNOP, fix_truncsf, 2, ALL)
+  BUILTIN_GPI (UNOP, fix_truncdf, 2, ALL)
+  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2, ALL)
+  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2, ALL)
+  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2, ALL)
 
   /* Implemented by aarch64_sm3ss1qv4si.  */
-  VAR1 (TERNOPU, sm3ss1q, 0, v4si)
+  VAR1 (TERNOPU, sm3ss1q, 0, ALL, v4si)
   /* Implemented by aarch64_sm3tt<sm3tt_op>qv4si.  */
-  VAR1 (QUADOPUI, sm3tt1aq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt1bq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt2aq, 0, v4si)
-  VAR1 (QUADOPUI, sm3tt2bq, 0, v4si)
+  VAR1 (QUADOPUI, sm3tt1aq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt1bq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt2aq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt2bq, 0, ALL, v4si)
   /* Implemented by aarch64_sm3partw<sm3part_op>qv4si.  */
-  VAR1 (TERNOPU, sm3partw1q, 0, v4si)
-  VAR1 (TERNOPU, sm3partw2q, 0, v4si)
+  VAR1 (TERNOPU, sm3partw1q, 0, ALL, v4si)
+  VAR1 (TERNOPU, sm3partw2q, 0, ALL, v4si)
   /* Implemented by aarch64_sm4eqv4si.  */
-  VAR1 (BINOPU, sm4eq, 0, v4si)
+  VAR1 (BINOPU, sm4eq, 0, ALL, v4si)
   /* Implemented by aarch64_sm4ekeyqv4si.  */
-  VAR1 (BINOPU, sm4ekeyq, 0, v4si)
+  VAR1 (BINOPU, sm4ekeyq, 0, ALL, v4si)
   /* Implemented by aarch64_crypto_sha512hqv2di.  */
-  VAR1 (TERNOPU, crypto_sha512hq, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512hq, 0, ALL, v2di)
   /* Implemented by aarch64_sha512h2qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512h2q, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512h2q, 0, ALL, v2di)
   /* Implemented by aarch64_crypto_sha512su0qv2di.  */
-  VAR1 (BINOPU, crypto_sha512su0q, 0, v2di)
+  VAR1 (BINOPU, crypto_sha512su0q, 0, ALL, v2di)
   /* Implemented by aarch64_crypto_sha512su1qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512su1q, 0, v2di)
+  VAR1 (TERNOPU, crypto_sha512su1q, 0, ALL, v2di)
   /* Implemented by eor3q<mode>4.  */
-  BUILTIN_VQ_I (TERNOPU, eor3q, 4)
-  BUILTIN_VQ_I (TERNOP, eor3q, 4)
+  BUILTIN_VQ_I (TERNOPU, eor3q, 4, ALL)
+  BUILTIN_VQ_I (TERNOP, eor3q, 4, ALL)
   /* Implemented by aarch64_rax1qv2di.  */
-  VAR1 (BINOPU, rax1q, 0, v2di)
+  VAR1 (BINOPU, rax1q, 0, ALL, v2di)
   /* Implemented by aarch64_xarqv2di.  */
-  VAR1 (TERNOPUI, xarq, 0, v2di)
+  VAR1 (TERNOPUI, xarq, 0, ALL, v2di)
   /* Implemented by bcaxq<mode>4.  */
-  BUILTIN_VQ_I (TERNOPU, bcaxq, 4)
-  BUILTIN_VQ_I (TERNOP, bcaxq, 4)
+  BUILTIN_VQ_I (TERNOPU, bcaxq, 4, ALL)
+  BUILTIN_VQ_I (TERNOP, bcaxq, 4, ALL)
 
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_low<mode>.  */
-  VAR1 (TERNOP, fmlal_low, 0, v2sf)
-  VAR1 (TERNOP, fmlsl_low, 0, v2sf)
-  VAR1 (TERNOP, fmlalq_low, 0, v4sf)
-  VAR1 (TERNOP, fmlslq_low, 0, v4sf)
+  VAR1 (TERNOP, fmlal_low, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlsl_low, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlalq_low, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlslq_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_high<mode>.  */
-  VAR1 (TERNOP, fmlal_high, 0, v2sf)
-  VAR1 (TERNOP, fmlsl_high, 0, v2sf)
-  VAR1 (TERNOP, fmlalq_high, 0, v4sf)
-  VAR1 (TERNOP, fmlslq_high, 0, v4sf)
+  VAR1 (TERNOP, fmlal_high, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlsl_high, 0, ALL, v2sf)
+  VAR1 (TERNOP, fmlalq_high, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlslq_high, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l_lane_lowv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_lane_low, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_lane_low, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>l_laneq_lowv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_laneq_low, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_laneq_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_laneq_low, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_low, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>lq_lane_lowv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_lane_low, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_lane_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_lane_low, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>lq_laneq_lowv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_laneq_low, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_laneq_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_laneq_low, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_low, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l_lane_highv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_lane_high, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_lane_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_lane_high, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_high, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>l_laneq_highv2sf.  */
-  VAR1 (QUADOP_LANE, fmlal_laneq_high, 0, v2sf)
-  VAR1 (QUADOP_LANE, fmlsl_laneq_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlal_laneq_high, 0, ALL, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_high, 0, ALL, v2sf)
   /* Implemented by aarch64_fml<f16mac1>lq_lane_highv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_lane_high, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_lane_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_lane_high, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_high, 0, ALL, v4sf)
   /* Implemented by aarch64_fml<f16mac1>lq_laneq_highv4sf.  */
-  VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, v4sf)
-  VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, ALL, v4sf)
 
   /* Implemented by aarch64_<frintnzs_op><mode>.  */
-  BUILTIN_VSFDF (UNOP, frint32z, 0)
-  BUILTIN_VSFDF (UNOP, frint32x, 0)
-  BUILTIN_VSFDF (UNOP, frint64z, 0)
-  BUILTIN_VSFDF (UNOP, frint64x, 0)
+  BUILTIN_VSFDF (UNOP, frint32z, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint32x, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint64z, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint64x, 0, ALL)
 
   /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
-  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
-  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
-  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
+  VAR2 (TERNOP, bfdot, 0, ALL, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, ALL, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, ALL, v2sf, v4sf)
 
   /* Implemented by aarch64_bfmmlaqv4sf  */
-  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+  VAR1 (TERNOP, bfmmlaq, 0, ALL, v4sf)
 
   /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf  */
-  VAR1 (TERNOP, bfmlalb, 0, v4sf)
-  VAR1 (TERNOP, bfmlalt, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
-  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
+  VAR1 (TERNOP, bfmlalb, 0, ALL, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
-  VAR1 (TERNOP, simd_smmla, 0, v16qi)
-  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
-  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
+  VAR1 (TERNOP, simd_smmla, 0, ALL, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, ALL, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, ALL, v16qi)
 
   /* Implemented by aarch64_bfcvtn{q}{2}<mode>  */
-  VAR1 (UNOP, bfcvtn, 0, v4bf)
-  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
-  VAR1 (BINOP, bfcvtn2, 0, v8bf)
-  VAR1 (UNOP, bfcvt, 0, bf)
+  VAR1 (UNOP, bfcvtn, 0, ALL, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
+  VAR1 (UNOP, bfcvt, 0, ALL, bf)
diff --git a/gcc/config/aarch64/geniterators.sh b/gcc/config/aarch64/geniterators.sh
index a7420964f85..43feb482ce9 100644
--- a/gcc/config/aarch64/geniterators.sh
+++ b/gcc/config/aarch64/geniterators.sh
@@ -70,8 +70,8 @@ iterdef {
 	sub(/ *\]/, "", s)
 
 	n = split(s, a)
-	printf "#define BUILTIN_" a[1] "(T, N, MAP) \\\n"
-	printf "  VAR" (n-1) " (T, N, MAP"
+	printf "#define BUILTIN_" a[1] "(T, N, MAP, FLAG) \\\n"
+	printf "  VAR" (n-1) " (T, N, MAP, FLAG"
 	for (i = 2; i <= n; i++)
 		printf ", "  tolower(a[i])
 	printf ")\n"
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-16 14:05               ` xiezhiheng
@ 2020-07-17  9:03                 ` Richard Sandiford
  2020-07-30  2:43                   ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-07-17  9:03 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Thursday, July 16, 2020 8:42 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Tuesday, July 7, 2020 10:08 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> >> -----Original Message-----
>> >> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> >> Sent: Monday, July 6, 2020 5:31 PM
>> >> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> >> gcc-patches@gcc.gnu.org
>> >> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp
>> instructions
>> >> >> emitted at -O3
>> >> >>
>> >> >> No, this is unfortunately a known bug.  See:
>> >> >>
>> >> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
>> >> >>
>> >> >> (Although the PR is recent, it's been a known bug for longer.)
>> >> >>
>> >> >> As you say, the difficulty is that the correct attributes depend on what
>> >> >> the built-in function does.  Most integer arithmetic is “const”, but
>> >> things
>> >> >> get more complicated for floating-point arithmetic.
>> >> >>
>> >> >> The SVE intrinsics use a three stage process:
>> >> >>
>> >> >> - each function is classified into one of several groups
>> >> >> - each group has a set of flags that describe what functions in the
>> >> >>   group can do
>> >> >> - these flags get converted into attributes based on the current
>> >> >>   command-line options
>> >> >>
>> >> >> I guess we should have something similar for the arm_neon.h built-ins.
>> >> >>
>> >> >> If you're willing to help fix this, that'd be great.  I think a first
>> >> >> step would be to agree a design.
>> >> >>
>> >> >> Thanks,
>> >> >> Richard
>> >> >
>> >> > I'd like to have a try.
>> >>
>> >> Great!
>> >>
>> >> > I have checked the steps in SVE intrinsics.
>> >> > It defines a base class "function_base" and derives different classes
>> >> > to describe several intrinsics for each.  And each class may
>> >> > have its own unique flags described in virtual function "call_properties".
>> >> > The specific attributes will be converted from these flags in
>> >> > "get_attributes" later.
>> >> >
>> >> > I find that there are more than 100 classes in total and if I only
>> >> > need to classify them into different groups by attributes, maybe
>> >> > we does not need so many classes?
>> >>
>> >> Yeah, I agree.
>> >>
>> >> Long term, there might be value in defining arm_neon.h in a similar
>> >> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
>> >> a special compiler pragma.  But that's going to be a lot of work.
>> >>
>> >> I think it's possible to make incremental improvements to the current
>> >> arm_neon.h implementation without that work being thrown away if we
>> >> ever
>> >> did switch to a pragma in future.  And the incremental approach seems
>> >> more practical.
>> >>
>> >> > The difficult thing I think is how to classify neon intrinsics into
>> >> > different groups.  I'm going to follow up the way in SVE intrinsics
>> >> > first now.
>> >>
>> >> For now I'd suggest just giving a name to each combination of flags
>> >> that the intrinsics need, rather than splitting instructions in a
>> >> more fine-grained way.  (It's not at all obvious from the final state
>> >> of the SVE code, but even there, the idea was to have as few groups as
>> >> possible.  I.e. the groups were supposedly only split where necessary.
>> >> As you say, there still ended up being a lot of groups in the end…)
>> >>
>> >> It'd be easier to review if the work was split up into smaller steps.
>> >> E.g. maybe one way would be this, with each number being a single
>> >> patch:
>> >>
>> >> (1) (a) Add a flags field to the built-in function definitions
>> >>         that for now is always zero.
>> >>     (b) Pick a name N to describe the most conservative set of flags.
>> >>     (c) Make every built-in function definition use N.
>> >>
>> >
>> > I have finished the first part.
>> >
>> > (a) I add a new parameter called FLAG to every built-in function macro.
>> >
>> > (b) I define some flags in aarch64-builtins.c
>> > FLAG_NONE for no needed flags
>> > FLAG_READ_FPCR for functions will read FPCR register
>> > FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
>> > FLAG_READ_MEMORY for functions will read global memory
>> > FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
>> > FLAG_WRITE_MEMORY for functions will write global memory
>> >
>> > FLAG_FP is used for floating-point arithmetic
>> > FLAG_ALL is all flags above
>> >
>> > (c) I add a field in struct aarch64_simd_builtin_datum to record flags
>> > for each built-in function.  But the default flags I set for built-in functions
>> > are FLAG_ALL because by default the built-in functions might do anything.
>> >
>> > And bootstrap and regression are tested ok on aarch64 Linux platform.
>> 
>> This looks great.
>> 
>> The patch is OK for trunk, but could you send a changelog too,
>> so that I can include it in the commit message?
>> 
>> Thanks,
>> Richard
>
> OK, and I add the git commit msg in patch.

Thanks, pushed to master.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-17  9:03                 ` Richard Sandiford
@ 2020-07-30  2:43                   ` xiezhiheng
  2020-07-31  9:02                     ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-07-30  2:43 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2529 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, July 17, 2020 5:04 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
>

Cut...

> 
> Thanks, pushed to master.
> 
> Richard

And I have finished the second part.

In function aarch64_general_add_builtin, I add an argument ATTRS to
pass attributes for each built-in function.

And some new functions are added:
aarch64_call_properties: return flags for each built-in function based
on command-line options.  When the built-in function handles
floating-points, add FLAG_FP flag.

aarch64_modifies_global_state_p: True if the function would modify
global states.

aarch64_reads_global_state_p: True if the function would read
global states.

aarch64_could_trap_p: True if the function would raise a signal.

aarch64_add_attribute: Add attributes in ATTRS.

aarch64_get_attributes: return attributes for each built-in functons
based on flags and command-line options.

In function aarch64_init_simd_builtins, attributes are get by flags
and pass them to function aarch64_general_add_builtin.


Bootstrap is tested OK on aarch64 Linux platform, but regression
FAIL one test case ---- pr93423.f90.
However, I found that this test case would fail randomly in trunk.
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
Some PRs have tracked it.  After my patch, this test case would
always fail.  I guess the syntax errors in fortran crash some structures
result in illegal memory access but I can't find what exactly it is.
But I think my patch should have no influence on it.

Have some further suggestions?

Thanks,
Xiezhiheng



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 871b97c8543..8882ec1d59a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,15 @@
+2020-07-30  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_general_add_builtin):
+	Add new argument ATTRS.
+	(aarch64_call_properties): New function.
+	(aarch64_modifies_global_state_p): Likewise.
+	(aarch64_reads_global_state_p): Likewise.
+	(aarch64_could_trap_p): Likewise.
+	(aarch64_add_attribute): Likewise.
+	(aarch64_get_attributes): Likewise.
+	(aarch64_init_simd_builtins): Add attributes for each built-in function.
+


[-- Attachment #2: pr94442-v1.patch --]
[-- Type: application/octet-stream, Size: 5075 bytes --]

From edb4971fa4ddfd9ec72bca19f43ca0c28bd1dd7d Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Wed, 29 Jul 2020 21:12:16 -0400
Subject: [PATCH] AArch64: Add attributes according to flags in built-in
 functions [PR94442]

2020-07-30  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (aarch64_general_add_builtin):
	Add new argument ATTRS.
	(aarch64_call_properties): New function.
	(aarch64_modifies_global_state_p): Likewise.
	(aarch64_reads_global_state_p): Likewise.
	(aarch64_could_trap_p): Likewise.
	(aarch64_add_attribute): Likewise.
	(aarch64_get_attributes): Likewise.
	(aarch64_init_simd_builtins): Add attributes for each built-in function.
---
 gcc/config/aarch64/aarch64-builtins.c | 119 +++++++++++++++++++++++++-
 1 file changed, 115 insertions(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index d5fb29048c4..8fc73230761 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -629,13 +629,15 @@ tree aarch64_bf16_ptr_type_node = NULL_TREE;
 
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
    function, TYPE is the function type, and CODE is the function subcode
-   (relative to AARCH64_BUILTIN_GENERAL).  */
+   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
+   attributes.  */
 static tree
-aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
+aarch64_general_add_builtin (const char *name, tree type, unsigned int code,
+			     tree attrs = NULL_TREE)
 {
   code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
   return add_builtin_function (name, type, code, BUILT_IN_MD,
-			       NULL, NULL_TREE);
+			       NULL, attrs);
 }
 
 static const char *
@@ -892,6 +894,113 @@ aarch64_init_simd_builtin_scalar_types (void)
 					     "__builtin_aarch64_simd_udi");
 }
 
+/* Return a set of FLAG_* flags that describe what the function could do,
+   taking the command-line flags into account.  */
+static unsigned int
+aarch64_call_properties (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = d->flags;
+  switch (d->mode)
+    {
+    /* Floating-point.  */
+    case E_BFmode:
+    case E_V4BFmode:
+    case E_V8BFmode:
+    case E_HFmode:
+    case E_V4HFmode:
+    case E_V8HFmode:
+    case E_SFmode:
+    case E_V2SFmode:
+    case E_V4SFmode:
+    case E_DFmode:
+    case E_V1DFmode:
+    case E_V2DFmode:
+      flags |= FLAG_FP;
+      break;
+
+    default:
+      break;
+    }
+
+  /* -fno-trapping-math means that we can assume any FP exceptions
+     are not user-visible.  */
+  if (!flag_trapping_math)
+    flags &= ~FLAG_RAISE_FP_EXCEPTIONS;
+
+  return flags;
+}
+
+/* Return true if calls to the function could modify some form of
+   global state.  */
+static bool
+aarch64_modifies_global_state_p (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = aarch64_call_properties (d);
+
+  if (flags & FLAG_RAISE_FP_EXCEPTIONS)
+    return true;
+
+  if (flags & FLAG_PREFETCH_MEMORY)
+    return true;
+
+  return flags & FLAG_WRITE_MEMORY;
+}
+
+/* Return true if calls to the function could read some form of
+   global state.  */
+static bool
+aarch64_reads_global_state_p (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = aarch64_call_properties (d);
+
+  if (flags & FLAG_READ_FPCR)
+    return true;
+
+  return flags & FLAG_READ_MEMORY;
+}
+
+/* Return true if calls to the function could raise a signal.  */
+static bool
+aarch64_could_trap_p (aarch64_simd_builtin_datum *d)
+{
+  unsigned int flags = aarch64_call_properties (d);
+
+  if (flags & FLAG_RAISE_FP_EXCEPTIONS)
+    return true;
+
+  if (flags & (FLAG_READ_MEMORY | FLAG_WRITE_MEMORY))
+    return true;
+
+  return false;
+}
+
+/* Add attribute NAME to ATTRS.  */
+static tree
+aarch64_add_attribute (const char *name, tree attrs)
+{
+  return tree_cons (get_identifier (name), NULL_TREE, attrs);
+}
+
+/* Return the appropriate function attributes.  */
+static tree
+aarch64_get_attributes (aarch64_simd_builtin_datum *d)
+{
+  tree attrs = NULL_TREE;
+
+  if (!aarch64_modifies_global_state_p (d))
+    {
+      if (aarch64_reads_global_state_p (d))
+	attrs = aarch64_add_attribute ("pure", attrs);
+      else
+	attrs = aarch64_add_attribute ("const", attrs);
+    }
+
+  if (!flag_non_call_exceptions || !aarch64_could_trap_p (d))
+    attrs = aarch64_add_attribute ("nothrow", attrs);
+
+  return aarch64_add_attribute ("leaf", attrs);
+}
+
 static bool aarch64_simd_builtins_initialized_p = false;
 
 /* Due to the architecture not providing lane variant of the lane instructions
@@ -1045,7 +1154,9 @@ aarch64_init_simd_builtins (void)
 	snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
 		  d->name);
 
-      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
+      tree attrs = aarch64_get_attributes (d);
+
+      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode, attrs);
       aarch64_builtin_decls[fcode] = fndecl;
     }
 
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-30  2:43                   ` xiezhiheng
@ 2020-07-31  9:02                     ` Richard Sandiford
  2020-08-03  2:21                       ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-07-31  9:02 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, July 17, 2020 5:04 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>>
>
> Cut...
>
>> 
>> Thanks, pushed to master.
>> 
>> Richard
>
> And I have finished the second part.
>
> In function aarch64_general_add_builtin, I add an argument ATTRS to
> pass attributes for each built-in function.
>
> And some new functions are added:
> aarch64_call_properties: return flags for each built-in function based
> on command-line options.  When the built-in function handles
> floating-points, add FLAG_FP flag.
>
> aarch64_modifies_global_state_p: True if the function would modify
> global states.
>
> aarch64_reads_global_state_p: True if the function would read
> global states.
>
> aarch64_could_trap_p: True if the function would raise a signal.
>
> aarch64_add_attribute: Add attributes in ATTRS.
>
> aarch64_get_attributes: return attributes for each built-in functons
> based on flags and command-line options.
>
> In function aarch64_init_simd_builtins, attributes are get by flags
> and pass them to function aarch64_general_add_builtin.
>
>
> Bootstrap is tested OK on aarch64 Linux platform, but regression
> FAIL one test case ---- pr93423.f90.
> However, I found that this test case would fail randomly in trunk.
>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
> Some PRs have tracked it.  After my patch, this test case would
> always fail.  I guess the syntax errors in fortran crash some structures
> result in illegal memory access but I can't find what exactly it is.
> But I think my patch should have no influence on it.

Yeah, I agree.  And FWIW, I didn't see this in my testing.

I've pushed the patch with one trivial change: to remove the “and”
before “CODE” in:

>  /* Wrapper around add_builtin_function.  NAME is the name of the built-in
>     function, TYPE is the function type, and CODE is the function subcode
> -   (relative to AARCH64_BUILTIN_GENERAL).  */
> +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
> +   attributes.  */

BTW, one thing to be careful of in future is that not all FP intrinsics
raise FP exceptions.  So while:

> +  switch (d->mode)
> +    {
> +    /* Floating-point.  */
> +    case E_BFmode:
> +    case E_V4BFmode:
> +    case E_V8BFmode:
> +    case E_HFmode:
> +    case E_V4HFmode:
> +    case E_V8HFmode:
> +    case E_SFmode:
> +    case E_V2SFmode:
> +    case E_V4SFmode:
> +    case E_DFmode:
> +    case E_V1DFmode:
> +    case E_V2DFmode:
> +      flags |= FLAG_FP;
> +      break;
> +
> +    default:
> +      break;
> +    }

is a good, conservatively-correct default, we might need an additional
flag to suppress it for certain intrinsics.

I've just realised that the code above could have used FLOAT_MODE_P,
but I didn't think of that before pushing the patch :-)

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-07-31  9:02                     ` Richard Sandiford
@ 2020-08-03  2:21                       ` xiezhiheng
  2020-08-03 13:55                         ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-03  2:21 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4497 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, July 31, 2020 5:03 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Friday, July 17, 2020 5:04 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >
> > Cut...
> >
> >>
> >> Thanks, pushed to master.
> >>
> >> Richard
> >
> > And I have finished the second part.
> >
> > In function aarch64_general_add_builtin, I add an argument ATTRS to
> > pass attributes for each built-in function.
> >
> > And some new functions are added:
> > aarch64_call_properties: return flags for each built-in function based
> > on command-line options.  When the built-in function handles
> > floating-points, add FLAG_FP flag.
> >
> > aarch64_modifies_global_state_p: True if the function would modify
> > global states.
> >
> > aarch64_reads_global_state_p: True if the function would read
> > global states.
> >
> > aarch64_could_trap_p: True if the function would raise a signal.
> >
> > aarch64_add_attribute: Add attributes in ATTRS.
> >
> > aarch64_get_attributes: return attributes for each built-in functons
> > based on flags and command-line options.
> >
> > In function aarch64_init_simd_builtins, attributes are get by flags
> > and pass them to function aarch64_general_add_builtin.
> >
> >
> > Bootstrap is tested OK on aarch64 Linux platform, but regression
> > FAIL one test case ---- pr93423.f90.
> > However, I found that this test case would fail randomly in trunk.
> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
> > Some PRs have tracked it.  After my patch, this test case would
> > always fail.  I guess the syntax errors in fortran crash some structures
> > result in illegal memory access but I can't find what exactly it is.
> > But I think my patch should have no influence on it.
> 
> Yeah, I agree.  And FWIW, I didn't see this in my testing.
> 
> I've pushed the patch with one trivial change: to remove the “and”
> before “CODE” in:
> 
> >  /* Wrapper around add_builtin_function.  NAME is the name of the
> built-in
> >     function, TYPE is the function type, and CODE is the function subcode
> > -   (relative to AARCH64_BUILTIN_GENERAL).  */
> > +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
> > +   attributes.  */
> 
> BTW, one thing to be careful of in future is that not all FP intrinsics
> raise FP exceptions.  So while:
> 
> > +  switch (d->mode)
> > +    {
> > +    /* Floating-point.  */
> > +    case E_BFmode:
> > +    case E_V4BFmode:
> > +    case E_V8BFmode:
> > +    case E_HFmode:
> > +    case E_V4HFmode:
> > +    case E_V8HFmode:
> > +    case E_SFmode:
> > +    case E_V2SFmode:
> > +    case E_V4SFmode:
> > +    case E_DFmode:
> > +    case E_V1DFmode:
> > +    case E_V2DFmode:
> > +      flags |= FLAG_FP;
> > +      break;
> > +
> > +    default:
> > +      break;
> > +    }
> 
> is a good, conservatively-correct default, we might need an additional
> flag to suppress it for certain intrinsics.
> 

I agree.

> I've just realised that the code above could have used FLOAT_MODE_P,
> but I didn't think of that before pushing the patch :-)
> 

Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.

Bootstrap and regression are tested ok on aarch64 Linux platform.

Thanks,
Xiezhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 83e41ff737e..a848b1f64f1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-08-03  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
+	Use FLOAT_MODE_P macro instead of enumerating all floating-point
+	modes and add global flag FLAG_SUPPRESS_FP_EXCEPTIONS.
+

> Thanks,
> Richard

[-- Attachment #2: pr94442-v1.patch --]
[-- Type: application/octet-stream, Size: 2149 bytes --]

From 1b50adbd1a7b9e10291a00cd87852efe678e6cdf Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Sun, 2 Aug 2020 21:27:00 -0400
Subject: [PATCH] AArch64: Use FLOAT_MODE_P macro and add
 FLAG_SUPPRESS_FP_EXCEPTIONS [PR94442]

2020-08-03  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
	Use FLOAT_MODE_P macro instead of enumerating all floating-point
	modes and add global flag FLAG_SUPPRESS_FP_EXCEPTIONS.
---
 gcc/config/aarch64/aarch64-builtins.c | 29 +++++++++------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 892ef9495e1..1a4625d356c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -125,6 +125,10 @@ const unsigned int FLAG_READ_MEMORY = 1U << 2;
 const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
 const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
 
+/* Not all FP intrinsics raise FP exceptions, use this flag to
+   suppress it.  */
+const unsigned int FLAG_SUPPRESS_FP_EXCEPTIONS = 1U << 5;
+
 const unsigned int FLAG_FP = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS;
 const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
   | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
@@ -900,27 +904,12 @@ static unsigned int
 aarch64_call_properties (aarch64_simd_builtin_datum *d)
 {
   unsigned int flags = d->flags;
-  switch (d->mode)
-    {
-    /* Floating-point.  */
-    case E_BFmode:
-    case E_V4BFmode:
-    case E_V8BFmode:
-    case E_HFmode:
-    case E_V4HFmode:
-    case E_V8HFmode:
-    case E_SFmode:
-    case E_V2SFmode:
-    case E_V4SFmode:
-    case E_DFmode:
-    case E_V1DFmode:
-    case E_V2DFmode:
-      flags |= FLAG_FP;
-      break;
 
-    default:
-      break;
-    }
+  if (FLOAT_MODE_P (d->mode))
+    flags |= FLAG_FP;
+
+  if (flags & FLAG_SUPPRESS_FP_EXCEPTIONS)
+    flags &= ~FLAG_RAISE_FP_EXCEPTIONS;
 
   /* -fno-trapping-math means that we can assume any FP exceptions
      are not user-visible.  */
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-03  2:21                       ` xiezhiheng
@ 2020-08-03 13:55                         ` Richard Sandiford
  2020-08-04  8:01                           ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-03 13:55 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, July 31, 2020 5:03 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Friday, July 17, 2020 5:04 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >
>> > Cut...
>> >
>> >>
>> >> Thanks, pushed to master.
>> >>
>> >> Richard
>> >
>> > And I have finished the second part.
>> >
>> > In function aarch64_general_add_builtin, I add an argument ATTRS to
>> > pass attributes for each built-in function.
>> >
>> > And some new functions are added:
>> > aarch64_call_properties: return flags for each built-in function based
>> > on command-line options.  When the built-in function handles
>> > floating-points, add FLAG_FP flag.
>> >
>> > aarch64_modifies_global_state_p: True if the function would modify
>> > global states.
>> >
>> > aarch64_reads_global_state_p: True if the function would read
>> > global states.
>> >
>> > aarch64_could_trap_p: True if the function would raise a signal.
>> >
>> > aarch64_add_attribute: Add attributes in ATTRS.
>> >
>> > aarch64_get_attributes: return attributes for each built-in functons
>> > based on flags and command-line options.
>> >
>> > In function aarch64_init_simd_builtins, attributes are get by flags
>> > and pass them to function aarch64_general_add_builtin.
>> >
>> >
>> > Bootstrap is tested OK on aarch64 Linux platform, but regression
>> > FAIL one test case ---- pr93423.f90.
>> > However, I found that this test case would fail randomly in trunk.
>> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
>> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
>> > Some PRs have tracked it.  After my patch, this test case would
>> > always fail.  I guess the syntax errors in fortran crash some structures
>> > result in illegal memory access but I can't find what exactly it is.
>> > But I think my patch should have no influence on it.
>> 
>> Yeah, I agree.  And FWIW, I didn't see this in my testing.
>> 
>> I've pushed the patch with one trivial change: to remove the “and”
>> before “CODE” in:
>> 
>> >  /* Wrapper around add_builtin_function.  NAME is the name of the
>> built-in
>> >     function, TYPE is the function type, and CODE is the function subcode
>> > -   (relative to AARCH64_BUILTIN_GENERAL).  */
>> > +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
>> > +   attributes.  */
>> 
>> BTW, one thing to be careful of in future is that not all FP intrinsics
>> raise FP exceptions.  So while:
>> 
>> > +  switch (d->mode)
>> > +    {
>> > +    /* Floating-point.  */
>> > +    case E_BFmode:
>> > +    case E_V4BFmode:
>> > +    case E_V8BFmode:
>> > +    case E_HFmode:
>> > +    case E_V4HFmode:
>> > +    case E_V8HFmode:
>> > +    case E_SFmode:
>> > +    case E_V2SFmode:
>> > +    case E_V4SFmode:
>> > +    case E_DFmode:
>> > +    case E_V1DFmode:
>> > +    case E_V2DFmode:
>> > +      flags |= FLAG_FP;
>> > +      break;
>> > +
>> > +    default:
>> > +      break;
>> > +    }
>> 
>> is a good, conservatively-correct default, we might need an additional
>> flag to suppress it for certain intrinsics.
>> 
>
> I agree.
>
>> I've just realised that the code above could have used FLOAT_MODE_P,
>> but I didn't think of that before pushing the patch :-)
>> 
>
> Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
> macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
> FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.

The same thing is true for reading FPCR as well, so I think the flag
should suppress the FLOAT_MODE_P check, instead of fixing up the flags
afterwards.

I'm struggling to think of a good name though.  How about adding
FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on FLAG_AUTO_FP
being set?

We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
though.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-03 13:55                         ` Richard Sandiford
@ 2020-08-04  8:01                           ` xiezhiheng
  2020-08-04 16:25                             ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-04  8:01 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1640 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Monday, August 3, 2020 9:55 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 

Cut...

> >
> > Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.
> 
> The same thing is true for reading FPCR as well, so I think the flag
> should suppress the FLOAT_MODE_P check, instead of fixing up the flags
> afterwards.
> 
> I'm struggling to think of a good name though.  How about adding
> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on
> FLAG_AUTO_FP
> being set?
> 
> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
> though.

I could not think of a better name either.  So I choose to use FLAG_AUTO_FP
to control the check of FLOAT_MODE_P finally.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
XieZhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b834a2c473a..f4a44704926 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-08-04  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
+	Use FLOAT_MODE_P macro instead of enumerating all floating-point
+	modes and add global flag FLAG_AUTO_FP.
+

[-- Attachment #2: pr94442-v2.patch --]
[-- Type: application/octet-stream, Size: 2257 bytes --]

From 6f86a9cafef21c2365f05b219b7c9575ad4d629b Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Mon, 3 Aug 2020 22:09:50 -0400
Subject: [PATCH] AArch64: Use FLOAT_MODE_P macro and add FLAG_AUTO_FP
 [PR94442]

Since all FP intrinsics are set by FLAG_FP by default, but not all FP intrinsics
raise FP exceptions or read FPCR register.  So we add a global flag FLAG_AUTO_FP
to suppress the flag FLAG_FP.

2020-08-04  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
	Use FLOAT_MODE_P macro instead of enumerating all floating-point
	modes and add global flag FLAG_AUTO_FP.
---
 gcc/config/aarch64/aarch64-builtins.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 892ef9495e1..4f33dd936c7 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -125,6 +125,10 @@ const unsigned int FLAG_READ_MEMORY = 1U << 2;
 const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
 const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
 
+/* Not all FP intrinsics raise FP exceptions or read FPCR register,
+   use this flag to suppress it.  */
+const unsigned int FLAG_AUTO_FP = 1U << 5;
+
 const unsigned int FLAG_FP = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS;
 const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
   | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
@@ -900,27 +904,9 @@ static unsigned int
 aarch64_call_properties (aarch64_simd_builtin_datum *d)
 {
   unsigned int flags = d->flags;
-  switch (d->mode)
-    {
-    /* Floating-point.  */
-    case E_BFmode:
-    case E_V4BFmode:
-    case E_V8BFmode:
-    case E_HFmode:
-    case E_V4HFmode:
-    case E_V8HFmode:
-    case E_SFmode:
-    case E_V2SFmode:
-    case E_V4SFmode:
-    case E_DFmode:
-    case E_V1DFmode:
-    case E_V2DFmode:
-      flags |= FLAG_FP;
-      break;
 
-    default:
-      break;
-    }
+  if (!(flags & FLAG_AUTO_FP) && FLOAT_MODE_P (d->mode))
+    flags |= FLAG_FP;
 
   /* -fno-trapping-math means that we can assume any FP exceptions
      are not user-visible.  */
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-04  8:01                           ` xiezhiheng
@ 2020-08-04 16:25                             ` Richard Sandiford
  2020-08-17  8:05                               ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-04 16:25 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> > Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
>> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
>> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.
>> 
>> The same thing is true for reading FPCR as well, so I think the flag
>> should suppress the FLOAT_MODE_P check, instead of fixing up the flags
>> afterwards.
>> 
>> I'm struggling to think of a good name though.  How about adding
>> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on
>> FLAG_AUTO_FP
>> being set?
>> 
>> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
>> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
>> though.
>
> I could not think of a better name either.  So I choose to use FLAG_AUTO_FP
> to control the check of FLOAT_MODE_P finally.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, pushed to master.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-04 16:25                             ` Richard Sandiford
@ 2020-08-17  8:05                               ` xiezhiheng
  2020-08-19 10:06                                 ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-17  8:05 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2479 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, August 5, 2020 12:26 AM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> > Sorry, I should have used it.  And I prepare a patch to use
> FLOAT_MODE_P
> >> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
> >> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.
> >>
> >> The same thing is true for reading FPCR as well, so I think the flag
> >> should suppress the FLOAT_MODE_P check, instead of fixing up the flags
> >> afterwards.
> >>
> >> I'm struggling to think of a good name though.  How about adding
> >> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on
> >> FLAG_AUTO_FP
> >> being set?
> >>
> >> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
> >> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
> >> though.
> >
> > I could not think of a better name either.  So I choose to use
> FLAG_AUTO_FP
> > to control the check of FLOAT_MODE_P finally.
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> Thanks, pushed to master.
> 
> Richard

I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
including all the add/sub arithmetic intrinsics.

Something like faddp intrinsic which only handles floating-point operations,
both FP and NONE flags are suitable for it because FLAG_FP will be added
later if the intrinsic handles floating-point operations.  And I prefer FP since
it would be more clear.

But for qadd intrinsics, they would modify FPSR register which is a scenario
I missed before.  And I consider to add an additional flag FLAG_WRITE_FPSR
to represent it.

Bootstrapped and tested on aarch64 Linux platform.

Have any suggestions?

Thanks,
XieZhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9cf1f9733e7..cde50c54d9e 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2020-08-17  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_modifies_global_state_p):
+	Add flag FLAG_WRITE_FPSR to control attribtues.
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAGS
+	for intrinsic functions.
+

[-- Attachment #2: pr94442-v1.patch --]
[-- Type: application/octet-stream, Size: 7727 bytes --]

From ab6edbb4ca65ca3a09b0aec4d3076438e7ae0116 Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Mon, 17 Aug 2020 02:57:09 -0400
Subject: [PATCH] AArch64: Add FLAGS for intrinsic function [PR94442]

Additional flag FLAG_WRITE_FPSR is introduced because some intrinsics
would modify global FPSR register to help describe the attributes of
intrinsics.  And proper attributes are added for part of intrinsics.

2020-08-17  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (aarch64_modifies_global_state_p):
	Add flag FLAG_WRITE_FPSR to control attribtues.
	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAGS
	for intrinsic functions.
---
 gcc/config/aarch64/aarch64-builtins.c        |  6 +-
 gcc/config/aarch64/aarch64-simd-builtins.def | 88 ++++++++++----------
 2 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 4f33dd936c7..47504ace24c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -128,10 +128,12 @@ const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
 /* Not all FP intrinsics raise FP exceptions or read FPCR register,
    use this flag to suppress it.  */
 const unsigned int FLAG_AUTO_FP = 1U << 5;
+const unsigned int FLAG_WRITE_FPSR = 1U << 6;
 
 const unsigned int FLAG_FP = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS;
 const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
-  | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
+  | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY
+  | FLAG_WRITE_FPSR;
 
 typedef struct
 {
@@ -929,7 +931,7 @@ aarch64_modifies_global_state_p (aarch64_simd_builtin_datum *d)
   if (flags & FLAG_PREFETCH_MEMORY)
     return true;
 
-  return flags & FLAG_WRITE_MEMORY;
+  return flags & (FLAG_WRITE_MEMORY | FLAG_WRITE_FPSR);
 }
 
 /* Return true if calls to the function could read some form of
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index e8650121cd6..9b182285e19 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -37,15 +37,19 @@
    macro holding the RTL pattern for the intrinsic.  This mapping is:
    0 - CODE_FOR_aarch64_<name><mode>
    1-9 - CODE_FOR_<name><mode><1-9>
-   10 - CODE_FOR_<name><mode>.  */
+   10 - CODE_FOR_<name><mode>.
+
+   Parameter 4 is the 'flag' of the intrinsic.  This is used to
+   help describe the attributes (for example, pure) for the intrinsic
+   function.  */
 
   BUILTIN_VDC (COMBINE, combine, 0, ALL)
   VAR1 (COMBINEP, combine, 0, ALL, di)
   BUILTIN_VB (BINOP, pmul, 0, ALL)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL)
   BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
-  BUILTIN_VD_BHSI (BINOP, addp, 0, ALL)
-  VAR1 (UNOP, addp, 0, ALL, di)
+  BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
+  VAR1 (UNOP, addp, 0, NONE, di)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
   BUILTIN_VDQ_BHSI (UNOP, clz, 2, ALL)
   BUILTIN_VS (UNOP, ctz, 2, ALL)
@@ -57,13 +61,13 @@
   BUILTIN_VSDQ_I (BINOP, sqrshl, 0, ALL)
   BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, ALL)
   /* Implemented by aarch64_<su_optab><optab><mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqadd, 0, ALL)
-  BUILTIN_VSDQ_I (BINOPU, uqadd, 0, ALL)
-  BUILTIN_VSDQ_I (BINOP, sqsub, 0, ALL)
-  BUILTIN_VSDQ_I (BINOPU, uqsub, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP, sqadd, 0, WRITE_FPSR)
+  BUILTIN_VSDQ_I (BINOPU, uqadd, 0, WRITE_FPSR)
+  BUILTIN_VSDQ_I (BINOP, sqsub, 0, WRITE_FPSR)
+  BUILTIN_VSDQ_I (BINOPU, uqsub, 0, WRITE_FPSR)
   /* Implemented by aarch64_<sur>qadd<mode>.  */
-  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, ALL)
-  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, ALL)
+  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, WRITE_FPSR)
+  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, WRITE_FPSR)
 
   /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
   BUILTIN_VDC (GETREG, get_dregoi, 0, ALL)
@@ -119,41 +123,41 @@
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0, ALL)
 
-  BUILTIN_VQW (BINOP, saddl2, 0, ALL)
-  BUILTIN_VQW (BINOP, uaddl2, 0, ALL)
-  BUILTIN_VQW (BINOP, ssubl2, 0, ALL)
-  BUILTIN_VQW (BINOP, usubl2, 0, ALL)
-  BUILTIN_VQW (BINOP, saddw2, 0, ALL)
-  BUILTIN_VQW (BINOP, uaddw2, 0, ALL)
-  BUILTIN_VQW (BINOP, ssubw2, 0, ALL)
-  BUILTIN_VQW (BINOP, usubw2, 0, ALL)
+  BUILTIN_VQW (BINOP, saddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, saddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubw2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubw2, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, uaddl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, ssubl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, usubl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, saddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubl, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, uaddw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, ssubw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, usubw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, saddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubw, 0, NONE)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
-  BUILTIN_VQN (BINOP, addhn, 0, ALL)
-  BUILTIN_VQN (BINOP, subhn, 0, ALL)
-  BUILTIN_VQN (BINOP, raddhn, 0, ALL)
-  BUILTIN_VQN (BINOP, rsubhn, 0, ALL)
+  BUILTIN_VQN (BINOP, addhn, 0, NONE)
+  BUILTIN_VQN (BINOP, subhn, 0, NONE)
+  BUILTIN_VQN (BINOP, raddhn, 0, NONE)
+  BUILTIN_VQN (BINOP, rsubhn, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
-  BUILTIN_VQN (TERNOP, addhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, subhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, raddhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, rsubhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, addhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, subhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, raddhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0, NONE)
 
   BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0, ALL)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
@@ -238,8 +242,8 @@
   BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0, ALL)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
-  BUILTIN_VHSDF (BINOP, fcadd90, 0, ALL)
-  BUILTIN_VHSDF (BINOP, fcadd270, 0, ALL)
+  BUILTIN_VHSDF (BINOP, fcadd90, 0, FP)
+  BUILTIN_VHSDF (BINOP, fcadd270, 0, FP)
 
   /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>.   */
   BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL)
@@ -594,7 +598,7 @@
   BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, ALL)
 
   /* Implemented by aarch64_faddp<mode>.  */
-  BUILTIN_VHSDF (BINOP, faddp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, faddp, 0, FP)
 
   /* Implemented by aarch64_cm<optab><mode>.  */
   BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0, ALL)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-17  8:05                               ` xiezhiheng
@ 2020-08-19 10:06                                 ` Richard Sandiford
  2020-08-20  8:24                                   ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-19 10:06 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
> I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
> including all the add/sub arithmetic intrinsics.
>
> Something like faddp intrinsic which only handles floating-point operations,
> both FP and NONE flags are suitable for it because FLAG_FP will be added
> later if the intrinsic handles floating-point operations.  And I prefer FP since
> it would be more clear.

Sounds good to me.

> But for qadd intrinsics, they would modify FPSR register which is a scenario
> I missed before.  And I consider to add an additional flag FLAG_WRITE_FPSR
> to represent it.

I don't think we make any attempt to guarantee that the Q flag is
meaningful after saturating intrinsics.  To do that, we'd need to model
the modification of the flag in the .md patterns too.

So my preference would be to leave this out and just use NONE for the
saturating forms too.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-19 10:06                                 ` Richard Sandiford
@ 2020-08-20  8:24                                   ` xiezhiheng
  2020-08-20  8:55                                     ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-20  8:24 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2284 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, August 19, 2020 6:06 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
> > including all the add/sub arithmetic intrinsics.
> >
> > Something like faddp intrinsic which only handles floating-point operations,
> > both FP and NONE flags are suitable for it because FLAG_FP will be added
> > later if the intrinsic handles floating-point operations.  And I prefer FP
> since
> > it would be more clear.
> 
> Sounds good to me.
> 
> > But for qadd intrinsics, they would modify FPSR register which is a scenario
> > I missed before.  And I consider to add an additional flag
> FLAG_WRITE_FPSR
> > to represent it.
> 
> I don't think we make any attempt to guarantee that the Q flag is
> meaningful after saturating intrinsics.  To do that, we'd need to model
> the modification of the flag in the .md patterns too.
> 
> So my preference would be to leave this out and just use NONE for the
> saturating forms too.

The problem is that the test case in the attachment has different results under -O0 and -O2.

In gimple phase statement:
  _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
would be treated as dead code if we set NONE flag for saturating intrinsics.
Adding FLAG_WRITE_FPSR would help fix this problem.

Even when we set FLAG_WRITE_FPSR, the uqadd insn: 
  (insn 11 10 12 2 (set (reg:V2SI 97)
        (us_plus:V2SI (reg:V2SI 98)
            (reg:V2SI 99))) {aarch64_uqaddv2si}
     (nil))
could also be eliminated in RTL phase because this insn will be treated as dead insn.
So I think we might also need to modify saturating instruction patterns adding the side effect of set the FPSR register.

So if we could use NONE flag for saturating intrinsics, the description of function attributes and patterns are both incorrect. 
I think I can propose another patch to fix the patterns if you agree? 

Thanks,
Xie Zhiheng

[-- Attachment #2: test.c --]
[-- Type: text/plain, Size: 852 bytes --]

#include <arm_neon.h>
#include <stdlib.h>

typedef union {
  struct {
    int _xxx:24;
    unsigned int FZ:1;
    unsigned int DN:1;
    unsigned int AHP:1;
    unsigned int QC:1;
    int V:1;
    int C:1;
    int Z:1;
    int N:1;
  } b;
  unsigned int word;
} _ARM_FPSCR;

static volatile int __read_neon_cumulative_sat (void) {
    _ARM_FPSCR _afpscr_for_qc;
    asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
    return _afpscr_for_qc.b.QC;
}

int main()
{
  uint32x2_t op0, op1, res;

  op0 = vdup_n_u32 ((uint32_t)0xfffffff0);
  op1 = vdup_n_u32 ((uint32_t)0x20);

  _ARM_FPSCR _afpscr_for_qc;
  asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
  _afpscr_for_qc.b.QC = (0);
  asm volatile ("msr fpsr,%0" :  : "r" (_afpscr_for_qc));

  res = vqadd_u32 (op0, op1);
  if (__read_neon_cumulative_sat () != 1)
    abort ();

  return 0;
}

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-20  8:24                                   ` xiezhiheng
@ 2020-08-20  8:55                                     ` Richard Sandiford
  2020-08-20 12:16                                       ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-20  8:55 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Wednesday, August 19, 2020 6:06 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
>> > including all the add/sub arithmetic intrinsics.
>> >
>> > Something like faddp intrinsic which only handles floating-point operations,
>> > both FP and NONE flags are suitable for it because FLAG_FP will be added
>> > later if the intrinsic handles floating-point operations.  And I prefer FP
>> since
>> > it would be more clear.
>> 
>> Sounds good to me.
>> 
>> > But for qadd intrinsics, they would modify FPSR register which is a scenario
>> > I missed before.  And I consider to add an additional flag
>> FLAG_WRITE_FPSR
>> > to represent it.
>> 
>> I don't think we make any attempt to guarantee that the Q flag is
>> meaningful after saturating intrinsics.  To do that, we'd need to model
>> the modification of the flag in the .md patterns too.
>> 
>> So my preference would be to leave this out and just use NONE for the
>> saturating forms too.
>
> The problem is that the test case in the attachment has different results under -O0 and -O2.

Right.  But my point was that I don't think that use case is supported.
If you want to use saturating instructions and read the Q flag afterwards,
the saturating instructions need to be inline asm too.

> In gimple phase statement:
>   _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
> would be treated as dead code if we set NONE flag for saturating intrinsics.
> Adding FLAG_WRITE_FPSR would help fix this problem.
>
> Even when we set FLAG_WRITE_FPSR, the uqadd insn: 
>   (insn 11 10 12 2 (set (reg:V2SI 97)
>         (us_plus:V2SI (reg:V2SI 98)
>             (reg:V2SI 99))) {aarch64_uqaddv2si}
>      (nil))
> could also be eliminated in RTL phase because this insn will be treated as dead insn.
> So I think we might also need to modify saturating instruction patterns adding the side effect of set the FPSR register.

The problem is that FPSR is global state and we don't in general
know who might read it.  So if we modelled the modification of the FPSR,
we'd never be able to fold away saturating arithmetic that does actually
saturate at compile time, because we'd never know whether the program
wanted the effect on the Q flag result to be visible (perhaps to another
function that the compiler can't see).  We'd also be unable to remove
results that really are dead.

So I think this is one of those situations in which we can't keep all
constituents happy.  Catering for people who want to read the Q flag
would make things worse for those who want saturating arithmetic to be
optimised as aggressively as possible.  And the same holds in reverse.

Thanks,
Richard

>
> So if we could use NONE flag for saturating intrinsics, the description of function attributes and patterns are both incorrect. 
> I think I can propose another patch to fix the patterns if you agree? 
>
> Thanks,
> Xie Zhiheng
>
> #include <arm_neon.h>
> #include <stdlib.h>
>
> typedef union {
>   struct {
>     int _xxx:24;
>     unsigned int FZ:1;
>     unsigned int DN:1;
>     unsigned int AHP:1;
>     unsigned int QC:1;
>     int V:1;
>     int C:1;
>     int Z:1;
>     int N:1;
>   } b;
>   unsigned int word;
> } _ARM_FPSCR;
>
> static volatile int __read_neon_cumulative_sat (void) {
>     _ARM_FPSCR _afpscr_for_qc;
>     asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
>     return _afpscr_for_qc.b.QC;
> }
>
> int main()
> {
>   uint32x2_t op0, op1, res;
>
>   op0 = vdup_n_u32 ((uint32_t)0xfffffff0);
>   op1 = vdup_n_u32 ((uint32_t)0x20);
>
>   _ARM_FPSCR _afpscr_for_qc;
>   asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
>   _afpscr_for_qc.b.QC = (0);
>   asm volatile ("msr fpsr,%0" :  : "r" (_afpscr_for_qc));
>
>   res = vqadd_u32 (op0, op1);
>   if (__read_neon_cumulative_sat () != 1)
>     abort ();
>
>   return 0;
> }

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-20  8:55                                     ` Richard Sandiford
@ 2020-08-20 12:16                                       ` xiezhiheng
  2020-08-21  9:02                                         ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-20 12:16 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Thursday, August 20, 2020 4:55 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Wednesday, August 19, 2020 6:06 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a
> try,
> >> > including all the add/sub arithmetic intrinsics.
> >> >
> >> > Something like faddp intrinsic which only handles floating-point
> operations,
> >> > both FP and NONE flags are suitable for it because FLAG_FP will be
> added
> >> > later if the intrinsic handles floating-point operations.  And I prefer FP
> >> since
> >> > it would be more clear.
> >>
> >> Sounds good to me.
> >>
> >> > But for qadd intrinsics, they would modify FPSR register which is a
> scenario
> >> > I missed before.  And I consider to add an additional flag
> >> FLAG_WRITE_FPSR
> >> > to represent it.
> >>
> >> I don't think we make any attempt to guarantee that the Q flag is
> >> meaningful after saturating intrinsics.  To do that, we'd need to model
> >> the modification of the flag in the .md patterns too.
> >>
> >> So my preference would be to leave this out and just use NONE for the
> >> saturating forms too.
> >
> > The problem is that the test case in the attachment has different results
> under -O0 and -O2.
> 
> Right.  But my point was that I don't think that use case is supported.
> If you want to use saturating instructions and read the Q flag afterwards,
> the saturating instructions need to be inline asm too.
> 
> > In gimple phase statement:
> >   _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
> > would be treated as dead code if we set NONE flag for saturating intrinsics.
> > Adding FLAG_WRITE_FPSR would help fix this problem.
> >
> > Even when we set FLAG_WRITE_FPSR, the uqadd insn:
> >   (insn 11 10 12 2 (set (reg:V2SI 97)
> >         (us_plus:V2SI (reg:V2SI 98)
> >             (reg:V2SI 99))) {aarch64_uqaddv2si}
> >      (nil))
> > could also be eliminated in RTL phase because this insn will be treated as
> dead insn.
> > So I think we might also need to modify saturating instruction patterns
> adding the side effect of set the FPSR register.
> 
> The problem is that FPSR is global state and we don't in general
> know who might read it.  So if we modelled the modification of the FPSR,
> we'd never be able to fold away saturating arithmetic that does actually
> saturate at compile time, because we'd never know whether the program
> wanted the effect on the Q flag result to be visible (perhaps to another
> function that the compiler can't see).  We'd also be unable to remove
> results that really are dead.
> 
> So I think this is one of those situations in which we can't keep all
> constituents happy.  Catering for people who want to read the Q flag
> would make things worse for those who want saturating arithmetic to be
> optimised as aggressively as possible.  And the same holds in reverse.

I agree.  The test case is extracted from gcc.target/aarch64/advsimd-intrinsics/vqadd.c
If we set NONE flag for saturating intrinsics, it would fail in regression because some qadd
intrinsics would be treated as dead code and be eliminated.
  Running target unix
  Running ./gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp ...
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  (test for excess errors)
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  execution test

So maybe this test case should only be tested at -O0 level?

Thanks,
Xie Zhiheng

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-20 12:16                                       ` xiezhiheng
@ 2020-08-21  9:02                                         ` Richard Sandiford
  2020-08-25  3:14                                           ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-21  9:02 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Thursday, August 20, 2020 4:55 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Wednesday, August 19, 2020 6:06 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a
>> try,
>> >> > including all the add/sub arithmetic intrinsics.
>> >> >
>> >> > Something like faddp intrinsic which only handles floating-point
>> operations,
>> >> > both FP and NONE flags are suitable for it because FLAG_FP will be
>> added
>> >> > later if the intrinsic handles floating-point operations.  And I prefer FP
>> >> since
>> >> > it would be more clear.
>> >>
>> >> Sounds good to me.
>> >>
>> >> > But for qadd intrinsics, they would modify FPSR register which is a
>> scenario
>> >> > I missed before.  And I consider to add an additional flag
>> >> FLAG_WRITE_FPSR
>> >> > to represent it.
>> >>
>> >> I don't think we make any attempt to guarantee that the Q flag is
>> >> meaningful after saturating intrinsics.  To do that, we'd need to model
>> >> the modification of the flag in the .md patterns too.
>> >>
>> >> So my preference would be to leave this out and just use NONE for the
>> >> saturating forms too.
>> >
>> > The problem is that the test case in the attachment has different results
>> under -O0 and -O2.
>> 
>> Right.  But my point was that I don't think that use case is supported.
>> If you want to use saturating instructions and read the Q flag afterwards,
>> the saturating instructions need to be inline asm too.
>> 
>> > In gimple phase statement:
>> >   _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
>> > would be treated as dead code if we set NONE flag for saturating intrinsics.
>> > Adding FLAG_WRITE_FPSR would help fix this problem.
>> >
>> > Even when we set FLAG_WRITE_FPSR, the uqadd insn:
>> >   (insn 11 10 12 2 (set (reg:V2SI 97)
>> >         (us_plus:V2SI (reg:V2SI 98)
>> >             (reg:V2SI 99))) {aarch64_uqaddv2si}
>> >      (nil))
>> > could also be eliminated in RTL phase because this insn will be treated as
>> dead insn.
>> > So I think we might also need to modify saturating instruction patterns
>> adding the side effect of set the FPSR register.
>> 
>> The problem is that FPSR is global state and we don't in general
>> know who might read it.  So if we modelled the modification of the FPSR,
>> we'd never be able to fold away saturating arithmetic that does actually
>> saturate at compile time, because we'd never know whether the program
>> wanted the effect on the Q flag result to be visible (perhaps to another
>> function that the compiler can't see).  We'd also be unable to remove
>> results that really are dead.
>> 
>> So I think this is one of those situations in which we can't keep all
>> constituents happy.  Catering for people who want to read the Q flag
>> would make things worse for those who want saturating arithmetic to be
>> optimised as aggressively as possible.  And the same holds in reverse.
>
> I agree.  The test case is extracted from gcc.target/aarch64/advsimd-intrinsics/vqadd.c
> If we set NONE flag for saturating intrinsics, it would fail in regression because some qadd
> intrinsics would be treated as dead code and be eliminated.
>   Running target unix
>   Running ./gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp ...
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  (test for excess errors)
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  execution test

Ah, OK.

> So maybe this test case should only be tested at -O0 level?

Looks like the saturating intrinsics might need a bit more thought.
Would you mind submitting the patch with just the other parts?
Those were uncontroversial and it would be a shame to hold them
up over this.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-21  9:02                                         ` Richard Sandiford
@ 2020-08-25  3:14                                           ` xiezhiheng
  2020-08-25 11:07                                             ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-25  3:14 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2612 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, August 21, 2020 5:02 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3

Cut...
 
> Looks like the saturating intrinsics might need a bit more thought.
> Would you mind submitting the patch with just the other parts?
> Those were uncontroversial and it would be a shame to hold them
> up over this.

Okay, I reorganized the existing patch and finished the first half of the intrinsics
except saturating intrinsics and load intrinsics.

Bootstrapped and tested on aarch64 Linux platform.

For load intrinsics, I have one problem when I set FLAG_READ_MEMORY for them,
some test cases like
gcc.target/aarch64/advsimd-intrinsics/vld2_lane_p8_indices_1.c
  #include <arm_neon.h>

  /* { dg-do compile } */
  /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */

  poly8x8x2_t
  f_vld2_lane_p8 (poly8_t * p, poly8x8x2_t v)
  {
    poly8x8x2_t res;
    /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
    res = vld2_lane_p8 (p, v, 8);
    /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
    res = vld2_lane_p8 (p, v, -1);
    return res;
  }
would fail in regression.  Because the first statement
  res = vld2_lane_p8 (p, v, 8);
would be eliminated as dead code in gimple phase but the error message is
generated in expand pass.  So I am going to replace the second statement
  res = vld2_lane_p8 (p, v, -1);
with
  res = vld2_lane_p8 (p, res, -1);
or do you have any other suggestions?

And for test case gcc.target/aarch64/arg-type-diagnostics-1.c, I return the result
to prevent the statement
  result = vrsra_n_s32 (arg1, arg2, a);
from being eliminated by treated as dead code.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7a71b4367d4..217344d7d1f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-25  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAGS
+	for intrinsic functions.
+

diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index b9562e67883..e10bcc9b28a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-25  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* gcc.target/aarch64/arg-type-diagnostics-1.c: Return result
+	to prevent statement from being eliminated.
+

[-- Attachment #2: pr94442-v1.patch --]
[-- Type: application/octet-stream, Size: 28927 bytes --]

From 7769142a0855d38510869262a9e413dc48766086 Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Mon, 24 Aug 2020 21:35:51 -0400
Subject: [PATCH] AArch64: Add FLAGS for intrinsic functions [PR94442]

2020-08-25  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAGS
	for intrinsic functions.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/arg-type-diagnostics-1.c: Return result
	to prevent statement from being eliminated.
---
 gcc/config/aarch64/aarch64-simd-builtins.def  | 560 +++++++++---------
 .../aarch64/arg-type-diagnostics-1.c          |   3 +-
 2 files changed, 284 insertions(+), 279 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index e8650121cd6..c37c6d561ba 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -37,19 +37,23 @@
    macro holding the RTL pattern for the intrinsic.  This mapping is:
    0 - CODE_FOR_aarch64_<name><mode>
    1-9 - CODE_FOR_<name><mode><1-9>
-   10 - CODE_FOR_<name><mode>.  */
-
-  BUILTIN_VDC (COMBINE, combine, 0, ALL)
-  VAR1 (COMBINEP, combine, 0, ALL, di)
-  BUILTIN_VB (BINOP, pmul, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
-  BUILTIN_VD_BHSI (BINOP, addp, 0, ALL)
-  VAR1 (UNOP, addp, 0, ALL, di)
-  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
-  BUILTIN_VDQ_BHSI (UNOP, clz, 2, ALL)
-  BUILTIN_VS (UNOP, ctz, 2, ALL)
-  BUILTIN_VB (UNOP, popcount, 2, ALL)
+   10 - CODE_FOR_<name><mode>.
+
+   Parameter 4 is the 'flag' of the intrinsic.  This is used to
+   help describe the attributes (for example, pure) for the intrinsic
+   function.  */
+
+  BUILTIN_VDC (COMBINE, combine, 0, AUTO_FP)
+  VAR1 (COMBINEP, combine, 0, AUTO_FP, di)
+  BUILTIN_VB (BINOP, pmul, 0, NONE)
+  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, NONE)
+  BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
+  VAR1 (UNOP, addp, 0, NONE, di)
+  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, NONE)
+  BUILTIN_VDQ_BHSI (UNOP, clz, 2, NONE)
+  BUILTIN_VS (UNOP, ctz, 2, NONE)
+  BUILTIN_VB (UNOP, popcount, 2, NONE)
 
   /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
   BUILTIN_VSDQ_I (BINOP, sqshl, 0, ALL)
@@ -66,94 +70,94 @@
   BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, ALL)
 
   /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
-  BUILTIN_VDC (GETREG, get_dregoi, 0, ALL)
-  BUILTIN_VDC (GETREG, get_dregci, 0, ALL)
-  BUILTIN_VDC (GETREG, get_dregxi, 0, ALL)
-  VAR1 (GETREGP, get_dregoi, 0, ALL, di)
-  VAR1 (GETREGP, get_dregci, 0, ALL, di)
-  VAR1 (GETREGP, get_dregxi, 0, ALL, di)
+  BUILTIN_VDC (GETREG, get_dregoi, 0, AUTO_FP)
+  BUILTIN_VDC (GETREG, get_dregci, 0, AUTO_FP)
+  BUILTIN_VDC (GETREG, get_dregxi, 0, AUTO_FP)
+  VAR1 (GETREGP, get_dregoi, 0, AUTO_FP, di)
+  VAR1 (GETREGP, get_dregci, 0, AUTO_FP, di)
+  VAR1 (GETREGP, get_dregxi, 0, AUTO_FP, di)
   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (GETREG, get_qregoi, 0, ALL)
-  BUILTIN_VQ (GETREG, get_qregci, 0, ALL)
-  BUILTIN_VQ (GETREG, get_qregxi, 0, ALL)
-  VAR1 (GETREGP, get_qregoi, 0, ALL, v2di)
-  VAR1 (GETREGP, get_qregci, 0, ALL, v2di)
-  VAR1 (GETREGP, get_qregxi, 0, ALL, v2di)
+  BUILTIN_VQ (GETREG, get_qregoi, 0, AUTO_FP)
+  BUILTIN_VQ (GETREG, get_qregci, 0, AUTO_FP)
+  BUILTIN_VQ (GETREG, get_qregxi, 0, AUTO_FP)
+  VAR1 (GETREGP, get_qregoi, 0, AUTO_FP, v2di)
+  VAR1 (GETREGP, get_qregci, 0, AUTO_FP, v2di)
+  VAR1 (GETREGP, get_qregxi, 0, AUTO_FP, v2di)
   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (SETREG, set_qregoi, 0, ALL)
-  BUILTIN_VQ (SETREG, set_qregci, 0, ALL)
-  BUILTIN_VQ (SETREG, set_qregxi, 0, ALL)
-  VAR1 (SETREGP, set_qregoi, 0, ALL, v2di)
-  VAR1 (SETREGP, set_qregci, 0, ALL, v2di)
-  VAR1 (SETREGP, set_qregxi, 0, ALL, v2di)
+  BUILTIN_VQ (SETREG, set_qregoi, 0, AUTO_FP)
+  BUILTIN_VQ (SETREG, set_qregci, 0, AUTO_FP)
+  BUILTIN_VQ (SETREG, set_qregxi, 0, AUTO_FP)
+  VAR1 (SETREGP, set_qregoi, 0, AUTO_FP, v2di)
+  VAR1 (SETREGP, set_qregci, 0, AUTO_FP, v2di)
+  VAR1 (SETREGP, set_qregxi, 0, AUTO_FP, v2di)
   /* Implemented by aarch64_ld1x2<VQ:mode>. */
-  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld1x2, 0, READ_MEMORY)
   /* Implemented by aarch64_ld1x2<VDC:mode>. */
-  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld1x2, 0, READ_MEMORY)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (LOADSTRUCT, ld2, 0, ALL)
-  BUILTIN_VDC (LOADSTRUCT, ld3, 0, ALL)
-  BUILTIN_VDC (LOADSTRUCT, ld4, 0, ALL)
+  BUILTIN_VDC (LOADSTRUCT, ld2, 0, READ_MEMORY)
+  BUILTIN_VDC (LOADSTRUCT, ld3, 0, READ_MEMORY)
+  BUILTIN_VDC (LOADSTRUCT, ld4, 0, READ_MEMORY)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (LOADSTRUCT, ld2, 0, ALL)
-  BUILTIN_VQ (LOADSTRUCT, ld3, 0, ALL)
-  BUILTIN_VQ (LOADSTRUCT, ld4, 0, ALL)
+  BUILTIN_VQ (LOADSTRUCT, ld2, 0, READ_MEMORY)
+  BUILTIN_VQ (LOADSTRUCT, ld3, 0, READ_MEMORY)
+  BUILTIN_VQ (LOADSTRUCT, ld4, 0, READ_MEMORY)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0, ALL)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0, ALL)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0, READ_MEMORY)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0, READ_MEMORY)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0, READ_MEMORY)
   /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0, ALL)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0, ALL)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0, ALL)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0, READ_MEMORY)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0, READ_MEMORY)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0, READ_MEMORY)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (STORESTRUCT, st2, 0, ALL)
-  BUILTIN_VDC (STORESTRUCT, st3, 0, ALL)
-  BUILTIN_VDC (STORESTRUCT, st4, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st2, 0, WRITE_MEMORY)
+  BUILTIN_VDC (STORESTRUCT, st3, 0, WRITE_MEMORY)
+  BUILTIN_VDC (STORESTRUCT, st4, 0, WRITE_MEMORY)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (STORESTRUCT, st2, 0, ALL)
-  BUILTIN_VQ (STORESTRUCT, st3, 0, ALL)
-  BUILTIN_VQ (STORESTRUCT, st4, 0, ALL)
-
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0, ALL)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0, ALL)
-
-  BUILTIN_VQW (BINOP, saddl2, 0, ALL)
-  BUILTIN_VQW (BINOP, uaddl2, 0, ALL)
-  BUILTIN_VQW (BINOP, ssubl2, 0, ALL)
-  BUILTIN_VQW (BINOP, usubl2, 0, ALL)
-  BUILTIN_VQW (BINOP, saddw2, 0, ALL)
-  BUILTIN_VQW (BINOP, uaddw2, 0, ALL)
-  BUILTIN_VQW (BINOP, ssubw2, 0, ALL)
-  BUILTIN_VQW (BINOP, usubw2, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st2, 0, WRITE_MEMORY)
+  BUILTIN_VQ (STORESTRUCT, st3, 0, WRITE_MEMORY)
+  BUILTIN_VQ (STORESTRUCT, st4, 0, WRITE_MEMORY)
+
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0, WRITE_MEMORY)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, WRITE_MEMORY)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0, WRITE_MEMORY)
+
+  BUILTIN_VQW (BINOP, saddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, saddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubw2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubw2, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, uaddl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, ssubl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, usubl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, saddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubl, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, uaddw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, ssubw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, usubw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, saddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubw, 0, NONE)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
-  BUILTIN_VQN (BINOP, addhn, 0, ALL)
-  BUILTIN_VQN (BINOP, subhn, 0, ALL)
-  BUILTIN_VQN (BINOP, raddhn, 0, ALL)
-  BUILTIN_VQN (BINOP, rsubhn, 0, ALL)
+  BUILTIN_VQN (BINOP, addhn, 0, NONE)
+  BUILTIN_VQN (BINOP, subhn, 0, NONE)
+  BUILTIN_VQN (BINOP, raddhn, 0, NONE)
+  BUILTIN_VQN (BINOP, rsubhn, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
-  BUILTIN_VQN (TERNOP, addhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, subhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, raddhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, rsubhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, addhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, subhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, raddhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0, NONE)
 
   BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0, ALL)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
@@ -185,20 +189,20 @@
   BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0, ALL)
   BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0, ALL)
 
-  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, ALL)
-  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, NONE)
+  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, NONE)
 
-  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, ALL)
-  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, ALL)
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, NONE)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, NONE)
 
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, ALL)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, ALL)
-  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, ALL)
-  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0, ALL)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0, ALL)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0, ALL)
-  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0, ALL)
-  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0, ALL)
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, NONE)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, NONE)
+  BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, NONE)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_laneq_, 0, NONE)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_lane_, 0, NONE)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_lane_, 0, NONE)
+  BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0, NONE)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0, NONE)
 
   BUILTIN_VSD_HSI (BINOP, sqdmull, 0, ALL)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0, ALL)
@@ -217,63 +221,63 @@
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0, ALL)
   BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0, ALL)
 
-  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3, NONE)
   /* Implemented by aarch64_<sur>shl<mode>.  */
-  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, ALL)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, ALL)
-  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, ALL)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, ALL)
+  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, NONE)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, NONE)
+  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
 
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0, ALL)
-  BUILTIN_VB (TERNOPU, udot, 0, ALL)
-  BUILTIN_VB (TERNOP_SSUS, usdot, 0, ALL)
-  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, ALL)
-  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, ALL)
-  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, ALL)
-  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0, ALL)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0, ALL)
-  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0, ALL)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0, ALL)
-  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0, ALL)
+  BUILTIN_VB (TERNOP, sdot, 0, NONE)
+  BUILTIN_VB (TERNOPU, udot, 0, NONE)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE)
+  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
+  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE)
+  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE)
+  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0, NONE)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0, NONE)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0, NONE)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0, NONE)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0, NONE)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
-  BUILTIN_VHSDF (BINOP, fcadd90, 0, ALL)
-  BUILTIN_VHSDF (BINOP, fcadd270, 0, ALL)
+  BUILTIN_VHSDF (BINOP, fcadd90, 0, FP)
+  BUILTIN_VHSDF (BINOP, fcadd270, 0, FP)
 
   /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>.   */
-  BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL)
-  BUILTIN_VHSDF (TERNOP, fcmla90, 0, ALL)
-  BUILTIN_VHSDF (TERNOP, fcmla180, 0, ALL)
-  BUILTIN_VHSDF (TERNOP, fcmla270, 0, ALL)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0, ALL)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0, ALL)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0, ALL)
-  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0, ALL)
-
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0, ALL)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0, ALL)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0, ALL)
-  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0, ALL)
-
-  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3, ALL)
-  VAR1 (SHIFTIMM, ashr_simd, 0, ALL, di)
-  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3, ALL)
-  VAR1 (USHIFTIMM, lshr_simd, 0, ALL, di)
+  BUILTIN_VHSDF (TERNOP, fcmla0, 0, FP)
+  BUILTIN_VHSDF (TERNOP, fcmla90, 0, FP)
+  BUILTIN_VHSDF (TERNOP, fcmla180, 0, FP)
+  BUILTIN_VHSDF (TERNOP, fcmla270, 0, FP)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0, FP)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0, FP)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0, FP)
+  BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0, FP)
+
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0, FP)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0, FP)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0, FP)
+  BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0, FP)
+
+  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3, NONE)
+  VAR1 (SHIFTIMM, ashr_simd, 0, NONE, di)
+  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3, NONE)
+  VAR1 (USHIFTIMM, lshr_simd, 0, NONE, di)
   /* Implemented by aarch64_<sur>shr_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0, ALL)
-  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0, NONE)
+  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0, NONE)
   /* Implemented by aarch64_<sur>sra_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0, ALL)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0, ALL)
-  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0, ALL)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0, NONE)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0, NONE)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0, NONE)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0, NONE)
   /* Implemented by aarch64_<sur>shll_n<mode>.  */
-  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0, ALL)
-  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0, ALL)
+  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0, NONE)
+  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0, NONE)
   /* Implemented by aarch64_<sur>shll2_n<mode>.  */
-  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0, ALL)
-  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0, ALL)
+  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0, NONE)
+  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0, NONE)
   /* Implemented by aarch64_<sur>q<r>shr<u>n_n<mode>.  */
   BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0, ALL)
   BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0, ALL)
@@ -282,166 +286,166 @@
   BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0, ALL)
   BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0, ALL)
   /* Implemented by aarch64_<sur>s<lr>i_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, ALL)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, ALL)
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, ALL)
-  VAR2 (SHIFTINSERTP, ssli_n, 0, ALL, di, v2di)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, ALL)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, NONE)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, NONE)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, NONE)
+  VAR2 (SHIFTINSERTP, ssli_n, 0, NONE, di, v2di)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, NONE)
   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
   BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, ALL)
   BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0, ALL)
   BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
 
   /* Implemented by aarch64_reduc_plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, NONE)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, ALL)
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10, ALL)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, ALL)
-  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10, ALL)
-  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10, ALL)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, NONE)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10, NONE)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, NONE)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, NONE)
+  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10, NONE)
+  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10, NONE)
 
   /* Implemented by <maxmin_uns><mode>3.
      smax variants map to fmaxnm,
      smax_nan variants map to fmax.  */
-  BUILTIN_VDQ_BHSI (BINOP, smax, 3, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, smin, 3, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, umax, 3, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, umin, 3, ALL)
-  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3, ALL)
-  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, smax, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, smin, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umax, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umin, 3, NONE)
+  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3, NONE)
+  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3, NONE)
 
   /* Implemented by <maxmin_uns><mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3, FP)
+  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3, FP)
 
   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, sminp, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uminp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, smaxp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, sminp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, smax_nanp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, smin_nanp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, sminp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uminp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smaxp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, sminp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smax_nanp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smin_nanp, 0, NONE)
 
   /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VHSDF (UNOP, btrunc, 2, ALL)
-  BUILTIN_VHSDF (UNOP, ceil, 2, ALL)
-  BUILTIN_VHSDF (UNOP, floor, 2, ALL)
-  BUILTIN_VHSDF (UNOP, nearbyint, 2, ALL)
-  BUILTIN_VHSDF (UNOP, rint, 2, ALL)
-  BUILTIN_VHSDF (UNOP, round, 2, ALL)
-  BUILTIN_VHSDF_DF (UNOP, frintn, 2, ALL)
-
-  VAR1 (UNOP, btrunc, 2, ALL, hf)
-  VAR1 (UNOP, ceil, 2, ALL, hf)
-  VAR1 (UNOP, floor, 2, ALL, hf)
-  VAR1 (UNOP, frintn, 2, ALL, hf)
-  VAR1 (UNOP, nearbyint, 2, ALL, hf)
-  VAR1 (UNOP, rint, 2, ALL, hf)
-  VAR1 (UNOP, round, 2, ALL, hf)
+  BUILTIN_VHSDF (UNOP, btrunc, 2, FP)
+  BUILTIN_VHSDF (UNOP, ceil, 2, FP)
+  BUILTIN_VHSDF (UNOP, floor, 2, FP)
+  BUILTIN_VHSDF (UNOP, nearbyint, 2, FP)
+  BUILTIN_VHSDF (UNOP, rint, 2, FP)
+  BUILTIN_VHSDF (UNOP, round, 2, FP)
+  BUILTIN_VHSDF_DF (UNOP, frintn, 2, FP)
+
+  VAR1 (UNOP, btrunc, 2, FP, hf)
+  VAR1 (UNOP, ceil, 2, FP, hf)
+  VAR1 (UNOP, floor, 2, FP, hf)
+  VAR1 (UNOP, frintn, 2, FP, hf)
+  VAR1 (UNOP, nearbyint, 2, FP, hf)
+  VAR1 (UNOP, rint, 2, FP, hf)
+  VAR1 (UNOP, round, 2, FP, hf)
 
   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
-  VAR1 (UNOP, lbtruncv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lbtruncv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lbtruncv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lbtruncv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lbtruncv2df, 2, ALL, v2di)
-
-  VAR1 (UNOPUS, lbtruncuv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lbtruncuv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lbtruncuv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lbtruncuv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lbtruncuv2df, 2, ALL, v2di)
-
-  VAR1 (UNOP, lroundv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lroundv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lroundv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lroundv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lroundv2df, 2, ALL, v2di)
+  VAR1 (UNOP, lbtruncv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lbtruncv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lbtruncv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lbtruncv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lbtruncv2df, 2, FP, v2di)
+
+  VAR1 (UNOPUS, lbtruncuv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lbtruncuv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lbtruncuv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lbtruncuv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lbtruncuv2df, 2, FP, v2di)
+
+  VAR1 (UNOP, lroundv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lroundv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lroundv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lroundv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lroundv2df, 2, FP, v2di)
   /* Implemented by l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2.  */
-  BUILTIN_GPI_I16 (UNOP, lroundhf, 2, ALL)
-  VAR1 (UNOP, lroundsf, 2, ALL, si)
-  VAR1 (UNOP, lrounddf, 2, ALL, di)
-
-  VAR1 (UNOPUS, lrounduv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lrounduv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lrounduv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lrounduv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lrounduv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2, ALL)
-  VAR1 (UNOPUS, lroundusf, 2, ALL, si)
-  VAR1 (UNOPUS, lroundudf, 2, ALL, di)
-
-  VAR1 (UNOP, lceilv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lceilv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lceilv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lceilv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lceilv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOP, lceilhf, 2, ALL)
-
-  VAR1 (UNOPUS, lceiluv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lceiluv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lceiluv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lceiluv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lceiluv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2, ALL)
-  VAR1 (UNOPUS, lceilusf, 2, ALL, si)
-  VAR1 (UNOPUS, lceiludf, 2, ALL, di)
-
-  VAR1 (UNOP, lfloorv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lfloorv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lfloorv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lfloorv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lfloorv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2, ALL)
-
-  VAR1 (UNOPUS, lflooruv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lflooruv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lflooruv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lflooruv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lflooruv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2, ALL)
-  VAR1 (UNOPUS, lfloorusf, 2, ALL, si)
-  VAR1 (UNOPUS, lfloorudf, 2, ALL, di)
-
-  VAR1 (UNOP, lfrintnv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lfrintnv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lfrintnv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lfrintnv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lfrintnv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2, ALL)
-  VAR1 (UNOP, lfrintnsf, 2, ALL, si)
-  VAR1 (UNOP, lfrintndf, 2, ALL, di)
-
-  VAR1 (UNOPUS, lfrintnuv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lfrintnuv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lfrintnuv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lfrintnuv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lfrintnuv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2, ALL)
-  VAR1 (UNOPUS, lfrintnusf, 2, ALL, si)
-  VAR1 (UNOPUS, lfrintnudf, 2, ALL, di)
+  BUILTIN_GPI_I16 (UNOP, lroundhf, 2, FP)
+  VAR1 (UNOP, lroundsf, 2, FP, si)
+  VAR1 (UNOP, lrounddf, 2, FP, di)
+
+  VAR1 (UNOPUS, lrounduv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lrounduv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lrounduv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lrounduv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lrounduv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2, FP)
+  VAR1 (UNOPUS, lroundusf, 2, FP, si)
+  VAR1 (UNOPUS, lroundudf, 2, FP, di)
+
+  VAR1 (UNOP, lceilv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lceilv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lceilv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lceilv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lceilv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lceilhf, 2, FP)
+
+  VAR1 (UNOPUS, lceiluv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lceiluv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lceiluv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lceiluv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lceiluv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2, FP)
+  VAR1 (UNOPUS, lceilusf, 2, FP, si)
+  VAR1 (UNOPUS, lceiludf, 2, FP, di)
+
+  VAR1 (UNOP, lfloorv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lfloorv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lfloorv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lfloorv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lfloorv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2, FP)
+
+  VAR1 (UNOPUS, lflooruv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lflooruv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lflooruv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lflooruv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lflooruv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2, FP)
+  VAR1 (UNOPUS, lfloorusf, 2, FP, si)
+  VAR1 (UNOPUS, lfloorudf, 2, FP, di)
+
+  VAR1 (UNOP, lfrintnv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lfrintnv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lfrintnv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lfrintnv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lfrintnv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2, FP)
+  VAR1 (UNOP, lfrintnsf, 2, FP, si)
+  VAR1 (UNOP, lfrintndf, 2, FP, di)
+
+  VAR1 (UNOPUS, lfrintnuv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lfrintnuv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lfrintnuv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lfrintnuv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lfrintnuv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2, FP)
+  VAR1 (UNOPUS, lfrintnusf, 2, FP, si)
+  VAR1 (UNOPUS, lfrintnudf, 2, FP, di)
 
   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
-  VAR1 (UNOP, floatv4hi, 2, ALL, v4hf)
-  VAR1 (UNOP, floatv8hi, 2, ALL, v8hf)
-  VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
-  VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
-  VAR1 (UNOP, floatv2di, 2, ALL, v2df)
+  VAR1 (UNOP, floatv4hi, 2, FP, v4hf)
+  VAR1 (UNOP, floatv8hi, 2, FP, v8hf)
+  VAR1 (UNOP, floatv2si, 2, FP, v2sf)
+  VAR1 (UNOP, floatv4si, 2, FP, v4sf)
+  VAR1 (UNOP, floatv2di, 2, FP, v2df)
 
-  VAR1 (UNOP, floatunsv4hi, 2, ALL, v4hf)
-  VAR1 (UNOP, floatunsv8hi, 2, ALL, v8hf)
-  VAR1 (UNOP, floatunsv2si, 2, ALL, v2sf)
-  VAR1 (UNOP, floatunsv4si, 2, ALL, v4sf)
-  VAR1 (UNOP, floatunsv2di, 2, ALL, v2df)
+  VAR1 (UNOP, floatunsv4hi, 2, FP, v4hf)
+  VAR1 (UNOP, floatunsv8hi, 2, FP, v8hf)
+  VAR1 (UNOP, floatunsv2si, 2, FP, v2sf)
+  VAR1 (UNOP, floatunsv4si, 2, FP, v4sf)
+  VAR1 (UNOP, floatunsv2di, 2, FP, v2df)
 
-  VAR5 (UNOPU, bswap, 2, ALL, v4hi, v8hi, v2si, v4si, v2di)
+  VAR5 (UNOPU, bswap, 2, NONE, v4hi, v8hi, v2si, v4si, v2di)
 
-  BUILTIN_VB (UNOP, rbit, 0, ALL)
+  BUILTIN_VB (UNOP, rbit, 0, NONE)
 
   /* Implemented by
      aarch64_<PERMUTE:perm_insn><mode>.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/arg-type-diagnostics-1.c b/gcc/testsuite/gcc.target/aarch64/arg-type-diagnostics-1.c
index a7b7cd3bd8d..ef70f75e75e 100644
--- a/gcc/testsuite/gcc.target/aarch64/arg-type-diagnostics-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/arg-type-diagnostics-1.c
@@ -3,7 +3,7 @@
 
 #include "arm_neon.h"
 
-void foo (int a)
+int32x2_t foo (int a)
 {
   int32x2_t arg1;
   int32x2_t arg2;
@@ -15,4 +15,5 @@ void foo (int a)
      we have to tell dg-error to ignore the line number.  */
   result = vrsra_n_s32 (arg1, arg2, a);
   /* { dg-error "must be a constant immediate" "" { target *-*-* } 0 } */
+  return result;
 }
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-25  3:14                                           ` xiezhiheng
@ 2020-08-25 11:07                                             ` Richard Sandiford
  2020-08-26  1:39                                               ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-25 11:07 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, August 21, 2020 5:02 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>
> Cut...
>  
>> Looks like the saturating intrinsics might need a bit more thought.
>> Would you mind submitting the patch with just the other parts?
>> Those were uncontroversial and it would be a shame to hold them
>> up over this.
>
> Okay, I reorganized the existing patch and finished the first half of the intrinsics
> except saturating intrinsics and load intrinsics.
>
> Bootstrapped and tested on aarch64 Linux platform.

I know this'll be frustrating, sorry, but could you post the
2020-08-17 patch without the saturation changes?  It's going to be
easier to track and review if each patch deals with similar intrinsics.
The non-saturating part of the 2020-08-17 patch was good because it was
dealing purely with arithmetic operations.  Loads should really be a
separate change.

BTW, for something like this, it's OK to test and submit several patches
at once, so separating the patches doesn't need to mean longer test cycles.
It's just that for review purposes, it's easier if one patch does one thing.

> For load intrinsics, I have one problem when I set FLAG_READ_MEMORY for them,
> some test cases like
> gcc.target/aarch64/advsimd-intrinsics/vld2_lane_p8_indices_1.c
>   #include <arm_neon.h>
>
>   /* { dg-do compile } */
>   /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>
>   poly8x8x2_t
>   f_vld2_lane_p8 (poly8_t * p, poly8x8x2_t v)
>   {
>     poly8x8x2_t res;
>     /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
>     res = vld2_lane_p8 (p, v, 8);
>     /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
>     res = vld2_lane_p8 (p, v, -1);
>     return res;
>   }
> would fail in regression.  Because the first statement
>   res = vld2_lane_p8 (p, v, 8);
> would be eliminated as dead code in gimple phase but the error message is
> generated in expand pass.  So I am going to replace the second statement
>   res = vld2_lane_p8 (p, v, -1);
> with
>   res = vld2_lane_p8 (p, res, -1);
> or do you have any other suggestions?

The test is valid as-is, so it would be better not to change it.

I guess this means that we should leave the _lane loads and stores until
we implement the range checks in a different way.  This is somewhat
related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95969 ,
although your example shows that the “dummy const function” approach
might not work.

So to start with, could you just patch the non-lane loads?

> And for test case gcc.target/aarch64/arg-type-diagnostics-1.c, I return the result
> to prevent the statement
>   result = vrsra_n_s32 (arg1, arg2, a);
> from being eliminated by treated as dead code.

Hmm.  Here too I think the test is valid as-is.  I think we need
to ensure that the range check still happens even if the call is
dead code (similar to PR95969 above).

So I guess here too, it might be better to leave the _n forms to
a separate patch.

That doesn't mean we shouldn't fix the _lane and _n cases (or the
previous saturating cases).  It's just that each time we find a group
of functions that's awkward for some reason, it'd be better to deal
with those functions separately.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-25 11:07                                             ` Richard Sandiford
@ 2020-08-26  1:39                                               ` xiezhiheng
  2020-08-26 10:14                                                 ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-26  1:39 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4756 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, August 25, 2020 7:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Friday, August 21, 2020 5:02 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >
> > Cut...
> >
> >> Looks like the saturating intrinsics might need a bit more thought.
> >> Would you mind submitting the patch with just the other parts?
> >> Those were uncontroversial and it would be a shame to hold them
> >> up over this.
> >
> > Okay, I reorganized the existing patch and finished the first half of the
> intrinsics
> > except saturating intrinsics and load intrinsics.
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> I know this'll be frustrating, sorry, but could you post the
> 2020-08-17 patch without the saturation changes?  It's going to be
> easier to track and review if each patch deals with similar intrinsics.
> The non-saturating part of the 2020-08-17 patch was good because it was
> dealing purely with arithmetic operations.  Loads should really be a
> separate change.
> 
> BTW, for something like this, it's OK to test and submit several patches
> at once, so separating the patches doesn't need to mean longer test cycles.
> It's just that for review purposes, it's easier if one patch does one thing.
> 

That's true.  And I finished the patch to add FLAG for add/sub arithmetic
intrinsics except saturating intrinsics.  Later I will try to separate the rest
into several subsets to fix.

Bootstrapped and tested on aarch64 Linux platform.


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7a71b4367d4..a93712ae0a5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-26  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for add/sub arithmetic intrinsics.
+

> > For load intrinsics, I have one problem when I set FLAG_READ_MEMORY
> for them,
> > some test cases like
> > gcc.target/aarch64/advsimd-intrinsics/vld2_lane_p8_indices_1.c
> >   #include <arm_neon.h>
> >
> >   /* { dg-do compile } */
> >   /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> >
> >   poly8x8x2_t
> >   f_vld2_lane_p8 (poly8_t * p, poly8x8x2_t v)
> >   {
> >     poly8x8x2_t res;
> >     /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
> >     res = vld2_lane_p8 (p, v, 8);
> >     /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
> >     res = vld2_lane_p8 (p, v, -1);
> >     return res;
> >   }
> > would fail in regression.  Because the first statement
> >   res = vld2_lane_p8 (p, v, 8);
> > would be eliminated as dead code in gimple phase but the error message is
> > generated in expand pass.  So I am going to replace the second statement
> >   res = vld2_lane_p8 (p, v, -1);
> > with
> >   res = vld2_lane_p8 (p, res, -1);
> > or do you have any other suggestions?
> 
> The test is valid as-is, so it would be better not to change it.
> 
> I guess this means that we should leave the _lane loads and stores until
> we implement the range checks in a different way.  This is somewhat
> related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95969 ,
> although your example shows that the “dummy const function” approach
> might not work.
> 
> So to start with, could you just patch the non-lane loads?

Okay.

> 
> > And for test case gcc.target/aarch64/arg-type-diagnostics-1.c, I return the
> result
> > to prevent the statement
> >   result = vrsra_n_s32 (arg1, arg2, a);
> > from being eliminated by treated as dead code.
> 
> Hmm.  Here too I think the test is valid as-is.  I think we need
> to ensure that the range check still happens even if the call is
> dead code (similar to PR95969 above).

I agree.  That would be more reasonable.

> 
> So I guess here too, it might be better to leave the _n forms to
> a separate patch.
> 
> That doesn't mean we shouldn't fix the _lane and _n cases (or the
> previous saturating cases).  It's just that each time we find a group
> of functions that's awkward for some reason, it'd be better to deal
> with those functions separately.
> 
> Thanks,
> Richard

[-- Attachment #2: pr94442-v2.patch --]
[-- Type: application/octet-stream, Size: 5362 bytes --]

From 918022728cc039378290a745ffa0dcbdc8e0f5c7 Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Tue, 25 Aug 2020 07:53:55 -0400
Subject: [PATCH] AArch64: Add FLAG for add/sub arithmetic intrinsics [PR94442]

2020-08-26  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for add/sub arithmetic intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 76 ++++++++++----------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index e8650121cd6..502b83cb27f 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -37,15 +37,19 @@
    macro holding the RTL pattern for the intrinsic.  This mapping is:
    0 - CODE_FOR_aarch64_<name><mode>
    1-9 - CODE_FOR_<name><mode><1-9>
-   10 - CODE_FOR_<name><mode>.  */
+   10 - CODE_FOR_<name><mode>.
+
+   Parameter 4 is the 'flag' of the intrinsic.  This is used to
+   help describe the attributes (for example, pure) for the intrinsic
+   function.  */
 
   BUILTIN_VDC (COMBINE, combine, 0, ALL)
   VAR1 (COMBINEP, combine, 0, ALL, di)
   BUILTIN_VB (BINOP, pmul, 0, ALL)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL)
   BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
-  BUILTIN_VD_BHSI (BINOP, addp, 0, ALL)
-  VAR1 (UNOP, addp, 0, ALL, di)
+  BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
+  VAR1 (UNOP, addp, 0, NONE, di)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
   BUILTIN_VDQ_BHSI (UNOP, clz, 2, ALL)
   BUILTIN_VS (UNOP, ctz, 2, ALL)
@@ -119,41 +123,41 @@
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0, ALL)
 
-  BUILTIN_VQW (BINOP, saddl2, 0, ALL)
-  BUILTIN_VQW (BINOP, uaddl2, 0, ALL)
-  BUILTIN_VQW (BINOP, ssubl2, 0, ALL)
-  BUILTIN_VQW (BINOP, usubl2, 0, ALL)
-  BUILTIN_VQW (BINOP, saddw2, 0, ALL)
-  BUILTIN_VQW (BINOP, uaddw2, 0, ALL)
-  BUILTIN_VQW (BINOP, ssubw2, 0, ALL)
-  BUILTIN_VQW (BINOP, usubw2, 0, ALL)
+  BUILTIN_VQW (BINOP, saddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddl2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubl2, 0, NONE)
+  BUILTIN_VQW (BINOP, saddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, uaddw2, 0, NONE)
+  BUILTIN_VQW (BINOP, ssubw2, 0, NONE)
+  BUILTIN_VQW (BINOP, usubw2, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, uaddl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, ssubl, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, usubl, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, saddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubl, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubl, 0, NONE)
   /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, uaddw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, ssubw, 0, ALL)
-  BUILTIN_VD_BHSI (BINOP, usubw, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, saddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, uaddw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, ssubw, 0, NONE)
+  BUILTIN_VD_BHSI (BINOP, usubw, 0, NONE)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, shadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
-  BUILTIN_VQN (BINOP, addhn, 0, ALL)
-  BUILTIN_VQN (BINOP, subhn, 0, ALL)
-  BUILTIN_VQN (BINOP, raddhn, 0, ALL)
-  BUILTIN_VQN (BINOP, rsubhn, 0, ALL)
+  BUILTIN_VQN (BINOP, addhn, 0, NONE)
+  BUILTIN_VQN (BINOP, subhn, 0, NONE)
+  BUILTIN_VQN (BINOP, raddhn, 0, NONE)
+  BUILTIN_VQN (BINOP, rsubhn, 0, NONE)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
-  BUILTIN_VQN (TERNOP, addhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, subhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, raddhn2, 0, ALL)
-  BUILTIN_VQN (TERNOP, rsubhn2, 0, ALL)
+  BUILTIN_VQN (TERNOP, addhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, subhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, raddhn2, 0, NONE)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0, NONE)
 
   BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0, ALL)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
@@ -238,8 +242,8 @@
   BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0, ALL)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
-  BUILTIN_VHSDF (BINOP, fcadd90, 0, ALL)
-  BUILTIN_VHSDF (BINOP, fcadd270, 0, ALL)
+  BUILTIN_VHSDF (BINOP, fcadd90, 0, FP)
+  BUILTIN_VHSDF (BINOP, fcadd270, 0, FP)
 
   /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>.   */
   BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL)
@@ -594,7 +598,7 @@
   BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, ALL)
 
   /* Implemented by aarch64_faddp<mode>.  */
-  BUILTIN_VHSDF (BINOP, faddp, 0, ALL)
+  BUILTIN_VHSDF (BINOP, faddp, 0, FP)
 
   /* Implemented by aarch64_cm<optab><mode>.  */
   BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0, ALL)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-26  1:39                                               ` xiezhiheng
@ 2020-08-26 10:14                                                 ` Richard Sandiford
  2020-08-27  2:50                                                   ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-26 10:14 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, August 25, 2020 7:08 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Friday, August 21, 2020 5:02 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >
>> > Cut...
>> >
>> >> Looks like the saturating intrinsics might need a bit more thought.
>> >> Would you mind submitting the patch with just the other parts?
>> >> Those were uncontroversial and it would be a shame to hold them
>> >> up over this.
>> >
>> > Okay, I reorganized the existing patch and finished the first half of the
>> intrinsics
>> > except saturating intrinsics and load intrinsics.
>> >
>> > Bootstrapped and tested on aarch64 Linux platform.
>> 
>> I know this'll be frustrating, sorry, but could you post the
>> 2020-08-17 patch without the saturation changes?  It's going to be
>> easier to track and review if each patch deals with similar intrinsics.
>> The non-saturating part of the 2020-08-17 patch was good because it was
>> dealing purely with arithmetic operations.  Loads should really be a
>> separate change.
>> 
>> BTW, for something like this, it's OK to test and submit several patches
>> at once, so separating the patches doesn't need to mean longer test cycles.
>> It's just that for review purposes, it's easier if one patch does one thing.
>> 
>
> That's true.  And I finished the patch to add FLAG for add/sub arithmetic
> intrinsics except saturating intrinsics.  Later I will try to separate the rest
> into several subsets to fix.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, looks great.  Pushed to master.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-26 10:14                                                 ` Richard Sandiford
@ 2020-08-27  2:50                                                   ` xiezhiheng
  2020-08-27  8:08                                                     ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-08-27  2:50 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1258 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, August 26, 2020 6:14 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 

Cut...

> 
> Thanks, looks great.  Pushed to master.
> 
> Richard

I made two separate patches for these two groups for review purposes.

Note: Patch for min/max intrinsics should be applied before the patch for rounding intrinsics

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f6605eae08c..939aae71ecd 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-27  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for min/max intrinsics.
+


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f6605eae08c..b0d3ec6cf19 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-27  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for rounding intrinsics.
+

[-- Attachment #2: min_max-v1.patch --]
[-- Type: application/octet-stream, Size: 3275 bytes --]

From 40646d7731fe5bf7e5cf7c79fee19fbc63f57689 Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Wed, 26 Aug 2020 07:47:34 -0400
Subject: [PATCH] AArch64: Add FLAG for min/max intrinsics [PR94442]

2020-08-27  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for min/max intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 44 ++++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 502b83cb27f..a9b47e4ea1f 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -300,36 +300,36 @@
   BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, ALL)
-  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10, ALL)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, ALL)
-  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10, ALL)
-  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10, ALL)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, NONE)
+  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10, NONE)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, NONE)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10, NONE)
+  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10, NONE)
+  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10, NONE)
 
   /* Implemented by <maxmin_uns><mode>3.
      smax variants map to fmaxnm,
      smax_nan variants map to fmax.  */
-  BUILTIN_VDQ_BHSI (BINOP, smax, 3, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, smin, 3, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, umax, 3, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, umin, 3, ALL)
-  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3, ALL)
-  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, smax, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, smin, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umax, 3, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umin, 3, NONE)
+  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3, NONE)
+  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3, NONE)
 
   /* Implemented by <maxmin_uns><mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3, FP)
+  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3, FP)
 
   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, sminp, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0, ALL)
-  BUILTIN_VDQ_BHSI (BINOP, uminp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, smaxp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, sminp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, smax_nanp, 0, ALL)
-  BUILTIN_VHSDF (BINOP, smin_nanp, 0, ALL)
+  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, sminp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0, NONE)
+  BUILTIN_VDQ_BHSI (BINOP, uminp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smaxp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, sminp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smax_nanp, 0, NONE)
+  BUILTIN_VHSDF (BINOP, smin_nanp, 0, NONE)
 
   /* Implemented by <frint_pattern><mode>2.  */
   BUILTIN_VHSDF (UNOP, btrunc, 2, ALL)
-- 
2.19.1


[-- Attachment #3: rounding-v1.patch --]
[-- Type: application/octet-stream, Size: 8073 bytes --]

From 520997201e8cf5da1e5661b256d1b76ab1a1a15b Mon Sep 17 00:00:00 2001
From: xiezhiheng <xiezhiheng@huawei.com>
Date: Wed, 26 Aug 2020 08:04:07 -0400
Subject: [PATCH] AArch64: Add FLAG for rounding intrinsics [PR94442]

2020-08-27  Zhiheng Xie  <xiezhiheng@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for rounding intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 188 +++++++++----------
 1 file changed, 94 insertions(+), 94 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a9b47e4ea1f..d1b21102b2f 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -332,103 +332,103 @@
   BUILTIN_VHSDF (BINOP, smin_nanp, 0, NONE)
 
   /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VHSDF (UNOP, btrunc, 2, ALL)
-  BUILTIN_VHSDF (UNOP, ceil, 2, ALL)
-  BUILTIN_VHSDF (UNOP, floor, 2, ALL)
-  BUILTIN_VHSDF (UNOP, nearbyint, 2, ALL)
-  BUILTIN_VHSDF (UNOP, rint, 2, ALL)
-  BUILTIN_VHSDF (UNOP, round, 2, ALL)
-  BUILTIN_VHSDF_DF (UNOP, frintn, 2, ALL)
-
-  VAR1 (UNOP, btrunc, 2, ALL, hf)
-  VAR1 (UNOP, ceil, 2, ALL, hf)
-  VAR1 (UNOP, floor, 2, ALL, hf)
-  VAR1 (UNOP, frintn, 2, ALL, hf)
-  VAR1 (UNOP, nearbyint, 2, ALL, hf)
-  VAR1 (UNOP, rint, 2, ALL, hf)
-  VAR1 (UNOP, round, 2, ALL, hf)
+  BUILTIN_VHSDF (UNOP, btrunc, 2, FP)
+  BUILTIN_VHSDF (UNOP, ceil, 2, FP)
+  BUILTIN_VHSDF (UNOP, floor, 2, FP)
+  BUILTIN_VHSDF (UNOP, nearbyint, 2, FP)
+  BUILTIN_VHSDF (UNOP, rint, 2, FP)
+  BUILTIN_VHSDF (UNOP, round, 2, FP)
+  BUILTIN_VHSDF_DF (UNOP, frintn, 2, FP)
+
+  VAR1 (UNOP, btrunc, 2, FP, hf)
+  VAR1 (UNOP, ceil, 2, FP, hf)
+  VAR1 (UNOP, floor, 2, FP, hf)
+  VAR1 (UNOP, frintn, 2, FP, hf)
+  VAR1 (UNOP, nearbyint, 2, FP, hf)
+  VAR1 (UNOP, rint, 2, FP, hf)
+  VAR1 (UNOP, round, 2, FP, hf)
 
   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
-  VAR1 (UNOP, lbtruncv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lbtruncv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lbtruncv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lbtruncv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lbtruncv2df, 2, ALL, v2di)
-
-  VAR1 (UNOPUS, lbtruncuv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lbtruncuv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lbtruncuv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lbtruncuv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lbtruncuv2df, 2, ALL, v2di)
-
-  VAR1 (UNOP, lroundv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lroundv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lroundv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lroundv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lroundv2df, 2, ALL, v2di)
+  VAR1 (UNOP, lbtruncv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lbtruncv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lbtruncv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lbtruncv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lbtruncv2df, 2, FP, v2di)
+
+  VAR1 (UNOPUS, lbtruncuv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lbtruncuv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lbtruncuv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lbtruncuv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lbtruncuv2df, 2, FP, v2di)
+
+  VAR1 (UNOP, lroundv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lroundv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lroundv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lroundv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lroundv2df, 2, FP, v2di)
   /* Implemented by l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2.  */
-  BUILTIN_GPI_I16 (UNOP, lroundhf, 2, ALL)
-  VAR1 (UNOP, lroundsf, 2, ALL, si)
-  VAR1 (UNOP, lrounddf, 2, ALL, di)
-
-  VAR1 (UNOPUS, lrounduv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lrounduv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lrounduv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lrounduv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lrounduv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2, ALL)
-  VAR1 (UNOPUS, lroundusf, 2, ALL, si)
-  VAR1 (UNOPUS, lroundudf, 2, ALL, di)
-
-  VAR1 (UNOP, lceilv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lceilv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lceilv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lceilv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lceilv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOP, lceilhf, 2, ALL)
-
-  VAR1 (UNOPUS, lceiluv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lceiluv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lceiluv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lceiluv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lceiluv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2, ALL)
-  VAR1 (UNOPUS, lceilusf, 2, ALL, si)
-  VAR1 (UNOPUS, lceiludf, 2, ALL, di)
-
-  VAR1 (UNOP, lfloorv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lfloorv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lfloorv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lfloorv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lfloorv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2, ALL)
-
-  VAR1 (UNOPUS, lflooruv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lflooruv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lflooruv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lflooruv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lflooruv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2, ALL)
-  VAR1 (UNOPUS, lfloorusf, 2, ALL, si)
-  VAR1 (UNOPUS, lfloorudf, 2, ALL, di)
-
-  VAR1 (UNOP, lfrintnv4hf, 2, ALL, v4hi)
-  VAR1 (UNOP, lfrintnv8hf, 2, ALL, v8hi)
-  VAR1 (UNOP, lfrintnv2sf, 2, ALL, v2si)
-  VAR1 (UNOP, lfrintnv4sf, 2, ALL, v4si)
-  VAR1 (UNOP, lfrintnv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2, ALL)
-  VAR1 (UNOP, lfrintnsf, 2, ALL, si)
-  VAR1 (UNOP, lfrintndf, 2, ALL, di)
-
-  VAR1 (UNOPUS, lfrintnuv4hf, 2, ALL, v4hi)
-  VAR1 (UNOPUS, lfrintnuv8hf, 2, ALL, v8hi)
-  VAR1 (UNOPUS, lfrintnuv2sf, 2, ALL, v2si)
-  VAR1 (UNOPUS, lfrintnuv4sf, 2, ALL, v4si)
-  VAR1 (UNOPUS, lfrintnuv2df, 2, ALL, v2di)
-  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2, ALL)
-  VAR1 (UNOPUS, lfrintnusf, 2, ALL, si)
-  VAR1 (UNOPUS, lfrintnudf, 2, ALL, di)
+  BUILTIN_GPI_I16 (UNOP, lroundhf, 2, FP)
+  VAR1 (UNOP, lroundsf, 2, FP, si)
+  VAR1 (UNOP, lrounddf, 2, FP, di)
+
+  VAR1 (UNOPUS, lrounduv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lrounduv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lrounduv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lrounduv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lrounduv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2, FP)
+  VAR1 (UNOPUS, lroundusf, 2, FP, si)
+  VAR1 (UNOPUS, lroundudf, 2, FP, di)
+
+  VAR1 (UNOP, lceilv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lceilv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lceilv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lceilv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lceilv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lceilhf, 2, FP)
+
+  VAR1 (UNOPUS, lceiluv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lceiluv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lceiluv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lceiluv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lceiluv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2, FP)
+  VAR1 (UNOPUS, lceilusf, 2, FP, si)
+  VAR1 (UNOPUS, lceiludf, 2, FP, di)
+
+  VAR1 (UNOP, lfloorv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lfloorv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lfloorv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lfloorv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lfloorv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2, FP)
+
+  VAR1 (UNOPUS, lflooruv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lflooruv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lflooruv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lflooruv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lflooruv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2, FP)
+  VAR1 (UNOPUS, lfloorusf, 2, FP, si)
+  VAR1 (UNOPUS, lfloorudf, 2, FP, di)
+
+  VAR1 (UNOP, lfrintnv4hf, 2, FP, v4hi)
+  VAR1 (UNOP, lfrintnv8hf, 2, FP, v8hi)
+  VAR1 (UNOP, lfrintnv2sf, 2, FP, v2si)
+  VAR1 (UNOP, lfrintnv4sf, 2, FP, v4si)
+  VAR1 (UNOP, lfrintnv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2, FP)
+  VAR1 (UNOP, lfrintnsf, 2, FP, si)
+  VAR1 (UNOP, lfrintndf, 2, FP, di)
+
+  VAR1 (UNOPUS, lfrintnuv4hf, 2, FP, v4hi)
+  VAR1 (UNOPUS, lfrintnuv8hf, 2, FP, v8hi)
+  VAR1 (UNOPUS, lfrintnuv2sf, 2, FP, v2si)
+  VAR1 (UNOPUS, lfrintnuv4sf, 2, FP, v4si)
+  VAR1 (UNOPUS, lfrintnuv2df, 2, FP, v2di)
+  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2, FP)
+  VAR1 (UNOPUS, lfrintnusf, 2, FP, si)
+  VAR1 (UNOPUS, lfrintnudf, 2, FP, di)
 
   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
   VAR1 (UNOP, floatv4hi, 2, ALL, v4hf)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-27  2:50                                                   ` xiezhiheng
@ 2020-08-27  8:08                                                     ` Richard Sandiford
  2020-10-09  9:32                                                       ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-08-27  8:08 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
> I made two separate patches for these two groups for review purposes.
>
> Note: Patch for min/max intrinsics should be applied before the patch for rounding intrinsics
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, LGTM.  Pushed to master.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-08-27  8:08                                                     ` Richard Sandiford
@ 2020-10-09  9:32                                                       ` xiezhiheng
  2020-10-13  8:07                                                         ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-10-09  9:32 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1520 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Thursday, August 27, 2020 4:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> > I made two separate patches for these two groups for review purposes.
> >
> > Note: Patch for min/max intrinsics should be applied before the patch for
> rounding intrinsics
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> Thanks, LGTM.  Pushed to master.
> 
> Richard

I made the patch for multiply and multiply accumulator intrinsics.

Note that bfmmlaq intrinsic is special because this instruction ignores the FPCR and does not update the FPSR exception status.
  https://developer.arm.com/docs/ddi0596/h/simd-and-floating-point-instructions-alphabetic-order/bfmmla-bfloat16-floating-point-matrix-multiply-accumulate-into-2x2-matrix
So I set it to the AUTO_FP flag.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 75b62b590e2..8ca9746189a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-09  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for mul/mla/mls intrinsics.
+

[-- Attachment #2: pr94442-v1.patch --]
[-- Type: application/octet-stream, Size: 5024 bytes --]

From 24554e35b228d5265b84689975b4dc843a3c33e1 Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Sat, 10 Oct 2020 00:51:04 +0800
Subject: [PATCH] AArch64: Add FLAG for mul/mla/mls intrinsics [PR94442]

2020-10-09  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for mul/mla/mls intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 52 ++++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 3554fb0e0dc..4c23328a575 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -45,8 +45,8 @@
 
   BUILTIN_VDC (COMBINE, combine, 0, ALL)
   VAR1 (COMBINEP, combine, 0, ALL, di)
-  BUILTIN_VB (BINOP, pmul, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, ALL)
+  BUILTIN_VB (BINOP, pmul, 0, NONE)
+  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
   BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
   BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
   VAR1 (UNOP, addp, 0, NONE, di)
@@ -189,11 +189,11 @@
   BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0, ALL)
   BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0, ALL)
 
-  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, ALL)
-  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, ALL)
+  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0, NONE)
+  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0, NONE)
 
-  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, ALL)
-  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, ALL)
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, NONE)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, NONE)
 
   BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, ALL)
   BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, ALL)
@@ -246,10 +246,10 @@
   BUILTIN_VHSDF (BINOP, fcadd270, 0, FP)
 
   /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>.   */
-  BUILTIN_VHSDF (TERNOP, fcmla0, 0, ALL)
-  BUILTIN_VHSDF (TERNOP, fcmla90, 0, ALL)
-  BUILTIN_VHSDF (TERNOP, fcmla180, 0, ALL)
-  BUILTIN_VHSDF (TERNOP, fcmla270, 0, ALL)
+  BUILTIN_VHSDF (TERNOP, fcmla0, 0, FP)
+  BUILTIN_VHSDF (TERNOP, fcmla90, 0, FP)
+  BUILTIN_VHSDF (TERNOP, fcmla180, 0, FP)
+  BUILTIN_VHSDF (TERNOP, fcmla270, 0, FP)
   BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0, ALL)
   BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0, ALL)
   BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0, ALL)
@@ -534,8 +534,8 @@
   VAR1 (TERNOPU, crypto_sha256su1, 0, ALL, v4si)
 
   /* Implemented by aarch64_crypto_pmull<mode>.  */
-  VAR1 (BINOPP, crypto_pmull, 0, ALL, di)
-  VAR1 (BINOPP, crypto_pmull, 0, ALL, v2di)
+  VAR1 (BINOPP, crypto_pmull, 0, NONE, di)
+  VAR1 (BINOPP, crypto_pmull, 0, NONE, v2di)
 
   /* Implemented by aarch64_tbl3<mode>.  */
   VAR1 (BINOP, tbl3, 0, ALL, v8qi)
@@ -666,15 +666,15 @@
   BUILTIN_VQ_I (TERNOP, bcaxq, 4, ALL)
 
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_low<mode>.  */
-  VAR1 (TERNOP, fmlal_low, 0, ALL, v2sf)
-  VAR1 (TERNOP, fmlsl_low, 0, ALL, v2sf)
-  VAR1 (TERNOP, fmlalq_low, 0, ALL, v4sf)
-  VAR1 (TERNOP, fmlslq_low, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlal_low, 0, FP, v2sf)
+  VAR1 (TERNOP, fmlsl_low, 0, FP, v2sf)
+  VAR1 (TERNOP, fmlalq_low, 0, FP, v4sf)
+  VAR1 (TERNOP, fmlslq_low, 0, FP, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l<f16quad>_high<mode>.  */
-  VAR1 (TERNOP, fmlal_high, 0, ALL, v2sf)
-  VAR1 (TERNOP, fmlsl_high, 0, ALL, v2sf)
-  VAR1 (TERNOP, fmlalq_high, 0, ALL, v4sf)
-  VAR1 (TERNOP, fmlslq_high, 0, ALL, v4sf)
+  VAR1 (TERNOP, fmlal_high, 0, FP, v2sf)
+  VAR1 (TERNOP, fmlsl_high, 0, FP, v2sf)
+  VAR1 (TERNOP, fmlalq_high, 0, FP, v4sf)
+  VAR1 (TERNOP, fmlslq_high, 0, FP, v4sf)
   /* Implemented by aarch64_fml<f16mac1>l_lane_lowv2sf.  */
   VAR1 (QUADOP_LANE, fmlal_lane_low, 0, ALL, v2sf)
   VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, ALL, v2sf)
@@ -712,20 +712,20 @@
   VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, ALL, v2sf, v4sf)
 
   /* Implemented by aarch64_bfmmlaqv4sf  */
-  VAR1 (TERNOP, bfmmlaq, 0, ALL, v4sf)
+  VAR1 (TERNOP, bfmmlaq, 0, AUTO_FP, v4sf)
 
   /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf  */
-  VAR1 (TERNOP, bfmlalb, 0, ALL, v4sf)
-  VAR1 (TERNOP, bfmlalt, 0, ALL, v4sf)
+  VAR1 (TERNOP, bfmlalb, 0, FP, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, FP, v4sf)
   VAR1 (QUADOP_LANE, bfmlalb_lane, 0, ALL, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane, 0, ALL, v4sf)
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
-  VAR1 (TERNOP, simd_smmla, 0, ALL, v16qi)
-  VAR1 (TERNOPU, simd_ummla, 0, ALL, v16qi)
-  VAR1 (TERNOP_SSUS, simd_usmmla, 0, ALL, v16qi)
+  VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, NONE, v16qi)
 
   /* Implemented by aarch64_bfcvtn{q}{2}<mode>  */
   VAR1 (UNOP, bfcvtn, 0, ALL, v4bf)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-09  9:32                                                       ` xiezhiheng
@ 2020-10-13  8:07                                                         ` Richard Sandiford
  2020-10-19  9:21                                                           ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-10-13  8:07 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Thursday, August 27, 2020 4:08 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> > I made two separate patches for these two groups for review purposes.
>> >
>> > Note: Patch for min/max intrinsics should be applied before the patch for
>> rounding intrinsics
>> >
>> > Bootstrapped and tested on aarch64 Linux platform.
>> 
>> Thanks, LGTM.  Pushed to master.
>> 
>> Richard
>
> I made the patch for multiply and multiply accumulator intrinsics.
>
> Note that bfmmlaq intrinsic is special because this instruction ignores the FPCR and does not update the FPSR exception status.
>   https://developer.arm.com/docs/ddi0596/h/simd-and-floating-point-instructions-alphabetic-order/bfmmla-bfloat16-floating-point-matrix-multiply-accumulate-into-2x2-matrix
> So I set it to the AUTO_FP flag.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, LGTM.  Pushed to trunk.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-13  8:07                                                         ` Richard Sandiford
@ 2020-10-19  9:21                                                           ` xiezhiheng
  2020-10-20 16:53                                                             ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-10-19  9:21 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1321 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, October 13, 2020 4:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 

Cut ...

> 
> Thanks, LGTM.  Pushed to trunk.
> 

I made two separate patches for these two groups, get/set register intrinsics and store intrinsics.

Note: It does not matter which patch is applied first.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d1ce634eb2b..8828cc5929d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-19  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for get/set reg intrinsics.
+

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d1ce634eb2b..bab5c1faf3c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-19  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for store intrinsics.
+

[-- Attachment #2: get_set-reg-v1.patch --]
[-- Type: application/octet-stream, Size: 2848 bytes --]

From 2cb5ac199f9430d8ab5e2bc5b25e585ae9488cca Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Mon, 19 Oct 2020 23:37:48 +0800
Subject: [PATCH] AArch64: Add FLAG for get/set reg intrinsics [PR94442]

2020-10-19  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for get/set reg intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 36 ++++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4c23328a575..5bc596dbffc 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -70,26 +70,26 @@
   BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, ALL)
 
   /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
-  BUILTIN_VDC (GETREG, get_dregoi, 0, ALL)
-  BUILTIN_VDC (GETREG, get_dregci, 0, ALL)
-  BUILTIN_VDC (GETREG, get_dregxi, 0, ALL)
-  VAR1 (GETREGP, get_dregoi, 0, ALL, di)
-  VAR1 (GETREGP, get_dregci, 0, ALL, di)
-  VAR1 (GETREGP, get_dregxi, 0, ALL, di)
+  BUILTIN_VDC (GETREG, get_dregoi, 0, AUTO_FP)
+  BUILTIN_VDC (GETREG, get_dregci, 0, AUTO_FP)
+  BUILTIN_VDC (GETREG, get_dregxi, 0, AUTO_FP)
+  VAR1 (GETREGP, get_dregoi, 0, AUTO_FP, di)
+  VAR1 (GETREGP, get_dregci, 0, AUTO_FP, di)
+  VAR1 (GETREGP, get_dregxi, 0, AUTO_FP, di)
   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (GETREG, get_qregoi, 0, ALL)
-  BUILTIN_VQ (GETREG, get_qregci, 0, ALL)
-  BUILTIN_VQ (GETREG, get_qregxi, 0, ALL)
-  VAR1 (GETREGP, get_qregoi, 0, ALL, v2di)
-  VAR1 (GETREGP, get_qregci, 0, ALL, v2di)
-  VAR1 (GETREGP, get_qregxi, 0, ALL, v2di)
+  BUILTIN_VQ (GETREG, get_qregoi, 0, AUTO_FP)
+  BUILTIN_VQ (GETREG, get_qregci, 0, AUTO_FP)
+  BUILTIN_VQ (GETREG, get_qregxi, 0, AUTO_FP)
+  VAR1 (GETREGP, get_qregoi, 0, AUTO_FP, v2di)
+  VAR1 (GETREGP, get_qregci, 0, AUTO_FP, v2di)
+  VAR1 (GETREGP, get_qregxi, 0, AUTO_FP, v2di)
   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (SETREG, set_qregoi, 0, ALL)
-  BUILTIN_VQ (SETREG, set_qregci, 0, ALL)
-  BUILTIN_VQ (SETREG, set_qregxi, 0, ALL)
-  VAR1 (SETREGP, set_qregoi, 0, ALL, v2di)
-  VAR1 (SETREGP, set_qregci, 0, ALL, v2di)
-  VAR1 (SETREGP, set_qregxi, 0, ALL, v2di)
+  BUILTIN_VQ (SETREG, set_qregoi, 0, AUTO_FP)
+  BUILTIN_VQ (SETREG, set_qregci, 0, AUTO_FP)
+  BUILTIN_VQ (SETREG, set_qregxi, 0, AUTO_FP)
+  VAR1 (SETREGP, set_qregoi, 0, AUTO_FP, v2di)
+  VAR1 (SETREGP, set_qregci, 0, AUTO_FP, v2di)
+  VAR1 (SETREGP, set_qregxi, 0, AUTO_FP, v2di)
   /* Implemented by aarch64_ld1x2<VQ:mode>. */
   BUILTIN_VQ (LOADSTRUCT, ld1x2, 0, ALL)
   /* Implemented by aarch64_ld1x2<VDC:mode>. */
-- 
2.19.1


[-- Attachment #3: store-v1.patch --]
[-- Type: application/octet-stream, Size: 2738 bytes --]

From 85159cd259a3debf8fc410c7abac4a969178ba22 Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Tue, 20 Oct 2020 00:32:15 +0800
Subject: [PATCH] AArch64: Add FLAG for store intrinsics [PR94442]

2020-10-19  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for store intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 22 ++++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 5bc596dbffc..a81654effb3 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -111,13 +111,13 @@
   BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0, ALL)
   BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (STORESTRUCT, st2, 0, ALL)
-  BUILTIN_VDC (STORESTRUCT, st3, 0, ALL)
-  BUILTIN_VDC (STORESTRUCT, st4, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st2, 0, WRITE_MEMORY)
+  BUILTIN_VDC (STORESTRUCT, st3, 0, WRITE_MEMORY)
+  BUILTIN_VDC (STORESTRUCT, st4, 0, WRITE_MEMORY)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (STORESTRUCT, st2, 0, ALL)
-  BUILTIN_VQ (STORESTRUCT, st3, 0, ALL)
-  BUILTIN_VQ (STORESTRUCT, st4, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st2, 0, WRITE_MEMORY)
+  BUILTIN_VQ (STORESTRUCT, st3, 0, WRITE_MEMORY)
+  BUILTIN_VQ (STORESTRUCT, st4, 0, WRITE_MEMORY)
 
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0, ALL)
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
@@ -482,8 +482,8 @@
   VAR1(STORE1P, ld1, 0, ALL, v2di)
 
   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (STORE1, st1, 0, ALL)
-  VAR1(STORE1P, st1, 0, ALL, v2di)
+  BUILTIN_VALL_F16 (STORE1, st1, 0, WRITE_MEMORY)
+  VAR1 (STORE1P, st1, 0, WRITE_MEMORY, v2di)
 
   /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
   BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0, ALL)
@@ -492,13 +492,13 @@
   BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0, ALL)
 
   /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0, WRITE_MEMORY)
 
   /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0, WRITE_MEMORY)
 
   /* Implemented by aarch64_st1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, WRITE_MEMORY)
 
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4, ALL)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-19  9:21                                                           ` xiezhiheng
@ 2020-10-20 16:53                                                             ` Richard Sandiford
  2020-10-22  9:16                                                               ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-10-20 16:53 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
> I made two separate patches for these two groups, get/set register intrinsics and store intrinsics.
>
> Note: It does not matter which patch is applied first.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks.  I pushed the get/set patch.  For the store patch, I think
we should have:

const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY | FLAG_AUTO_FP;

since the FP forms don't (for example) read the FPCR.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-20 16:53                                                             ` Richard Sandiford
@ 2020-10-22  9:16                                                               ` xiezhiheng
  2020-10-26 13:03                                                                 ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-10-22  9:16 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1424 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, October 21, 2020 12:54 AM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> > I made two separate patches for these two groups, get/set register
> intrinsics and store intrinsics.
> >
> > Note: It does not matter which patch is applied first.
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> Thanks.  I pushed the get/set patch.  For the store patch, I think
> we should have:
> 
> const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY | FLAG_AUTO_FP;
> 
> since the FP forms don't (for example) read the FPCR.
> 

That's true.  I added FLAG_STORE for the store intrinsics and made the patch for them.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 59fa1ad4d5d..26edaa309c8 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2020-10-22  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c: Add FLAG STORE.
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for store intrinsics.
+

[-- Attachment #2: store-v2.patch --]
[-- Type: application/octet-stream, Size: 3349 bytes --]

From 0171ceffa9b9ecdb3c2ca482edb6ee48425d265e Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Fri, 23 Oct 2020 00:31:19 +0800
Subject: [PATCH] AArch64: Add FLAG for store intrinsics [PR94442]

2020-10-22  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c: Add FLAG STORE.
	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for store intrinsics.
---
 gcc/config/aarch64/aarch64-builtins.c        |  1 +
 gcc/config/aarch64/aarch64-simd-builtins.def | 22 ++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 732a4dcbcc3..9d5e8c75c55 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -132,6 +132,7 @@ const unsigned int FLAG_AUTO_FP = 1U << 5;
 const unsigned int FLAG_FP = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS;
 const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
   | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
+const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY | FLAG_AUTO_FP;
 
 typedef struct
 {
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 5bc596dbffc..bae7a048b72 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -111,13 +111,13 @@
   BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0, ALL)
   BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0, ALL)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (STORESTRUCT, st2, 0, ALL)
-  BUILTIN_VDC (STORESTRUCT, st3, 0, ALL)
-  BUILTIN_VDC (STORESTRUCT, st4, 0, ALL)
+  BUILTIN_VDC (STORESTRUCT, st2, 0, STORE)
+  BUILTIN_VDC (STORESTRUCT, st3, 0, STORE)
+  BUILTIN_VDC (STORESTRUCT, st4, 0, STORE)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (STORESTRUCT, st2, 0, ALL)
-  BUILTIN_VQ (STORESTRUCT, st3, 0, ALL)
-  BUILTIN_VQ (STORESTRUCT, st4, 0, ALL)
+  BUILTIN_VQ (STORESTRUCT, st2, 0, STORE)
+  BUILTIN_VQ (STORESTRUCT, st3, 0, STORE)
+  BUILTIN_VQ (STORESTRUCT, st4, 0, STORE)
 
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0, ALL)
   BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0, ALL)
@@ -482,8 +482,8 @@
   VAR1(STORE1P, ld1, 0, ALL, v2di)
 
   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (STORE1, st1, 0, ALL)
-  VAR1(STORE1P, st1, 0, ALL, v2di)
+  BUILTIN_VALL_F16 (STORE1, st1, 0, STORE)
+  VAR1 (STORE1P, st1, 0, STORE, v2di)
 
   /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
   BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0, ALL)
@@ -492,13 +492,13 @@
   BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0, ALL)
 
   /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0, STORE)
 
   /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0, STORE)
 
   /* Implemented by aarch64_st1x4<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, ALL)
+  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, STORE)
 
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4, ALL)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-22  9:16                                                               ` xiezhiheng
@ 2020-10-26 13:03                                                                 ` Richard Sandiford
  2020-10-30  6:41                                                                   ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-10-26 13:03 UTC (permalink / raw)
  To: xiezhiheng; +Cc: Richard Biener, gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Wednesday, October 21, 2020 12:54 AM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> > I made two separate patches for these two groups, get/set register
>> intrinsics and store intrinsics.
>> >
>> > Note: It does not matter which patch is applied first.
>> >
>> > Bootstrapped and tested on aarch64 Linux platform.
>> 
>> Thanks.  I pushed the get/set patch.  For the store patch, I think
>> we should have:
>> 
>> const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY | FLAG_AUTO_FP;
>> 
>> since the FP forms don't (for example) read the FPCR.
>> 
>
> That's true.  I added FLAG_STORE for the store intrinsics and made the patch for them.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, pushed to trunk.

Sorry for the delayed response.

Richard

>
> Thanks,
> Xie Zhiheng
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 59fa1ad4d5d..26edaa309c8 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,10 @@
> +2020-10-22  Zhiheng Xie  <xiezhiheng@huawei.com>
> +	    Nannan Zheng  <zhengnannan@huawei.com>
> +
> +	* config/aarch64/aarch64-builtins.c: Add FLAG STORE.
> +	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
> +	for store intrinsics.
> +

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-26 13:03                                                                 ` Richard Sandiford
@ 2020-10-30  6:41                                                                   ` xiezhiheng
  2020-10-30 10:23                                                                     ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-10-30  6:41 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 862 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Monday, October 26, 2020 9:03 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
>
> Thanks, pushed to trunk.
>

Thanks, and I made the patch for float conversion intrinsics.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 58ed7b12850..af910066ba0 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-30  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for conversion intrinsics.
+

[-- Attachment #2: float_conversion-v1.patch --]
[-- Type: application/octet-stream, Size: 4790 bytes --]

From 9ec96dea2300150f6cfe3b88998d25f0d2b7e7fb Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Fri, 30 Oct 2020 04:50:58 +0800
Subject: [PATCH] AArch64: Add FLAG for float conversion intrinsics [PR94442]

2020-10-30  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for conversion intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 74 ++++++++++----------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index bae7a048b72..eb8e6f7b3d8 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -430,17 +430,17 @@
   VAR1 (UNOPUS, lfrintnudf, 2, FP, di)
 
   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
-  VAR1 (UNOP, floatv4hi, 2, ALL, v4hf)
-  VAR1 (UNOP, floatv8hi, 2, ALL, v8hf)
-  VAR1 (UNOP, floatv2si, 2, ALL, v2sf)
-  VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
-  VAR1 (UNOP, floatv2di, 2, ALL, v2df)
-
-  VAR1 (UNOP, floatunsv4hi, 2, ALL, v4hf)
-  VAR1 (UNOP, floatunsv8hi, 2, ALL, v8hf)
-  VAR1 (UNOP, floatunsv2si, 2, ALL, v2sf)
-  VAR1 (UNOP, floatunsv4si, 2, ALL, v4sf)
-  VAR1 (UNOP, floatunsv2di, 2, ALL, v2df)
+  VAR1 (UNOP, floatv4hi, 2, FP, v4hf)
+  VAR1 (UNOP, floatv8hi, 2, FP, v8hf)
+  VAR1 (UNOP, floatv2si, 2, FP, v2sf)
+  VAR1 (UNOP, floatv4si, 2, FP, v4sf)
+  VAR1 (UNOP, floatv2di, 2, FP, v2df)
+
+  VAR1 (UNOP, floatunsv4hi, 2, FP, v4hf)
+  VAR1 (UNOP, floatunsv8hi, 2, FP, v8hf)
+  VAR1 (UNOP, floatunsv2si, 2, FP, v2sf)
+  VAR1 (UNOP, floatunsv4si, 2, FP, v4sf)
+  VAR1 (UNOP, floatunsv2di, 2, FP, v2df)
 
   VAR5 (UNOPU, bswap, 2, ALL, v4hi, v8hi, v2si, v4si, v2di)
 
@@ -469,13 +469,13 @@
   BUILTIN_VHSDF (UNOP, abs, 2, ALL)
   VAR1 (UNOP, abs, 2, ALL, hf)
 
-  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10, ALL)
-  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v4sf)
-  VAR1 (BINOP, float_truncate_hi_, 0, ALL, v8hf)
+  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10, FP)
+  VAR1 (BINOP, float_truncate_hi_, 0, FP, v4sf)
+  VAR1 (BINOP, float_truncate_hi_, 0, FP, v8hf)
 
-  VAR1 (UNOP, float_extend_lo_, 0, ALL, v2df)
-  VAR1 (UNOP, float_extend_lo_,  0, ALL, v4sf)
-  BUILTIN_VDF (UNOP, float_truncate_lo_, 0, ALL)
+  VAR1 (UNOP, float_extend_lo_, 0, FP, v2df)
+  VAR1 (UNOP, float_extend_lo_,  0, FP, v4sf)
+  BUILTIN_VDF (UNOP, float_truncate_lo_, 0, FP)
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
   BUILTIN_VALL_F16 (LOAD1, ld1, 0, ALL)
@@ -619,18 +619,18 @@
   VAR1 (UNOP, sqrt, 2, ALL, hf)
 
   /* Implemented by <optab><mode>hf2.  */
-  VAR1 (UNOP, floatdi, 2, ALL, hf)
-  VAR1 (UNOP, floatsi, 2, ALL, hf)
-  VAR1 (UNOP, floathi, 2, ALL, hf)
-  VAR1 (UNOPUS, floatunsdi, 2, ALL, hf)
-  VAR1 (UNOPUS, floatunssi, 2, ALL, hf)
-  VAR1 (UNOPUS, floatunshi, 2, ALL, hf)
-  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2, ALL)
-  BUILTIN_GPI (UNOP, fix_truncsf, 2, ALL)
-  BUILTIN_GPI (UNOP, fix_truncdf, 2, ALL)
-  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2, ALL)
-  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2, ALL)
-  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2, ALL)
+  VAR1 (UNOP, floatdi, 2, FP, hf)
+  VAR1 (UNOP, floatsi, 2, FP, hf)
+  VAR1 (UNOP, floathi, 2, FP, hf)
+  VAR1 (UNOPUS, floatunsdi, 2, FP, hf)
+  VAR1 (UNOPUS, floatunssi, 2, FP, hf)
+  VAR1 (UNOPUS, floatunshi, 2, FP, hf)
+  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2, FP)
+  BUILTIN_GPI (UNOP, fix_truncsf, 2, FP)
+  BUILTIN_GPI (UNOP, fix_truncdf, 2, FP)
+  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2, FP)
+  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2, FP)
+  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2, FP)
 
   /* Implemented by aarch64_sm3ss1qv4si.  */
   VAR1 (TERNOPU, sm3ss1q, 0, ALL, v4si)
@@ -701,10 +701,10 @@
   VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, ALL, v4sf)
 
   /* Implemented by aarch64_<frintnzs_op><mode>.  */
-  BUILTIN_VSFDF (UNOP, frint32z, 0, ALL)
-  BUILTIN_VSFDF (UNOP, frint32x, 0, ALL)
-  BUILTIN_VSFDF (UNOP, frint64z, 0, ALL)
-  BUILTIN_VSFDF (UNOP, frint64x, 0, ALL)
+  BUILTIN_VSFDF (UNOP, frint32z, 0, FP)
+  BUILTIN_VSFDF (UNOP, frint32x, 0, FP)
+  BUILTIN_VSFDF (UNOP, frint64z, 0, FP)
+  BUILTIN_VSFDF (UNOP, frint64x, 0, FP)
 
   /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
   VAR2 (TERNOP, bfdot, 0, ALL, v2sf, v4sf)
@@ -728,7 +728,7 @@
   VAR1 (TERNOP_SSUS, simd_usmmla, 0, NONE, v16qi)
 
   /* Implemented by aarch64_bfcvtn{q}{2}<mode>  */
-  VAR1 (UNOP, bfcvtn, 0, ALL, v4bf)
-  VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
-  VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
-  VAR1 (UNOP, bfcvt, 0, ALL, bf)
+  VAR1 (UNOP, bfcvtn, 0, FP, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, FP, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, FP, v8bf)
+  VAR1 (UNOP, bfcvt, 0, FP, bf)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-30  6:41                                                                   ` xiezhiheng
@ 2020-10-30 10:23                                                                     ` Richard Sandiford
  2020-11-03 11:59                                                                       ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-10-30 10:23 UTC (permalink / raw)
  To: xiezhiheng; +Cc: gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Monday, October 26, 2020 9:03 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>>
>> Thanks, pushed to trunk.
>>
>
> Thanks, and I made the patch for float conversion intrinsics.

LGTM, thanks.  Pushed.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-10-30 10:23                                                                     ` Richard Sandiford
@ 2020-11-03 11:59                                                                       ` xiezhiheng
  2020-11-03 13:57                                                                         ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-11-03 11:59 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1827 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, October 30, 2020 6:24 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Monday, October 26, 2020 9:03 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> Thanks, pushed to trunk.
> >>
> >
> > Thanks, and I made the patch for float conversion intrinsics.
> 
> LGTM, thanks.  Pushed.
> 

Thanks.  And I made two separate patches for these two groups, compare intrinsics
and encryption algorithm (AES/SHA/SM3/SM4) intrinsics.

Note: It does not matter which patch is applied first.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9f743ecc89a..ba5e3dc7c55 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-03  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for compare intrinsics.
+

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9f743ecc89a..d6b943fc0df 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-03  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for AES/SHA/SM3/SM4 intrinsics.
+

[-- Attachment #2: compare-v1.patch --]
[-- Type: application/octet-stream, Size: 1980 bytes --]

From d270dc206d609d314ca76956b0e685cf621c454f Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Wed, 4 Nov 2020 00:34:06 +0800
Subject: [PATCH] AArch64: Add FLAG for compare intrinsics [PR94442]

2020-11-03  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for compare intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index eb8e6f7b3d8..e8e8af0e4ba 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -600,20 +600,20 @@
   BUILTIN_VHSDF (BINOP, faddp, 0, FP)
 
   /* Implemented by aarch64_cm<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0, FP)
 
   /* Implemented by neg<mode>2.  */
   BUILTIN_VHSDF_HSDF (UNOP, neg, 2, ALL)
 
   /* Implemented by aarch64_fac<optab><mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP)
 
   /* Implemented by sqrt<mode>2.  */
   VAR1 (UNOP, sqrt, 2, ALL, hf)
-- 
2.19.1


[-- Attachment #3: encryption-v1.patch --]
[-- Type: application/octet-stream, Size: 4306 bytes --]

From 33f40483ed7047883612a6f7be1154e83ce9b51a Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Wed, 4 Nov 2020 00:53:10 +0800
Subject: [PATCH] AArch64: Add FLAG for AES/SHA/SM3/SM4 intrinsics [PR94442]

2020-11-03  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for AES/SHA/SM3/SM4 intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 54 ++++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index e8e8af0e4ba..fc11ff32371 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -514,24 +514,24 @@
   BUILTIN_VALLDIF (BSL_S, simd_bsl, 0, ALL)
 
   /* Implemented by aarch64_crypto_aes<op><mode>.  */
-  VAR1 (BINOPU, crypto_aese, 0, ALL, v16qi)
-  VAR1 (BINOPU, crypto_aesd, 0, ALL, v16qi)
-  VAR1 (UNOPU, crypto_aesmc, 0, ALL, v16qi)
-  VAR1 (UNOPU, crypto_aesimc, 0, ALL, v16qi)
+  VAR1 (BINOPU, crypto_aese, 0, NONE, v16qi)
+  VAR1 (BINOPU, crypto_aesd, 0, NONE, v16qi)
+  VAR1 (UNOPU, crypto_aesmc, 0, NONE, v16qi)
+  VAR1 (UNOPU, crypto_aesimc, 0, NONE, v16qi)
 
   /* Implemented by aarch64_crypto_sha1<op><mode>.  */
-  VAR1 (UNOPU, crypto_sha1h, 0, ALL, si)
-  VAR1 (BINOPU, crypto_sha1su1, 0, ALL, v4si)
-  VAR1 (TERNOPU, crypto_sha1c, 0, ALL, v4si)
-  VAR1 (TERNOPU, crypto_sha1m, 0, ALL, v4si)
-  VAR1 (TERNOPU, crypto_sha1p, 0, ALL, v4si)
-  VAR1 (TERNOPU, crypto_sha1su0, 0, ALL, v4si)
+  VAR1 (UNOPU, crypto_sha1h, 0, NONE, si)
+  VAR1 (BINOPU, crypto_sha1su1, 0, NONE, v4si)
+  VAR1 (TERNOPU, crypto_sha1c, 0, NONE, v4si)
+  VAR1 (TERNOPU, crypto_sha1m, 0, NONE, v4si)
+  VAR1 (TERNOPU, crypto_sha1p, 0, NONE, v4si)
+  VAR1 (TERNOPU, crypto_sha1su0, 0, NONE, v4si)
 
   /* Implemented by aarch64_crypto_sha256<op><mode>.  */
-  VAR1 (TERNOPU, crypto_sha256h, 0, ALL, v4si)
-  VAR1 (TERNOPU, crypto_sha256h2, 0, ALL, v4si)
-  VAR1 (BINOPU, crypto_sha256su0, 0, ALL, v4si)
-  VAR1 (TERNOPU, crypto_sha256su1, 0, ALL, v4si)
+  VAR1 (TERNOPU, crypto_sha256h, 0, NONE, v4si)
+  VAR1 (TERNOPU, crypto_sha256h2, 0, NONE, v4si)
+  VAR1 (BINOPU, crypto_sha256su0, 0, NONE, v4si)
+  VAR1 (TERNOPU, crypto_sha256su1, 0, NONE, v4si)
 
   /* Implemented by aarch64_crypto_pmull<mode>.  */
   VAR1 (BINOPP, crypto_pmull, 0, NONE, di)
@@ -633,27 +633,27 @@
   BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2, FP)
 
   /* Implemented by aarch64_sm3ss1qv4si.  */
-  VAR1 (TERNOPU, sm3ss1q, 0, ALL, v4si)
+  VAR1 (TERNOPU, sm3ss1q, 0, NONE, v4si)
   /* Implemented by aarch64_sm3tt<sm3tt_op>qv4si.  */
-  VAR1 (QUADOPUI, sm3tt1aq, 0, ALL, v4si)
-  VAR1 (QUADOPUI, sm3tt1bq, 0, ALL, v4si)
-  VAR1 (QUADOPUI, sm3tt2aq, 0, ALL, v4si)
-  VAR1 (QUADOPUI, sm3tt2bq, 0, ALL, v4si)
+  VAR1 (QUADOPUI, sm3tt1aq, 0, NONE, v4si)
+  VAR1 (QUADOPUI, sm3tt1bq, 0, NONE, v4si)
+  VAR1 (QUADOPUI, sm3tt2aq, 0, NONE, v4si)
+  VAR1 (QUADOPUI, sm3tt2bq, 0, NONE, v4si)
   /* Implemented by aarch64_sm3partw<sm3part_op>qv4si.  */
-  VAR1 (TERNOPU, sm3partw1q, 0, ALL, v4si)
-  VAR1 (TERNOPU, sm3partw2q, 0, ALL, v4si)
+  VAR1 (TERNOPU, sm3partw1q, 0, NONE, v4si)
+  VAR1 (TERNOPU, sm3partw2q, 0, NONE, v4si)
   /* Implemented by aarch64_sm4eqv4si.  */
-  VAR1 (BINOPU, sm4eq, 0, ALL, v4si)
+  VAR1 (BINOPU, sm4eq, 0, NONE, v4si)
   /* Implemented by aarch64_sm4ekeyqv4si.  */
-  VAR1 (BINOPU, sm4ekeyq, 0, ALL, v4si)
+  VAR1 (BINOPU, sm4ekeyq, 0, NONE, v4si)
   /* Implemented by aarch64_crypto_sha512hqv2di.  */
-  VAR1 (TERNOPU, crypto_sha512hq, 0, ALL, v2di)
+  VAR1 (TERNOPU, crypto_sha512hq, 0, NONE, v2di)
   /* Implemented by aarch64_sha512h2qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512h2q, 0, ALL, v2di)
+  VAR1 (TERNOPU, crypto_sha512h2q, 0, NONE, v2di)
   /* Implemented by aarch64_crypto_sha512su0qv2di.  */
-  VAR1 (BINOPU, crypto_sha512su0q, 0, ALL, v2di)
+  VAR1 (BINOPU, crypto_sha512su0q, 0, NONE, v2di)
   /* Implemented by aarch64_crypto_sha512su1qv2di.  */
-  VAR1 (TERNOPU, crypto_sha512su1q, 0, ALL, v2di)
+  VAR1 (TERNOPU, crypto_sha512su1q, 0, NONE, v2di)
   /* Implemented by eor3q<mode>4.  */
   BUILTIN_VQ_I (TERNOPU, eor3q, 4, ALL)
   BUILTIN_VQ_I (TERNOP, eor3q, 4, ALL)
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-11-03 11:59                                                                       ` xiezhiheng
@ 2020-11-03 13:57                                                                         ` Richard Sandiford
  2020-11-09  3:27                                                                           ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-11-03 13:57 UTC (permalink / raw)
  To: xiezhiheng; +Cc: gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, October 30, 2020 6:24 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Monday, October 26, 2020 9:03 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> Thanks, pushed to trunk.
>> >>
>> >
>> > Thanks, and I made the patch for float conversion intrinsics.
>> 
>> LGTM, thanks.  Pushed.
>> 
>
> Thanks.  And I made two separate patches for these two groups, compare intrinsics
> and encryption algorithm (AES/SHA/SM3/SM4) intrinsics.
>
> Note: It does not matter which patch is applied first.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, I pushed both patches to trunk.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-11-03 13:57                                                                         ` Richard Sandiford
@ 2020-11-09  3:27                                                                           ` xiezhiheng
  2020-11-10 11:53                                                                             ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-11-09  3:27 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1350 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, November 3, 2020 9:57 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> Thanks, I pushed both patches to trunk.
> 

Thanks.  And I made two separate patches for these two groups, tbl/tbx intrinsics and
the rest of the arithmetic operation intrinsics.

Note: It does not matter which patch is applied first.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index eab26b5f3a9..4f81c86fc76 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-09  Zhiheng Xie  <xiezhiheng@huawei.com>
+           Nannan Zheng  <zhengnannan@huawei.com>
+
+       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+       for tbl/tbx intrinsics.
+

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index eab26b5f3a9..193fbe4cf7d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-09  Zhiheng Xie  <xiezhiheng@huawei.com>
+           Nannan Zheng  <zhengnannan@huawei.com>
+
+       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+       for arithmetic operation intrinsics.
+

[-- Attachment #2: arithmetic-operation-v1.patch --]
[-- Type: application/octet-stream, Size: 4740 bytes --]

From 2564c05b3d22edc25f0a549aed3a1b3198c70fcb Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Sat, 7 Nov 2020 02:55:24 +0800
Subject: [PATCH] AArch64: Add FLAG for arithmetic operation intrinsics
 [PR94442]

2020-11-09  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for arithmetic operation intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 44 ++++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 968ddf026bf..98cdd97342b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -47,7 +47,7 @@
   VAR1 (COMBINEP, combine, 0, ALL, di)
   BUILTIN_VB (BINOP, pmul, 0, NONE)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP)
   BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
   VAR1 (UNOP, addp, 0, NONE, di)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
@@ -229,9 +229,9 @@
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, ALL)
 
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0, ALL)
-  BUILTIN_VB (TERNOPU, udot, 0, ALL)
-  BUILTIN_VB (TERNOP_SSUS, usdot, 0, ALL)
+  BUILTIN_VB (TERNOP, sdot, 0, NONE)
+  BUILTIN_VB (TERNOPU, udot, 0, NONE)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE)
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, ALL)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, ALL)
   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, ALL)
@@ -297,7 +297,7 @@
   BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
 
   /* Implemented by aarch64_reduc_plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, FP)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
   BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, NONE)
@@ -455,19 +455,19 @@
   BUILTIN_VALL (BINOP, trn1, 0, ALL)
   BUILTIN_VALL (BINOP, trn2, 0, ALL)
 
-  BUILTIN_GPF_F16 (UNOP, frecpe, 0, ALL)
-  BUILTIN_GPF_F16 (UNOP, frecpx, 0, ALL)
+  BUILTIN_GPF_F16 (UNOP, frecpe, 0, FP)
+  BUILTIN_GPF_F16 (UNOP, frecpx, 0, FP)
 
-  BUILTIN_VDQ_SI (UNOP, urecpe, 0, ALL)
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0, NONE)
 
-  BUILTIN_VHSDF (UNOP, frecpe, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0, ALL)
+  BUILTIN_VHSDF (UNOP, frecpe, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0, FP)
 
   /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
-  BUILTIN_VSDQ_I_DI (UNOP, abs, 0, ALL)
-  BUILTIN_VHSDF (UNOP, abs, 2, ALL)
-  VAR1 (UNOP, abs, 2, ALL, hf)
+  BUILTIN_VSDQ_I_DI (UNOP, abs, 0, AUTO_FP)
+  BUILTIN_VHSDF (UNOP, abs, 2, AUTO_FP)
+  VAR1 (UNOP, abs, 2, AUTO_FP, hf)
 
   BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10, FP)
   VAR1 (BINOP, float_truncate_hi_, 0, FP, v4sf)
@@ -501,11 +501,11 @@
   BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, STORE)
 
   /* Implemented by fma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fma, 4, ALL)
-  VAR1 (TERNOP, fma, 4, ALL, hf)
+  BUILTIN_VHSDF (TERNOP, fma, 4, FP)
+  VAR1 (TERNOP, fma, 4, FP, hf)
   /* Implemented by fnma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fnma, 4, ALL)
-  VAR1 (TERNOP, fnma, 4, ALL, hf)
+  BUILTIN_VHSDF (TERNOP, fnma, 4, FP)
+  VAR1 (TERNOP, fnma, 4, FP, hf)
 
   /* Implemented by aarch64_simd_bsl<mode>.  */
   BUILTIN_VDQQH (BSL_P, simd_bsl, 0, ALL)
@@ -588,13 +588,13 @@
   BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3, ALL)
 
   /* Implemented by aarch64_rsqrte<mode>.  */
-  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0, ALL)
+  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0, FP)
 
   /* Implemented by aarch64_rsqrts<mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0, FP)
 
   /* Implemented by fabd<mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, FP)
 
   /* Implemented by aarch64_faddp<mode>.  */
   BUILTIN_VHSDF (BINOP, faddp, 0, FP)
@@ -616,7 +616,7 @@
   BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP)
 
   /* Implemented by sqrt<mode>2.  */
-  VAR1 (UNOP, sqrt, 2, ALL, hf)
+  VAR1 (UNOP, sqrt, 2, FP, hf)
 
   /* Implemented by <optab><mode>hf2.  */
   VAR1 (UNOP, floatdi, 2, FP, hf)
@@ -707,7 +707,7 @@
   BUILTIN_VSFDF (UNOP, frint64x, 0, FP)
 
   /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
-  VAR2 (TERNOP, bfdot, 0, ALL, v2sf, v4sf)
+  VAR2 (TERNOP, bfdot, 0, AUTO_FP, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, ALL, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, ALL, v2sf, v4sf)
 
-- 
2.19.1


[-- Attachment #3: tbl_tbx-v1.patch --]
[-- Type: application/octet-stream, Size: 2131 bytes --]

From e957f2a9d03fc46fe4978d5014897b0d95e9de6e Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Sat, 7 Nov 2020 02:40:46 +0800
Subject: [PATCH] AArch64: Add FLAG for tbl/tbx intrinsics [PR94442]

2020-11-09  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for tbl/tbx intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 6d898d4503a..968ddf026bf 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -538,28 +538,28 @@
   VAR1 (BINOPP, crypto_pmull, 0, NONE, v2di)
 
   /* Implemented by aarch64_tbl3<mode>.  */
-  VAR1 (BINOP, tbl3, 0, ALL, v8qi)
-  VAR1 (BINOP, tbl3, 0, ALL, v16qi)
+  VAR1 (BINOP, tbl3, 0, NONE, v8qi)
+  VAR1 (BINOP, tbl3, 0, NONE, v16qi)
 
   /* Implemented by aarch64_qtbl3<mode>.  */
-  VAR1 (BINOP, qtbl3, 0, ALL, v8qi)
-  VAR1 (BINOP, qtbl3, 0, ALL, v16qi)
+  VAR1 (BINOP, qtbl3, 0, NONE, v8qi)
+  VAR1 (BINOP, qtbl3, 0, NONE, v16qi)
 
   /* Implemented by aarch64_qtbl4<mode>.  */
-  VAR1 (BINOP, qtbl4, 0, ALL, v8qi)
-  VAR1 (BINOP, qtbl4, 0, ALL, v16qi)
+  VAR1 (BINOP, qtbl4, 0, NONE, v8qi)
+  VAR1 (BINOP, qtbl4, 0, NONE, v16qi)
 
   /* Implemented by aarch64_tbx4<mode>.  */
-  VAR1 (TERNOP, tbx4, 0, ALL, v8qi)
-  VAR1 (TERNOP, tbx4, 0, ALL, v16qi)
+  VAR1 (TERNOP, tbx4, 0, NONE, v8qi)
+  VAR1 (TERNOP, tbx4, 0, NONE, v16qi)
 
   /* Implemented by aarch64_qtbx3<mode>.  */
-  VAR1 (TERNOP, qtbx3, 0, ALL, v8qi)
-  VAR1 (TERNOP, qtbx3, 0, ALL, v16qi)
+  VAR1 (TERNOP, qtbx3, 0, NONE, v8qi)
+  VAR1 (TERNOP, qtbx3, 0, NONE, v16qi)
 
   /* Implemented by aarch64_qtbx4<mode>.  */
-  VAR1 (TERNOP, qtbx4, 0, ALL, v8qi)
-  VAR1 (TERNOP, qtbx4, 0, ALL, v16qi)
+  VAR1 (TERNOP, qtbx4, 0, NONE, v8qi)
+  VAR1 (TERNOP, qtbx4, 0, NONE, v16qi)
 
   /* Builtins for ARMv8.1-A Adv.SIMD instructions.  */
 
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-11-09  3:27                                                                           ` xiezhiheng
@ 2020-11-10 11:53                                                                             ` Richard Sandiford
  2020-11-11  7:59                                                                               ` xiezhiheng
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Sandiford @ 2020-11-10 11:53 UTC (permalink / raw)
  To: xiezhiheng; +Cc: gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, November 3, 2020 9:57 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> Thanks, I pushed both patches to trunk.
>> 
>
> Thanks.  And I made two separate patches for these two groups, tbl/tbx intrinsics and
> the rest of the arithmetic operation intrinsics.
>
> Note: It does not matter which patch is applied first.

I pushed the TBL/TBX one, but on the other patch:

> @@ -297,7 +297,7 @@
>    BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
>  
>    /* Implemented by aarch64_reduc_plus_<mode>.  */
> -  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
> +  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, FP)

This is defined for integer and FP modes, so I think it should be
NONE instead of FP.  We'll automatically add FLAGS_FP based on the
mode where necessary.

Otherwise it looks good, thanks.

Richard

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-11-10 11:53                                                                             ` Richard Sandiford
@ 2020-11-11  7:59                                                                               ` xiezhiheng
  2020-11-11 10:59                                                                                 ` Richard Sandiford
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-11-11  7:59 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1917 bytes --]

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, November 10, 2020 7:54 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Tuesday, November 3, 2020 9:57 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> Thanks, I pushed both patches to trunk.
> >>
> >
> > Thanks.  And I made two separate patches for these two groups, tbl/tbx
> intrinsics and
> > the rest of the arithmetic operation intrinsics.
> >
> > Note: It does not matter which patch is applied first.
> 
> I pushed the TBL/TBX one, but on the other patch:
> 
> > @@ -297,7 +297,7 @@
> >    BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
> >
> >    /* Implemented by aarch64_reduc_plus_<mode>.  */
> > -  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
> > +  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, FP)
> 
> This is defined for integer and FP modes, so I think it should be
> NONE instead of FP.  We'll automatically add FLAGS_FP based on the
> mode where necessary.
> 

Sorry, and I have revised a new patch.
Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 75092451216..d6a49d65214 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-11  Zhiheng Xie  <xiezhiheng@huawei.com>
+           Nannan Zheng  <zhengnannan@huawei.com>
+
+       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+       for arithmetic operation intrinsics.
+


[-- Attachment #2: arithmetic-operation-v2.patch --]
[-- Type: application/octet-stream, Size: 4743 bytes --]

From ee92cbeca751cdeb4242a66f749597aa840fce5e Mon Sep 17 00:00:00 2001
From: zhengnannan <zhengnannan@huawei.com>
Date: Wed, 11 Nov 2020 04:26:12 +0800
Subject: [PATCH] AArch64: Add FLAG for arithmetic operation intrinsics
 [PR94442]

2020-11-11  Zhiheng Xie  <xiezhiheng@huawei.com>
	    Nannan Zheng  <zhengnannan@huawei.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
	for arithmetic operation intrinsics.
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 44 ++++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index cb05aad77fb..b70056aa185 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -47,7 +47,7 @@
   VAR1 (COMBINEP, combine, 0, ALL, di)
   BUILTIN_VB (BINOP, pmul, 0, NONE)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, ALL)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP)
   BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
   VAR1 (UNOP, addp, 0, NONE, di)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, ALL)
@@ -229,9 +229,9 @@
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, ALL)
 
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0, ALL)
-  BUILTIN_VB (TERNOPU, udot, 0, ALL)
-  BUILTIN_VB (TERNOP_SSUS, usdot, 0, ALL)
+  BUILTIN_VB (TERNOP, sdot, 0, NONE)
+  BUILTIN_VB (TERNOPU, udot, 0, NONE)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE)
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, ALL)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, ALL)
   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, ALL)
@@ -304,7 +304,7 @@
   BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
 
   /* Implemented by aarch64_reduc_plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, NONE)
 
   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
   BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10, NONE)
@@ -462,19 +462,19 @@
   BUILTIN_VALL (BINOP, trn1, 0, ALL)
   BUILTIN_VALL (BINOP, trn2, 0, ALL)
 
-  BUILTIN_GPF_F16 (UNOP, frecpe, 0, ALL)
-  BUILTIN_GPF_F16 (UNOP, frecpx, 0, ALL)
+  BUILTIN_GPF_F16 (UNOP, frecpe, 0, FP)
+  BUILTIN_GPF_F16 (UNOP, frecpx, 0, FP)
 
-  BUILTIN_VDQ_SI (UNOP, urecpe, 0, ALL)
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0, NONE)
 
-  BUILTIN_VHSDF (UNOP, frecpe, 0, ALL)
-  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0, ALL)
+  BUILTIN_VHSDF (UNOP, frecpe, 0, FP)
+  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0, FP)
 
   /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
-  BUILTIN_VSDQ_I_DI (UNOP, abs, 0, ALL)
-  BUILTIN_VHSDF (UNOP, abs, 2, ALL)
-  VAR1 (UNOP, abs, 2, ALL, hf)
+  BUILTIN_VSDQ_I_DI (UNOP, abs, 0, AUTO_FP)
+  BUILTIN_VHSDF (UNOP, abs, 2, AUTO_FP)
+  VAR1 (UNOP, abs, 2, AUTO_FP, hf)
 
   BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10, FP)
   VAR1 (BINOP, float_truncate_hi_, 0, FP, v4sf)
@@ -508,11 +508,11 @@
   BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0, STORE)
 
   /* Implemented by fma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fma, 4, ALL)
-  VAR1 (TERNOP, fma, 4, ALL, hf)
+  BUILTIN_VHSDF (TERNOP, fma, 4, FP)
+  VAR1 (TERNOP, fma, 4, FP, hf)
   /* Implemented by fnma<mode>4.  */
-  BUILTIN_VHSDF (TERNOP, fnma, 4, ALL)
-  VAR1 (TERNOP, fnma, 4, ALL, hf)
+  BUILTIN_VHSDF (TERNOP, fnma, 4, FP)
+  VAR1 (TERNOP, fnma, 4, FP, hf)
 
   /* Implemented by aarch64_simd_bsl<mode>.  */
   BUILTIN_VDQQH (BSL_P, simd_bsl, 0, ALL)
@@ -595,13 +595,13 @@
   BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3, ALL)
 
   /* Implemented by aarch64_rsqrte<mode>.  */
-  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0, ALL)
+  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0, FP)
 
   /* Implemented by aarch64_rsqrts<mode>.  */
-  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0, FP)
 
   /* Implemented by fabd<mode>3.  */
-  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, ALL)
+  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3, FP)
 
   /* Implemented by aarch64_faddp<mode>.  */
   BUILTIN_VHSDF (BINOP, faddp, 0, FP)
@@ -623,7 +623,7 @@
   BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP)
 
   /* Implemented by sqrt<mode>2.  */
-  VAR1 (UNOP, sqrt, 2, ALL, hf)
+  VAR1 (UNOP, sqrt, 2, FP, hf)
 
   /* Implemented by <optab><mode>hf2.  */
   VAR1 (UNOP, floatdi, 2, FP, hf)
@@ -714,7 +714,7 @@
   BUILTIN_VSFDF (UNOP, frint64x, 0, FP)
 
   /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
-  VAR2 (TERNOP, bfdot, 0, ALL, v2sf, v4sf)
+  VAR2 (TERNOP, bfdot, 0, AUTO_FP, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, ALL, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, ALL, v2sf, v4sf)
 
-- 
2.19.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-11-11  7:59                                                                               ` xiezhiheng
@ 2020-11-11 10:59                                                                                 ` Richard Sandiford
  0 siblings, 0 replies; 44+ messages in thread
From: Richard Sandiford @ 2020-11-11 10:59 UTC (permalink / raw)
  To: xiezhiheng; +Cc: gcc-patches

xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, November 10, 2020 7:54 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Tuesday, November 3, 2020 9:57 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> Thanks, I pushed both patches to trunk.
>> >>
>> >
>> > Thanks.  And I made two separate patches for these two groups, tbl/tbx
>> intrinsics and
>> > the rest of the arithmetic operation intrinsics.
>> >
>> > Note: It does not matter which patch is applied first.
>> 
>> I pushed the TBL/TBX one, but on the other patch:
>> 
>> > @@ -297,7 +297,7 @@
>> >    BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
>> >
>> >    /* Implemented by aarch64_reduc_plus_<mode>.  */
>> > -  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
>> > +  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, FP)
>> 
>> This is defined for integer and FP modes, so I think it should be
>> NONE instead of FP.  We'll automatically add FLAGS_FP based on the
>> mode where necessary.
>> 
>
> Sorry, and I have revised a new patch.
> Bootstrapped and tested on aarch64 Linux platform.

LGTM, thanks.  Pushed to trunk.

Richard

> Thanks,
> Xie Zhiheng
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 75092451216..d6a49d65214 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,9 @@
> +2020-11-11  Zhiheng Xie  <xiezhiheng@huawei.com>
> +           Nannan Zheng  <zhengnannan@huawei.com>
> +
> +       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
> +       for arithmetic operation intrinsics.
> +

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
  2020-04-02  6:35 xiezhiheng
@ 2020-06-09 20:40 ` Jeff Law
  0 siblings, 0 replies; 44+ messages in thread
From: Jeff Law @ 2020-06-09 20:40 UTC (permalink / raw)
  To: xiezhiheng, gcc-patches

On Thu, 2020-04-02 at 06:35 +0000, xiezhiheng wrote:
> Hi,
>   I've created a bug for this issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94442
> 
>   And I'm going to solve this problem by propagating def's insn to its use
>   when they are at the same loop in fwprop pass.
>   I mean something like:
> diff --git a/gcc/fwprop.c b/gcc/fwprop.c
> index 705d2885aae..0edbbc65047 100644
> --- a/gcc/fwprop.c
> +++ b/gcc/fwprop.c
> @@ -416,7 +416,7 @@ should_replace_address (rtx old_rtx, rtx new_rtx, machine_mode mode,
>      gain = (set_src_cost (new_rtx, VOIDmode, speed)
>             - set_src_cost (old_rtx, VOIDmode, speed));
> 
> -  return (gain > 0);
> +  return (gain >= 0);
>  }
> 
> 
> @@ -1573,10 +1573,14 @@ fwprop (bool fwprop_addr_p)
>        df_ref use = DF_USES_GET (i);
>        if (use)
>         {
> +         df_ref def = get_def_for_use (use);
>           if (DF_REF_TYPE (use) == DF_REF_REG_USE
>               || DF_REF_BB (use)->loop_father == NULL
>               /* The outer most loop is not really a loop.  */
> -             || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
> +             || loop_outer (DF_REF_BB (use)->loop_father) == NULL
> +             || (def && (DF_REF_BB (def)->loop_father == DF_REF_BB (use)->loop_father
> +                         || flow_loop_nested_p (DF_REF_BB(use)->loop_father,
> +                                                DF_REF_BB(def)->loop_father))))
>             forward_propagate_into (use, fwprop_addr_p);
> 
>           else if (fwprop_addr_p)
> 
> Any suggestions?
ISTM this is really either a gimple issue or a gimple->rtl expansion issue. Yea,
we *could* fix this via propagation, but ISTM it's better to generate reasonable
code from the start rather than try to optimize it later.

jeff


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3
@ 2020-04-02  6:35 xiezhiheng
  2020-06-09 20:40 ` Jeff Law
  0 siblings, 1 reply; 44+ messages in thread
From: xiezhiheng @ 2020-04-02  6:35 UTC (permalink / raw)
  To: gcc-patches

Hi,
  I've created a bug for this issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94442

  And I'm going to solve this problem by propagating def's insn to its use
  when they are at the same loop in fwprop pass.
  I mean something like:
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 705d2885aae..0edbbc65047 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -416,7 +416,7 @@ should_replace_address (rtx old_rtx, rtx new_rtx, machine_mode mode,
     gain = (set_src_cost (new_rtx, VOIDmode, speed)
            - set_src_cost (old_rtx, VOIDmode, speed));

-  return (gain > 0);
+  return (gain >= 0);
 }


@@ -1573,10 +1573,14 @@ fwprop (bool fwprop_addr_p)
       df_ref use = DF_USES_GET (i);
       if (use)
        {
+         df_ref def = get_def_for_use (use);
          if (DF_REF_TYPE (use) == DF_REF_REG_USE
              || DF_REF_BB (use)->loop_father == NULL
              /* The outer most loop is not really a loop.  */
-             || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+             || loop_outer (DF_REF_BB (use)->loop_father) == NULL
+             || (def && (DF_REF_BB (def)->loop_father == DF_REF_BB (use)->loop_father
+                         || flow_loop_nested_p (DF_REF_BB(use)->loop_father,
+                                                DF_REF_BB(def)->loop_father))))
            forward_propagate_into (use, fwprop_addr_p);

          else if (fwprop_addr_p)

Any suggestions?

Best regards
Xie Zhiheng

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2020-11-11 10:59 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-02 13:22 [PATCH PR94442] [AArch64] Redundant ldp/stp instructions emitted at -O3 xiezhiheng
2020-07-02 14:45 ` Richard Biener
2020-07-06  9:10   ` xiezhiheng
2020-07-06  9:31     ` Richard Sandiford
2020-07-07 12:49       ` xiezhiheng
2020-07-07 14:07         ` Richard Sandiford
2020-07-15  8:49           ` xiezhiheng
2020-07-16 12:41             ` Richard Sandiford
2020-07-16 14:05               ` xiezhiheng
2020-07-17  9:03                 ` Richard Sandiford
2020-07-30  2:43                   ` xiezhiheng
2020-07-31  9:02                     ` Richard Sandiford
2020-08-03  2:21                       ` xiezhiheng
2020-08-03 13:55                         ` Richard Sandiford
2020-08-04  8:01                           ` xiezhiheng
2020-08-04 16:25                             ` Richard Sandiford
2020-08-17  8:05                               ` xiezhiheng
2020-08-19 10:06                                 ` Richard Sandiford
2020-08-20  8:24                                   ` xiezhiheng
2020-08-20  8:55                                     ` Richard Sandiford
2020-08-20 12:16                                       ` xiezhiheng
2020-08-21  9:02                                         ` Richard Sandiford
2020-08-25  3:14                                           ` xiezhiheng
2020-08-25 11:07                                             ` Richard Sandiford
2020-08-26  1:39                                               ` xiezhiheng
2020-08-26 10:14                                                 ` Richard Sandiford
2020-08-27  2:50                                                   ` xiezhiheng
2020-08-27  8:08                                                     ` Richard Sandiford
2020-10-09  9:32                                                       ` xiezhiheng
2020-10-13  8:07                                                         ` Richard Sandiford
2020-10-19  9:21                                                           ` xiezhiheng
2020-10-20 16:53                                                             ` Richard Sandiford
2020-10-22  9:16                                                               ` xiezhiheng
2020-10-26 13:03                                                                 ` Richard Sandiford
2020-10-30  6:41                                                                   ` xiezhiheng
2020-10-30 10:23                                                                     ` Richard Sandiford
2020-11-03 11:59                                                                       ` xiezhiheng
2020-11-03 13:57                                                                         ` Richard Sandiford
2020-11-09  3:27                                                                           ` xiezhiheng
2020-11-10 11:53                                                                             ` Richard Sandiford
2020-11-11  7:59                                                                               ` xiezhiheng
2020-11-11 10:59                                                                                 ` Richard Sandiford
  -- strict thread matches above, loose matches on Subject: below --
2020-04-02  6:35 xiezhiheng
2020-06-09 20:40 ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).