[AArch64] Remove backend support for widen-sub

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [AArch64] Remove backend support for widen-sub
@ 2021-01-21 11:36 Joel Hutton
  2021-01-21 13:06 ` Richard Biener
  2021-01-21 13:19 ` Richard Sandiford
  0 siblings, 2 replies; 6+ messages in thread
From: Joel Hutton @ 2021-01-21 11:36 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1154 bytes --]

Hi all, 

This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.

In the following example:

#include <stdint.h>
extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict pix2)
{
   for( int y = 0; y < 4; y++ )
  {    
    for( int x = 0; x < 4; x++ )
      d[x + y*4] = pix1[x] - pix2[x];
    pix1 += 16;  
    pix2 += 16;
 }

The widening minus pattern is recognized and substituted, but cannot be used due to the input vector type chosen in slp vectorization. This results in an attempt to do an 8 byte->8 short widening subtract operation, which is not supported. 

The issue is documented in PR 98772.


[AArch64] Remove backend support for widen-sub

This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md    
        (vec_widen_<su>subl_lo_<mode>): Removed.
        (vec_widen_<su>subl_hi_<mode>): Removed.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vect-widen-sub.c: Removed.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: remove-widen-sub.patch --]
[-- Type: text/x-patch; name="remove-widen-sub.patch", Size: 3942 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 41071b668fd0982f55f9e48510403b9f50fe0f60..c685c512e06917f9cf6bdcffcc41dd091dabfb4e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3478,30 +3478,6 @@
   DONE;
 })
 
-(define_expand "vec_widen_<su>subl_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
-  "TARGET_SIMD"
-{
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
-  emit_insn (gen_aarch64_<su>subl<mode>_lo_internal (operands[0], operands[1],
-						     operands[2], p));
-  DONE;
-})
-
-(define_expand "vec_widen_<su>subl_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
-  "TARGET_SIMD"
-{
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
-  emit_insn (gen_aarch64_<su>subl<mode>_hi_internal (operands[0], operands[1],
-						     operands[2], p));
-  DONE;
-})
-
 (define_expand "aarch64_saddl2<mode>"
   [(match_operand:<VWIDE> 0 "register_operand")
    (match_operand:VQW 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c b/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c
deleted file mode 100644
index a2bed63affbd091977df95a126da1f5b8c1d41d2..0000000000000000000000000000000000000000
--- a/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* { dg-do run } */
-/* { dg-options "-O3 -save-temps" } */
-#include <stdint.h>
-#include <string.h>
-
-#pragma GCC target "+nosve"
-
-#define ARR_SIZE 1024
-
-/* Should produce an usubl */
-void usub_opt (uint32_t *foo, uint16_t *a, uint16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-__attribute__((optimize (0)))
-void usub_nonopt (uint32_t *foo, uint16_t *a, uint16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-/* Should produce an ssubl */
-void ssub_opt (int32_t *foo, int16_t *a, int16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-__attribute__((optimize (0)))
-void ssub_nonopt (int32_t *foo, int16_t *a, int16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-
-void __attribute__((optimize (0)))
-init(uint16_t *a, uint16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE;i++)
-    {
-      a[i] = i;
-      b[i] = 2*i;
-    }
-}
-
-int __attribute__((optimize (0)))
-main()
-{
-    uint32_t foo_arr[ARR_SIZE];
-    uint32_t bar_arr[ARR_SIZE];
-    uint16_t a[ARR_SIZE];
-    uint16_t b[ARR_SIZE];
-
-    init(a, b);
-    usub_opt(foo_arr, a, b);
-    usub_nonopt(bar_arr, a, b);
-    if (memcmp(foo_arr, bar_arr, ARR_SIZE) != 0)
-      return 1;
-    ssub_opt((int32_t*) foo_arr, (int16_t*) a, (int16_t*) b);
-    ssub_nonopt((int32_t*) bar_arr, (int16_t*) a, (int16_t*) b);
-    if (memcmp(foo_arr, bar_arr, ARR_SIZE) != 0)
-      return 1;
-    return 0;
-}
-
-/* { dg-final { scan-assembler-times {\tusubl\t} 1} } */
-/* { dg-final { scan-assembler-times {\tusubl2\t} 1} } */
-/* { dg-final { scan-assembler-times {\tssubl\t} 1} } */
-/* { dg-final { scan-assembler-times {\tssubl2\t} 1} } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [AArch64] Remove backend support for widen-sub
  2021-01-21 11:36 [AArch64] Remove backend support for widen-sub Joel Hutton
@ 2021-01-21 13:06 ` Richard Biener
  2021-01-21 13:19 ` Richard Sandiford
  1 sibling, 0 replies; 6+ messages in thread
From: Richard Biener @ 2021-01-21 13:06 UTC (permalink / raw)
  To: Joel Hutton; +Cc: GCC Patches

On Thu, Jan 21, 2021 at 12:37 PM Joel Hutton via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi all,
>
> This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
>
> In the following example:
>
> #include <stdint.h>
> extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict pix2)
> {
>    for( int y = 0; y < 4; y++ )
>   {
>     for( int x = 0; x < 4; x++ )
>       d[x + y*4] = pix1[x] - pix2[x];
>     pix1 += 16;
>     pix2 += 16;
>  }
>
> The widening minus pattern is recognized and substituted, but cannot be used due to the input vector type chosen in slp vectorization. This results in an attempt to do an 8 byte->8 short widening subtract operation, which is not supported.
>
> The issue is documented in PR 98772.

But it's not analyzed.  The fix is clearly not to remove the support
for this pattern but fix
what goes wrong in detecting it.

Richard.

>
> [AArch64] Remove backend support for widen-sub
>
> This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd.md
>         (vec_widen_<su>subl_lo_<mode>): Removed.
>         (vec_widen_<su>subl_hi_<mode>): Removed.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vect-widen-sub.c: Removed.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [AArch64] Remove backend support for widen-sub
  2021-01-21 11:36 [AArch64] Remove backend support for widen-sub Joel Hutton
  2021-01-21 13:06 ` Richard Biener
@ 2021-01-21 13:19 ` Richard Sandiford
  2021-01-21 13:31   ` Richard Biener
  1 sibling, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2021-01-21 13:19 UTC (permalink / raw)
  To: Joel Hutton via Gcc-patches; +Cc: Joel Hutton, rguenther

Joel Hutton via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Hi all, 
>
> This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
>
> In the following example:
>
> #include <stdint.h>
> extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict pix2)
> {
>    for( int y = 0; y < 4; y++ )
>   {    
>     for( int x = 0; x < 4; x++ )
>       d[x + y*4] = pix1[x] - pix2[x];
>     pix1 += 16;  
>     pix2 += 16;
>  }
>
> The widening minus pattern is recognized and substituted, but cannot be used due to the input vector type chosen in slp vectorization. This results in an attempt to do an 8 byte->8 short widening subtract operation, which is not supported. 
>
> The issue is documented in PR 98772.

IMO removing just the sub patterns is too arbitrary.  Like you say,
the PR affects all widening instructions. but it happens that the
first encountered regression used subtraction.

I think we should try to fix the PR instead.  The widening operations
can only be used for SLP if the group_size is at least double the
number of elements in the vectype, so one idea (not worked through)
is to make the vect_build_slp_tree family of routines undo pattern
recognition for widening operations if the group size is less than that.

Richi would know better than me though.

Thanks,
Richard

>
>
> [AArch64] Remove backend support for widen-sub
>
> This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd.md    
>         (vec_widen_<su>subl_lo_<mode>): Removed.
>         (vec_widen_<su>subl_hi_<mode>): Removed.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vect-widen-sub.c: Removed.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [AArch64] Remove backend support for widen-sub
  2021-01-21 13:19 ` Richard Sandiford
@ 2021-01-21 13:31   ` Richard Biener
  2021-01-21 13:40     ` Richard Sandiford
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2021-01-21 13:31 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Joel Hutton via Gcc-patches, Joel Hutton

On Thu, 21 Jan 2021, Richard Sandiford wrote:

> Joel Hutton via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > Hi all, 
> >
> > This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
> >
> > In the following example:
> >
> > #include <stdint.h>
> > extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict pix2)
> > {
> >    for( int y = 0; y < 4; y++ )
> >   {    
> >     for( int x = 0; x < 4; x++ )
> >       d[x + y*4] = pix1[x] - pix2[x];
> >     pix1 += 16;  
> >     pix2 += 16;
> >  }
> >
> > The widening minus pattern is recognized and substituted, but cannot be used due to the input vector type chosen in slp vectorization. This results in an attempt to do an 8 byte->8 short widening subtract operation, which is not supported. 
> >
> > The issue is documented in PR 98772.
> 
> IMO removing just the sub patterns is too arbitrary.  Like you say,
> the PR affects all widening instructions. but it happens that the
> first encountered regression used subtraction.
> 
> I think we should try to fix the PR instead.  The widening operations
> can only be used for SLP if the group_size is at least double the
> number of elements in the vectype, so one idea (not worked through)
> is to make the vect_build_slp_tree family of routines undo pattern
> recognition for widening operations if the group size is less than that.
> 
> Richi would know better than me though.

Why should the widen ops be only usable with such constraints?
As I read md.texi they for example do v8qi -> v8hi operations
(where the v8qi is either lo or hi part of a v16qi vector).

The dumps show we use a VF of 4 which makes us have two v8hi
vectors and one v16qi which vectorizable_conversion should
be able to handle by emitting hi/lo widened subtracts.

Of course the dumps also show we fail vector construction
because of the permute.  If as I say you force strieded code-gen
we get optimal

wdiff:
.LFB0:
        .cfi_startproc
        lsl     w3, w3, 4
        ldr     s0, [x1]
        ldr     s1, [x2]
        sxtw    x4, w3
        ldr     s3, [x1, w3, sxtw]
        add     x5, x2, x4
        ldr     s2, [x2, w3, sxtw]
        add     x1, x1, x4
        add     x2, x1, x4
        add     x4, x5, x4
        ins     v0.s[1], v3.s[0]
        ldr     s4, [x5, w3, sxtw]
        ins     v1.s[1], v2.s[0]
        ldr     s5, [x1, w3, sxtw]
        ldr     s2, [x4, w3, sxtw]
        ldr     s3, [x2, w3, sxtw]
        ins     v0.s[2], v5.s[0]
        ins     v1.s[2], v4.s[0]
        ins     v0.s[3], v3.s[0]
        ins     v1.s[3], v2.s[0]
        usubl   v2.8h, v0.8b, v1.8b
        usubl2  v0.8h, v0.16b, v1.16b
        stp     q2, q0, [x0]
        ret

from

void wdiff( short d[16], unsigned char *restrict pix1, unsigned char 
*restrict pix2, int s)
{
   for( int y = 0; y < 4; y++ )
  {
    for( int x = 0; x < 4; x++ )
      d[x + y*4] = pix1[x] - pix2[x];
    pix1 += 16*s;
    pix2 += 16*s;
 }
}

so the fix, if, is to fix the bug that mentions this issue
and appropriately classify / vectorize the load.

Richard

> Thanks,
> Richard
> 
> >
> >
> > [AArch64] Remove backend support for widen-sub
> >
> > This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
> >
> > gcc/ChangeLog:
> >
> >         * config/aarch64/aarch64-simd.md    
> >         (vec_widen_<su>subl_lo_<mode>): Removed.
> >         (vec_widen_<su>subl_hi_<mode>): Removed.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/aarch64/vect-widen-sub.c: Removed.
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [AArch64] Remove backend support for widen-sub
  2021-01-21 13:31   ` Richard Biener
@ 2021-01-21 13:40     ` Richard Sandiford
  2021-01-21 14:09       ` Joel Hutton
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2021-01-21 13:40 UTC (permalink / raw)
  To: Richard Biener; +Cc: Joel Hutton via Gcc-patches, Joel Hutton

Richard Biener <rguenther@suse.de> writes:
> On Thu, 21 Jan 2021, Richard Sandiford wrote:
>
>> Joel Hutton via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> > Hi all, 
>> >
>> > This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
>> >
>> > In the following example:
>> >
>> > #include <stdint.h>
>> > extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict pix2)
>> > {
>> >    for( int y = 0; y < 4; y++ )
>> >   {    
>> >     for( int x = 0; x < 4; x++ )
>> >       d[x + y*4] = pix1[x] - pix2[x];
>> >     pix1 += 16;  
>> >     pix2 += 16;
>> >  }
>> >
>> > The widening minus pattern is recognized and substituted, but cannot be used due to the input vector type chosen in slp vectorization. This results in an attempt to do an 8 byte->8 short widening subtract operation, which is not supported. 
>> >
>> > The issue is documented in PR 98772.
>> 
>> IMO removing just the sub patterns is too arbitrary.  Like you say,
>> the PR affects all widening instructions. but it happens that the
>> first encountered regression used subtraction.
>> 
>> I think we should try to fix the PR instead.  The widening operations
>> can only be used for SLP if the group_size is at least double the
>> number of elements in the vectype, so one idea (not worked through)
>> is to make the vect_build_slp_tree family of routines undo pattern
>> recognition for widening operations if the group size is less than that.
>> 
>> Richi would know better than me though.
>
> Why should the widen ops be only usable with such constraints?
> As I read md.texi they for example do v8qi -> v8hi operations
> (where the v8qi is either lo or hi part of a v16qi vector).

They're v16qi->v8hi operations rather than v8qi->v8hi.  The lo
operations operate on one half of the v16qi and the hi operations
operate on the other half.  They're supposed to be used together
to produce v16qi->v8hi+v8hi, so for BB SLP we need a group size
of 16 to feed them.  (Loop-aware SLP is fine as-is because we can
just increase the unroll factor.)

In the testcase, the store end of the SLP graph is operating on
8 shorts, which further up the graph are converted from 8 chars.
To use WIDEN_MINUS_EXPR at v8hi we'd need 16 shorts and 16 chars.

> The dumps show we use a VF of 4 which makes us have two v8hi
> vectors and one v16qi which vectorizable_conversion should
> be able to handle by emitting hi/lo widened subtracts.

AIUI the problem is with slp1.  I think the loop side is fine.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [AArch64] Remove backend support for widen-sub
  2021-01-21 13:40     ` Richard Sandiford
@ 2021-01-21 14:09       ` Joel Hutton
  0 siblings, 0 replies; 6+ messages in thread
From: Joel Hutton @ 2021-01-21 14:09 UTC (permalink / raw)
  To: Richard Sandiford, Richard Biener; +Cc: Joel Hutton via Gcc-patches

> I think we should try to fix the PR instead.  The widening operations
> can only be used for SLP if the group_size is at least double the
> number of elements in the vectype, so one idea (not worked through)
> is to make the vect_build_slp_tree family of routines undo pattern
> recognition for widening operations if the group size is less than that. >

This seems reasonable, how do I go about 'undoing' the pattern recognition.

Ideally the patterns wouldn't be substituted in the first place, but group size is chosen after pattern substitution.
________________________________
From: Richard Sandiford <richard.sandiford@arm.com>
Sent: 21 January 2021 13:40
To: Richard Biener <rguenther@suse.de>
Cc: Joel Hutton via Gcc-patches <gcc-patches@gcc.gnu.org>; Joel Hutton <Joel.Hutton@arm.com>
Subject: Re: [AArch64] Remove backend support for widen-sub

Richard Biener <rguenther@suse.de> writes:
> On Thu, 21 Jan 2021, Richard Sandiford wrote:
>
>> Joel Hutton via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> > Hi all,
>> >
>> > This patch removes support for the widening subtract operation in the aarch64 backend as it is causing a performance regression.
>> >
>> > In the following example:
>> >
>> > #include <stdint.h>
>> > extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict pix2)
>> > {
>> >    for( int y = 0; y < 4; y++ )
>> >   {
>> >     for( int x = 0; x < 4; x++ )
>> >       d[x + y*4] = pix1[x] - pix2[x];
>> >     pix1 += 16;
>> >     pix2 += 16;
>> >  }
>> >
>> > The widening minus pattern is recognized and substituted, but cannot be used due to the input vector type chosen in slp vectorization. This results in an attempt to do an 8 byte->8 short widening subtract operation, which is not supported.
>> >
>> > The issue is documented in PR 98772.
>>
>> IMO removing just the sub patterns is too arbitrary.  Like you say,
>> the PR affects all widening instructions. but it happens that the
>> first encountered regression used subtraction.
>>
>> I think we should try to fix the PR instead.  The widening operations
>> can only be used for SLP if the group_size is at least double the
>> number of elements in the vectype, so one idea (not worked through)
>> is to make the vect_build_slp_tree family of routines undo pattern
>> recognition for widening operations if the group size is less than that.
>>
>> Richi would know better than me though.
>
> Why should the widen ops be only usable with such constraints?
> As I read md.texi they for example do v8qi -> v8hi operations
> (where the v8qi is either lo or hi part of a v16qi vector).

They're v16qi->v8hi operations rather than v8qi->v8hi.  The lo
operations operate on one half of the v16qi and the hi operations
operate on the other half.  They're supposed to be used together
to produce v16qi->v8hi+v8hi, so for BB SLP we need a group size
of 16 to feed them.  (Loop-aware SLP is fine as-is because we can
just increase the unroll factor.)

In the testcase, the store end of the SLP graph is operating on
8 shorts, which further up the graph are converted from 8 chars.
To use WIDEN_MINUS_EXPR at v8hi we'd need 16 shorts and 16 chars.

> The dumps show we use a VF of 4 which makes us have two v8hi
> vectors and one v16qi which vectorizable_conversion should
> be able to handle by emitting hi/lo widened subtracts.

AIUI the problem is with slp1.  I think the loop side is fine.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-01-21 14:09 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-21 11:36 [AArch64] Remove backend support for widen-sub Joel Hutton
2021-01-21 13:06 ` Richard Biener
2021-01-21 13:19 ` Richard Sandiford
2021-01-21 13:31   ` Richard Biener
2021-01-21 13:40     ` Richard Sandiford
2021-01-21 14:09       ` Joel Hutton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).