[Bug tree-optimization/106352] New: SLP seems to need temporary variables

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "eochoa at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/106352] New: SLP seems to need temporary variables
Date: Tue, 19 Jul 2022 09:48:22 +0000	[thread overview]
Message-ID: <bug-106352-4@http.gcc.gnu.org/bugzilla/> (raw)

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106352

            Bug ID: 106352
           Summary: SLP seems to need temporary variables
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: eochoa at gcc dot gnu.org
  Target Milestone: ---

Hi,

I am looking at how SLP works and I'm finding this interesting case which I
believe is a bug. 

I've added all examples to this compiler explorer link:
https://godbolt.org/z/zrPKEYvds

Compiling the following function with `-O3 -fverbose-asm -march=armv8.4-a
-fno-tree-loop-vectorize -ftree-slp-vectorize`:

typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;

void foo ( uint8_t *dst,  int i_dst_stride,
                              uint8_t *src1, int i_src1_stride,
                              uint8_t *src2, int i_src2_stride,
                              int i_width, int i_height )
{
    for( int y = 0; y < i_height; y++ )
    {

        dst[0] = ( src1[0] + src2[0] + 1 );
        dst[1] = ( src1[1] + src2[1] + 1 );
        dst[2] = ( src1[2] + src2[2] + 1 );
        dst[3] = ( src1[3] + src2[3] + 1 );
        dst[4] = ( src1[4] + src2[4] + 1 );
        dst[5] = ( src1[5] + src2[5] + 1 );
        dst[6] = ( src1[6] + src2[6] + 1 );
        dst[7] = ( src1[7] + src2[7] + 1 );
        dst  += i_dst_stride;
        src1 += i_src1_stride;
        src2 += i_src2_stride;
    }
}

Produces the following scalar code:

        ldrb    w9, [x4]        // MEM[(uint8_t *)src2_68], MEM[(uint8_t
*)src2_68]
        add     w3, w3, 1 // y, y,
        ldrb    w1, [x2]        //, MEM[(uint8_t *)src1_67]
        add     w1, w1, w9        // tmp138, MEM[(uint8_t *)src1_67],
MEM[(uint8_t *)src2_68]
        add     w1, w1, 1 // tmp141, tmp138,
        strb    w1, [x0]        // tmp141, MEM[(uint8_t *)dst_66]
        ldrb    w9, [x4, 1]     // MEM[(uint8_t *)src2_68 + 1B], MEM[(uint8_t
*)src2_68 + 1B]
        ldrb    w1, [x2, 1]     //, MEM[(uint8_t *)src1_67 + 1B]
        add     w1, w1, w9        // tmp145, MEM[(uint8_t *)src1_67 + 1B],
MEM[(uint8_t *)src2_68 + 1B]
        add     w1, w1, 1 // tmp148, tmp145,
        strb    w1, [x0, 1]     // tmp148, MEM[(uint8_t *)dst_66 + 1B]
        ldrb    w9, [x4, 2]     // MEM[(uint8_t *)src2_68 + 2B], MEM[(uint8_t
*)src2_68 + 2B]
        ldrb    w1, [x2, 2]     //, MEM[(uint8_t *)src1_67 + 2B]
        add     w1, w1, w9        // tmp152, MEM[(uint8_t *)src1_67 + 2B],
MEM[(uint8_t *)src2_68 + 2B]
        add     w1, w1, 1 // tmp155, tmp152,
        strb    w1, [x0, 2]     // tmp155, MEM[(uint8_t *)dst_66 + 2B]
        ldrb    w9, [x4, 3]     // MEM[(uint8_t *)src2_68 + 3B], MEM[(uint8_t
*)src2_68 + 3B]
        ldrb    w1, [x2, 3]     //, MEM[(uint8_t *)src1_67 + 3B]
        add     w1, w1, w9        // tmp159, MEM[(uint8_t *)src1_67 + 3B],
MEM[(uint8_t *)src2_68 + 3B]
        add     w1, w1, 1 // tmp162, tmp159,
        strb    w1, [x0, 3]     // tmp162, MEM[(uint8_t *)dst_66 + 3B]
        ldrb    w9, [x4, 4]     // MEM[(uint8_t *)src2_68 + 4B], MEM[(uint8_t
*)src2_68 + 4B]
        ldrb    w1, [x2, 4]     //, MEM[(uint8_t *)src1_67 + 4B]
        add     w1, w1, w9        // tmp166, MEM[(uint8_t *)src1_67 + 4B],
MEM[(uint8_t *)src2_68 + 4B]
        add     w1, w1, 1 // tmp169, tmp166,
        strb    w1, [x0, 4]     // tmp169, MEM[(uint8_t *)dst_66 + 4B]
        ldrb    w9, [x4, 5]     // MEM[(uint8_t *)src2_68 + 5B], MEM[(uint8_t
*)src2_68 + 5B]
        ldrb    w1, [x2, 5]     //, MEM[(uint8_t *)src1_67 + 5B]
        add     w1, w1, w9        // tmp173, MEM[(uint8_t *)src1_67 + 5B],
MEM[(uint8_t *)src2_68 + 5B]
        add     w1, w1, 1 // tmp176, tmp173,
        strb    w1, [x0, 5]     // tmp176, MEM[(uint8_t *)dst_66 + 5B]
        ldrb    w9, [x4, 6]     // MEM[(uint8_t *)src2_68 + 6B], MEM[(uint8_t
*)src2_68 + 6B]
        ldrb    w1, [x2, 6]     //, MEM[(uint8_t *)src1_67 + 6B]
        add     w1, w1, w9        // tmp180, MEM[(uint8_t *)src1_67 + 6B],
MEM[(uint8_t *)src2_68 + 6B]
        add     w1, w1, 1 // tmp183, tmp180,
        strb    w1, [x0, 6]     // tmp183, MEM[(uint8_t *)dst_66 + 6B]
        ldrb    w1, [x2, 7]     //, MEM[(uint8_t *)src1_67 + 7B]
        add     x2, x2, x6        // src1, src1, _62
        ldrb    w9, [x4, 7]     // MEM[(uint8_t *)src2_68 + 7B], MEM[(uint8_t
*)src2_68 + 7B]
        add     x4, x4, x5        // src2, src2, _64
        add     w1, w1, w9        // tmp187, MEM[(uint8_t *)src1_67 + 7B],
MEM[(uint8_t *)src2_68 + 7B]
        add     w1, w1, 1 // tmp190, tmp187,
        strb    w1, [x0, 7]     // tmp190, MEM[(uint8_t *)dst_66 + 7B]
        add     x0, x0, x8        // dst, dst, _61
        cmp     w7, w3    // i_height, y
        bne     .L3             //,

However, adding a temporary variable between the arithmetic and the dst
variables (like such:)

typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;


void foo ( uint8_t *dst,  int i_dst_stride,
                              uint8_t *src1, int i_src1_stride,
                              uint8_t *src2, int i_src2_stride,
                              int i_width, int i_height )
{
    for( int y = 0; y < i_height; y++ )
    {
        uint8_t temp0 = ( src1[0] + src2[0] + 1 );
        uint8_t temp1 = ( src1[1] + src2[1] + 1 );
        uint8_t temp2 = ( src1[2] + src2[2] + 1 );
        uint8_t temp3 = ( src1[3] + src2[3] + 1 );
        uint8_t temp4 = ( src1[4] + src2[4] + 1 );
        uint8_t temp5 = ( src1[5] + src2[5] + 1 );
        uint8_t temp6 = ( src1[6] + src2[6] + 1 );
        uint8_t temp7 = ( src1[7] + src2[7] + 1 );

        dst[0] = temp0;
        dst[1] = temp1;
        dst[2] = temp2;
        dst[3] = temp3;
        dst[4] = temp4;
        dst[5] = temp5;
        dst[6] = temp6;
        dst[7] = temp7;

        dst  += i_dst_stride;
        src1 += i_src1_stride;
        src2 += i_src2_stride;
    }
}

will use SLP and vectorize the basic block:

        ldr     d1, [x2]  // vect__1.6, MEM <vector(8) unsigned char> [(uint8_t
*)src1_67]
        add     w1, w1, 1 // y, y,
        ldr     d0, [x4]  // vect__2.9, MEM <vector(8) unsigned char> [(uint8_t
*)src2_68]
        add     x2, x2, x6        // src1, src1, _111
        add     x4, x4, x3        // src2, src2, _112
        add     v0.8b, v0.8b, v1.8b       // vect__3.10, vect__2.9, vect__1.6
        add     v0.8b, v0.8b, v2.8b       // vect_temp0_38.11, vect__3.10,
tmp112
        str     d0, [x0]  // vect_temp0_38.11, MEM <vector(8) unsigned char>
[(uint8_t *)dst_66]
        add     x0, x0, x8        // dst, dst, _110
        cmp     w7, w1    // i_height, y
        bne     .L3             //,

Thanks!

next             reply	other threads:[~2022-07-19  9:48 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-19  9:48 eochoa at gcc dot gnu.org [this message]
2022-07-19 12:40 ` [Bug tree-optimization/106352] " rguenth at gcc dot gnu.org
2022-07-19 14:51 ` pinskia at gcc dot gnu.org
2022-07-19 14:53 ` pinskia at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-106352-4@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).