public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
From: "eochoa at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org> To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/106352] New: SLP seems to need temporary variables Date: Tue, 19 Jul 2022 09:48:22 +0000 [thread overview] Message-ID: <bug-106352-4@http.gcc.gnu.org/bugzilla/> (raw) https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106352 Bug ID: 106352 Summary: SLP seems to need temporary variables Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: eochoa at gcc dot gnu.org Target Milestone: --- Hi, I am looking at how SLP works and I'm finding this interesting case which I believe is a bug. I've added all examples to this compiler explorer link: https://godbolt.org/z/zrPKEYvds Compiling the following function with `-O3 -fverbose-asm -march=armv8.4-a -fno-tree-loop-vectorize -ftree-slp-vectorize`: typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; void foo ( uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, uint8_t *src2, int i_src2_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { dst[0] = ( src1[0] + src2[0] + 1 ); dst[1] = ( src1[1] + src2[1] + 1 ); dst[2] = ( src1[2] + src2[2] + 1 ); dst[3] = ( src1[3] + src2[3] + 1 ); dst[4] = ( src1[4] + src2[4] + 1 ); dst[5] = ( src1[5] + src2[5] + 1 ); dst[6] = ( src1[6] + src2[6] + 1 ); dst[7] = ( src1[7] + src2[7] + 1 ); dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; } } Produces the following scalar code: ldrb w9, [x4] // MEM[(uint8_t *)src2_68], MEM[(uint8_t *)src2_68] add w3, w3, 1 // y, y, ldrb w1, [x2] //, MEM[(uint8_t *)src1_67] add w1, w1, w9 // tmp138, MEM[(uint8_t *)src1_67], MEM[(uint8_t *)src2_68] add w1, w1, 1 // tmp141, tmp138, strb w1, [x0] // tmp141, MEM[(uint8_t *)dst_66] ldrb w9, [x4, 1] // MEM[(uint8_t *)src2_68 + 1B], MEM[(uint8_t *)src2_68 + 1B] ldrb w1, [x2, 1] //, MEM[(uint8_t *)src1_67 + 1B] add w1, w1, w9 // tmp145, MEM[(uint8_t *)src1_67 + 1B], MEM[(uint8_t *)src2_68 + 1B] add w1, w1, 1 // tmp148, tmp145, strb w1, [x0, 1] // tmp148, MEM[(uint8_t *)dst_66 + 1B] ldrb w9, [x4, 2] // MEM[(uint8_t *)src2_68 + 2B], MEM[(uint8_t *)src2_68 + 2B] ldrb w1, [x2, 2] //, MEM[(uint8_t *)src1_67 + 2B] add w1, w1, w9 // tmp152, MEM[(uint8_t *)src1_67 + 2B], MEM[(uint8_t *)src2_68 + 2B] add w1, w1, 1 // tmp155, tmp152, strb w1, [x0, 2] // tmp155, MEM[(uint8_t *)dst_66 + 2B] ldrb w9, [x4, 3] // MEM[(uint8_t *)src2_68 + 3B], MEM[(uint8_t *)src2_68 + 3B] ldrb w1, [x2, 3] //, MEM[(uint8_t *)src1_67 + 3B] add w1, w1, w9 // tmp159, MEM[(uint8_t *)src1_67 + 3B], MEM[(uint8_t *)src2_68 + 3B] add w1, w1, 1 // tmp162, tmp159, strb w1, [x0, 3] // tmp162, MEM[(uint8_t *)dst_66 + 3B] ldrb w9, [x4, 4] // MEM[(uint8_t *)src2_68 + 4B], MEM[(uint8_t *)src2_68 + 4B] ldrb w1, [x2, 4] //, MEM[(uint8_t *)src1_67 + 4B] add w1, w1, w9 // tmp166, MEM[(uint8_t *)src1_67 + 4B], MEM[(uint8_t *)src2_68 + 4B] add w1, w1, 1 // tmp169, tmp166, strb w1, [x0, 4] // tmp169, MEM[(uint8_t *)dst_66 + 4B] ldrb w9, [x4, 5] // MEM[(uint8_t *)src2_68 + 5B], MEM[(uint8_t *)src2_68 + 5B] ldrb w1, [x2, 5] //, MEM[(uint8_t *)src1_67 + 5B] add w1, w1, w9 // tmp173, MEM[(uint8_t *)src1_67 + 5B], MEM[(uint8_t *)src2_68 + 5B] add w1, w1, 1 // tmp176, tmp173, strb w1, [x0, 5] // tmp176, MEM[(uint8_t *)dst_66 + 5B] ldrb w9, [x4, 6] // MEM[(uint8_t *)src2_68 + 6B], MEM[(uint8_t *)src2_68 + 6B] ldrb w1, [x2, 6] //, MEM[(uint8_t *)src1_67 + 6B] add w1, w1, w9 // tmp180, MEM[(uint8_t *)src1_67 + 6B], MEM[(uint8_t *)src2_68 + 6B] add w1, w1, 1 // tmp183, tmp180, strb w1, [x0, 6] // tmp183, MEM[(uint8_t *)dst_66 + 6B] ldrb w1, [x2, 7] //, MEM[(uint8_t *)src1_67 + 7B] add x2, x2, x6 // src1, src1, _62 ldrb w9, [x4, 7] // MEM[(uint8_t *)src2_68 + 7B], MEM[(uint8_t *)src2_68 + 7B] add x4, x4, x5 // src2, src2, _64 add w1, w1, w9 // tmp187, MEM[(uint8_t *)src1_67 + 7B], MEM[(uint8_t *)src2_68 + 7B] add w1, w1, 1 // tmp190, tmp187, strb w1, [x0, 7] // tmp190, MEM[(uint8_t *)dst_66 + 7B] add x0, x0, x8 // dst, dst, _61 cmp w7, w3 // i_height, y bne .L3 //, However, adding a temporary variable between the arithmetic and the dst variables (like such:) typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; void foo ( uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, uint8_t *src2, int i_src2_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { uint8_t temp0 = ( src1[0] + src2[0] + 1 ); uint8_t temp1 = ( src1[1] + src2[1] + 1 ); uint8_t temp2 = ( src1[2] + src2[2] + 1 ); uint8_t temp3 = ( src1[3] + src2[3] + 1 ); uint8_t temp4 = ( src1[4] + src2[4] + 1 ); uint8_t temp5 = ( src1[5] + src2[5] + 1 ); uint8_t temp6 = ( src1[6] + src2[6] + 1 ); uint8_t temp7 = ( src1[7] + src2[7] + 1 ); dst[0] = temp0; dst[1] = temp1; dst[2] = temp2; dst[3] = temp3; dst[4] = temp4; dst[5] = temp5; dst[6] = temp6; dst[7] = temp7; dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; } } will use SLP and vectorize the basic block: ldr d1, [x2] // vect__1.6, MEM <vector(8) unsigned char> [(uint8_t *)src1_67] add w1, w1, 1 // y, y, ldr d0, [x4] // vect__2.9, MEM <vector(8) unsigned char> [(uint8_t *)src2_68] add x2, x2, x6 // src1, src1, _111 add x4, x4, x3 // src2, src2, _112 add v0.8b, v0.8b, v1.8b // vect__3.10, vect__2.9, vect__1.6 add v0.8b, v0.8b, v2.8b // vect_temp0_38.11, vect__3.10, tmp112 str d0, [x0] // vect_temp0_38.11, MEM <vector(8) unsigned char> [(uint8_t *)dst_66] add x0, x0, x8 // dst, dst, _110 cmp w7, w1 // i_height, y bne .L3 //, Thanks!
next reply other threads:[~2022-07-19 9:48 UTC|newest] Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-07-19 9:48 eochoa at gcc dot gnu.org [this message] 2022-07-19 12:40 ` [Bug tree-optimization/106352] " rguenth at gcc dot gnu.org 2022-07-19 14:51 ` pinskia at gcc dot gnu.org 2022-07-19 14:53 ` pinskia at gcc dot gnu.org
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=bug-106352-4@http.gcc.gnu.org/bugzilla/ \ --to=gcc-bugzilla@gcc.gnu.org \ --cc=gcc-bugs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).