public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/106352] New: SLP seems to need temporary variables
@ 2022-07-19 9:48 eochoa at gcc dot gnu.org
2022-07-19 12:40 ` [Bug tree-optimization/106352] " rguenth at gcc dot gnu.org
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: eochoa at gcc dot gnu.org @ 2022-07-19 9:48 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106352
Bug ID: 106352
Summary: SLP seems to need temporary variables
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: eochoa at gcc dot gnu.org
Target Milestone: ---
Hi,
I am looking at how SLP works and I'm finding this interesting case which I
believe is a bug.
I've added all examples to this compiler explorer link:
https://godbolt.org/z/zrPKEYvds
Compiling the following function with `-O3 -fverbose-asm -march=armv8.4-a
-fno-tree-loop-vectorize -ftree-slp-vectorize`:
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
void foo ( uint8_t *dst, int i_dst_stride,
uint8_t *src1, int i_src1_stride,
uint8_t *src2, int i_src2_stride,
int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
{
dst[0] = ( src1[0] + src2[0] + 1 );
dst[1] = ( src1[1] + src2[1] + 1 );
dst[2] = ( src1[2] + src2[2] + 1 );
dst[3] = ( src1[3] + src2[3] + 1 );
dst[4] = ( src1[4] + src2[4] + 1 );
dst[5] = ( src1[5] + src2[5] + 1 );
dst[6] = ( src1[6] + src2[6] + 1 );
dst[7] = ( src1[7] + src2[7] + 1 );
dst += i_dst_stride;
src1 += i_src1_stride;
src2 += i_src2_stride;
}
}
Produces the following scalar code:
ldrb w9, [x4] // MEM[(uint8_t *)src2_68], MEM[(uint8_t
*)src2_68]
add w3, w3, 1 // y, y,
ldrb w1, [x2] //, MEM[(uint8_t *)src1_67]
add w1, w1, w9 // tmp138, MEM[(uint8_t *)src1_67],
MEM[(uint8_t *)src2_68]
add w1, w1, 1 // tmp141, tmp138,
strb w1, [x0] // tmp141, MEM[(uint8_t *)dst_66]
ldrb w9, [x4, 1] // MEM[(uint8_t *)src2_68 + 1B], MEM[(uint8_t
*)src2_68 + 1B]
ldrb w1, [x2, 1] //, MEM[(uint8_t *)src1_67 + 1B]
add w1, w1, w9 // tmp145, MEM[(uint8_t *)src1_67 + 1B],
MEM[(uint8_t *)src2_68 + 1B]
add w1, w1, 1 // tmp148, tmp145,
strb w1, [x0, 1] // tmp148, MEM[(uint8_t *)dst_66 + 1B]
ldrb w9, [x4, 2] // MEM[(uint8_t *)src2_68 + 2B], MEM[(uint8_t
*)src2_68 + 2B]
ldrb w1, [x2, 2] //, MEM[(uint8_t *)src1_67 + 2B]
add w1, w1, w9 // tmp152, MEM[(uint8_t *)src1_67 + 2B],
MEM[(uint8_t *)src2_68 + 2B]
add w1, w1, 1 // tmp155, tmp152,
strb w1, [x0, 2] // tmp155, MEM[(uint8_t *)dst_66 + 2B]
ldrb w9, [x4, 3] // MEM[(uint8_t *)src2_68 + 3B], MEM[(uint8_t
*)src2_68 + 3B]
ldrb w1, [x2, 3] //, MEM[(uint8_t *)src1_67 + 3B]
add w1, w1, w9 // tmp159, MEM[(uint8_t *)src1_67 + 3B],
MEM[(uint8_t *)src2_68 + 3B]
add w1, w1, 1 // tmp162, tmp159,
strb w1, [x0, 3] // tmp162, MEM[(uint8_t *)dst_66 + 3B]
ldrb w9, [x4, 4] // MEM[(uint8_t *)src2_68 + 4B], MEM[(uint8_t
*)src2_68 + 4B]
ldrb w1, [x2, 4] //, MEM[(uint8_t *)src1_67 + 4B]
add w1, w1, w9 // tmp166, MEM[(uint8_t *)src1_67 + 4B],
MEM[(uint8_t *)src2_68 + 4B]
add w1, w1, 1 // tmp169, tmp166,
strb w1, [x0, 4] // tmp169, MEM[(uint8_t *)dst_66 + 4B]
ldrb w9, [x4, 5] // MEM[(uint8_t *)src2_68 + 5B], MEM[(uint8_t
*)src2_68 + 5B]
ldrb w1, [x2, 5] //, MEM[(uint8_t *)src1_67 + 5B]
add w1, w1, w9 // tmp173, MEM[(uint8_t *)src1_67 + 5B],
MEM[(uint8_t *)src2_68 + 5B]
add w1, w1, 1 // tmp176, tmp173,
strb w1, [x0, 5] // tmp176, MEM[(uint8_t *)dst_66 + 5B]
ldrb w9, [x4, 6] // MEM[(uint8_t *)src2_68 + 6B], MEM[(uint8_t
*)src2_68 + 6B]
ldrb w1, [x2, 6] //, MEM[(uint8_t *)src1_67 + 6B]
add w1, w1, w9 // tmp180, MEM[(uint8_t *)src1_67 + 6B],
MEM[(uint8_t *)src2_68 + 6B]
add w1, w1, 1 // tmp183, tmp180,
strb w1, [x0, 6] // tmp183, MEM[(uint8_t *)dst_66 + 6B]
ldrb w1, [x2, 7] //, MEM[(uint8_t *)src1_67 + 7B]
add x2, x2, x6 // src1, src1, _62
ldrb w9, [x4, 7] // MEM[(uint8_t *)src2_68 + 7B], MEM[(uint8_t
*)src2_68 + 7B]
add x4, x4, x5 // src2, src2, _64
add w1, w1, w9 // tmp187, MEM[(uint8_t *)src1_67 + 7B],
MEM[(uint8_t *)src2_68 + 7B]
add w1, w1, 1 // tmp190, tmp187,
strb w1, [x0, 7] // tmp190, MEM[(uint8_t *)dst_66 + 7B]
add x0, x0, x8 // dst, dst, _61
cmp w7, w3 // i_height, y
bne .L3 //,
However, adding a temporary variable between the arithmetic and the dst
variables (like such:)
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
void foo ( uint8_t *dst, int i_dst_stride,
uint8_t *src1, int i_src1_stride,
uint8_t *src2, int i_src2_stride,
int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
{
uint8_t temp0 = ( src1[0] + src2[0] + 1 );
uint8_t temp1 = ( src1[1] + src2[1] + 1 );
uint8_t temp2 = ( src1[2] + src2[2] + 1 );
uint8_t temp3 = ( src1[3] + src2[3] + 1 );
uint8_t temp4 = ( src1[4] + src2[4] + 1 );
uint8_t temp5 = ( src1[5] + src2[5] + 1 );
uint8_t temp6 = ( src1[6] + src2[6] + 1 );
uint8_t temp7 = ( src1[7] + src2[7] + 1 );
dst[0] = temp0;
dst[1] = temp1;
dst[2] = temp2;
dst[3] = temp3;
dst[4] = temp4;
dst[5] = temp5;
dst[6] = temp6;
dst[7] = temp7;
dst += i_dst_stride;
src1 += i_src1_stride;
src2 += i_src2_stride;
}
}
will use SLP and vectorize the basic block:
ldr d1, [x2] // vect__1.6, MEM <vector(8) unsigned char> [(uint8_t
*)src1_67]
add w1, w1, 1 // y, y,
ldr d0, [x4] // vect__2.9, MEM <vector(8) unsigned char> [(uint8_t
*)src2_68]
add x2, x2, x6 // src1, src1, _111
add x4, x4, x3 // src2, src2, _112
add v0.8b, v0.8b, v1.8b // vect__3.10, vect__2.9, vect__1.6
add v0.8b, v0.8b, v2.8b // vect_temp0_38.11, vect__3.10,
tmp112
str d0, [x0] // vect_temp0_38.11, MEM <vector(8) unsigned char>
[(uint8_t *)dst_66]
add x0, x0, x8 // dst, dst, _110
cmp w7, w1 // i_height, y
bne .L3 //,
Thanks!
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-07-19 14:53 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-19 9:48 [Bug tree-optimization/106352] New: SLP seems to need temporary variables eochoa at gcc dot gnu.org
2022-07-19 12:40 ` [Bug tree-optimization/106352] " rguenth at gcc dot gnu.org
2022-07-19 14:51 ` pinskia at gcc dot gnu.org
2022-07-19 14:53 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).