From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 22195 invoked by alias); 31 Oct 2012 21:33:01 -0000 Received: (qmail 22131 invoked by uid 48); 31 Oct 2012 21:32:39 -0000 From: "sgunderson at bigfoot dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores Date: Wed, 31 Oct 2012 21:33:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: sgunderson at bigfoot dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Changed-Fields: Message-ID: X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org X-SW-Source: 2012-10/txt/msg02992.txt.bz2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D55155 Bug #: 55155 Summary: Autovectorization does not use unaligned loads/stores Classification: Unclassified Product: gcc Version: 4.7.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassigned@gcc.gnu.org ReportedBy: sgunderson@bigfoot.com Hi, I am on gcc version 4.7.1 (Debian 4.7.1-7)=20 and a project of mine had code that looked like this: beklager:~> cat example.cpp void func(float * __restrict prod_features, float * __restrict grad_prod_features, float alpha, unsigned num_prods) { float *pf =3D (float *)__builtin_assume_aligned(prod_features, 16); float *gpf =3D (float *)__builtin_assume_aligned(grad_prod_features, 16= ); for (unsigned i =3D 0; i < num_prods * 16; ++i) { prod_features[i] -=3D alpha * grad_prod_features[i]; //pf[i] -=3D alpha * gpf[i]; } } This would seem like a great case for autovectorization, so I tried: beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp=20 example.cpp: In function =E2=80=98void func(float*, float*, float, unsigned= int)=E2=80=99: example.cpp:2:9: warning: unused variable =E2=80=98pf=E2=80=99 [-Wunused-va= riable] example.cpp:3:9: warning: unused variable =E2=80=98gpf=E2=80=99 [-Wunused-v= ariable] The resulting code, however, is a train wreck: beklager:~> objdump --disassemble --demangle example.o=20=20=20=20=20=20=20= =20=20 example.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 : 0: 55 push %rbp 1: c1 e2 04 shl $0x4,%edx 4: 85 d2 test %edx,%edx 6: 53 push %rbx 7: 0f 84 ef 00 00 00 je fc d: 49 89 f8 mov %rdi,%r8 10: 41 83 e0 0f and $0xf,%r8d 14: 49 c1 e8 02 shr $0x2,%r8 18: 49 f7 d8 neg %r8 1b: 41 83 e0 03 and $0x3,%r8d 1f: 44 39 c2 cmp %r8d,%edx 22: 44 0f 42 c2 cmovb %edx,%r8d 26: 83 fa 04 cmp $0x4,%edx 29: 0f 87 d0 00 00 00 ja ff 2f: 41 89 d0 mov %edx,%r8d 32: 31 c0 xor %eax,%eax 34: 0f 1f 40 00 nopl 0x0(%rax) 38: f3 0f 10 14 86 movss (%rsi,%rax,4),%xmm2 3d: 8d 48 01 lea 0x1(%rax),%ecx 40: f3 0f 59 d0 mulss %xmm0,%xmm2 44: f3 0f 10 0c 87 movss (%rdi,%rax,4),%xmm1 49: f3 0f 5c ca subss %xmm2,%xmm1 4d: f3 0f 11 0c 87 movss %xmm1,(%rdi,%rax,4) 52: 48 83 c0 01 add $0x1,%rax 56: 41 39 c0 cmp %eax,%r8d 59: 77 dd ja 38 5b: 44 39 c2 cmp %r8d,%edx 5e: 0f 84 98 00 00 00 je fc 64: 89 d5 mov %edx,%ebp 66: 45 89 c1 mov %r8d,%r9d 69: 44 29 c5 sub %r8d,%ebp 6c: 41 89 eb mov %ebp,%r11d 6f: 41 c1 eb 02 shr $0x2,%r11d 73: 42 8d 1c 9d 00 00 00 lea 0x0(,%r11,4),%ebx 7a: 00=20 7b: 85 db test %ebx,%ebx 7d: 74 59 je d8 7f: 0f 28 c8 movaps %xmm0,%xmm1 82: 49 c1 e1 02 shl $0x2,%r9 86: 0f 57 db xorps %xmm3,%xmm3 89: 4e 8d 14 0f lea (%rdi,%r9,1),%r10 8d: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1 91: 49 01 f1 add %rsi,%r9 94: 31 c0 xor %eax,%eax 96: 45 31 c0 xor %r8d,%r8d 99: 0f 28 e1 movaps %xmm1,%xmm4 9c: 0f 1f 40 00 nopl 0x0(%rax) a0: 0f 28 cb movaps %xmm3,%xmm1 a3: 41 83 c0 01 add $0x1,%r8d a7: 41 0f 28 14 02 movaps (%r10,%rax,1),%xmm2 ac: 41 0f 12 0c 01 movlps (%r9,%rax,1),%xmm1 b1: 41 0f 16 4c 01 08 movhps 0x8(%r9,%rax,1),%xmm1 b7: 0f 59 cc mulps %xmm4,%xmm1 ba: 0f 5c d1 subps %xmm1,%xmm2 bd: 41 0f 29 14 02 movaps %xmm2,(%r10,%rax,1) c2: 48 83 c0 10 add $0x10,%rax c6: 45 39 d8 cmp %r11d,%r8d c9: 72 d5 jb a0 cb: 01 d9 add %ebx,%ecx cd: 39 dd cmp %ebx,%ebp cf: 74 2b je fc d1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) d8: 41 89 c8 mov %ecx,%r8d db: 83 c1 01 add $0x1,%ecx de: f3 42 0f 10 14 86 movss (%rsi,%r8,4),%xmm2 e4: 4a 8d 04 87 lea (%rdi,%r8,4),%rax e8: 39 ca cmp %ecx,%edx ea: f3 0f 59 d0 mulss %xmm0,%xmm2 ee: f3 0f 10 08 movss (%rax),%xmm1 f2: f3 0f 5c ca subss %xmm2,%xmm1 f6: f3 0f 11 08 movss %xmm1,(%rax) fa: 77 dc ja d8 fc: 5b pop %rbx fd: 5d pop %rbp fe: c3 retq=20=20=20 ff: 45 85 c0 test %r8d,%r8d 102: 0f 85 2a ff ff ff jne 32 108: 31 c9 xor %ecx,%ecx 10a: e9 55 ff ff ff jmpq 64 There are two potential issues here: 1. It knows that my two arrays are not necessarily 16-byte aligned, so it e= mits a huge body of code around it. (If I comment out the line in the inner loop= and uncomment the one next to it, much of this code disappears.) It should simp= ly write the loop using unaligned loads/stores (movups) instead of trying to p= iece together packed scalars with movlps and movhps itself. 2. For some reason, it doesn't understand that (num_prods * 16) is divisibl= e by four, so it has extra code to handle that case. If I change num_prods to a constant (e.g. 64), and use the variables that a= re assumed 16-aligned, the output is the much more sane beklager:~> cat example.cpp=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20= =20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20 void func(float * __restrict prod_features, float * __restrict grad_prod_features, float alpha, unsigned num_prods) { float *pf =3D (float *)__builtin_assume_aligned(prod_features, 16); float *gpf =3D (float *)__builtin_assume_aligned(grad_prod_features, 16= ); for (unsigned i =3D 0; i < 64 * 16; ++i) { //prod_features[i] -=3D alpha * grad_prod_features[i]; pf[i] -=3D alpha * gpf[i]; } } beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp=20 beklager:~> objdump --disassemble --demangle example.o=20=20=20=20=20=20=20= =20=20=20=20=20=20 example.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 : 0: 0f 28 c8 movaps %xmm0,%xmm1 3: 31 c0 xor %eax,%eax 5: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1 9: 0f 28 d1 movaps %xmm1,%xmm2 c: 0f 1f 40 00 nopl 0x0(%rax) 10: 0f 28 0c 06 movaps (%rsi,%rax,1),%xmm1 14: 0f 59 ca mulps %xmm2,%xmm1 17: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0 1b: 0f 5c c1 subps %xmm1,%xmm0 1e: 0f 29 04 07 movaps %xmm0,(%rdi,%rax,1) 22: 48 83 c0 10 add $0x10,%rax 26: 48 3d 00 10 00 00 cmp $0x1000,%rax 2c: 75 e2 jne 10 2e: f3 c3 repz retq=20 although in this case, one could argue that it should have fused the movaps+subps+movaps to a single subps from memory.