From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-405316-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 22195 invoked by alias); 31 Oct 2012 21:33:01 -0000
Received: (qmail 22131 invoked by uid 48); 31 Oct 2012 21:32:39 -0000
From: "sgunderson at bigfoot dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores
Date: Wed, 31 Oct 2012 21:33:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Keywords:
X-Bugzilla-Severity: normal
X-Bugzilla-Who: sgunderson at bigfoot dot com
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Changed-Fields:
Message-ID: <bug-55155-4@http.gcc.gnu.org/bugzilla/>
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
X-SW-Source: 2012-10/txt/msg02992.txt.bz2


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D55155

             Bug #: 55155
           Summary: Autovectorization does not use unaligned loads/stores
    Classification: Unclassified
           Product: gcc
           Version: 4.7.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: sgunderson@bigfoot.com


Hi,

I am on

  gcc version 4.7.1 (Debian 4.7.1-7)=20

and a project of mine had code that looked like this:

beklager:~> cat example.cpp
void func(float * __restrict prod_features, float * __restrict
grad_prod_features, float alpha, unsigned num_prods) {
    float *pf =3D (float *)__builtin_assume_aligned(prod_features, 16);
    float *gpf =3D (float *)__builtin_assume_aligned(grad_prod_features, 16=
);
    for (unsigned i =3D 0; i < num_prods * 16; ++i) {
        prod_features[i] -=3D alpha * grad_prod_features[i];
        //pf[i] -=3D alpha * gpf[i];
    }
}

This would seem like a great case for autovectorization, so I tried:

beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp=20
example.cpp: In function =E2=80=98void func(float*, float*, float, unsigned=
 int)=E2=80=99:
example.cpp:2:9: warning: unused variable =E2=80=98pf=E2=80=99 [-Wunused-va=
riable]
example.cpp:3:9: warning: unused variable =E2=80=98gpf=E2=80=99 [-Wunused-v=
ariable]

The resulting code, however, is a train wreck:
beklager:~> objdump --disassemble --demangle example.o=20=20=20=20=20=20=20=
=20=20

example.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <func(float*, float*, float, unsigned int)>:
   0:    55                       push   %rbp
   1:    c1 e2 04                 shl    $0x4,%edx
   4:    85 d2                    test   %edx,%edx
   6:    53                       push   %rbx
   7:    0f 84 ef 00 00 00        je     fc <func(float*, float*, float,
unsigned int)+0xfc>
   d:    49 89 f8                 mov    %rdi,%r8
  10:    41 83 e0 0f              and    $0xf,%r8d
  14:    49 c1 e8 02              shr    $0x2,%r8
  18:    49 f7 d8                 neg    %r8
  1b:    41 83 e0 03              and    $0x3,%r8d
  1f:    44 39 c2                 cmp    %r8d,%edx
  22:    44 0f 42 c2              cmovb  %edx,%r8d
  26:    83 fa 04                 cmp    $0x4,%edx
  29:    0f 87 d0 00 00 00        ja     ff <func(float*, float*, float,
unsigned int)+0xff>
  2f:    41 89 d0                 mov    %edx,%r8d
  32:    31 c0                    xor    %eax,%eax
  34:    0f 1f 40 00              nopl   0x0(%rax)
  38:    f3 0f 10 14 86           movss  (%rsi,%rax,4),%xmm2
  3d:    8d 48 01                 lea    0x1(%rax),%ecx
  40:    f3 0f 59 d0              mulss  %xmm0,%xmm2
  44:    f3 0f 10 0c 87           movss  (%rdi,%rax,4),%xmm1
  49:    f3 0f 5c ca              subss  %xmm2,%xmm1
  4d:    f3 0f 11 0c 87           movss  %xmm1,(%rdi,%rax,4)
  52:    48 83 c0 01              add    $0x1,%rax
  56:    41 39 c0                 cmp    %eax,%r8d
  59:    77 dd                    ja     38 <func(float*, float*, float,
unsigned int)+0x38>
  5b:    44 39 c2                 cmp    %r8d,%edx
  5e:    0f 84 98 00 00 00        je     fc <func(float*, float*, float,
unsigned int)+0xfc>
  64:    89 d5                    mov    %edx,%ebp
  66:    45 89 c1                 mov    %r8d,%r9d
  69:    44 29 c5                 sub    %r8d,%ebp
  6c:    41 89 eb                 mov    %ebp,%r11d
  6f:    41 c1 eb 02              shr    $0x2,%r11d
  73:    42 8d 1c 9d 00 00 00     lea    0x0(,%r11,4),%ebx
  7a:    00=20
  7b:    85 db                    test   %ebx,%ebx
  7d:    74 59                    je     d8 <func(float*, float*, float,
unsigned int)+0xd8>
  7f:    0f 28 c8                 movaps %xmm0,%xmm1
  82:    49 c1 e1 02              shl    $0x2,%r9
  86:    0f 57 db                 xorps  %xmm3,%xmm3
  89:    4e 8d 14 0f              lea    (%rdi,%r9,1),%r10
  8d:    0f c6 c9 00              shufps $0x0,%xmm1,%xmm1
  91:    49 01 f1                 add    %rsi,%r9
  94:    31 c0                    xor    %eax,%eax
  96:    45 31 c0                 xor    %r8d,%r8d
  99:    0f 28 e1                 movaps %xmm1,%xmm4
  9c:    0f 1f 40 00              nopl   0x0(%rax)
  a0:    0f 28 cb                 movaps %xmm3,%xmm1
  a3:    41 83 c0 01              add    $0x1,%r8d
  a7:    41 0f 28 14 02           movaps (%r10,%rax,1),%xmm2
  ac:    41 0f 12 0c 01           movlps (%r9,%rax,1),%xmm1
  b1:    41 0f 16 4c 01 08        movhps 0x8(%r9,%rax,1),%xmm1
  b7:    0f 59 cc                 mulps  %xmm4,%xmm1
  ba:    0f 5c d1                 subps  %xmm1,%xmm2
  bd:    41 0f 29 14 02           movaps %xmm2,(%r10,%rax,1)
  c2:    48 83 c0 10              add    $0x10,%rax
  c6:    45 39 d8                 cmp    %r11d,%r8d
  c9:    72 d5                    jb     a0 <func(float*, float*, float,
unsigned int)+0xa0>
  cb:    01 d9                    add    %ebx,%ecx
  cd:    39 dd                    cmp    %ebx,%ebp
  cf:    74 2b                    je     fc <func(float*, float*, float,
unsigned int)+0xfc>
  d1:    0f 1f 80 00 00 00 00     nopl   0x0(%rax)
  d8:    41 89 c8                 mov    %ecx,%r8d
  db:    83 c1 01                 add    $0x1,%ecx
  de:    f3 42 0f 10 14 86        movss  (%rsi,%r8,4),%xmm2
  e4:    4a 8d 04 87              lea    (%rdi,%r8,4),%rax
  e8:    39 ca                    cmp    %ecx,%edx
  ea:    f3 0f 59 d0              mulss  %xmm0,%xmm2
  ee:    f3 0f 10 08              movss  (%rax),%xmm1
  f2:    f3 0f 5c ca              subss  %xmm2,%xmm1
  f6:    f3 0f 11 08              movss  %xmm1,(%rax)
  fa:    77 dc                    ja     d8 <func(float*, float*, float,
unsigned int)+0xd8>
  fc:    5b                       pop    %rbx
  fd:    5d                       pop    %rbp
  fe:    c3                       retq=20=20=20
  ff:    45 85 c0                 test   %r8d,%r8d
 102:    0f 85 2a ff ff ff        jne    32 <func(float*, float*, float,
unsigned int)+0x32>
 108:    31 c9                    xor    %ecx,%ecx
 10a:    e9 55 ff ff ff           jmpq   64 <func(float*, float*, float,
unsigned int)+0x64>

There are two potential issues here:

1. It knows that my two arrays are not necessarily 16-byte aligned, so it e=
mits
a huge body of code around it. (If I comment out the line in the inner loop=
 and
uncomment the one next to it, much of this code disappears.) It should simp=
ly
write the loop using unaligned loads/stores (movups) instead of trying to p=
iece
together packed scalars with movlps and movhps itself.
2. For some reason, it doesn't understand that (num_prods * 16) is divisibl=
e by
four, so it has extra code to handle that case.

If I change num_prods to a constant (e.g. 64), and use the variables that a=
re
assumed 16-aligned, the output is the much more sane
beklager:~> cat example.cpp=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=
=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20
void func(float * __restrict prod_features, float * __restrict
grad_prod_features, float alpha, unsigned num_prods) {
    float *pf =3D (float *)__builtin_assume_aligned(prod_features, 16);
    float *gpf =3D (float *)__builtin_assume_aligned(grad_prod_features, 16=
);
    for (unsigned i =3D 0; i < 64 * 16; ++i) {
        //prod_features[i] -=3D alpha * grad_prod_features[i];
        pf[i] -=3D alpha * gpf[i];
    }
}

beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp=20
beklager:~> objdump --disassemble --demangle example.o=20=20=20=20=20=20=20=
=20=20=20=20=20=20

example.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <func(float*, float*, float, unsigned int)>:
   0:    0f 28 c8                 movaps %xmm0,%xmm1
   3:    31 c0                    xor    %eax,%eax
   5:    0f c6 c9 00              shufps $0x0,%xmm1,%xmm1
   9:    0f 28 d1                 movaps %xmm1,%xmm2
   c:    0f 1f 40 00              nopl   0x0(%rax)
  10:    0f 28 0c 06              movaps (%rsi,%rax,1),%xmm1
  14:    0f 59 ca                 mulps  %xmm2,%xmm1
  17:    0f 28 04 07              movaps (%rdi,%rax,1),%xmm0
  1b:    0f 5c c1                 subps  %xmm1,%xmm0
  1e:    0f 29 04 07              movaps %xmm0,(%rdi,%rax,1)
  22:    48 83 c0 10              add    $0x10,%rax
  26:    48 3d 00 10 00 00        cmp    $0x1000,%rax
  2c:    75 e2                    jne    10 <func(float*, float*, float,
unsigned int)+0x10>
  2e:    f3 c3                    repz retq=20

although in this case, one could argue that it should have fused the
movaps+subps+movaps to a single subps from memory.