From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-458743-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 17376 invoked by alias); 18 Aug 2014 22:47:05 -0000
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
Received: (qmail 17337 invoked by uid 48); 18 Aug 2014 22:47:01 -0000
From: "spop at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/62178] New: [AArch64] Performance regression on matrix matrix multiply due to r211211
Date: Mon, 18 Aug 2014 22:47:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 5.0
X-Bugzilla-Keywords:
X-Bugzilla-Severity: normal
X-Bugzilla-Who: spop at gcc dot gnu.org
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags:
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter
Message-ID: <bug-62178-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: 7bit
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-SW-Source: 2014-08/txt/msg01240.txt.bz2

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62178

            Bug ID: 62178
           Summary: [AArch64] Performance regression on matrix matrix
                    multiply due to r211211
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: spop at gcc dot gnu.org

int a[30 +1][30 +1], b[30 +1][30 +1], r[30 +1][30 +1];

void Intmm (int run) {
  int i, j, k;

  for ( i = 1; i <= 30; i++ )
    for ( j = 1; j <= 30; j++ ) {
      r[i][j] = 0;
      for(k = 1; k <= 30; k++ )
        r[i][j] += a[i][k]*b[k][j];
    }
}

compile this at -O3 with the last good compiler r211210 and with the first bad
compiler at r211211, then diff the assembly:

--- good.s    2014-08-18 17:44:26.179506000 -0500
+++ bad.s    2014-08-18 17:44:26.213807000 -0500
@@ -6,45 +6,44 @@
     .type    Intmm, %function
 Intmm:
     movi    v3.2s, 0
-    adrp    x6, a+128
-    adrp    x8, r+128
-    adrp    x10, r+3848
-    adrp    x9, b+128
-    adrp    x7, b+248
-    add    x6, x6, :lo12:a+128
-    add    x8, x8, :lo12:r+128
-    add    x10, x10, :lo12:r+3848
-    add    x9, x9, :lo12:b+128
-    add    x7, x7, :lo12:b+248
+    adrp    x6, r+128
+    adrp    x4, a+124
+    adrp    x8, r+3848
+    adrp    x7, b
+    add    x6, x6, :lo12:r+128
+    add    x4, x4, :lo12:a+124
+    add    x8, x8, :lo12:r+3848
+    add    x7, x7, :lo12:b
 .L2:
-    mov    x5, x8
-    mov    x4, x8
-    mov    x3, x9
+    mov    x5, 0
 .L4:
-    str    d3, [x4]
-    add    x2, x3, 3720
-    movi    v0.2s, 0
-    mov    x1, x6
-    mov    x0, x3
+    str    d3, [x6, x5]
+    add    x3, x5, 128
+    movi    v1.2s, 0
+    add    x3, x3, x7
+    mov    x0, 0
 .L3:
-    ldr    d1, [x0]
-    add    x0, x0, 124
-    ld1r    {v2.2s}, [x1], 4
-    cmp    x0, x2
-    mla    v0.2s, v2.2s, v1.2s
+    add    x1, x4, x0
+    lsl    x2, x0, 5
+    sub    x2, x2, x0
+    add    x0, x0, 4
+    cmp    x0, 120
+    ldr    w1, [x1, 4]
+    ldr    d2, [x3, x2]
+    dup    v0.2s, w1
+    mla    v1.2s, v0.2s, v2.2s
     bne    .L3
-    str    d0, [x5], 8
-    add    x3, x3, 8
-    cmp    x3, x7
-    add    x4, x4, 8
+    str    d1, [x6, x5]
+    add    x5, x5, 8
+    cmp    x5, 120
     bne    .L4
-    add    x8, x8, 124
     add    x6, x6, 124
-    cmp    x8, x10
+    add    x4, x4, 124
+    cmp    x6, x8
     bne    .L2
     ret
     .size    Intmm, .-Intmm
     .comm    r,3844,8
     .comm    b,3844,8
     .comm    a,3844,8

Remark that the innermost loop .L3 contains 5 more instructions with the bad
compiler, due to more scalar computations for the addressing modes:

 .L3:
-    ldr    d1, [x0]
-    add    x0, x0, 124
-    ld1r    {v2.2s}, [x1], 4
-    cmp    x0, x2
-    mla    v0.2s, v2.2s, v1.2s
+    add    x1, x4, x0
+    lsl    x2, x0, 5
+    sub    x2, x2, x0
+    add    x0, x0, 4
+    cmp    x0, 120
+    ldr    w1, [x1, 4]
+    ldr    d2, [x3, x2]
+    dup    v0.2s, w1
+    mla    v1.2s, v0.2s, v2.2s
     bne    .L3