From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-485624-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 89271 invoked by alias); 6 May 2015 11:51:15 -0000
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
Received: (qmail 89231 invoked by uid 48); 6 May 2015 11:51:10 -0000
From: "rguenth at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/66002] paq8p benchmark 50% slower than clang on sandybridge
Date: Wed, 06 May 2015 11:51:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 6.0
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: rguenth at gcc dot gnu.org
X-Bugzilla-Status: ASSIGNED
X-Bugzilla-Resolution:
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: rguenth at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags:
X-Bugzilla-Changed-Fields:
Message-ID: <bug-66002-4-dnlv4Vj6vA@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-66002-4@http.gcc.gnu.org/bugzilla/>
References: <bug-66002-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: 7bit
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-SW-Source: 2015-05/txt/msg00464.txt.bz2

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66002
--- Comment #7 from Richard Biener <rguenth at gcc dot gnu.org> ---
https://gcc.gnu.org/ml/gcc-patches/2015-05/msg00214.html

regresses

FAIL: gcc.dg/tree-ssa/pr21559.c scan-tree-dump-times vrp1 "Threaded jump" 3

(a real missed optimization - a redundant if remains)

and also

FAIL: gcc.dg/graphite/scop-dsyr2k.c scan-tree-dump-times graphite "number of
SCo
Ps: 1" 1
FAIL: gcc.dg/graphite/scop-dsyrk.c scan-tree-dump-times graphite "number of
SCoP
s: 1" 1

for 32bits, not investigated yet.

So it seems for the first regression that VRP somehow depends on mergephi,
or at least jump threading as performed by VRP.  IL difference before VRP:

@@ -20,16 +19,19 @@

   <bb 4>:
   if (bytes_11 < 0)
-    goto <bb 7>;
+    goto <bb 6>;
   else
     goto <bb 8>;

   <bb 5>:
   toread_12 = toread_1 - bytes_11;

+  <bb 6>:
+  # toread_9 = PHI <toread_12(5), toread_1(4)>
+
   <bb 7>:
-  # toread_1 = PHI <toread_1(4), 4096(2), toread_12(5)>
-  # bytes_2 = PHI <bytes_11(4), 1(2), bytes_11(5)>
+  # toread_1 = PHI <toread_9(6), 4096(2)>
+  # bytes_2 = PHI <bytes_11(6), 1(2)>
   if (toread_1 != 0)
     goto <bb 3>;
   else

and then VRP gets

-fix_loop_structure: fixing up loops for function
-Disambiguating loop 1 with multiple latches
-Merged latch edges of loop 1
 ;; 2 loops found
 ;;
 ;; Loop 0
 ;;  header 0, latch 1
 ;;  depth 0, outer -1
-;;  nodes: 0 1 2 3 4 5 7 11 8 9 10
+;;  nodes: 0 1 2 3 4 5 6 7 8 9 10
 ;;
 ;; Loop 1
-;;  header 11, latch 7
+;;  header 7, latch 6
 ;;  depth 1, outer 0
-;;  nodes: 11 7 4 5 3
-;; 2 succs { 11 }
+;;  nodes: 7 6 5 4 3
+;; 2 succs { 7 }
 ;; 3 succs { 4 5 }
-;; 4 succs { 7 8 }
-;; 5 succs { 7 }
-;; 7 succs { 11 }
-;; 11 succs { 3 8 }
+;; 4 succs { 6 8 }
+;; 5 succs { 6 }
+;; 6 succs { 7 }
+;; 7 succs { 3 8 }
 ;; 8 succs { 9 10 }
 ;; 9 succs { 10 }
 ;; 10 succs { 1 }

which might be already the whole story about this - it splits the merged PHI
again but in a different way, ending up with

-  <bb 7>:
-  # toread_9 = PHI <toread_15(12), toread_12(5)>
-  # bytes_8 = PHI <bytes_16(12), bytes_19(5)>

-  <bb 11>:
-  # toread_1 = PHI <toread_9(7), 4096(2)>
-  # bytes_2 = PHI <bytes_8(7), 1(2)>

instead of the following (without mergephi and re-splitting):

+  <bb 6>:
+  # toread_9 = PHI <toread_12(5), toread_8(11)>

+  <bb 7>:
+  # toread_1 = PHI <toread_9(6), 4096(2)>
+  # bytes_2 = PHI <bytes_11(6), 1(2)>

and as final result of VRP:

-bytes_2: ~[0, 0]
+bytes_2: VARYING

and that's the usual issue of VRP not inserting asserts at CFG merges
(it doesn't insert PHIs...).  mergephi effectively inserting a PHI
for bytes_11 in BB 6 is pure luck :/

.optimized code difference:

 foo ()
 {
   static char eof_reached = 0;
@@ -13,8 +15,8 @@
   <bb 2>:

   <bb 3>:
-  # toread_22 = PHI <toread_9(6), 4096(2)>
-  bytes_11 = bar (toread_22);
+  # toread_18 = PHI <toread_9(6), 4096(2)>
+  bytes_11 = bar (toread_18);
   if (bytes_11 <= 0)
     goto <bb 4>;
   else
@@ -27,21 +29,26 @@
     goto <bb 8>;

   <bb 5>:
-  toread_12 = toread_22 - bytes_11;
+  toread_12 = toread_18 - bytes_11;

   <bb 6>:
-  # toread_9 = PHI <toread_22(4), toread_12(5)>
+  # toread_9 = PHI <toread_12(5), toread_18(4)>
   if (toread_9 != 0)
     goto <bb 3>;
   else
     goto <bb 7>;

   <bb 7>:
-  return;
+  if (bytes_11 == 0)
+    goto <bb 8>;
+  else
+    goto <bb 9>;

   <bb 8>:
   eof_reached = 1;
-  goto <bb 7>;
+
+  <bb 9>:
+  return;

 }

I'm inclined to XFAIL the testcase, but ...