public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim
@ 2015-05-04  8:32 amker at gcc dot gnu.org
  2015-05-04 11:41 ` [Bug tree-optimization/66003] " rguenth at gcc dot gnu.org
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: amker at gcc dot gnu.org @ 2015-05-04  8:32 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003

            Bug ID: 66003
           Summary: missed cse opportunity in addr expressions because of
                    tree pre/lim
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: amker at gcc dot gnu.org
  Target Milestone: ---

Below simple case is reduced from spec,
typedef struct
{
  int x;
  int y;
} coord;

extern unsigned short **org;
extern coord *c;
void bar (unsigned short *ptr);
void foo (int s, int n)
{
  unsigned short arr[256], *ptr = arr;
  int x, y;

  for (y = c->y; y < c->y + 16; y++)
    for (x = c->x; x < c->x + 16; x++)
      *ptr++ = org [y][x];

  bar (ptr);
}

When compiling with below two command lines
A: $gcc -Ofast -S test.c -o x.S
B: $gcc -Ofast -S test.c -o y.S -fno-tree-pre -fno-tree-loop-im

The assembly difference is as below:

$ diff  x.S y.S
12,14c12,34
<       subq    $520, %rsp
<       .cfi_def_cfa_offset 528
<       movq    c(%rip), %rdx
---
> 	pushq	%r15
> 	.cfi_def_cfa_offset 16
> 	.cfi_offset 15, -16
> 	pushq	%r14
> 	.cfi_def_cfa_offset 24
> 	.cfi_offset 14, -24
> 	pushq	%r13
> 	.cfi_def_cfa_offset 32
> 	.cfi_offset 13, -32
> 	pushq	%r12
> 	.cfi_def_cfa_offset 40
> 	.cfi_offset 12, -40
> 	pushq	%rbp
> 	.cfi_def_cfa_offset 48
> 	.cfi_offset 6, -48
> 	pushq	%rbx
> 	.cfi_def_cfa_offset 56
> 	.cfi_offset 3, -56
> 	subq	$568, %rsp
> 	.cfi_def_cfa_offset 624
> 	movq	c(%rip), %rax
> 	movslq	(%rax), %rsi
> 	movslq	4(%rax), %rdx
16,20c36,58
<       movslq  4(%rdx), %rcx
<       leaq    (%rax,%rcx,8), %rsi
<       movslq  (%rdx), %rcx
<       movq    %rsp, %rax
<       addq    %rcx, %rcx
---
> 	addq	%rsi, %rsi
> 	leaq	24(%rsi), %rcx
> 	leaq	22(%rsi), %rdi
> 	leaq	2(%rsi), %r15
> 	leaq	4(%rsi), %r14
> 	leaq	6(%rsi), %r13
> 	leaq	8(%rsi), %r12
> 	movq	%rcx, 8(%rsp)
> 	leaq	26(%rsi), %rcx
> 	leaq	10(%rsi), %rbp
> 	leaq	12(%rsi), %rbx
> 	leaq	14(%rsi), %r11
> 	leaq	16(%rsi), %r10
> 	movq	%rcx, 16(%rsp)
> 	leaq	28(%rsi), %rcx
> 	leaq	18(%rsi), %r9
> 	leaq	20(%rsi), %r8
> 	movq	%rdi, 40(%rsp)
> 	movq	%rcx, 24(%rsp)
> 	leaq	30(%rsi), %rcx
> 	movq	%rcx, 32(%rsp)
> 	leaq	(%rax,%rdx,8), %rcx
> 	leaq	48(%rsp), %rax
24c62
<       movq    (%rsi), %rdx
---
> 	movq	(%rcx), %rdx
26,27c64,65
<       addq    $8, %rsi
<       movzwl  (%rdx,%rcx), %edi
---
> 	addq	$8, %rcx
> 	movzwl	(%rdx,%rsi), %edi
29c67
<       movzwl  2(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r15), %edi
31c69
<       movzwl  4(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r14), %edi
33c71
<       movzwl  6(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r13), %edi
35c73
<       movzwl  8(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r12), %edi
37c75
<       movzwl  10(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%rbp), %edi
39c77
<       movzwl  12(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%rbx), %edi
41c79
<       movzwl  14(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r11), %edi
43c81
<       movzwl  16(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r10), %edi
45c83
<       movzwl  18(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r9), %edi
47c85
<       movzwl  20(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r8), %edi
49c87,88
<       movzwl  22(%rdx,%rcx), %edi
---
> 	movq	40(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
51c90,91
<       movzwl  24(%rdx,%rcx), %edi
---
> 	movq	8(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
53c93,94
<       movzwl  26(%rdx,%rcx), %edi
---
> 	movq	16(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
55c96,97
<       movzwl  28(%rdx,%rcx), %edi
---
> 	movq	24(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
57c99,100
<       movzwl  30(%rdx,%rcx), %edx
---
> 	movq	32(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edx
59c102
<       leaq    512(%rsp), %rdx
---
> 	leaq	560(%rsp), %rdx
64c107,119
<       addq    $520, %rsp
---
> 	addq	$568, %rsp
> 	.cfi_def_cfa_offset 56
> 	popq	%rbx
> 	.cfi_def_cfa_offset 48
> 	popq	%rbp
> 	.cfi_def_cfa_offset 40
> 	popq	%r12
> 	.cfi_def_cfa_offset 32
> 	popq	%r13
> 	.cfi_def_cfa_offset 24
> 	popq	%r14
> 	.cfi_def_cfa_offset 16
> 	popq	%r15

The tree-pre dump is as below:

  <bb 2>:
  c.0_8 = c;
  y_9 = c.0_8->y;
  _47 = y_9 + 15;
  pretmp_112 = c.0_8->x;
  pretmp_128 = org;
  pretmp_144 = (long unsigned int) pretmp_112;
  pretmp_159 = pretmp_144 * 2;
  pretmp_160 = pretmp_112 + 1;
  pretmp_175 = (long unsigned int) pretmp_160;
  pretmp_176 = pretmp_175 * 2;
  pretmp_191 = pretmp_112 + 2;
  pretmp_192 = (long unsigned int) pretmp_191;
  pretmp_207 = pretmp_192 * 2;
  pretmp_208 = pretmp_112 + 3;
  pretmp_223 = (long unsigned int) pretmp_208;
  pretmp_224 = pretmp_223 * 2;
  pretmp_239 = pretmp_112 + 4;
  pretmp_240 = (long unsigned int) pretmp_239;
  pretmp_255 = pretmp_240 * 2;
  pretmp_256 = pretmp_112 + 5;
  pretmp_271 = (long unsigned int) pretmp_256;
  pretmp_283 = pretmp_271 * 2;
  pretmp_12 = pretmp_112 + 6;
  pretmp_50 = (long unsigned int) pretmp_12;
  pretmp_51 = pretmp_50 * 2;
  pretmp_52 = pretmp_112 + 7;
  pretmp_53 = (long unsigned int) pretmp_52;
  pretmp_65 = pretmp_53 * 2;
  pretmp_66 = pretmp_112 + 8;
  pretmp_67 = (long unsigned int) pretmp_66;
  pretmp_68 = pretmp_67 * 2;
  pretmp_69 = pretmp_112 + 9;
  pretmp_81 = (long unsigned int) pretmp_69;
  pretmp_82 = pretmp_81 * 2;
  pretmp_83 = pretmp_112 + 10;
  pretmp_84 = (long unsigned int) pretmp_83;
  pretmp_85 = pretmp_84 * 2;
  pretmp_97 = pretmp_112 + 11;
  pretmp_98 = (long unsigned int) pretmp_97;
  pretmp_99 = pretmp_98 * 2;
  pretmp_100 = pretmp_112 + 12;
  pretmp_101 = (long unsigned int) pretmp_100;
  pretmp_113 = pretmp_101 * 2;
  pretmp_114 = pretmp_112 + 13;
  pretmp_115 = (long unsigned int) pretmp_114;
  pretmp_116 = pretmp_115 * 2;
  pretmp_117 = pretmp_112 + 14;
  pretmp_129 = (long unsigned int) pretmp_117;
  pretmp_130 = pretmp_129 * 2;
  pretmp_131 = pretmp_112 + 15;
  pretmp_132 = (long unsigned int) pretmp_131;
  pretmp_133 = pretmp_132 * 2;

  <bb 3>:
  # ptr_48 = PHI <&arr(2), ptr_272(3)>
  # y_64 = PHI <y_9(2), y_25(3)>
  _34 = (long unsigned int) y_64;
  _35 = _34 * 8;
  _36 = pretmp_128 + _35;
  _37 = *_36;
  _40 = _37 + pretmp_159;
  _41 = *_40;
  *ptr_48 = _41;
  _56 = _37 + pretmp_176;
  _57 = *_56;
  MEM[(short unsigned int *)ptr_48 + 2B] = _57;
  _72 = _37 + pretmp_207;
  _73 = *_72;
  MEM[(short unsigned int *)ptr_48 + 4B] = _73;
  _88 = _37 + pretmp_224;
  _89 = *_88;
  MEM[(short unsigned int *)ptr_48 + 6B] = _89;
  _104 = _37 + pretmp_255;
  _105 = *_104;
  MEM[(short unsigned int *)ptr_48 + 8B] = _105;
  _120 = _37 + pretmp_283;
  _121 = *_120;
  MEM[(short unsigned int *)ptr_48 + 10B] = _121;
  _136 = _37 + pretmp_51;
  _137 = *_136;
  MEM[(short unsigned int *)ptr_48 + 12B] = _137;
  _152 = _37 + pretmp_65;
  _153 = *_152;
  MEM[(short unsigned int *)ptr_48 + 14B] = _153;
  _168 = _37 + pretmp_68;
  _169 = *_168;
  MEM[(short unsigned int *)ptr_48 + 16B] = _169;
  _184 = _37 + pretmp_82;
  _185 = *_184;
  MEM[(short unsigned int *)ptr_48 + 18B] = _185;
  _200 = _37 + pretmp_85;
  _201 = *_200;
  MEM[(short unsigned int *)ptr_48 + 20B] = _201;
  _216 = _37 + pretmp_99;
  _217 = *_216;
  MEM[(short unsigned int *)ptr_48 + 22B] = _217;
  _232 = _37 + pretmp_113;
  _233 = *_232;
  MEM[(short unsigned int *)ptr_48 + 24B] = _233;
  _248 = _37 + pretmp_116;
  _249 = *_248;
  MEM[(short unsigned int *)ptr_48 + 26B] = _249;
  _264 = _37 + pretmp_130;
  _265 = *_264;
  MEM[(short unsigned int *)ptr_48 + 28B] = _265;
  ptr_272 = &MEM[(void *)ptr_48 + 32B];
  _280 = _37 + pretmp_133;
  _281 = *_280;
  MEM[(short unsigned int *)ptr_48 + 30B] = _281;
  y_25 = y_64 + 1;
  if (y_25 > _47)
    goto <bb 4>;
  else
    goto <bb 3>;

Pre hoist the index part of addr expression "base + (reg + i) *2" out of first
loop.  This introduces higher register pressure, prevents gcc from using
powerful addressing expression on x86.

On other targets like arm, only register pressure issue may hold.

Both pre and lim will do same transformation.
>From gcc-bugs-return-485330-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org Mon May 04 08:37:45 2015
Return-Path: <gcc-bugs-return-485330-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Delivered-To: listarch-gcc-bugs@gcc.gnu.org
Received: (qmail 55774 invoked by alias); 4 May 2015 08:37:45 -0000
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
Delivered-To: mailing list gcc-bugs@gcc.gnu.org
Received: (qmail 55724 invoked by uid 48); 4 May 2015 08:37:41 -0000
From: "tschwinge at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug libgomp/65993] [6 Regression] Numerous libgomp.oacc failures seen in r222712
Date: Mon, 04 May 2015 08:37:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: libgomp
X-Bugzilla-Version: 5.0
X-Bugzilla-Keywords: openacc
X-Bugzilla-Severity: normal
X-Bugzilla-Who: tschwinge at gcc dot gnu.org
X-Bugzilla-Status: ASSIGNED
X-Bugzilla-Resolution:
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: tschwinge at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags:
X-Bugzilla-Changed-Fields: bug_status cf_reconfirmed_on assigned_to everconfirmed
Message-ID: <bug-65993-4-hry9aranJf@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-65993-4@http.gcc.gnu.org/bugzilla/>
References: <bug-65993-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: 7bit
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-SW-Source: 2015-05/txt/msg00170.txt.bz2
Content-length: 682

https://gcc.gnu.org/bugzilla/show_bug.cgi?ide993

Thomas Schwinge <tschwinge at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |ASSIGNED
   Last reconfirmed|                            |2015-05-04
           Assignee|unassigned at gcc dot gnu.org      |tschwinge at gcc dot gnu.org
     Ever confirmed|0                           |1

--- Comment #2 from Thomas Schwinge <tschwinge at gcc dot gnu.org> ---
Patch posted:
<http://news.gmane.org/find-root.php?message_id=%3C87pp6gvj3v.fsf%40kepler.schwinge.homeip.net%3E>.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/66003] missed cse opportunity in addr expressions because of tree pre/lim
  2015-05-04  8:32 [Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim amker at gcc dot gnu.org
@ 2015-05-04 11:41 ` rguenth at gcc dot gnu.org
  2015-05-05  1:31 ` amker at gcc dot gnu.org
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2015-05-04 11:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Keywords|                            |missed-optimization

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Hmm, I think IVOPTs should be able to undo this code motion?


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/66003] missed cse opportunity in addr expressions because of tree pre/lim
  2015-05-04  8:32 [Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim amker at gcc dot gnu.org
  2015-05-04 11:41 ` [Bug tree-optimization/66003] " rguenth at gcc dot gnu.org
@ 2015-05-05  1:31 ` amker at gcc dot gnu.org
  2015-05-05  1:48 ` pinskia at gcc dot gnu.org
  2024-03-18  6:46 ` pinskia at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: amker at gcc dot gnu.org @ 2015-05-05  1:31 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003

--- Comment #2 from amker at gcc dot gnu.org ---
(In reply to Richard Biener from comment #1)
> Hmm, I think IVOPTs should be able to undo this code motion?

It can't.  Address of all pointer dereferences except the first one are not
even induction variables.  The base address is loaded from memory by the first
dereference.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/66003] missed cse opportunity in addr expressions because of tree pre/lim
  2015-05-04  8:32 [Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim amker at gcc dot gnu.org
  2015-05-04 11:41 ` [Bug tree-optimization/66003] " rguenth at gcc dot gnu.org
  2015-05-05  1:31 ` amker at gcc dot gnu.org
@ 2015-05-05  1:48 ` pinskia at gcc dot gnu.org
  2024-03-18  6:46 ` pinskia at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2015-05-05  1:48 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003

--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
slsr does a good job at cleaning up a little bit the code.  I would have
suspected slsr would have generated the addresses correctly and move the add
inside the loop.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/66003] missed cse opportunity in addr expressions because of tree pre/lim
  2015-05-04  8:32 [Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim amker at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2015-05-05  1:48 ` pinskia at gcc dot gnu.org
@ 2024-03-18  6:46 ` pinskia at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-03-18  6:46 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
             Status|UNCONFIRMED                 |NEW
             Target|x86_64                      |x86_64 aarch64
           Severity|normal                      |enhancement
   Last reconfirmed|                            |2024-03-18

--- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Confirmed.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-03-18  6:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-04  8:32 [Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim amker at gcc dot gnu.org
2015-05-04 11:41 ` [Bug tree-optimization/66003] " rguenth at gcc dot gnu.org
2015-05-05  1:31 ` amker at gcc dot gnu.org
2015-05-05  1:48 ` pinskia at gcc dot gnu.org
2024-03-18  6:46 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).