public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/55190] New: [SH] ivopts causes loop setup bloat
@ 2012-11-03 12:16 olegendo at gcc dot gnu.org
  2013-02-16 11:13 ` [Bug target/55190] " olegendo at gcc dot gnu.org
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2012-11-03 12:16 UTC (permalink / raw)
  To: gcc-bugs


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

             Bug #: 55190
           Summary: [SH] ivopts causes loop setup bloat
    Classification: Unclassified
           Product: gcc
           Version: 4.8.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: target
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: olegendo@gcc.gnu.org
            Target: sh*-*-*


The following code:

struct X
{
  int a, b, c, d, e;
};

int test (X* x, unsigned int c)
{
  int s = 0;
  unsigned int i;
  for (i = 0; i < c; ++i)
    s += x[i].b;
  return s;
}

results in:
        tst     r5,r5
        bt/s    .L4
        mov     r5,r1
        shll2   r1
        add     r5,r1
        mov.l   .L9,r2
        shll2   r1
        add     #-20,r1
        shlr2   r1
        mul.l   r2,r1
        mov.l   .L10,r2
        add     #4,r4
        mov     #0,r0
        sts     macl,r1
        and     r2,r1
        add     #1,r1
.L3:
        mov.l   @r4,r2
        dt      r1
        add     #20,r4
        bf/s    .L3
        add     r2,r0
        rts
        nop
.L4:
        rts
        mov     #0,r0
.L11:
        .align 2
.L9:
        .long   214748365
.L10:
        .long   1073741823

In such cases, the loop counter setup code that seems to be produced by ivopts
can be left out, which would result in something like:

        tst     r5,r5
        bt/s    .L15
        add     #4,r4
        mov     #0,r0
.L14:
        mov.l   @r4,r1
        dt      r5
        add     #20,r4
        bf/s    .L14
        add     r1,r0
        rts
        nop
.L15:
        rts
        mov     #0,r0


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug target/55190] [SH] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
@ 2013-02-16 11:13 ` olegendo at gcc dot gnu.org
  2013-06-17  0:13 ` [Bug rtl-optimization/55190] " olegendo at gcc dot gnu.org
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2013-02-16 11:13 UTC (permalink / raw)
  To: gcc-bugs


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

Oleg Endo <olegendo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2013-02-16
     Ever Confirmed|0                           |1

--- Comment #1 from Oleg Endo <olegendo at gcc dot gnu.org> 2013-02-16 11:12:47 UTC ---
As of rev. 196091 this problem still exists.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] [SH] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
  2013-02-16 11:13 ` [Bug target/55190] " olegendo at gcc dot gnu.org
@ 2013-06-17  0:13 ` olegendo at gcc dot gnu.org
  2013-09-30  7:16 ` amker.cheng at gmail dot com
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2013-06-17  0:13 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

Oleg Endo <olegendo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
          Component|target                      |rtl-optimization

--- Comment #2 from Oleg Endo <olegendo at gcc dot gnu.org> ---
Looking at a simpler case (with -O2) ....

int test_0 (int* x, int y)
{
  int sum = 0;

  for (int i = 0; i < y; ++i)
    sum += x[i];

  return sum;
}

        cmp/pl  r5
        bf/s    .L6
        mov     #0,r0

        shll2   r5
        add     #-4,r5
        shlr2   r5
        add     #1,r5    // r5 = ((((unsigned int)y << 2) - 4) >> 2) + 1

        .align 2
.L3:
        mov.l   @r4+,r1
        dt      r5
        bf/s    .L3
        add     r1,r0
.L6:
        rts
        nop

In this case, if 'y' initially has the value '0x7FFFFFFF' the resulting loop
count is truncated to '0x3FFFFFFF'.  This is sort of OK, since the resulting
address would overflow and that is undefined behavior.
On the other hand, if an unlucky address for 'x' is passed in the first place,
the resulting address might overflow much earlier than that.  Thus the loop
counter fiddling seems pointless.

The tree-ssa-ivopts pass converts the loop to this:

Replacing exit test: if (y_3(D) > i_11)
int test_0(int*, int) (int * x, int y)
{
  unsigned int ivtmp.6;
  int i;
  int sum;
  unsigned int i.0;
  unsigned int _1;
  void * _2;
  int _9;
  unsigned int _19;
  unsigned int _20;
  unsigned int _21;

  <bb 2>:
  if (y_3(D) > 0)
    goto <bb 3>;
  else
    goto <bb 7>;

  <bb 3>:
  ivtmp.6_12 = (unsigned int) x_6(D);
  _1 = (unsigned int) y_3(D);
  _21 = _1 * 4;
  _20 = (unsigned int) x_6(D);
  _19 = _20 + _21;

  <bb 4>:
  # sum_16 = PHI <sum_10(6), 0(3)>
  # ivtmp.6_15 = PHI <ivtmp.6_14(6), ivtmp.6_12(3)>
  _2 = (void *) ivtmp.6_15;
  _9 = MEM[base: _2, offset: 0B];
  sum_10 = _9 + sum_16;
  ivtmp.6_14 = ivtmp.6_15 + 4;
  if (ivtmp.6_14 != _19)
    goto <bb 6>;
  else
    goto <bb 5>;

  <bb 5>:
  # sum_18 = PHI <sum_10(4)>
  goto <bb 7>;

  <bb 6>:
  goto <bb 4>;

  <bb 7>:
  # sum_13 = PHI <sum_18(5), 0(2)>
  return sum_13;

}

... which uses address '(x + y * 4)' as the loop exit test.

It is expanded to RTL as '(x + (y << 2))'


;; Generating RTL for gimple basic block 3

;; ivtmp.6_12 = (unsigned int) x_6(D);

(insn 38 37 0 (set (reg:SI 190 [ ivtmp.6 ])
        (reg/v/f:SI 194 [ x ])) -1
     (nil))

;; _19 = ivtmp.6_12 + _21;

(insn 39 38 40 (set (reg:SI 196 [ D.1617 ])
        (ashift:SI (reg/v:SI 195 [ y ])
            (const_int 2 [0x2]))) -1
     (nil))

(insn 40 39 0 (set (reg:SI 191 [ D.1617 ])
        (plus:SI (reg:SI 190 [ ivtmp.6 ])
            (reg:SI 196 [ D.1617 ]))) -1
     (nil))

... and remains until the loop2_doloop RTL pass, which converts the whole thing
into a decrement-and-test loop and adds the other loop counter modifications:

Analyzing operand (reg:SI 191 [ D.1617 ]) of insn (insn 45 44 46 4 (set (reg:SI
147 t)
        (eq:SI (reg:SI 190 [ ivtmp.6 ])
            (reg:SI 191 [ D.1617 ]))) sh_tmp.cpp:5 17 {cmpeqsi_t}
     (expr_list:REG_DEAD (reg:SI 191 [ D.1617 ])
        (nil)))
  invariant (reg:SI 191 [ D.1617 ]) (in SI)
;; improved upper bound by one.
;; Determined upper bound -2.
Loop 1 is simple:
  simple exit 4 -> 7
  number of iterations: (lshiftrt:SI (plus:SI (minus:SI (reg:SI 191 [ D.1617 ])
            (reg:SI 190 [ ivtmp.6 ]))
        (const_int -4 [0xfffffffffffffffc]))
    (const_int 2 [0x2]))
  upper bound: 1073741822
  realistic bound: -1


The code in loop-iv.c works out the correct loop count if it gets the actual
loop count upper bound instead of the truncated address upper bound if the
tree-ssa-ivopts pass is turned off via -fno-ivopts.

I have also tried out the same code on ARM:

        cmp     r1, #0
        ble     .L4
        mov     r3, r0
        add     r1, r0, r1, asl #2
        mov     r0, #0
.L3:
        ldr     r2, [r3], #4
        cmp     r3, r1
        add     r0, r0, r2
        bne     .L3
        bx      lr
.L4:
        mov     r0, #0
        bx      lr

Since there is no doloop pattern on ARM, the code is left as it was output by
the tree-ssa-ivopts pass, i.e. the exit test uses 'x + (y << 2)'.

So this doesn't seem to be a SH only issue.  However, I'm not sure whether this
is more a problem of tree-ssa-ivopts or loop-iv.
If the tree-ssa-ivopts pass left the loop counter alone, the doloop pass would
pick it up and the upper bound calculations in this case would go away. 
However, targets that can do better without doloop (such as ARM) would probably
suffer.  So probably it would be better to handle this overflow case in
loop-iv.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] [SH] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
  2013-02-16 11:13 ` [Bug target/55190] " olegendo at gcc dot gnu.org
  2013-06-17  0:13 ` [Bug rtl-optimization/55190] " olegendo at gcc dot gnu.org
@ 2013-09-30  7:16 ` amker.cheng at gmail dot com
  2013-10-03 10:00 ` olegendo at gcc dot gnu.org
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: amker.cheng at gmail dot com @ 2013-09-30  7:16 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

bin.cheng <amker.cheng at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amker.cheng at gmail dot com

--- Comment #3 from bin.cheng <amker.cheng at gmail dot com> ---
ARM can benefit from doloop structure too, but it is implemented in different
way. ARM backend defines special addsi_compare pattern and let combine pass
combine decrement and comparison instruction, thus saving the comparison
instruction.

IVOPT can be improved to select two iv candidates for the example loop, with
auto-increment one for the memory access and decrement one for loop exit check.
 This is especially good for target supports both doloop and auto-increment
instructions like ARM and SH.

BUT most hand-written loops have incremental basic iv, so IVOPT depends on
previous pass ivcanon to rewrite it into decremental iv, like below:

for (i = 0; i < 100; i++)
  //loop body

---->
for (i = 100; i > 0; i--)
  //modified loop body

Unfortunately, ivcanon pass only do such loop transformation for loop which
iterates constant number times.

It seems difficult for RTL loop passes to revert decision made by IVOPT, so I
think it should be done in GIMPLE IVOPT. I will give it a try.

Thanks.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] [SH] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2013-09-30  7:16 ` amker.cheng at gmail dot com
@ 2013-10-03 10:00 ` olegendo at gcc dot gnu.org
  2014-07-31 18:03 ` olegendo at gcc dot gnu.org
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2013-10-03 10:00 UTC (permalink / raw)
  To: gcc-bugs

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

--- Comment #4 from Oleg Endo <olegendo at gcc dot gnu.org> ---
(In reply to bin.cheng from comment #3)
> ARM can benefit from doloop structure too, but it is implemented in
> different way. ARM backend defines special addsi_compare pattern and let
> combine pass combine decrement and comparison instruction, thus saving the
> comparison instruction.
> 
> IVOPT can be improved to select two iv candidates for the example loop, with
> auto-increment one for the memory access and decrement one for loop exit
> check.  This is especially good for target supports both doloop and
> auto-increment instructions like ARM and SH.

Yes, probably addressing mode selection for memories inside the loop can be
covered somehow by ivopt.  However, this PR is about the calculations of the
loop counter and the handling of overflows.

> 
> BUT most hand-written loops have incremental basic iv, so IVOPT depends on
> previous pass ivcanon to rewrite it into decremental iv, like below:
> 
> for (i = 0; i < 100; i++)
>   //loop body
> 
> ---->
> for (i = 100; i > 0; i--)
>   //modified loop body
> 
> Unfortunately, ivcanon pass only do such loop transformation for loop which
> iterates constant number times.
> 
> It seems difficult for RTL loop passes to revert decision made by IVOPT, so
> I think it should be done in GIMPLE IVOPT. I will give it a try.

I think this is a another story of missed ivopt opportunities.  Maybe open a
new PR for this?


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] [SH] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2013-10-03 10:00 ` olegendo at gcc dot gnu.org
@ 2014-07-31 18:03 ` olegendo at gcc dot gnu.org
  2015-04-04 11:46 ` olegendo at gcc dot gnu.org
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2014-07-31 18:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

Oleg Endo <olegendo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
      Known to fail|                            |4.9.0

--- Comment #5 from Oleg Endo <olegendo at gcc dot gnu.org> ---
As of r213381 the issue still persists.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] [SH] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2014-07-31 18:03 ` olegendo at gcc dot gnu.org
@ 2015-04-04 11:46 ` olegendo at gcc dot gnu.org
  2015-04-07  7:27 ` [Bug rtl-optimization/55190] " olegendo at gcc dot gnu.org
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2015-04-04 11:46 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

--- Comment #6 from Oleg Endo <olegendo at gcc dot gnu.org> ---
Another example:

const char* test (const char* s0, int c, int* rout)
{
  int r = 0;

  for (int i = 0; i < c; ++i)
//    r += s0[i];
    r += *s0++;

  *rout = r;
  return s0;
}

On SH compiles to:

_test:
        cmp/pl    r5
        bf    .L4
        mov    r4,r3       // r3 = s0
        add    r5,r4       // r4 = s0 + c
        mov    r4,r1       // r1 = s0 + c
        mov    #0,r2
        sub    r3,r1       // r1 = (s0 + c) - s0 = c  (d'uh)
    .align 2
.L3:
        mov.b   @r3+,r7
        dt      r1
        bf/s    .L3
        add     r7,r2
.L2:
        mov.l   r2,@r6
        rts    
        mov     r4,r0
    .align 1
.L4:
        bra     .L2
        mov     #0,r2


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
                   ` (5 preceding siblings ...)
  2015-04-04 11:46 ` olegendo at gcc dot gnu.org
@ 2015-04-07  7:27 ` olegendo at gcc dot gnu.org
  2021-05-04 12:31 ` rguenth at gcc dot gnu.org
  2023-07-07  6:49 ` olegendo at gcc dot gnu.org
  8 siblings, 0 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2015-04-07  7:27 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

--- Comment #8 from Oleg Endo <olegendo at gcc dot gnu.org> ---
I've adjusted the title, since the issues here are not SH specific.  There is
also a similar PR 62233.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
                   ` (6 preceding siblings ...)
  2015-04-07  7:27 ` [Bug rtl-optimization/55190] " olegendo at gcc dot gnu.org
@ 2021-05-04 12:31 ` rguenth at gcc dot gnu.org
  2023-07-07  6:49 ` olegendo at gcc dot gnu.org
  8 siblings, 0 replies; 10+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-05-04 12:31 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |ASSIGNED

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Bug rtl-optimization/55190] ivopts causes loop setup bloat
  2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
                   ` (7 preceding siblings ...)
  2021-05-04 12:31 ` rguenth at gcc dot gnu.org
@ 2023-07-07  6:49 ` olegendo at gcc dot gnu.org
  8 siblings, 0 replies; 10+ messages in thread
From: olegendo at gcc dot gnu.org @ 2023-07-07  6:49 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55190

--- Comment #12 from Oleg Endo <olegendo at gcc dot gnu.org> ---
(In reply to Oleg Endo from comment #0)
> The following code:
> 
> struct X
> {
>   int a, b, c, d, e;
> };
> 
> int test (X* x, unsigned int c)
> {
>   int s = 0;
>   unsigned int i;
>   for (i = 0; i < c; ++i)
>     s += x[i].b;
>   return s;
> }
> 
> results in:
>         tst     r5,r5
>         bt/s    .L4
>         mov     r5,r1
>         shll2   r1
>         add     r5,r1
>         mov.l   .L9,r2
>         shll2   r1
>         add     #-20,r1
>         shlr2   r1
>         mul.l   r2,r1
>         mov.l   .L10,r2
>         add     #4,r4
>         mov     #0,r0
>         sts     macl,r1
>         and     r2,r1
>         add     #1,r1
> .L3:
>         mov.l   @r4,r2
>         dt      r1
>         add     #20,r4
>         bf/s    .L3
>         add     r2,r0
>         rts
>         nop
> .L4:
>         rts
>         mov     #0,r0
> .L11:
>         .align 2
> .L9:
>         .long   214748365
> .L10:
>         .long   1073741823
> 

As of GCC 13, this still produces the same code with loop header bloat.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-07-07  6:49 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-11-03 12:16 [Bug target/55190] New: [SH] ivopts causes loop setup bloat olegendo at gcc dot gnu.org
2013-02-16 11:13 ` [Bug target/55190] " olegendo at gcc dot gnu.org
2013-06-17  0:13 ` [Bug rtl-optimization/55190] " olegendo at gcc dot gnu.org
2013-09-30  7:16 ` amker.cheng at gmail dot com
2013-10-03 10:00 ` olegendo at gcc dot gnu.org
2014-07-31 18:03 ` olegendo at gcc dot gnu.org
2015-04-04 11:46 ` olegendo at gcc dot gnu.org
2015-04-07  7:27 ` [Bug rtl-optimization/55190] " olegendo at gcc dot gnu.org
2021-05-04 12:31 ` rguenth at gcc dot gnu.org
2023-07-07  6:49 ` olegendo at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).