public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/94269] New: widening_mul should consider block frequency
@ 2020-03-23  9:47 felix.yang at huawei dot com
  2020-03-23 14:14 ` [Bug tree-optimization/94269] " rguenth at gcc dot gnu.org
  2020-03-26  7:36 ` cvs-commit at gcc dot gnu.org
  0 siblings, 2 replies; 3+ messages in thread
From: felix.yang at huawei dot com @ 2020-03-23  9:47 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269

            Bug ID: 94269
           Summary: widening_mul should consider block frequency
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: felix.yang at huawei dot com
  Target Milestone: ---

Test case:

float
calc(long n, float *x, int inc_x,
             float *y, int inc_y)
{
  float dot = 0.0;
  int ix = 0, iy = 0;

  if (n < 0) {
    return dot;
  }

  int i = 0;
  while (i < n) {
    dot += y[iy] * x[ix];
    ix  += inc_x;
    iy  += inc_y;
    i++;
  }

  return dot;
}

Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize
-funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c

calc:
.LFB0:
        .cfi_startproc
        cmp     x0, 0
        ble     .L4
        mov     w7, w0
        mov     x5, x3
        mov     w6, 32
        mov     x3, x1
        mov     x1, 0
        index   z4.s, #0, w4
        index   z3.s, #0, w2
        whilelo p0.s, wzr, w0
        mov     z0.s, #0
        .p2align 3,,7
.L3:
        ld1w    z1.s, p0/z, [x5, z4.s, sxtw 2]
        ld1w    z2.s, p0/z, [x3, z3.s, sxtw 2]
        add     x1, x1, 8
        fmla    z0.s, p0/m, z1.s, z2.s
        smaddl  x5, w4, w6, x5           <==============
        whilelo p0.s, w1, w7
        smaddl  x3, w2, w6, x3           <==============
        b.any   .L3
        ptrue   p0.b, vl32
        faddv   s0, p0, z0.s
        ret

Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize
-funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c
-fdisable-tree-widening_mul

calc:
.LFB0:
        .cfi_startproc
        cmp     x0, 0
        ble     .L4
        sbfiz   x8, x4, 5, 32
        sbfiz   x7, x2, 5, 32
        mov     w6, w0
        mov     x5, x3
        mov     x3, x1
        mov     x1, 0
        index   z4.s, #0, w4
        index   z3.s, #0, w2
        whilelo p0.s, wzr, w0
        mov     z0.s, #0
        ptrue   p1.b, vl32
        .p2align 3,,7
.L3:
        ld1w    z1.s, p0/z, [x5, z4.s, sxtw 2]
        ld1w    z2.s, p0/z, [x3, z3.s, sxtw 2]
        add     x1, x1, 8
        fmul    z1.s, z1.s, z2.s
        add     x5, x5, x8             <=============
        fadd    z0.s, p0/m, z0.s, z1.s
        add     x3, x3, x7             <=============
        whilelo p0.s, w1, w6
        b.any   .L3
        faddv   s0, p1, z0.s
        ret

widening_mul phase moves the two multiply instructions from outside the loop to
inside the loop, merging with the two add instructions separately.  This
increases the cost of the loop.  

I think widening_mul should consider block frequency when doing such a
combination.
I mean something like:
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 54ba035..4439452 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -2721,7 +2721,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi,
gimple *stmt,
     {
       if (!has_single_use (rhs1)
          || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
-                                 &type2, &mult_rhs2))
+                                 &type2, &mult_rhs2)
+         || (gimple_bb (rhs1_stmt) != gimple_bb (stmt)
+             && gimple_bb (rhs1_stmt)->count.to_frequency(cfun)
+                < gimple_bb (stmt)->count.to_frequency(cfun)))
        return false;
       add_rhs = rhs2;
       conv_stmt = conv1_stmt;
@@ -2730,7 +2733,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi,
gimple *stmt,
     {
       if (!has_single_use (rhs2)
          || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
-                                 &type2, &mult_rhs2))
+                                 &type2, &mult_rhs2)
+         || (gimple_bb (rhs2_stmt) != gimple_bb (stmt)
+             && gimple_bb (rhs2_stmt)->count.to_frequency(cfun)
+                < gimple_bb (stmt)->count.to_frequency(cfun)))
        return false;
       add_rhs = rhs1;
       conv_stmt = conv2_stmt;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Bug tree-optimization/94269] widening_mul should consider block frequency
  2020-03-23  9:47 [Bug tree-optimization/94269] New: widening_mul should consider block frequency felix.yang at huawei dot com
@ 2020-03-23 14:14 ` rguenth at gcc dot gnu.org
  2020-03-26  7:36 ` cvs-commit at gcc dot gnu.org
  1 sibling, 0 replies; 3+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-03-23 14:14 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Last reconfirmed|                            |2020-03-23
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Or even simpler like FMA detection which restricts ops to be defined in the
same basic-block:

      /* For now restrict this operations to single basic blocks.  In theory
         we would want to support sinking the multiplication in
         m = a*b;
         if ()
           ma = m + c;
         else
           d = m;
         to form a fma in the then block and sink the multiplication to the
         else block.  */
      if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
        return false;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Bug tree-optimization/94269] widening_mul should consider block frequency
  2020-03-23  9:47 [Bug tree-optimization/94269] New: widening_mul should consider block frequency felix.yang at huawei dot com
  2020-03-23 14:14 ` [Bug tree-optimization/94269] " rguenth at gcc dot gnu.org
@ 2020-03-26  7:36 ` cvs-commit at gcc dot gnu.org
  1 sibling, 0 replies; 3+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2020-03-26  7:36 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269

--- Comment #2 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:

https://gcc.gnu.org/g:d21dff5b4fee51ae432143065bededfc763dc344

commit r10-7391-gd21dff5b4fee51ae432143065bededfc763dc344
Author: Richard Biener <rguenther@suse.de>
Date:   Thu Mar 26 08:33:57 2020 +0100

    widening_mul: restrict ops to be defined in the same basic-block when
convert plusminus to widen

    In the testcase for PR94269, widening_mul moves two multiply
    instructions from outside the loop to inside
    the loop, merging with two add instructions separately.  This
    increases the cost of the loop.  Like FMA detection
    in the same pass, simply restrict ops to be defined in the same
    basic-block to avoid possibly moving multiply
    to a different block with a higher execution frequency.

    2020-03-26  Felix Yang  <felix.yang@huawei.com>

            PR tree-optimization/94269
            * tree-ssa-math-opts.c (convert_plusminus_to_widen): Restrict
            this
            operation to single basic block.

            * gcc.dg/pr94269.c: New test.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-03-26  7:36 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-23  9:47 [Bug tree-optimization/94269] New: widening_mul should consider block frequency felix.yang at huawei dot com
2020-03-23 14:14 ` [Bug tree-optimization/94269] " rguenth at gcc dot gnu.org
2020-03-26  7:36 ` cvs-commit at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).