public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [SVE] [fwprop] PR88833 - Redundant moves for WHILELO-based loops
@ 2019-06-21  9:52 Prathamesh Kulkarni
  2019-06-24  9:29 ` Richard Sandiford
  0 siblings, 1 reply; 18+ messages in thread
From: Prathamesh Kulkarni @ 2019-06-21  9:52 UTC (permalink / raw)
  To: gcc Patches, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 2415 bytes --]

Hi,
The attached patch tries to fix PR88833.
For the following test-case:

subroutine foo(x)
  real :: x(100)
  x = x + 10
end subroutine foo

Assembly with -O3 -march=armv8.2-a+sve:

foo_:
.LFB0:
        .cfi_startproc
        mov     w2, 100
        mov     w3, w2
        mov     x1, 0
        whilelo p0.s, wzr, w2
        fmov    z1.s, #1.0e+1
        .p2align 3,,7
.L2:
        ld1w    z0.s, p0/z, [x0, x1, lsl 2]
        fadd    z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x0, x1, lsl 2]
        incw    x1
        whilelo p0.s, w1, w3
        bne     .L2
        ret

As we can see, it generates extra mov w3, w2. Instead it could have reused w2 in
both whilelo's.

expand produces:
insn 7: reg:SI 97 = 100
insn 8: use (reg:SI 97)
insn 22: reg:SI 105 = 100
insn 23: use (reg:SI 105)

Both reg:SI 97 and reg:SI 105 have only single definitions (and also
single use).

cse2 then replaces 100 with reg:SI 97 in insn 22, which becomes:
insn 22: reg:SI 105 = reg:SI 97.

sched1 then reorders instructions, and insn 7 and insn 22 end up falling
in same basic block Looking at reload dump:

         Choosing alt 3 in insn 7:  (0) r  (1) M {*movsi_aarch64}
          alt=0,overall=0,losers=0,rld_nregs=0
         Choosing alt 0 in insn 2:  (0) =r  (1) r {*movdi_aarch64}
          alt=0,overall=0,losers=0,rld_nregs=0
         Choosing alt 0 in insn 22:  (0) =r  (1) r {*movsi_aarch64}
            1 Non-pseudo reload: reject+=2
            1 Non input pseudo reload: reject++
            Cycle danger: overall += LRA_MAX_REJECT
          alt=0,overall=609,losers=1,rld_nregs=1

which shows, it ends up taking extra register.

The issue here is that cse2 pass is leaving opportunities for
propagating register copies.
To address this, the patch makes following changes to fwprop.c:
(a) Add support for handling UNSPEC in propagate_rtx_1 in a similar
manner to simplify_replace_fn_rtx.
(b) Allow propagating def inside a loop if source of def is a register
in forward_propagate_into.
AFAIU, replacing register by another register shouldn't increase cost.
(c) Integrate fwprop and fwprop_addr, and make fwprop_addr propagate
register copies.

With the patch, fwprop_addr propagates reg:SI 97 in insn 23 and deletes insn 22,
which eliminates the redundant mov.

Does this patch look OK ?
Bootstrapped + tested on x86_64-unknown-linux-gnu and aarch64-linux-gnu.
Cross-testing with SVE in progress.

Thanks,
Prathamesh

[-- Attachment #2: pr88833-5.diff --]
[-- Type: text/x-patch, Size: 5647 bytes --]

diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 45703fe5f01..93a1a10c9a6 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)
 	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
 				     SUBREG_BYTE (x));
 	}
+
+      else
+	{
+	  rtvec vec;
+	  rtvec newvec;
+	  const char *fmt = GET_RTX_FORMAT (code);
+	  rtx op;
+
+	  for (int i = 0; fmt[i]; i++)
+	    switch (fmt[i])
+	      {
+	      case 'E':
+		vec = XVEC (x, i);
+		newvec = vec;
+		for (int j = 0; j < GET_NUM_ELEM (vec); j++)
+		  {
+		    op = RTVEC_ELT (vec, j);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != RTVEC_ELT (vec, j))
+		      {
+			if (newvec == vec)
+			  {
+			    newvec = shallow_copy_rtvec (vec);
+			    if (!tem)
+			      tem = shallow_copy_rtx (x);
+			    XVEC (tem, i) = newvec;
+			  }
+			RTVEC_ELT (newvec, j) = op;
+		      }
+		  }
+	      break;
+
+	      case 'e':
+		if (XEXP (x, i))
+		  {
+		    op = XEXP (x, i);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != XEXP (x, i))
+		      {
+			if (!tem)
+			  tem = shallow_copy_rtx (x);
+			XEXP (tem, i) = op;
+		      }
+		  }
+	      break;
+	      }
+	}
+
       break;
 
     case RTX_OBJ:
@@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
 
 /* Given a use USE of an insn, if it has a single reaching
    definition, try to forward propagate it into that insn.
-   Return true if cfg cleanup will be needed.  */
+   Return true if cfg cleanup will be needed.
+   REG_PROP_ONLY is true if we should only propagate register copies.  */
 
 static bool
-forward_propagate_into (df_ref use)
+forward_propagate_into (df_ref use, bool reg_prop_only = false)
 {
   df_ref def;
   rtx_insn *def_insn, *use_insn;
@@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use)
   if (DF_REF_IS_ARTIFICIAL (def))
     return false;
 
-  /* Do not propagate loop invariant definitions inside the loop.  */
-  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
-    return false;
-
   /* Check if the use is still present in the insn!  */
   use_insn = DF_REF_INSN (use);
   if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
@@ -1415,6 +1460,16 @@ forward_propagate_into (df_ref use)
   if (!def_set)
     return false;
 
+  if (reg_prop_only && !REG_P (SET_SRC (def_set)))
+    return false;
+
+  /* Allow propagating def inside loop only if source of def_set is
+     reg, since replacing reg by source reg shouldn't increase cost.  */
+
+  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father
+      && !REG_P (SET_SRC (def_set)))
+    return false;
+
   /* Only try one kind of propagation.  If two are possible, we'll
      do it on the following iterations.  */
   if (forward_propagate_and_simplify (use, def_insn, def_set)
@@ -1483,7 +1538,7 @@ gate_fwprop (void)
 }
 
 static unsigned int
-fwprop (void)
+fwprop (bool fwprop_addr_p)
 {
   unsigned i;
 
@@ -1502,11 +1557,16 @@ fwprop (void)
 
       df_ref use = DF_USES_GET (i);
       if (use)
-	if (DF_REF_TYPE (use) == DF_REF_REG_USE
-	    || DF_REF_BB (use)->loop_father == NULL
-	    /* The outer most loop is not really a loop.  */
-	    || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
-	  forward_propagate_into (use);
+	{
+	  if (DF_REF_TYPE (use) == DF_REF_REG_USE
+	      || DF_REF_BB (use)->loop_father == NULL
+	      /* The outer most loop is not really a loop.  */
+	      || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+	    forward_propagate_into (use, fwprop_addr_p);
+
+	  else if (fwprop_addr_p)
+	    forward_propagate_into (use, false);
+	}
     }
 
   fwprop_done ();
@@ -1537,7 +1597,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop (); }
+  virtual unsigned int execute (function *) { return fwprop (false); }
 
 }; // class pass_rtl_fwprop
 
@@ -1549,33 +1609,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt)
   return new pass_rtl_fwprop (ctxt);
 }
 
-static unsigned int
-fwprop_addr (void)
-{
-  unsigned i;
-
-  fwprop_init ();
-
-  /* Go through all the uses.  df_uses_create will create new ones at the
-     end, and we'll go through them as well.  */
-  for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
-    {
-      if (!propagations_left)
-	break;
-
-      df_ref use = DF_USES_GET (i);
-      if (use)
-	if (DF_REF_TYPE (use) != DF_REF_REG_USE
-	    && DF_REF_BB (use)->loop_father != NULL
-	    /* The outer most loop is not really a loop.  */
-	    && loop_outer (DF_REF_BB (use)->loop_father) != NULL)
-	  forward_propagate_into (use);
-    }
-
-  fwprop_done ();
-  return 0;
-}
-
 namespace {
 
 const pass_data pass_data_rtl_fwprop_addr =
@@ -1600,7 +1633,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop_addr (); }
+  virtual unsigned int execute (function *) { return fwprop (true); }
 
 }; // class pass_rtl_fwprop_addr
 
diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90
new file mode 100644
index 00000000000..224e6ce5f3d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr88833.f90
@@ -0,0 +1,9 @@
+! { dg-do assemble { target aarch64_asm_sve_ok } }
+! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
+
+subroutine foo(x)
+  real :: x(100)
+  x = x + 10
+end subroutine foo
+
+! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2019-07-04  6:50 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-21  9:52 [SVE] [fwprop] PR88833 - Redundant moves for WHILELO-based loops Prathamesh Kulkarni
2019-06-24  9:29 ` Richard Sandiford
2019-06-24 11:37   ` Prathamesh Kulkarni
2019-06-24 14:21     ` Richard Sandiford
2019-06-24 16:12       ` Prathamesh Kulkarni
2019-06-25 12:06         ` Prathamesh Kulkarni
2019-06-25 14:35           ` Richard Sandiford
2019-06-26 10:10             ` Prathamesh Kulkarni
2019-06-26 10:35               ` Richard Sandiford
2019-06-26 13:55                 ` Prathamesh Kulkarni
2019-06-26 18:15                   ` Richard Sandiford
2019-07-02 11:20                     ` Prathamesh Kulkarni
2019-07-02 11:29                       ` Richard Sandiford
2019-07-02 12:10                         ` Prathamesh Kulkarni
2019-07-02 12:52                           ` Richard Sandiford
2019-07-03 10:46                             ` Prathamesh Kulkarni
2019-07-03 11:36                               ` Richard Sandiford
2019-07-04  6:50                                 ` Prathamesh Kulkarni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).