public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: "augustine.sterling@gmail.com" <augustine.sterling@gmail.com>
To: "Yangfei (Felix)" <felix.yang@huawei.com>
Cc: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>,
	Andrew Pinski <pinskia@gmail.com>,
		Felix Yang <fei.yang0953@gmail.com>
Subject: Re: [PATCH] Add zero-overhead looping for xtensa backend
Date: Thu, 30 Oct 2014 22:02:00 -0000	[thread overview]
Message-ID: <CAGSvup_FD91Sbg-9haJOVNuf=oy1FQyuBSA-5K5HDcsFu3kMpQ@mail.gmail.com> (raw)
In-Reply-To: <DA41BE1DDCA941489001C7FBD7A8820E5554C822@szxema507-mbx.china.huawei.com>

[-- Attachment #1: Type: text/plain, Size: 1395 bytes --]

On Tue, Oct 28, 2014 at 5:22 AM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> Hi Sterling,
>   How do you think about this issue?
>   As c6x/bfin port handles this the same way, is it OK for the patch to be applied?
>   Thanks.

I have committed this patch as attached. I made a couple of minor
cleanups, plus some small fixes to the ChangeLog entry.

The new code generated is better than the old code, but I'm not
particularly happy with the result. In particular, it is way too
conservative around spilling the trip count, and it still maintains
the trip count inside the loop, in spite of the trip count being dead
at that point. It won't always be dead, but when it is, it should be
eliminated.

So there is quite a bit of performance that could still be gained here.

Thanks for the patch Felix.

Sterling

2014-10-10  Felix Yang  <felix.yang@huawei.com>

    * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
    * config/xtensa/xtensa.c: Include dumpfile.h and hw-doloop.h.
    (xtensa_reorg, xtensa_reorg_loops): New.
    (xtensa_can_use_doloop_p, xtensa_invalid_within_doloop): New.
    (hwloop_optimize, hwloop_fail, hwloop_pattern_reg): New.
    (xtensa_emit_loop_end): Emit the zero-overhead loop end label.
    (xtensa_doloop_hooks): Define.
    * config/xtensa/xtensa.md (doloop_end, loop_end): New
    (zero_cost_loop_start): Rewritten.
    (zero_cost_loop_end): Likewise.

[-- Attachment #2: loop-patch.diff --]
[-- Type: text/plain, Size: 13288 bytes --]

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c	(revision 216943)
+++ gcc/config/xtensa/xtensa.c	(working copy)
@@ -74,6 +74,8 @@
 #include "gimplify.h"
 #include "df.h"
 #include "builtins.h"
+#include "dumpfile.h"
+#include "hw-doloop.h"
 #include "rtl-iter.h"
 
 
@@ -200,6 +202,10 @@
 
 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const rtx_insn *);
 
 static bool xtensa_member_type_forces_blk (const_tree,
 					   machine_mode mode);
@@ -326,6 +332,15 @@
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 \f
@@ -1690,7 +1705,7 @@
         }
     }
 
-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }
 
 
@@ -3720,4 +3735,236 @@
   return !xtensa_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                         unsigned int loop_depth, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop
+     for innermost loops which must be entered from the top.  */
+  if (loop_depth > 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
+    return "Return from a call instruction in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx iter_reg;
+  rtx_insn *insn, *seq, *entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test;
+  rtx_insn *insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx_insn *insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"
Index: gcc/config/xtensa/xtensa.h
===================================================================
--- gcc/config/xtensa/xtensa.h	(revision 216943)
+++ gcc/config/xtensa/xtensa.h	(working copy)
@@ -65,6 +65,7 @@
 #define TARGET_S32C1I		XCHAL_HAVE_S32C1I
 #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 #define TARGET_THREADPTR	XCHAL_HAVE_THREADPTR
+#define TARGET_LOOPS	        XCHAL_HAVE_LOOPS
 
 #define TARGET_DEFAULT \
   ((XCHAL_HAVE_L32R	? 0 : MASK_CONST16) |				\
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md	(revision 216943)
+++ gcc/config/xtensa/xtensa.md	(working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL	9)
   (UNSPEC_TP		10)
   (UNSPEC_MEMW		11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)
 
   (UNSPECV_SET_FP	1)
   (UNSPECV_ENTRY	2)
@@ -1279,21 +1281,25 @@
    (set_attr "length"	"3")])
 
 
+;; Zero-overhead looping support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.
 
 (define_insn "zero_cost_loop_start"
   [(set (pc)
-	(if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-			  (const_int 0))
-		      (label_ref (match_operand 1 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (match_dup 0) (const_int -1)))]
-  ""
-  "loopnez\t%0, %l1"
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
+  "TARGET_LOOPS && optimize"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
@@ -1300,20 +1306,95 @@
 
 (define_insn "zero_cost_loop_end"
   [(set (pc)
-	(if_then_else (ne (reg:SI 19) (const_int 0))
-		      (label_ref (match_operand 0 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (reg:SI 19) (const_int -1)))]
-  ""
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "2,2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "=a,m")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch:SI 3 "=X,&r"))]
+  "TARGET_LOOPS && optimize"
+  "#"
+  [(set_attr "type"	"jump")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"0")])
+
+(define_insn "loop_end"
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
+  "TARGET_LOOPS && optimize"
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
+(define_split
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "")
+        (plus:SI (match_dup 0)
+                 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch 3))]
+  "TARGET_LOOPS && optimize && reload_completed"
+  [(const_int 0)]
+{
+  if (!REG_P (operands[0]))
+    {
+      rtx test;
+
+      /* Fallback into a normal conditional branch insn.  */
+      emit_move_insn (operands[3], operands[0]);
+      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
+      emit_move_insn (operands[0], operands[3]);
+      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
+      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
+                                      const0_rtx, operands[1]));
+    }
+  else
+    {
+      emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2]));
+    }
+
+  DONE;
+})
+
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+              (clobber (match_dup 2))])] ; match_scratch
+  "TARGET_LOOPS && optimize"
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+  operands[2] = gen_rtx_SCRATCH (SImode);
+})
+
 \f
 ;; Setting a register from a comparison.
 

      reply	other threads:[~2014-10-30 21:41 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-01-08 16:27 Felix Yang
2014-01-08 16:49 ` Sterling Augustine
2014-01-09 15:08   ` Felix Yang
2014-01-09 23:51     ` Felix Yang
2014-01-10  3:49       ` Yangfei (Felix)
2014-01-13 17:24         ` Sterling Augustine
2014-10-09 11:04           ` Felix Yang
2014-10-10 14:01             ` Felix Yang
2014-10-11  9:32               ` [PING] [PATCH, xtensa] " Yangfei (Felix)
2014-10-13 16:09               ` [PATCH] " augustine.sterling
2014-10-13 16:30                 ` Felix Yang
2014-10-14 15:43                   ` Felix Yang
2014-10-15 19:51                     ` augustine.sterling
2014-10-16  4:52                       ` Yangfei (Felix)
2014-10-21 14:57                         ` augustine.sterling
2014-10-22  2:20                           ` Yangfei (Felix)
2014-10-22  5:20                           ` Yangfei (Felix)
2014-10-23 17:51                             ` augustine.sterling
2014-10-24  1:49                               ` Yangfei (Felix)
2014-10-24  4:32                               ` Yangfei (Felix)
2014-10-24  6:28                                 ` augustine.sterling
2014-10-24  6:33                                   ` Yangfei (Felix)
2014-10-24  6:40                                     ` augustine.sterling
2014-10-24  6:43                                       ` Yangfei (Felix)
2014-10-24  6:49                                         ` augustine.sterling
2014-10-24  6:53                                           ` Yangfei (Felix)
2014-10-24  7:15                                             ` Andrew Pinski
2014-10-28 12:24                                               ` Yangfei (Felix)
2014-10-30 22:02                                                 ` augustine.sterling [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAGSvup_FD91Sbg-9haJOVNuf=oy1FQyuBSA-5K5HDcsFu3kMpQ@mail.gmail.com' \
    --to=augustine.sterling@gmail.com \
    --cc=fei.yang0953@gmail.com \
    --cc=felix.yang@huawei.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=pinskia@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).