public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Add zero-overhead looping for xtensa backend
@ 2014-01-08 16:27 Felix Yang
  2014-01-08 16:49 ` Sterling Augustine
  0 siblings, 1 reply; 29+ messages in thread
From: Felix Yang @ 2014-01-08 16:27 UTC (permalink / raw)
  To: augustine.sterling, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 12187 bytes --]

Hi Sterling,

  This patch implements zero-overhead looping for xtensa backend using
hw-doloop facility.
  If OK for trunk, please apply it for me. Thanks.


Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog    (revision 206431)
+++ gcc/ChangeLog    (working copy)
@@ -1,3 +1,18 @@
+2014-01-08  Felix Yang  <fei.yang0953@gmail.com>
+
+    * config/xtensa/xtensa.c (xtensa_reorg): New.
+    (xtensa_reorg_loops): New.
+    (xtensa_can_use_doloop_p): New.
+    (xtensa_invalid_within_doloop): New.
+    (hwloop_optimize): New.
+    (hwloop_fail): New.
+    (hwloop_pattern_reg): New.
+    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+    (xtensa_doloop_hooks): Define.
+    * config/xtensa/xtensa.md (doloop_end): New.
+    (zero_cost_loop_start): Rewritten.
+    (zero_cost_loop_end): Rewritten.
+
 2014-01-08  Marek Polacek  <polacek@redhat.com>

     PR middle-end/59669
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md    (revision 206431)
+++ gcc/config/xtensa/xtensa.md    (working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL    9)
   (UNSPEC_TP        10)
   (UNSPEC_MEMW        11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)

   (UNSPECV_SET_FP    1)
   (UNSPECV_ENTRY    2)
@@ -1289,6 +1291,8 @@
    (set_attr "length"    "3")])


+;; Hardware loop support.
+
 ;; Define the loop insns used by bct optimization to represent the
 ;; start and end of a zero-overhead loop (in loop.c).  This start
 ;; template generates the loop insn; the end template doesn't generate
@@ -1296,34 +1300,58 @@

 (define_insn "zero_cost_loop_start"
   [(set (pc)
-    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-              (const_int 0))
-              (label_ref (match_operand 1 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "3")])

 (define_insn "zero_cost_loop_end"
   [(set (pc)
-    (if_then_else (ne (reg:SI 19) (const_int 0))
-              (label_ref (match_operand 0 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "0")])

+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+

 ;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c    (revision 206431)
+++ gcc/config/xtensa/xtensa.c    (working copy)
@@ -1,6 +1,7 @@
 /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
    Copyright (C) 2001-2014 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+   Zero-overhead looping support by Felix Yang (felix.yang0953@gmail.com).

 This file is part of GCC.

@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "gimplify.h"
 #include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"

-
 /* Enumeration for all of the relational tests, so that we can build
    arrays indexed by the test type, and not worry about the order
    of EQ, NE, etc.  */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,

 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);

 static bool xtensa_member_type_forces_blk (const_tree,
                        enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
         }
     }

-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }


@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }

+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+                         unsigned int level, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop for
innermost loops
+     which must be entered from the top.  */
+  if (level != 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx insn, seq, iter_reg, entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test, insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (true, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"

Cheers,
Felix

[-- Attachment #2: xtensa-zcl.diff --]
[-- Type: text/plain, Size: 11880 bytes --]

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 206431)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,18 @@
+2014-01-09  Felix Yang  <fei.yang0953@gmail.com>
+
+	* config/xtensa/xtensa.c (xtensa_reorg): New.
+	(xtensa_reorg_loops): New.
+	(xtensa_can_use_doloop_p): New.
+	(xtensa_invalid_within_doloop): New.
+	(hwloop_optimize): New.
+	(hwloop_fail): New.
+	(hwloop_pattern_reg): New.
+	(xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+	(xtensa_doloop_hooks): Define.
+	* config/xtensa/xtensa.md (doloop_end): New.
+	(zero_cost_loop_start): Rewritten.
+	(zero_cost_loop_end): Rewritten.
+
 2014-01-08  Marek Polacek  <polacek@redhat.com>
 
 	PR middle-end/59669
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md	(revision 206431)
+++ gcc/config/xtensa/xtensa.md	(working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL	9)
   (UNSPEC_TP		10)
   (UNSPEC_MEMW		11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)
 
   (UNSPECV_SET_FP	1)
   (UNSPECV_ENTRY	2)
@@ -1289,6 +1291,8 @@
    (set_attr "length"	"3")])
 
 
+;; Hardware loop support.
+
 ;; Define the loop insns used by bct optimization to represent the
 ;; start and end of a zero-overhead loop (in loop.c).  This start
 ;; template generates the loop insn; the end template doesn't generate
@@ -1296,34 +1300,58 @@
 
 (define_insn "zero_cost_loop_start"
   [(set (pc)
-	(if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-			  (const_int 0))
-		      (label_ref (match_operand 1 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
 (define_insn "zero_cost_loop_end"
   [(set (pc)
-	(if_then_else (ne (reg:SI 19) (const_int 0))
-		      (label_ref (match_operand 0 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+
 \f
 ;; Setting a register from a comparison.
 
Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c	(revision 206431)
+++ gcc/config/xtensa/xtensa.c	(working copy)
@@ -1,6 +1,7 @@
 /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
    Copyright (C) 2001-2014 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+   Zero-overhead looping support by Felix Yang (felix.yang0953@gmail.com).
 
 This file is part of GCC.
 
@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "gimplify.h"
 #include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"
 
-
 /* Enumeration for all of the relational tests, so that we can build
    arrays indexed by the test type, and not worry about the order
    of EQ, NE, etc.  */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
 
 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);
 
 static bool xtensa_member_type_forces_blk (const_tree,
 					   enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 \f
@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
         }
     }
 
-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }
 
 
@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+                         unsigned int level, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop for innermost loops
+     which must be entered from the top.  */
+  if (level != 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx insn, seq, iter_reg, entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n", loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n", loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence (); 
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test, insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa 
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (true, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-01-08 16:27 [PATCH] Add zero-overhead looping for xtensa backend Felix Yang
@ 2014-01-08 16:49 ` Sterling Augustine
  2014-01-09 15:08   ` Felix Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Sterling Augustine @ 2014-01-08 16:49 UTC (permalink / raw)
  To: Felix Yang; +Cc: gcc-patches

On Wed, Jan 8, 2014 at 8:27 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
> Hi Sterling,
>
>   This patch implements zero-overhead looping for xtensa backend using
> hw-doloop facility.
>   If OK for trunk, please apply it for me. Thanks.

Hi Felix,

I last worked on zero-overhead loops for Xtensa in the gcc 4.3
timeframe, but when I did, I ran into several problems related to
later optimizations rearranging the code which I didn't have time to
address.

I'm sure much of that experience is completely stale now, but I would
appreciate a detail of the testing you have done with this patch (in
particular, a description of the different xtensa configurations you
tested it against, especially the ones with and without loop
instructions) before I approve it. Please be sure the assembler can
relax the loops it generates as well. I don't see any particular
problem, but there are many, many gotchas when dealing with xtensa
loop instructions.

It also appears that Tensilica has stopped posting test results for
Xtensa, which makes it difficult to evaluate the quality of this
patch.

Thanks,

Sterling

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-01-08 16:49 ` Sterling Augustine
@ 2014-01-09 15:08   ` Felix Yang
  2014-01-09 23:51     ` Felix Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Felix Yang @ 2014-01-09 15:08 UTC (permalink / raw)
  To: Sterling Augustine; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 14752 bytes --]

Hi Sterling,

    Attached please find version 2 of the patch.

    I applied this updated patch (with small adaptations) to gcc-4.8.2
and carried out some tests.
    I can execute the testcases in a simulator, which support
zero-overhead looping instructions.

    First of all, I can successfully build libgcc, libstdc++ and
newlibc for xtensa with this patch.
    The newly built xtensa gcc also passed testsuite which comes with newlibc.
    I also tested the cases under gcc/testsuite/gcc.c-torture/execute/
directory. There are about 800+ cases tested.
    Test result shows no new failed case with this patch, compared
with the original gcc version.
    Is that OK?

    I also double checked the loop relaxation issue with binutils-2.24
(the latest version).
    The result show that the assember can do loop relaxation when the
loop target is too far ( > 256 Byte).
    And this is the reason why I don't check the size of the loop.


Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog    (revision 206463)
+++ gcc/ChangeLog    (working copy)
@@ -1,3 +1,18 @@
+2014-01-09  Felix Yang  <fei.yang0953@gmail.com>
+
+    * config/xtensa/xtensa.c (xtensa_reorg): New.
+    (xtensa_reorg_loops): New.
+    (xtensa_can_use_doloop_p): New.
+    (xtensa_invalid_within_doloop): New.
+    (hwloop_optimize): New.
+    (hwloop_fail): New.
+    (hwloop_pattern_reg): New.
+    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+    (xtensa_doloop_hooks): Define.
+    * config/xtensa/xtensa.md (doloop_end): New.
+    (zero_cost_loop_start): Rewritten.
+    (zero_cost_loop_end): Rewritten.
+
 2014-01-09  Richard Biener  <rguenther@suse.de>

     PR tree-optimization/59715
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md    (revision 206463)
+++ gcc/config/xtensa/xtensa.md    (working copy)
@@ -1,6 +1,7 @@
 ;; GCC machine description for Tensilica's Xtensa architecture.
 ;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
 ;; Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+;; Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).

 ;; This file is part of GCC.

@@ -35,6 +36,8 @@
   (UNSPEC_TLS_CALL    9)
   (UNSPEC_TP        10)
   (UNSPEC_MEMW        11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)

   (UNSPECV_SET_FP    1)
   (UNSPECV_ENTRY    2)
@@ -1289,41 +1292,67 @@
    (set_attr "length"    "3")])


+;; Hardware loop support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.

 (define_insn "zero_cost_loop_start"
   [(set (pc)
-    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-              (const_int 0))
-              (label_ref (match_operand 1 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "3")])

 (define_insn "zero_cost_loop_end"
   [(set (pc)
-    (if_then_else (ne (reg:SI 19) (const_int 0))
-              (label_ref (match_operand 0 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "0")])

+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+

 ;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c    (revision 206463)
+++ gcc/config/xtensa/xtensa.c    (working copy)
@@ -1,6 +1,7 @@
 /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
    Copyright (C) 2001-2014 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+   Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).

 This file is part of GCC.

@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "gimplify.h"
 #include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"

-
 /* Enumeration for all of the relational tests, so that we can build
    arrays indexed by the test type, and not worry about the order
    of EQ, NE, etc.  */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,

 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);

 static bool xtensa_member_type_forces_blk (const_tree,
                        enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
         }
     }

-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }


@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }

+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+                         unsigned int level, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop for
innermost loops
+     which must be entered from the top.  */
+  if (level != 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx insn, seq, iter_reg, entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test, insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"
Cheers,
Felix


On Thu, Jan 9, 2014 at 12:49 AM, Sterling Augustine
<augustine.sterling@gmail.com> wrote:
> On Wed, Jan 8, 2014 at 8:27 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
>> Hi Sterling,
>>
>>   This patch implements zero-overhead looping for xtensa backend using
>> hw-doloop facility.
>>   If OK for trunk, please apply it for me. Thanks.
>
> Hi Felix,
>
> I last worked on zero-overhead loops for Xtensa in the gcc 4.3
> timeframe, but when I did, I ran into several problems related to
> later optimizations rearranging the code which I didn't have time to
> address.
>
> I'm sure much of that experience is completely stale now, but I would
> appreciate a detail of the testing you have done with this patch (in
> particular, a description of the different xtensa configurations you
> tested it against, especially the ones with and without loop
> instructions) before I approve it. Please be sure the assembler can
> relax the loops it generates as well. I don't see any particular
> problem, but there are many, many gotchas when dealing with xtensa
> loop instructions.
>
> It also appears that Tensilica has stopped posting test results for
> Xtensa, which makes it difficult to evaluate the quality of this
> patch.
>
> Thanks,
>
> Sterling

[-- Attachment #2: xtensa-zcl-v2.diff --]
[-- Type: text/plain, Size: 12413 bytes --]

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 206463)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,18 @@
+2014-01-09  Felix Yang  <fei.yang0953@gmail.com>
+
+	* config/xtensa/xtensa.c (xtensa_reorg): New.
+	(xtensa_reorg_loops): New.
+	(xtensa_can_use_doloop_p): New.
+	(xtensa_invalid_within_doloop): New.
+	(hwloop_optimize): New.
+	(hwloop_fail): New.
+	(hwloop_pattern_reg): New.
+	(xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+	(xtensa_doloop_hooks): Define.
+	* config/xtensa/xtensa.md (doloop_end): New.
+	(zero_cost_loop_start): Rewritten.
+	(zero_cost_loop_end): Rewritten.
+
 2014-01-09  Richard Biener  <rguenther@suse.de>
 
 	PR tree-optimization/59715
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md	(revision 206463)
+++ gcc/config/xtensa/xtensa.md	(working copy)
@@ -1,6 +1,7 @@
 ;; GCC machine description for Tensilica's Xtensa architecture.
 ;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
 ;; Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+;; Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).
 
 ;; This file is part of GCC.
 
@@ -35,6 +36,8 @@
   (UNSPEC_TLS_CALL	9)
   (UNSPEC_TP		10)
   (UNSPEC_MEMW		11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)
 
   (UNSPECV_SET_FP	1)
   (UNSPECV_ENTRY	2)
@@ -1289,41 +1292,67 @@
    (set_attr "length"	"3")])
 
 
+;; Hardware loop support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.
 
 (define_insn "zero_cost_loop_start"
   [(set (pc)
-	(if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-			  (const_int 0))
-		      (label_ref (match_operand 1 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
 (define_insn "zero_cost_loop_end"
   [(set (pc)
-	(if_then_else (ne (reg:SI 19) (const_int 0))
-		      (label_ref (match_operand 0 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+
 \f
 ;; Setting a register from a comparison.
 
Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c	(revision 206463)
+++ gcc/config/xtensa/xtensa.c	(working copy)
@@ -1,6 +1,7 @@
 /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
    Copyright (C) 2001-2014 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+   Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).
 
 This file is part of GCC.
 
@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "gimplify.h"
 #include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"
 
-
 /* Enumeration for all of the relational tests, so that we can build
    arrays indexed by the test type, and not worry about the order
    of EQ, NE, etc.  */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
 
 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);
 
 static bool xtensa_member_type_forces_blk (const_tree,
 					   enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 \f
@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
         }
     }
 
-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }
 
 
@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+                         unsigned int level, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop for innermost loops
+     which must be entered from the top.  */
+  if (level != 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx insn, seq, iter_reg, entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n", loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n", loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence (); 
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test, insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa 
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-01-09 15:08   ` Felix Yang
@ 2014-01-09 23:51     ` Felix Yang
  2014-01-10  3:49       ` Yangfei (Felix)
  0 siblings, 1 reply; 29+ messages in thread
From: Felix Yang @ 2014-01-09 23:51 UTC (permalink / raw)
  To: Sterling Augustine; +Cc: gcc-patches

Hi Sterling,

    Please note that version 2 of the patch is for gcc trunk, not for
gcc-4.8 branch.
    Since the doloop_end pattern format has changed, this patch need
small adaptation in order for it to work on gcc-4.8.
    Although I test it  on gcc-4.8, I think the testing result still
holds for trunk.
Cheers,
Felix


On Thu, Jan 9, 2014 at 11:08 PM, Felix Yang <fei.yang0953@gmail.com> wrote:
> Hi Sterling,
>
>     Attached please find version 2 of the patch.
>
>     I applied this updated patch (with small adaptations) to gcc-4.8.2
> and carried out some tests.
>     I can execute the testcases in a simulator, which support
> zero-overhead looping instructions.
>
>     First of all, I can successfully build libgcc, libstdc++ and
> newlibc for xtensa with this patch.
>     The newly built xtensa gcc also passed testsuite which comes with newlibc.
>     I also tested the cases under gcc/testsuite/gcc.c-torture/execute/
> directory. There are about 800+ cases tested.
>     Test result shows no new failed case with this patch, compared
> with the original gcc version.
>     Is that OK?
>
>     I also double checked the loop relaxation issue with binutils-2.24
> (the latest version).
>     The result show that the assember can do loop relaxation when the
> loop target is too far ( > 256 Byte).
>     And this is the reason why I don't check the size of the loop.
>
>
> Index: gcc/ChangeLog
> ===================================================================
> --- gcc/ChangeLog    (revision 206463)
> +++ gcc/ChangeLog    (working copy)
> @@ -1,3 +1,18 @@
> +2014-01-09  Felix Yang  <fei.yang0953@gmail.com>
> +
> +    * config/xtensa/xtensa.c (xtensa_reorg): New.
> +    (xtensa_reorg_loops): New.
> +    (xtensa_can_use_doloop_p): New.
> +    (xtensa_invalid_within_doloop): New.
> +    (hwloop_optimize): New.
> +    (hwloop_fail): New.
> +    (hwloop_pattern_reg): New.
> +    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
> +    (xtensa_doloop_hooks): Define.
> +    * config/xtensa/xtensa.md (doloop_end): New.
> +    (zero_cost_loop_start): Rewritten.
> +    (zero_cost_loop_end): Rewritten.
> +
>  2014-01-09  Richard Biener  <rguenther@suse.de>
>
>      PR tree-optimization/59715
> Index: gcc/config/xtensa/xtensa.md
> ===================================================================
> --- gcc/config/xtensa/xtensa.md    (revision 206463)
> +++ gcc/config/xtensa/xtensa.md    (working copy)
> @@ -1,6 +1,7 @@
>  ;; GCC machine description for Tensilica's Xtensa architecture.
>  ;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
>  ;; Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
> +;; Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).
>
>  ;; This file is part of GCC.
>
> @@ -35,6 +36,8 @@
>    (UNSPEC_TLS_CALL    9)
>    (UNSPEC_TP        10)
>    (UNSPEC_MEMW        11)
> +  (UNSPEC_LSETUP_START  12)
> +  (UNSPEC_LSETUP_END    13)
>
>    (UNSPECV_SET_FP    1)
>    (UNSPECV_ENTRY    2)
> @@ -1289,41 +1292,67 @@
>     (set_attr "length"    "3")])
>
>
> +;; Hardware loop support.
> +
>  ;; Define the loop insns used by bct optimization to represent the
> -;; start and end of a zero-overhead loop (in loop.c).  This start
> -;; template generates the loop insn; the end template doesn't generate
> -;; any instructions since loop end is handled in hardware.
> +;; start and end of a zero-overhead loop.  This start template generates
> +;; the loop insn; the end template doesn't generate any instructions since
> +;; loop end is handled in hardware.
>
>  (define_insn "zero_cost_loop_start"
>    [(set (pc)
> -    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
> -              (const_int 0))
> -              (label_ref (match_operand 1 "" ""))
> -              (pc)))
> -   (set (reg:SI 19)
> -    (plus:SI (match_dup 0) (const_int -1)))]
> +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "register_operand" "+a0")
> +        (plus (match_dup 2)
> +              (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
>    ""
> -  "loopnez\t%0, %l1"
> +  "loop\t%0, %l1_LEND"
>    [(set_attr "type"    "jump")
>     (set_attr "mode"    "none")
>     (set_attr "length"    "3")])
>
>  (define_insn "zero_cost_loop_end"
>    [(set (pc)
> -    (if_then_else (ne (reg:SI 19) (const_int 0))
> -              (label_ref (match_operand 0 "" ""))
> -              (pc)))
> -   (set (reg:SI 19)
> -    (plus:SI (reg:SI 19) (const_int -1)))]
> +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "register_operand" "+a0")
> +        (plus (match_dup 2)
> +              (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
>    ""
>  {
> -    xtensa_emit_loop_end (insn, operands);
> -    return "";
> +  xtensa_emit_loop_end (insn, operands);
> +  return "";
>  }
>    [(set_attr "type"    "jump")
>     (set_attr "mode"    "none")
>     (set_attr "length"    "0")])
>
> +; operand 0 is the loop count pseudo register
> +; operand 1 is the label to jump to at the top of the loop
> +(define_expand "doloop_end"
> +  [(parallel [(set (pc) (if_then_else
> +                          (ne (match_operand:SI 0 "" "")
> +                              (const_int 1))
> +                          (label_ref (match_operand 1 "" ""))
> +                          (pc)))
> +              (set (match_dup 0)
> +                   (plus:SI (match_dup 0)
> +                            (const_int -1)))
> +              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
> +  ""
> +{
> +  /* The loop optimizer doesn't check the predicates... */
> +  if (GET_MODE (operands[0]) != SImode)
> +    FAIL;
> +})
> +
>
>  ;; Setting a register from a comparison.
>
> Index: gcc/config/xtensa/xtensa.c
> ===================================================================
> --- gcc/config/xtensa/xtensa.c    (revision 206463)
> +++ gcc/config/xtensa/xtensa.c    (working copy)
> @@ -1,6 +1,7 @@
>  /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
>     Copyright (C) 2001-2014 Free Software Foundation, Inc.
>     Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
> +   Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).
>
>  This file is part of GCC.
>
> @@ -61,8 +62,9 @@ along with GCC; see the file COPYING3.  If not see
>  #include "gimple.h"
>  #include "gimplify.h"
>  #include "df.h"
> +#include "hw-doloop.h"
> +#include "dumpfile.h"
>
> -
>  /* Enumeration for all of the relational tests, so that we can build
>     arrays indexed by the test type, and not worry about the order
>     of EQ, NE, etc.  */
> @@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
>
>  static bool constantpool_address_p (const_rtx addr);
>  static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
> +static void xtensa_reorg (void);
> +static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
> +                                     unsigned int, bool);
> +static const char *xtensa_invalid_within_doloop (const_rtx);
>
>  static bool xtensa_member_type_forces_blk (const_tree,
>                         enum machine_mode mode);
> @@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
>  #undef TARGET_LEGITIMATE_CONSTANT_P
>  #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
>
> +#undef TARGET_MACHINE_DEPENDENT_REORG
> +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
> +
> +#undef TARGET_CAN_USE_DOLOOP_P
> +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
> +
> +#undef TARGET_INVALID_WITHIN_DOLOOP
> +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>
> @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
>          }
>      }
>
> -  output_asm_insn ("# loop end for %0", operands);
> +  output_asm_insn ("%1_LEND:", operands);
>  }
>
>
> @@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
>    return !xtensa_tls_referenced_p (x);
>  }
>
> +/* Implement TARGET_CAN_USE_DOLOOP_P.  */
> +
> +static bool
> +xtensa_can_use_doloop_p (double_int, double_int,
> +                         unsigned int level, bool entered_at_top)
> +{
> +  /* Considering limitations in the hardware, only use doloop for
> innermost loops
> +     which must be entered from the top.  */
> +  if (level != 1 || !entered_at_top)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* NULL if INSN insn is valid within a low-overhead loop.
> +   Otherwise return why doloop cannot be applied.  */
> +
> +static const char *
> +xtensa_invalid_within_doloop (const_rtx insn)
> +{
> +  if (CALL_P (insn))
> +    return "Function call in the loop.";
> +
> +  return NULL;
> +}
> +
> +/* Optimize LOOP.  */
> +
> +static bool
> +hwloop_optimize (hwloop_info loop)
> +{
> +  int i;
> +  edge entry_edge;
> +  basic_block entry_bb;
> +  rtx insn, seq, iter_reg, entry_after;
> +
> +  if (loop->depth > 1)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
> +      return false;
> +    }
> +
> +  if (!loop->incoming_dest)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has more than one entry\n",
> loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->incoming_dest != loop->head)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not entered from head\n",
> loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->has_call || loop->has_asm)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Scan all the blocks to make sure they don't use iter_reg.  */
> +  if (loop->iter_reg_used || loop->iter_reg_used_outside)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Check if start_label appears before doloop_end.  */
> +  insn = loop->start_label;
> +  while (insn && insn != loop->loop_end)
> +    insn = NEXT_INSN (insn);
> +
> +  if (!insn)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Get the loop iteration register.  */
> +  iter_reg = loop->iter_reg;
> +
> +  gcc_assert (REG_P (iter_reg));
> +
> +  entry_edge = NULL;
> +
> +  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
> +    if (entry_edge->flags & EDGE_FALLTHRU)
> +      break;
> +
> +  if (entry_edge == NULL)
> +    return false;
> +
> +  /* Place the zero_cost_loop_start instruction before the loop.  */
> +  entry_bb = entry_edge->src;
> +
> +  start_sequence ();
> +
> +  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
> +                                              loop->start_label,
> +                                              loop->iter_reg));
> +
> +  seq = get_insns ();
> +
> +  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
> +    {
> +      basic_block new_bb;
> +      edge e;
> +      edge_iterator ei;
> +
> +      emit_insn_before (seq, BB_HEAD (loop->head));
> +      seq = emit_label_before (gen_label_rtx (), seq);
> +
> +      new_bb = create_basic_block (seq, insn, entry_bb);
> +      FOR_EACH_EDGE (e, ei, loop->incoming)
> +        {
> +          if (!(e->flags & EDGE_FALLTHRU))
> +            redirect_edge_and_branch_force (e, new_bb);
> +          else
> +            redirect_edge_succ (e, new_bb);
> +        }
> +      make_edge (new_bb, loop->head, 0);
> +    }
> +  else
> +    {
> +      entry_after = BB_END (entry_bb);
> +      while (DEBUG_INSN_P (entry_after)
> +             || (NOTE_P (entry_after)
> +                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
> +        entry_after = PREV_INSN (entry_after);
> +      emit_insn_after (seq, entry_after);
> +    }
> +
> +  end_sequence ();
> +
> +  return true;
> +}
> +
> +/* A callback for the hw-doloop pass.  Called when a loop we have discovered
> +   turns out not to be optimizable; we have to split the loop_end pattern into
> +   a subtract and a test.  */
> +
> +static void
> +hwloop_fail (hwloop_info loop)
> +{
> +  rtx test, insn = loop->loop_end;
> +
> +  emit_insn_before (gen_addsi3 (loop->iter_reg,
> +                                loop->iter_reg,
> +                                constm1_rtx),
> +                    loop->loop_end);
> +
> +  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
> +  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
> +                                                loop->iter_reg, const0_rtx,
> +                                                loop->start_label),
> +                                loop->loop_end);
> +
> +  JUMP_LABEL (insn) = loop->start_label;
> +  LABEL_NUSES (loop->start_label)++;
> +  delete_insn (loop->loop_end);
> +}
> +
> +/* A callback for the hw-doloop pass.  This function examines INSN; if
> +   it is a doloop_end pattern we recognize, return the reg rtx for the
> +   loop counter.  Otherwise, return NULL_RTX.  */
> +
> +static rtx
> +hwloop_pattern_reg (rtx insn)
> +{
> +  rtx reg;
> +
> +  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
> +    return NULL_RTX;
> +
> +  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
> +  if (!REG_P (reg))
> +    return NULL_RTX;
> +  return reg;
> +}
> +
> +
> +static struct hw_doloop_hooks xtensa_doloop_hooks =
> +{
> +  hwloop_pattern_reg,
> +  hwloop_optimize,
> +  hwloop_fail
> +};
> +
> +/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
> +   and tries to rewrite the RTL of these loops so that proper Xtensa
> +   hardware loops are generated.  */
> +
> +static void
> +xtensa_reorg_loops (void)
> +{
> +  reorg_loops (false, &xtensa_doloop_hooks);
> +}
> +
> +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
> +
> +static void
> +xtensa_reorg (void)
> +{
> +  /* We are freeing block_for_insn in the toplev to keep compatibility
> +     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
> +  compute_bb_for_insn ();
> +
> +  df_analyze ();
> +
> +  /* Doloop optimization.  */
> +  xtensa_reorg_loops ();
> +}
> +
>  #include "gt-xtensa.h"
> Cheers,
> Felix
>
>
> On Thu, Jan 9, 2014 at 12:49 AM, Sterling Augustine
> <augustine.sterling@gmail.com> wrote:
>> On Wed, Jan 8, 2014 at 8:27 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
>>> Hi Sterling,
>>>
>>>   This patch implements zero-overhead looping for xtensa backend using
>>> hw-doloop facility.
>>>   If OK for trunk, please apply it for me. Thanks.
>>
>> Hi Felix,
>>
>> I last worked on zero-overhead loops for Xtensa in the gcc 4.3
>> timeframe, but when I did, I ran into several problems related to
>> later optimizations rearranging the code which I didn't have time to
>> address.
>>
>> I'm sure much of that experience is completely stale now, but I would
>> appreciate a detail of the testing you have done with this patch (in
>> particular, a description of the different xtensa configurations you
>> tested it against, especially the ones with and without loop
>> instructions) before I approve it. Please be sure the assembler can
>> relax the loops it generates as well. I don't see any particular
>> problem, but there are many, many gotchas when dealing with xtensa
>> loop instructions.
>>
>> It also appears that Tensilica has stopped posting test results for
>> Xtensa, which makes it difficult to evaluate the quality of this
>> patch.
>>
>> Thanks,
>>
>> Sterling

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH] Add zero-overhead looping for xtensa backend
  2014-01-09 23:51     ` Felix Yang
@ 2014-01-10  3:49       ` Yangfei (Felix)
  2014-01-13 17:24         ` Sterling Augustine
  0 siblings, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-01-10  3:49 UTC (permalink / raw)
  To: Felix Yang, Sterling Augustine; +Cc: gcc-patches

And here is the xtensa configuration tested (include/xtensa-config.h): 

#define XCHAL_HAVE_BE		0
#define XCHAL_HAVE_LOOPS		1


> 
> Hi Sterling,
> 
>     Please note that version 2 of the patch is for gcc trunk, not for
> gcc-4.8 branch.
>     Since the doloop_end pattern format has changed, this patch need small
> adaptation in order for it to work on gcc-4.8.
>     Although I test it  on gcc-4.8, I think the testing result still holds for
> trunk.
> Cheers,
> Felix
> 
> 
> On Thu, Jan 9, 2014 at 11:08 PM, Felix Yang <fei.yang0953@gmail.com> wrote:
> > Hi Sterling,
> >
> >     Attached please find version 2 of the patch.
> >
> >     I applied this updated patch (with small adaptations) to gcc-4.8.2
> > and carried out some tests.
> >     I can execute the testcases in a simulator, which support
> > zero-overhead looping instructions.
> >
> >     First of all, I can successfully build libgcc, libstdc++ and
> > newlibc for xtensa with this patch.
> >     The newly built xtensa gcc also passed testsuite which comes with
> newlibc.
> >     I also tested the cases under gcc/testsuite/gcc.c-torture/execute/
> > directory. There are about 800+ cases tested.
> >     Test result shows no new failed case with this patch, compared
> > with the original gcc version.
> >     Is that OK?
> >
> >     I also double checked the loop relaxation issue with binutils-2.24
> > (the latest version).
> >     The result show that the assember can do loop relaxation when the
> > loop target is too far ( > 256 Byte).
> >     And this is the reason why I don't check the size of the loop.
> >
> >
> > Index: gcc/ChangeLog
> >
> ================================================================
> ===
> > --- gcc/ChangeLog    (revision 206463)
> > +++ gcc/ChangeLog    (working copy)
> > @@ -1,3 +1,18 @@
> > +2014-01-09  Felix Yang  <fei.yang0953@gmail.com>
> > +
> > +    * config/xtensa/xtensa.c (xtensa_reorg): New.
> > +    (xtensa_reorg_loops): New.
> > +    (xtensa_can_use_doloop_p): New.
> > +    (xtensa_invalid_within_doloop): New.
> > +    (hwloop_optimize): New.
> > +    (hwloop_fail): New.
> > +    (hwloop_pattern_reg): New.
> > +    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end
> label.
> > +    (xtensa_doloop_hooks): Define.
> > +    * config/xtensa/xtensa.md (doloop_end): New.
> > +    (zero_cost_loop_start): Rewritten.
> > +    (zero_cost_loop_end): Rewritten.
> > +
> >  2014-01-09  Richard Biener  <rguenther@suse.de>
> >
> >      PR tree-optimization/59715
> > Index: gcc/config/xtensa/xtensa.md
> >
> ================================================================
> ===
> > --- gcc/config/xtensa/xtensa.md    (revision 206463)
> > +++ gcc/config/xtensa/xtensa.md    (working copy)
> > @@ -1,6 +1,7 @@
> >  ;; GCC machine description for Tensilica's Xtensa architecture.
> >  ;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
> >  ;; Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
> > +;; Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).
> >
> >  ;; This file is part of GCC.
> >
> > @@ -35,6 +36,8 @@
> >    (UNSPEC_TLS_CALL    9)
> >    (UNSPEC_TP        10)
> >    (UNSPEC_MEMW        11)
> > +  (UNSPEC_LSETUP_START  12)
> > +  (UNSPEC_LSETUP_END    13)
> >
> >    (UNSPECV_SET_FP    1)
> >    (UNSPECV_ENTRY    2)
> > @@ -1289,41 +1292,67 @@
> >     (set_attr "length"    "3")])
> >
> >
> > +;; Hardware loop support.
> > +
> >  ;; Define the loop insns used by bct optimization to represent the
> > -;; start and end of a zero-overhead loop (in loop.c).  This start -;;
> > template generates the loop insn; the end template doesn't generate
> > -;; any instructions since loop end is handled in hardware.
> > +;; start and end of a zero-overhead loop.  This start template
> > +generates ;; the loop insn; the end template doesn't generate any
> > +instructions since ;; loop end is handled in hardware.
> >
> >  (define_insn "zero_cost_loop_start"
> >    [(set (pc)
> > -    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
> > -              (const_int 0))
> > -              (label_ref (match_operand 1 "" ""))
> > -              (pc)))
> > -   (set (reg:SI 19)
> > -    (plus:SI (match_dup 0) (const_int -1)))]
> > +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> > +                          (const_int 1))
> > +                      (label_ref (match_operand 1 "" ""))
> > +                      (pc)))
> > +   (set (match_operand:SI 2 "register_operand" "+a0")
> > +        (plus (match_dup 2)
> > +              (const_int -1)))
> > +   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
> >    ""
> > -  "loopnez\t%0, %l1"
> > +  "loop\t%0, %l1_LEND"
> >    [(set_attr "type"    "jump")
> >     (set_attr "mode"    "none")
> >     (set_attr "length"    "3")])
> >
> >  (define_insn "zero_cost_loop_end"
> >    [(set (pc)
> > -    (if_then_else (ne (reg:SI 19) (const_int 0))
> > -              (label_ref (match_operand 0 "" ""))
> > -              (pc)))
> > -   (set (reg:SI 19)
> > -    (plus:SI (reg:SI 19) (const_int -1)))]
> > +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> > +                          (const_int 1))
> > +                      (label_ref (match_operand 1 "" ""))
> > +                      (pc)))
> > +   (set (match_operand:SI 2 "register_operand" "+a0")
> > +        (plus (match_dup 2)
> > +              (const_int -1)))
> > +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
> >    ""
> >  {
> > -    xtensa_emit_loop_end (insn, operands);
> > -    return "";
> > +  xtensa_emit_loop_end (insn, operands);  return "";
> >  }
> >    [(set_attr "type"    "jump")
> >     (set_attr "mode"    "none")
> >     (set_attr "length"    "0")])
> >
> > +; operand 0 is the loop count pseudo register ; operand 1 is the
> > +label to jump to at the top of the loop (define_expand "doloop_end"
> > +  [(parallel [(set (pc) (if_then_else
> > +                          (ne (match_operand:SI 0 "" "")
> > +                              (const_int 1))
> > +                          (label_ref (match_operand 1 "" ""))
> > +                          (pc)))
> > +              (set (match_dup 0)
> > +                   (plus:SI (match_dup 0)
> > +                            (const_int -1)))
> > +              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
> > +  ""
> > +{
> > +  /* The loop optimizer doesn't check the predicates... */
> > +  if (GET_MODE (operands[0]) != SImode)
> > +    FAIL;
> > +})
> > +
> >
> >  ;; Setting a register from a comparison.
> >
> > Index: gcc/config/xtensa/xtensa.c
> >
> ================================================================
> ===
> > --- gcc/config/xtensa/xtensa.c    (revision 206463)
> > +++ gcc/config/xtensa/xtensa.c    (working copy)
> > @@ -1,6 +1,7 @@
> >  /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
> >     Copyright (C) 2001-2014 Free Software Foundation, Inc.
> >     Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
> > +   Zero-overhead looping support by Felix Yang (fei.yang0953@gmail.com).
> >
> >  This file is part of GCC.
> >
> > @@ -61,8 +62,9 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "gimple.h"
> >  #include "gimplify.h"
> >  #include "df.h"
> > +#include "hw-doloop.h"
> > +#include "dumpfile.h"
> >
> > -
> >  /* Enumeration for all of the relational tests, so that we can build
> >     arrays indexed by the test type, and not worry about the order
> >     of EQ, NE, etc.  */
> > @@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
> >
> >  static bool constantpool_address_p (const_rtx addr);
> >  static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
> > +static void xtensa_reorg (void);
> > +static bool xtensa_can_use_doloop_p (double_int, double_int
> iterations_max,
> > +                                     unsigned int, bool);
> > +static const char *xtensa_invalid_within_doloop (const_rtx);
> >
> >  static bool xtensa_member_type_forces_blk (const_tree,
> >                         enum machine_mode mode);
> > @@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
> >  #undef TARGET_LEGITIMATE_CONSTANT_P
> >  #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
> >
> > +#undef TARGET_MACHINE_DEPENDENT_REORG
> > +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
> > +
> > +#undef TARGET_CAN_USE_DOLOOP_P
> > +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
> > +
> > +#undef TARGET_INVALID_WITHIN_DOLOOP
> > +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
> > +
> >  struct gcc_target targetm = TARGET_INITIALIZER;
> >
> >
> > @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
> >          }
> >      }
> >
> > -  output_asm_insn ("# loop end for %0", operands);
> > +  output_asm_insn ("%1_LEND:", operands);
> >  }
> >
> >
> > @@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum
> machine_mode mo
> >    return !xtensa_tls_referenced_p (x);
> >  }
> >
> > +/* Implement TARGET_CAN_USE_DOLOOP_P.  */
> > +
> > +static bool
> > +xtensa_can_use_doloop_p (double_int, double_int,
> > +                         unsigned int level, bool entered_at_top)
> > +{
> > +  /* Considering limitations in the hardware, only use doloop for
> > innermost loops
> > +     which must be entered from the top.  */
> > +  if (level != 1 || !entered_at_top)
> > +    return false;
> > +
> > +  return true;
> > +}
> > +
> > +/* NULL if INSN insn is valid within a low-overhead loop.
> > +   Otherwise return why doloop cannot be applied.  */
> > +
> > +static const char *
> > +xtensa_invalid_within_doloop (const_rtx insn)
> > +{
> > +  if (CALL_P (insn))
> > +    return "Function call in the loop.";
> > +
> > +  return NULL;
> > +}
> > +
> > +/* Optimize LOOP.  */
> > +
> > +static bool
> > +hwloop_optimize (hwloop_info loop)
> > +{
> > +  int i;
> > +  edge entry_edge;
> > +  basic_block entry_bb;
> > +  rtx insn, seq, iter_reg, entry_after;
> > +
> > +  if (loop->depth > 1)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  if (!loop->incoming_dest)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d has more than one entry\n",
> > loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  if (loop->incoming_dest != loop->head)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d is not entered from head\n",
> > loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  if (loop->has_call || loop->has_asm)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  /* Scan all the blocks to make sure they don't use iter_reg.  */
> > +  if (loop->iter_reg_used || loop->iter_reg_used_outside)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  /* Check if start_label appears before doloop_end.  */
> > +  insn = loop->start_label;
> > +  while (insn && insn != loop->loop_end)
> > +    insn = NEXT_INSN (insn);
> > +
> > +  if (!insn)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
> > +                 loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  /* Get the loop iteration register.  */
> > +  iter_reg = loop->iter_reg;
> > +
> > +  gcc_assert (REG_P (iter_reg));
> > +
> > +  entry_edge = NULL;
> > +
> > +  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
> > +    if (entry_edge->flags & EDGE_FALLTHRU)
> > +      break;
> > +
> > +  if (entry_edge == NULL)
> > +    return false;
> > +
> > +  /* Place the zero_cost_loop_start instruction before the loop.  */
> > +  entry_bb = entry_edge->src;
> > +
> > +  start_sequence ();
> > +
> > +  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
> > +                                              loop->start_label,
> > +                                              loop->iter_reg));
> > +
> > +  seq = get_insns ();
> > +
> > +  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
> > +    {
> > +      basic_block new_bb;
> > +      edge e;
> > +      edge_iterator ei;
> > +
> > +      emit_insn_before (seq, BB_HEAD (loop->head));
> > +      seq = emit_label_before (gen_label_rtx (), seq);
> > +
> > +      new_bb = create_basic_block (seq, insn, entry_bb);
> > +      FOR_EACH_EDGE (e, ei, loop->incoming)
> > +        {
> > +          if (!(e->flags & EDGE_FALLTHRU))
> > +            redirect_edge_and_branch_force (e, new_bb);
> > +          else
> > +            redirect_edge_succ (e, new_bb);
> > +        }
> > +      make_edge (new_bb, loop->head, 0);
> > +    }
> > +  else
> > +    {
> > +      entry_after = BB_END (entry_bb);
> > +      while (DEBUG_INSN_P (entry_after)
> > +             || (NOTE_P (entry_after)
> > +                 && NOTE_KIND (entry_after) !=
> NOTE_INSN_BASIC_BLOCK))
> > +        entry_after = PREV_INSN (entry_after);
> > +      emit_insn_after (seq, entry_after);
> > +    }
> > +
> > +  end_sequence ();
> > +
> > +  return true;
> > +}
> > +
> > +/* A callback for the hw-doloop pass.  Called when a loop we have
> discovered
> > +   turns out not to be optimizable; we have to split the loop_end pattern
> into
> > +   a subtract and a test.  */
> > +
> > +static void
> > +hwloop_fail (hwloop_info loop)
> > +{
> > +  rtx test, insn = loop->loop_end;
> > +
> > +  emit_insn_before (gen_addsi3 (loop->iter_reg,
> > +                                loop->iter_reg,
> > +                                constm1_rtx),
> > +                    loop->loop_end);
> > +
> > +  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
> > +  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
> > +                                                loop->iter_reg,
> const0_rtx,
> > +
> loop->start_label),
> > +                                loop->loop_end);
> > +
> > +  JUMP_LABEL (insn) = loop->start_label;
> > +  LABEL_NUSES (loop->start_label)++;
> > +  delete_insn (loop->loop_end);
> > +}
> > +
> > +/* A callback for the hw-doloop pass.  This function examines INSN; if
> > +   it is a doloop_end pattern we recognize, return the reg rtx for the
> > +   loop counter.  Otherwise, return NULL_RTX.  */
> > +
> > +static rtx
> > +hwloop_pattern_reg (rtx insn)
> > +{
> > +  rtx reg;
> > +
> > +  if (!JUMP_P (insn) || recog_memoized (insn) !=
> CODE_FOR_zero_cost_loop_end)
> > +    return NULL_RTX;
> > +
> > +  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
> > +  if (!REG_P (reg))
> > +    return NULL_RTX;
> > +  return reg;
> > +}
> > +
> > +
> > +static struct hw_doloop_hooks xtensa_doloop_hooks =
> > +{
> > +  hwloop_pattern_reg,
> > +  hwloop_optimize,
> > +  hwloop_fail
> > +};
> > +
> > +/* Run from machine_dependent_reorg, this pass looks for doloop_end
> insns
> > +   and tries to rewrite the RTL of these loops so that proper Xtensa
> > +   hardware loops are generated.  */
> > +
> > +static void
> > +xtensa_reorg_loops (void)
> > +{
> > +  reorg_loops (false, &xtensa_doloop_hooks);
> > +}
> > +
> > +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
> > +
> > +static void
> > +xtensa_reorg (void)
> > +{
> > +  /* We are freeing block_for_insn in the toplev to keep compatibility
> > +     with old MDEP_REORGS that are not CFG based.  Recompute it now.
> */
> > +  compute_bb_for_insn ();
> > +
> > +  df_analyze ();
> > +
> > +  /* Doloop optimization.  */
> > +  xtensa_reorg_loops ();
> > +}
> > +
> >  #include "gt-xtensa.h"
> > Cheers,
> > Felix
> >
> >
> > On Thu, Jan 9, 2014 at 12:49 AM, Sterling Augustine
> > <augustine.sterling@gmail.com> wrote:
> >> On Wed, Jan 8, 2014 at 8:27 AM, Felix Yang <fei.yang0953@gmail.com>
> wrote:
> >>> Hi Sterling,
> >>>
> >>>   This patch implements zero-overhead looping for xtensa backend using
> >>> hw-doloop facility.
> >>>   If OK for trunk, please apply it for me. Thanks.
> >>
> >> Hi Felix,
> >>
> >> I last worked on zero-overhead loops for Xtensa in the gcc 4.3
> >> timeframe, but when I did, I ran into several problems related to
> >> later optimizations rearranging the code which I didn't have time to
> >> address.
> >>
> >> I'm sure much of that experience is completely stale now, but I would
> >> appreciate a detail of the testing you have done with this patch (in
> >> particular, a description of the different xtensa configurations you
> >> tested it against, especially the ones with and without loop
> >> instructions) before I approve it. Please be sure the assembler can
> >> relax the loops it generates as well. I don't see any particular
> >> problem, but there are many, many gotchas when dealing with xtensa
> >> loop instructions.
> >>
> >> It also appears that Tensilica has stopped posting test results for
> >> Xtensa, which makes it difficult to evaluate the quality of this
> >> patch.
> >>
> >> Thanks,
> >>
> >> Sterling

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-01-10  3:49       ` Yangfei (Felix)
@ 2014-01-13 17:24         ` Sterling Augustine
  2014-10-09 11:04           ` Felix Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Sterling Augustine @ 2014-01-13 17:24 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: Felix Yang, gcc-patches

On Thu, Jan 9, 2014 at 7:48 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> And here is the xtensa configuration tested (include/xtensa-config.h):
>
> #define XCHAL_HAVE_BE           0
> #define XCHAL_HAVE_LOOPS                1


Hi Felix,

I like this patch, and expect I will approve it. However, I would like
you to do two more things before I do:

1. Ensure it doesn't generate zcl's when:

#define XCHAL_HAVE_LOOPS 0

2. Ensure it doesn't produce loops bodies that contain ret, retw,
ret.n or retw.n as the last instruction. It might be easier to just
disallow them in loop bodies entirely though.

Thanks!

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-01-13 17:24         ` Sterling Augustine
@ 2014-10-09 11:04           ` Felix Yang
  2014-10-10 14:01             ` Felix Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Felix Yang @ 2014-10-09 11:04 UTC (permalink / raw)
  To: Sterling Augustine; +Cc: Yangfei (Felix), gcc-patches

[-- Attachment #1: Type: text/plain, Size: 13984 bytes --]

Hello Sterling,

     My paper work with the FSF has finished and we can now move
forward with this patch :-)
     I rebased the patch on the latest trunk. Attached please find
version 3 of the patch.
     And the enclosed patch also includes the two points pointed by
you, do you like it?
     Make check regression tested with xtensa-elf-gcc built from trunk
with this patch.
     OK to apply?

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog    (revision 216036)
+++ gcc/ChangeLog    (working copy)
@@ -1,3 +1,19 @@
+2014-10-09  Felix Yang  <felix.yang@huawei.com>
+
+    * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
+    * config/xtensa/xtensa.c (xtensa_reorg): New.
+    (xtensa_reorg_loops): New.
+    (xtensa_can_use_doloop_p): New.
+    (xtensa_invalid_within_doloop): New.
+    (hwloop_optimize): New.
+    (hwloop_fail): New.
+    (hwloop_pattern_reg): New.
+    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+    (xtensa_doloop_hooks): Define.
+    * config/xtensa/xtensa.md (doloop_end): New.
+    (zero_cost_loop_start): Rewritten.
+    (zero_cost_loop_end): Rewritten.
+
 2014-10-09  Joern Rennecke  <joern.rennecke@embecosm.com>

     * config/avr/avr.opt (mmcu=): Change to have a string value.
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md    (revision 216036)
+++ gcc/config/xtensa/xtensa.md    (working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL    9)
   (UNSPEC_TP        10)
   (UNSPEC_MEMW        11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)

   (UNSPECV_SET_FP    1)
   (UNSPECV_ENTRY    2)
@@ -1289,41 +1291,67 @@
    (set_attr "length"    "3")])


+;; Zero-overhead looping support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.

 (define_insn "zero_cost_loop_start"
   [(set (pc)
-    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-              (const_int 0))
-              (label_ref (match_operand 1 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "3")])

 (define_insn "zero_cost_loop_end"
   [(set (pc)
-    (if_then_else (ne (reg:SI 19) (const_int 0))
-              (label_ref (match_operand 0 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "0")])

+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+

 ;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c    (revision 216036)
+++ gcc/config/xtensa/xtensa.c    (working copy)
@@ -61,6 +61,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimplify.h"
 #include "df.h"
 #include "builtins.h"
+#include "dumpfile.h"
+#include "hw-doloop.h"


 /* Enumeration for all of the relational tests, so that we can build
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,

 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const rtx_insn *);

 static bool xtensa_member_type_forces_blk (const_tree,
                        enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand
         }
     }

-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }


@@ -3712,4 +3727,239 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }

+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                         unsigned int loop_depth, bool entered_at_top)
+{
+  if (!TARGET_LOOPS)
+    return false;
+
+  /* Considering limitations in the hardware, only use doloop
+     for innermost loops which must be entered from the top.  */
+  if (loop_depth > 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
+    return "Return from a call instruction in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx iter_reg;
+  rtx_insn *insn, *seq, *entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test;
+  rtx_insn *insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx_insn *insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"
Index: gcc/config/xtensa/xtensa.h
===================================================================
--- gcc/config/xtensa/xtensa.h    (revision 216036)
+++ gcc/config/xtensa/xtensa.h    (working copy)
@@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
 #define TARGET_S32C1I        XCHAL_HAVE_S32C1I
 #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 #define TARGET_THREADPTR    XCHAL_HAVE_THREADPTR
+#define TARGET_LOOPS            XCHAL_HAVE_LOOPS

 #define TARGET_DEFAULT \
   ((XCHAL_HAVE_L32R    ? 0 : MASK_CONST16) |                \
Cheers,
Felix


On Tue, Jan 14, 2014 at 1:23 AM, Sterling Augustine
<augustine.sterling@gmail.com> wrote:
> On Thu, Jan 9, 2014 at 7:48 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
>> And here is the xtensa configuration tested (include/xtensa-config.h):
>>
>> #define XCHAL_HAVE_BE           0
>> #define XCHAL_HAVE_LOOPS                1
>
>
> Hi Felix,
>
> I like this patch, and expect I will approve it. However, I would like
> you to do two more things before I do:
>
> 1. Ensure it doesn't generate zcl's when:
>
> #define XCHAL_HAVE_LOOPS 0
>
> 2. Ensure it doesn't produce loops bodies that contain ret, retw,
> ret.n or retw.n as the last instruction. It might be easier to just
> disallow them in loop bodies entirely though.
>
> Thanks!

[-- Attachment #2: xtensa-zcl-v3.diff --]
[-- Type: text/plain, Size: 12660 bytes --]

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 216036)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,19 @@
+2014-10-09  Felix Yang  <felix.yang@huawei.com>
+
+	* config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
+	* config/xtensa/xtensa.c (xtensa_reorg): New.
+	(xtensa_reorg_loops): New.
+	(xtensa_can_use_doloop_p): New.
+	(xtensa_invalid_within_doloop): New.
+	(hwloop_optimize): New.
+	(hwloop_fail): New.
+	(hwloop_pattern_reg): New.
+	(xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+	(xtensa_doloop_hooks): Define.
+	* config/xtensa/xtensa.md (doloop_end): New.
+	(zero_cost_loop_start): Rewritten.
+	(zero_cost_loop_end): Rewritten.
+
 2014-10-09  Joern Rennecke  <joern.rennecke@embecosm.com>
 
 	* config/avr/avr.opt (mmcu=): Change to have a string value.
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md	(revision 216036)
+++ gcc/config/xtensa/xtensa.md	(working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL	9)
   (UNSPEC_TP		10)
   (UNSPEC_MEMW		11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)
 
   (UNSPECV_SET_FP	1)
   (UNSPECV_ENTRY	2)
@@ -1289,41 +1291,67 @@
    (set_attr "length"	"3")])
 
 
+;; Zero-overhead looping support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.
 
 (define_insn "zero_cost_loop_start"
   [(set (pc)
-	(if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-			  (const_int 0))
-		      (label_ref (match_operand 1 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
 (define_insn "zero_cost_loop_end"
   [(set (pc)
-	(if_then_else (ne (reg:SI 19) (const_int 0))
-		      (label_ref (match_operand 0 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "+a0")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+
 \f
 ;; Setting a register from a comparison.
 
Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c	(revision 216036)
+++ gcc/config/xtensa/xtensa.c	(working copy)
@@ -61,6 +61,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimplify.h"
 #include "df.h"
 #include "builtins.h"
+#include "dumpfile.h"
+#include "hw-doloop.h"
 
 
 /* Enumeration for all of the relational tests, so that we can build
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
 
 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const rtx_insn *);
 
 static bool xtensa_member_type_forces_blk (const_tree,
 					   enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 \f
@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand
         }
     }
 
-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }
 
 
@@ -3712,4 +3727,239 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                         unsigned int loop_depth, bool entered_at_top)
+{
+  if (!TARGET_LOOPS)
+    return false;
+
+  /* Considering limitations in the hardware, only use doloop
+     for innermost loops which must be entered from the top.  */
+  if (loop_depth > 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
+    return "Return from a call instruction in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx iter_reg;
+  rtx_insn *insn, *seq, *entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test;
+  rtx_insn *insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx_insn *insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"
Index: gcc/config/xtensa/xtensa.h
===================================================================
--- gcc/config/xtensa/xtensa.h	(revision 216036)
+++ gcc/config/xtensa/xtensa.h	(working copy)
@@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
 #define TARGET_S32C1I		XCHAL_HAVE_S32C1I
 #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 #define TARGET_THREADPTR	XCHAL_HAVE_THREADPTR
+#define TARGET_LOOPS	        XCHAL_HAVE_LOOPS
 
 #define TARGET_DEFAULT \
   ((XCHAL_HAVE_L32R	? 0 : MASK_CONST16) |				\

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-09 11:04           ` Felix Yang
@ 2014-10-10 14:01             ` Felix Yang
  2014-10-11  9:32               ` [PING] [PATCH, xtensa] " Yangfei (Felix)
  2014-10-13 16:09               ` [PATCH] " augustine.sterling
  0 siblings, 2 replies; 29+ messages in thread
From: Felix Yang @ 2014-10-10 14:01 UTC (permalink / raw)
  To: Sterling Augustine; +Cc: Yangfei (Felix), gcc-patches

[-- Attachment #1: Type: text/plain, Size: 30223 bytes --]

Hi Sterling,

    I made some improvement to the patch. Two changes:
    1. TARGET_LOOPS is now used as a condition of the doloop related
patterns, which is more elegant.
    2. As the trip count register of the zero-cost loop maybe
potentially spilled, we need to change the patterns in order to handle
this issue. The solution is similar to that adapted by c6x backend.
Just turn the zero-cost loop into a regular loop when that happens
when reload is completed.
    Attached please find version 4 of the patch. Make check regression
tested with xtensa-elf-gcc/simulator.
    OK for trunk?

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog    (revision 216079)
+++ gcc/ChangeLog    (working copy)
@@ -1,3 +1,20 @@
+2014-10-10  Felix Yang  <felix.yang@huawei.com>
+
+    * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
+    * config/xtensa/xtensa.c (xtensa_reorg): New.
+    (xtensa_reorg_loops): New.
+    (xtensa_can_use_doloop_p): New.
+    (xtensa_invalid_within_doloop): New.
+    (hwloop_optimize): New.
+    (hwloop_fail): New.
+    (hwloop_pattern_reg): New.
+    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+    (xtensa_doloop_hooks): Define.
+    * config/xtensa/xtensa.md (doloop_end): New.
+    (loop_end): New
+    (zero_cost_loop_start): Rewritten.
+    (zero_cost_loop_end): Rewritten.
+
 2014-10-10  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * configure.ac: Add --enable-fix-cortex-a53-835769 option.
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md    (revision 216079)
+++ gcc/config/xtensa/xtensa.md    (working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL    9)
   (UNSPEC_TP        10)
   (UNSPEC_MEMW        11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)

   (UNSPECV_SET_FP    1)
   (UNSPECV_ENTRY    2)
@@ -1289,41 +1291,120 @@
    (set_attr "length"    "3")])


+;; Zero-overhead looping support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.

 (define_insn "zero_cost_loop_start"
   [(set (pc)
-    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-              (const_int 0))
-              (label_ref (match_operand 1 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (match_dup 0) (const_int -1)))]
-  ""
-  "loopnez\t%0, %l1"
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
+  "TARGET_LOOPS && optimize"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "3")])

 (define_insn "zero_cost_loop_end"
   [(set (pc)
-    (if_then_else (ne (reg:SI 19) (const_int 0))
-              (label_ref (match_operand 0 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (reg:SI 19) (const_int -1)))]
-  ""
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "2,2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "=a,m")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch:SI 3 "=X,&r"))]
+  "TARGET_LOOPS && optimize"
+  "#"
+  [(set_attr "type"    "jump")
+   (set_attr "mode"    "none")
+   (set_attr "length"    "0")])
+
+(define_insn "loop_end"
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
+  "TARGET_LOOPS && optimize"
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "0")])

+(define_split
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "")
+        (plus:SI (match_dup 0)
+                 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch 3))]
+  "TARGET_LOOPS && optimize && reload_completed"
+  [(const_int 0)]
+{
+  if (!REG_P (operands[0]))
+    {
+      rtx test;
+
+      /* Fallback into a normal conditional branch insn.  */
+      emit_move_insn (operands[3], operands[0]);
+      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
+      emit_move_insn (operands[0], operands[3]);
+      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
+      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
+                                      const0_rtx, operands[1]));
+    }
+  else
+    {
+      emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2]));
+    }
+
+  DONE;
+})
+
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+              (clobber (match_dup 2))])] ; match_scratch
+  "TARGET_LOOPS && optimize"
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+  operands[2] = gen_rtx_SCRATCH (SImode);
+})
+

 ;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c    (revision 216079)
+++ gcc/config/xtensa/xtensa.c    (working copy)
@@ -61,6 +61,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimplify.h"
 #include "df.h"
 #include "builtins.h"
+#include "dumpfile.h"
+#include "hw-doloop.h"


 /* Enumeration for all of the relational tests, so that we can build
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,

 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const rtx_insn *);

 static bool xtensa_member_type_forces_blk (const_tree,
                        enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand
         }
     }

-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }


@@ -3712,4 +3727,236 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }

+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                         unsigned int loop_depth, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop
+     for innermost loops which must be entered from the top.  */
+  if (loop_depth > 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
+    return "Return from a call instruction in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx iter_reg;
+  rtx_insn *insn, *seq, *entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test;
+  rtx_insn *insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx_insn *insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"
Index: gcc/config/xtensa/xtensa.h
===================================================================
--- gcc/config/xtensa/xtensa.h    (revision 216079)
+++ gcc/config/xtensa/xtensa.h    (working copy)
@@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
 #define TARGET_S32C1I        XCHAL_HAVE_S32C1I
 #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 #define TARGET_THREADPTR    XCHAL_HAVE_THREADPTR
+#define TARGET_LOOPS            XCHAL_HAVE_LOOPS

 #define TARGET_DEFAULT \
   ((XCHAL_HAVE_L32R    ? 0 : MASK_CONST16) |                \

Cheers,
Felix


On Thu, Oct 9, 2014 at 6:52 PM, Felix Yang <fei.yang0953@gmail.com> wrote:
> Hello Sterling,
>
>      My paper work with the FSF has finished and we can now move
> forward with this patch :-)
>      I rebased the patch on the latest trunk. Attached please find
> version 3 of the patch.
>      And the enclosed patch also includes the two points pointed by
> you, do you like it?
>      Make check regression tested with xtensa-elf-gcc built from trunk
> with this patch.
>      OK to apply?
>
> Index: gcc/ChangeLog
> ===================================================================
> --- gcc/ChangeLog    (revision 216036)
> +++ gcc/ChangeLog    (working copy)
> @@ -1,3 +1,19 @@
> +2014-10-09  Felix Yang  <felix.yang@huawei.com>
> +
> +    * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
> +    * config/xtensa/xtensa.c (xtensa_reorg): New.
> +    (xtensa_reorg_loops): New.
> +    (xtensa_can_use_doloop_p): New.
> +    (xtensa_invalid_within_doloop): New.
> +    (hwloop_optimize): New.
> +    (hwloop_fail): New.
> +    (hwloop_pattern_reg): New.
> +    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
> +    (xtensa_doloop_hooks): Define.
> +    * config/xtensa/xtensa.md (doloop_end): New.
> +    (zero_cost_loop_start): Rewritten.
> +    (zero_cost_loop_end): Rewritten.
> +
>  2014-10-09  Joern Rennecke  <joern.rennecke@embecosm.com>
>
>      * config/avr/avr.opt (mmcu=): Change to have a string value.
> Index: gcc/config/xtensa/xtensa.md
> ===================================================================
> --- gcc/config/xtensa/xtensa.md    (revision 216036)
> +++ gcc/config/xtensa/xtensa.md    (working copy)
> @@ -35,6 +35,8 @@
>    (UNSPEC_TLS_CALL    9)
>    (UNSPEC_TP        10)
>    (UNSPEC_MEMW        11)
> +  (UNSPEC_LSETUP_START  12)
> +  (UNSPEC_LSETUP_END    13)
>
>    (UNSPECV_SET_FP    1)
>    (UNSPECV_ENTRY    2)
> @@ -1289,41 +1291,67 @@
>     (set_attr "length"    "3")])
>
>
> +;; Zero-overhead looping support.
> +
>  ;; Define the loop insns used by bct optimization to represent the
> -;; start and end of a zero-overhead loop (in loop.c).  This start
> -;; template generates the loop insn; the end template doesn't generate
> -;; any instructions since loop end is handled in hardware.
> +;; start and end of a zero-overhead loop.  This start template generates
> +;; the loop insn; the end template doesn't generate any instructions since
> +;; loop end is handled in hardware.
>
>  (define_insn "zero_cost_loop_start"
>    [(set (pc)
> -    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
> -              (const_int 0))
> -              (label_ref (match_operand 1 "" ""))
> -              (pc)))
> -   (set (reg:SI 19)
> -    (plus:SI (match_dup 0) (const_int -1)))]
> +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "register_operand" "+a0")
> +        (plus (match_dup 2)
> +              (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
>    ""
> -  "loopnez\t%0, %l1"
> +  "loop\t%0, %l1_LEND"
>    [(set_attr "type"    "jump")
>     (set_attr "mode"    "none")
>     (set_attr "length"    "3")])
>
>  (define_insn "zero_cost_loop_end"
>    [(set (pc)
> -    (if_then_else (ne (reg:SI 19) (const_int 0))
> -              (label_ref (match_operand 0 "" ""))
> -              (pc)))
> -   (set (reg:SI 19)
> -    (plus:SI (reg:SI 19) (const_int -1)))]
> +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "register_operand" "+a0")
> +        (plus (match_dup 2)
> +              (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
>    ""
>  {
> -    xtensa_emit_loop_end (insn, operands);
> -    return "";
> +  xtensa_emit_loop_end (insn, operands);
> +  return "";
>  }
>    [(set_attr "type"    "jump")
>     (set_attr "mode"    "none")
>     (set_attr "length"    "0")])
>
> +; operand 0 is the loop count pseudo register
> +; operand 1 is the label to jump to at the top of the loop
> +(define_expand "doloop_end"
> +  [(parallel [(set (pc) (if_then_else
> +                          (ne (match_operand:SI 0 "" "")
> +                              (const_int 1))
> +                          (label_ref (match_operand 1 "" ""))
> +                          (pc)))
> +              (set (match_dup 0)
> +                   (plus:SI (match_dup 0)
> +                            (const_int -1)))
> +              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
> +  ""
> +{
> +  /* The loop optimizer doesn't check the predicates... */
> +  if (GET_MODE (operands[0]) != SImode)
> +    FAIL;
> +})
> +
>
>  ;; Setting a register from a comparison.
>
> Index: gcc/config/xtensa/xtensa.c
> ===================================================================
> --- gcc/config/xtensa/xtensa.c    (revision 216036)
> +++ gcc/config/xtensa/xtensa.c    (working copy)
> @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "gimplify.h"
>  #include "df.h"
>  #include "builtins.h"
> +#include "dumpfile.h"
> +#include "hw-doloop.h"
>
>
>  /* Enumeration for all of the relational tests, so that we can build
> @@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
>
>  static bool constantpool_address_p (const_rtx addr);
>  static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
> +static void xtensa_reorg (void);
> +static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
> +                                     unsigned int, bool);
> +static const char *xtensa_invalid_within_doloop (const rtx_insn *);
>
>  static bool xtensa_member_type_forces_blk (const_tree,
>                         enum machine_mode mode);
> @@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
>  #undef TARGET_LEGITIMATE_CONSTANT_P
>  #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
>
> +#undef TARGET_MACHINE_DEPENDENT_REORG
> +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
> +
> +#undef TARGET_CAN_USE_DOLOOP_P
> +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
> +
> +#undef TARGET_INVALID_WITHIN_DOLOOP
> +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>
> @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand
>          }
>      }
>
> -  output_asm_insn ("# loop end for %0", operands);
> +  output_asm_insn ("%1_LEND:", operands);
>  }
>
>
> @@ -3712,4 +3727,239 @@ xtensa_legitimate_constant_p (enum machine_mode mo
>    return !xtensa_tls_referenced_p (x);
>  }
>
> +/* Implement TARGET_CAN_USE_DOLOOP_P.  */
> +
> +static bool
> +xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
> +                         unsigned int loop_depth, bool entered_at_top)
> +{
> +  if (!TARGET_LOOPS)
> +    return false;
> +
> +  /* Considering limitations in the hardware, only use doloop
> +     for innermost loops which must be entered from the top.  */
> +  if (loop_depth > 1 || !entered_at_top)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* NULL if INSN insn is valid within a low-overhead loop.
> +   Otherwise return why doloop cannot be applied.  */
> +
> +static const char *
> +xtensa_invalid_within_doloop (const rtx_insn *insn)
> +{
> +  if (CALL_P (insn))
> +    return "Function call in the loop.";
> +
> +  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
> +    return "Return from a call instruction in the loop.";
> +
> +  return NULL;
> +}
> +
> +/* Optimize LOOP.  */
> +
> +static bool
> +hwloop_optimize (hwloop_info loop)
> +{
> +  int i;
> +  edge entry_edge;
> +  basic_block entry_bb;
> +  rtx iter_reg;
> +  rtx_insn *insn, *seq, *entry_after;
> +
> +  if (loop->depth > 1)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not innermost\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (!loop->incoming_dest)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has more than one entry\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->incoming_dest != loop->head)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not entered from head\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->has_call || loop->has_asm)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has invalid insn\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Scan all the blocks to make sure they don't use iter_reg.  */
> +  if (loop->iter_reg_used || loop->iter_reg_used_outside)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d uses iterator\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Check if start_label appears before doloop_end.  */
> +  insn = loop->start_label;
> +  while (insn && insn != loop->loop_end)
> +    insn = NEXT_INSN (insn);
> +
> +  if (!insn)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Get the loop iteration register.  */
> +  iter_reg = loop->iter_reg;
> +
> +  gcc_assert (REG_P (iter_reg));
> +
> +  entry_edge = NULL;
> +
> +  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
> +    if (entry_edge->flags & EDGE_FALLTHRU)
> +      break;
> +
> +  if (entry_edge == NULL)
> +    return false;
> +
> +  /* Place the zero_cost_loop_start instruction before the loop.  */
> +  entry_bb = entry_edge->src;
> +
> +  start_sequence ();
> +
> +  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
> +                                              loop->start_label,
> +                                              loop->iter_reg));
> +
> +  seq = get_insns ();
> +
> +  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
> +    {
> +      basic_block new_bb;
> +      edge e;
> +      edge_iterator ei;
> +
> +      emit_insn_before (seq, BB_HEAD (loop->head));
> +      seq = emit_label_before (gen_label_rtx (), seq);
> +      new_bb = create_basic_block (seq, insn, entry_bb);
> +      FOR_EACH_EDGE (e, ei, loop->incoming)
> +        {
> +          if (!(e->flags & EDGE_FALLTHRU))
> +            redirect_edge_and_branch_force (e, new_bb);
> +          else
> +            redirect_edge_succ (e, new_bb);
> +        }
> +
> +      make_edge (new_bb, loop->head, 0);
> +    }
> +  else
> +    {
> +      entry_after = BB_END (entry_bb);
> +      while (DEBUG_INSN_P (entry_after)
> +             || (NOTE_P (entry_after)
> +                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
> +        entry_after = PREV_INSN (entry_after);
> +
> +      emit_insn_after (seq, entry_after);
> +    }
> +
> +  end_sequence ();
> +
> +  return true;
> +}
> +
> +/* A callback for the hw-doloop pass.  Called when a loop we have discovered
> +   turns out not to be optimizable; we have to split the loop_end pattern into
> +   a subtract and a test.  */
> +
> +static void
> +hwloop_fail (hwloop_info loop)
> +{
> +  rtx test;
> +  rtx_insn *insn = loop->loop_end;
> +
> +  emit_insn_before (gen_addsi3 (loop->iter_reg,
> +                                loop->iter_reg,
> +                                constm1_rtx),
> +                    loop->loop_end);
> +
> +  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
> +  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
> +                                                loop->iter_reg, const0_rtx,
> +                                                loop->start_label),
> +                                loop->loop_end);
> +
> +  JUMP_LABEL (insn) = loop->start_label;
> +  LABEL_NUSES (loop->start_label)++;
> +  delete_insn (loop->loop_end);
> +}
> +
> +/* A callback for the hw-doloop pass.  This function examines INSN; if
> +   it is a doloop_end pattern we recognize, return the reg rtx for the
> +   loop counter.  Otherwise, return NULL_RTX.  */
> +
> +static rtx
> +hwloop_pattern_reg (rtx_insn *insn)
> +{
> +  rtx reg;
> +
> +  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
> +    return NULL_RTX;
> +
> +  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
> +  if (!REG_P (reg))
> +    return NULL_RTX;
> +
> +  return reg;
> +}
> +
> +
> +static struct hw_doloop_hooks xtensa_doloop_hooks =
> +{
> +  hwloop_pattern_reg,
> +  hwloop_optimize,
> +  hwloop_fail
> +};
> +
> +/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
> +   and tries to rewrite the RTL of these loops so that proper Xtensa
> +   hardware loops are generated.  */
> +
> +static void
> +xtensa_reorg_loops (void)
> +{
> +  reorg_loops (false, &xtensa_doloop_hooks);
> +}
> +
> +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
> +
> +static void
> +xtensa_reorg (void)
> +{
> +  /* We are freeing block_for_insn in the toplev to keep compatibility
> +     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
> +  compute_bb_for_insn ();
> +
> +  df_analyze ();
> +
> +  /* Doloop optimization.  */
> +  xtensa_reorg_loops ();
> +}
> +
>  #include "gt-xtensa.h"
> Index: gcc/config/xtensa/xtensa.h
> ===================================================================
> --- gcc/config/xtensa/xtensa.h    (revision 216036)
> +++ gcc/config/xtensa/xtensa.h    (working copy)
> @@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
>  #define TARGET_S32C1I        XCHAL_HAVE_S32C1I
>  #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
>  #define TARGET_THREADPTR    XCHAL_HAVE_THREADPTR
> +#define TARGET_LOOPS            XCHAL_HAVE_LOOPS
>
>  #define TARGET_DEFAULT \
>    ((XCHAL_HAVE_L32R    ? 0 : MASK_CONST16) |                \
> Cheers,
> Felix
>
>
> On Tue, Jan 14, 2014 at 1:23 AM, Sterling Augustine
> <augustine.sterling@gmail.com> wrote:
>> On Thu, Jan 9, 2014 at 7:48 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
>>> And here is the xtensa configuration tested (include/xtensa-config.h):
>>>
>>> #define XCHAL_HAVE_BE           0
>>> #define XCHAL_HAVE_LOOPS                1
>>
>>
>> Hi Felix,
>>
>> I like this patch, and expect I will approve it. However, I would like
>> you to do two more things before I do:
>>
>> 1. Ensure it doesn't generate zcl's when:
>>
>> #define XCHAL_HAVE_LOOPS 0
>>
>> 2. Ensure it doesn't produce loops bodies that contain ret, retw,
>> ret.n or retw.n as the last instruction. It might be easier to just
>> disallow them in loop bodies entirely though.
>>
>> Thanks!

[-- Attachment #2: xtensa-zcl-v4.diff --]
[-- Type: text/plain, Size: 14492 bytes --]

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 216079)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,20 @@
+2014-10-10  Felix Yang  <felix.yang@huawei.com>
+
+	* config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
+	* config/xtensa/xtensa.c (xtensa_reorg): New.
+	(xtensa_reorg_loops): New.
+	(xtensa_can_use_doloop_p): New.
+	(xtensa_invalid_within_doloop): New.
+	(hwloop_optimize): New.
+	(hwloop_fail): New.
+	(hwloop_pattern_reg): New.
+	(xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+	(xtensa_doloop_hooks): Define.
+	* config/xtensa/xtensa.md (doloop_end): New.
+	(loop_end): New
+	(zero_cost_loop_start): Rewritten.
+	(zero_cost_loop_end): Rewritten.
+
 2014-10-10  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
 
 	* configure.ac: Add --enable-fix-cortex-a53-835769 option.
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md	(revision 216079)
+++ gcc/config/xtensa/xtensa.md	(working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL	9)
   (UNSPEC_TP		10)
   (UNSPEC_MEMW		11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)
 
   (UNSPECV_SET_FP	1)
   (UNSPECV_ENTRY	2)
@@ -1289,41 +1291,120 @@
    (set_attr "length"	"3")])
 
 
+;; Zero-overhead looping support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.
 
 (define_insn "zero_cost_loop_start"
   [(set (pc)
-	(if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-			  (const_int 0))
-		      (label_ref (match_operand 1 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (match_dup 0) (const_int -1)))]
-  ""
-  "loopnez\t%0, %l1"
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
+  "TARGET_LOOPS && optimize"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
 (define_insn "zero_cost_loop_end"
   [(set (pc)
-	(if_then_else (ne (reg:SI 19) (const_int 0))
-		      (label_ref (match_operand 0 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (reg:SI 19) (const_int -1)))]
-  ""
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "2,2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "=a,m")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch:SI 3 "=X,&r"))]
+  "TARGET_LOOPS && optimize"
+  "#"
+  [(set_attr "type"	"jump")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"0")])
+
+(define_insn "loop_end"
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
+  "TARGET_LOOPS && optimize"
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
+(define_split
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "")
+        (plus:SI (match_dup 0)
+                 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch 3))]
+  "TARGET_LOOPS && optimize && reload_completed"
+  [(const_int 0)]
+{
+  if (!REG_P (operands[0]))
+    {
+      rtx test;
+
+      /* Fallback into a normal conditional branch insn.  */
+      emit_move_insn (operands[3], operands[0]);
+      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
+      emit_move_insn (operands[0], operands[3]);
+      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
+      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
+                                      const0_rtx, operands[1]));
+    }
+  else
+    {
+      emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2]));
+    }
+
+  DONE;
+})
+
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+              (clobber (match_dup 2))])] ; match_scratch
+  "TARGET_LOOPS && optimize"
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+  operands[2] = gen_rtx_SCRATCH (SImode);
+})
+
 \f
 ;; Setting a register from a comparison.
 
Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c	(revision 216079)
+++ gcc/config/xtensa/xtensa.c	(working copy)
@@ -61,6 +61,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimplify.h"
 #include "df.h"
 #include "builtins.h"
+#include "dumpfile.h"
+#include "hw-doloop.h"
 
 
 /* Enumeration for all of the relational tests, so that we can build
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
 
 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const rtx_insn *);
 
 static bool xtensa_member_type_forces_blk (const_tree,
 					   enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 \f
@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand
         }
     }
 
-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }
 
 
@@ -3712,4 +3727,236 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                         unsigned int loop_depth, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop
+     for innermost loops which must be entered from the top.  */
+  if (loop_depth > 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
+    return "Return from a call instruction in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx iter_reg;
+  rtx_insn *insn, *seq, *entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test;
+  rtx_insn *insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx_insn *insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"
Index: gcc/config/xtensa/xtensa.h
===================================================================
--- gcc/config/xtensa/xtensa.h	(revision 216079)
+++ gcc/config/xtensa/xtensa.h	(working copy)
@@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
 #define TARGET_S32C1I		XCHAL_HAVE_S32C1I
 #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 #define TARGET_THREADPTR	XCHAL_HAVE_THREADPTR
+#define TARGET_LOOPS	        XCHAL_HAVE_LOOPS
 
 #define TARGET_DEFAULT \
   ((XCHAL_HAVE_L32R	? 0 : MASK_CONST16) |				\

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PING] [PATCH, xtensa] Add zero-overhead looping for xtensa backend
  2014-10-10 14:01             ` Felix Yang
@ 2014-10-11  9:32               ` Yangfei (Felix)
  2014-10-13 16:09               ` [PATCH] " augustine.sterling
  1 sibling, 0 replies; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-11  9:32 UTC (permalink / raw)
  To: Sterling Augustine, gcc-patches; +Cc: Felix Yang

PING?

> 
> Hi Sterling,
> 
>     I made some improvement to the patch. Two changes:
>     1. TARGET_LOOPS is now used as a condition of the doloop related
> patterns, which is more elegant.
>     2. As the trip count register of the zero-cost loop maybe potentially spilled,
> we need to change the patterns in order to handle this issue. The solution is
> similar to that adapted by c6x backend.
> Just turn the zero-cost loop into a regular loop when that happens when reload
> is completed.
>     Attached please find version 4 of the patch. Make check regression tested
> with xtensa-elf-gcc/simulator.
>     OK for trunk?
> 
> Index: gcc/ChangeLog
> ================================================================
> ===
> --- gcc/ChangeLog    (revision 216079)
> +++ gcc/ChangeLog    (working copy)
> @@ -1,3 +1,20 @@
> +2014-10-10  Felix Yang  <felix.yang@huawei.com>
> +
> +    * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
> +    * config/xtensa/xtensa.c (xtensa_reorg): New.
> +    (xtensa_reorg_loops): New.
> +    (xtensa_can_use_doloop_p): New.
> +    (xtensa_invalid_within_doloop): New.
> +    (hwloop_optimize): New.
> +    (hwloop_fail): New.
> +    (hwloop_pattern_reg): New.
> +    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end
> label.
> +    (xtensa_doloop_hooks): Define.
> +    * config/xtensa/xtensa.md (doloop_end): New.
> +    (loop_end): New
> +    (zero_cost_loop_start): Rewritten.
> +    (zero_cost_loop_end): Rewritten.
> +
>  2014-10-10  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>      * configure.ac: Add --enable-fix-cortex-a53-835769 option.
> Index: gcc/config/xtensa/xtensa.md
> ================================================================
> ===
> --- gcc/config/xtensa/xtensa.md    (revision 216079)
> +++ gcc/config/xtensa/xtensa.md    (working copy)
> @@ -35,6 +35,8 @@
>    (UNSPEC_TLS_CALL    9)
>    (UNSPEC_TP        10)
>    (UNSPEC_MEMW        11)
> +  (UNSPEC_LSETUP_START  12)
> +  (UNSPEC_LSETUP_END    13)
> 
>    (UNSPECV_SET_FP    1)
>    (UNSPECV_ENTRY    2)
> @@ -1289,41 +1291,120 @@
>     (set_attr "length"    "3")])
> 
> 
> +;; Zero-overhead looping support.
> +
>  ;; Define the loop insns used by bct optimization to represent the -;; start and
> end of a zero-overhead loop (in loop.c).  This start -;; template generates the
> loop insn; the end template doesn't generate -;; any instructions since loop end
> is handled in hardware.
> +;; start and end of a zero-overhead loop.  This start template
> +generates ;; the loop insn; the end template doesn't generate any
> +instructions since ;; loop end is handled in hardware.
> 
>  (define_insn "zero_cost_loop_start"
>    [(set (pc)
> -    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
> -              (const_int 0))
> -              (label_ref (match_operand 1 "" ""))
> -              (pc)))
> -   (set (reg:SI 19)
> -    (plus:SI (match_dup 0) (const_int -1)))]
> -  ""
> -  "loopnez\t%0, %l1"
> +        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "register_operand" "=a")
> +        (plus (match_dup 0)
> +              (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]  "TARGET_LOOPS &&
> + optimize"
> +  "loop\t%0, %l1_LEND"
>    [(set_attr "type"    "jump")
>     (set_attr "mode"    "none")
>     (set_attr "length"    "3")])
> 
>  (define_insn "zero_cost_loop_end"
>    [(set (pc)
> -    (if_then_else (ne (reg:SI 19) (const_int 0))
> -              (label_ref (match_operand 0 "" ""))
> -              (pc)))
> -   (set (reg:SI 19)
> -    (plus:SI (reg:SI 19) (const_int -1)))]
> -  ""
> +        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand"
> "2,2")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "nonimmediate_operand" "=a,m")
> +        (plus (match_dup 0)
> +              (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
> +   (clobber (match_scratch:SI 3 "=X,&r"))]  "TARGET_LOOPS && optimize"
> +  "#"
> +  [(set_attr "type"    "jump")
> +   (set_attr "mode"    "none")
> +   (set_attr "length"    "0")])
> +
> +(define_insn "loop_end"
> +  [(set (pc)
> +        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "register_operand" "=a")
> +        (plus (match_dup 0)
> +              (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
> +  "TARGET_LOOPS && optimize"
>  {
> -    xtensa_emit_loop_end (insn, operands);
> -    return "";
> +  xtensa_emit_loop_end (insn, operands);  return "";
>  }
>    [(set_attr "type"    "jump")
>     (set_attr "mode"    "none")
>     (set_attr "length"    "0")])
> 
> +(define_split
> +  [(set (pc)
> +        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "nonimmediate_operand" "")
> +        (plus:SI (match_dup 0)
> +                 (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
> +   (clobber (match_scratch 3))]
> +  "TARGET_LOOPS && optimize && reload_completed"
> +  [(const_int 0)]
> +{
> +  if (!REG_P (operands[0]))
> +    {
> +      rtx test;
> +
> +      /* Fallback into a normal conditional branch insn.  */
> +      emit_move_insn (operands[3], operands[0]);
> +      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
> +      emit_move_insn (operands[0], operands[3]);
> +      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
> +      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
> +                                      const0_rtx, operands[1]));
> +    }
> +  else
> +    {
> +      emit_jump_insn (gen_loop_end (operands[0], operands[1],
> operands[2]));
> +    }
> +
> +  DONE;
> +})
> +
> +; operand 0 is the loop count pseudo register ; operand 1 is the label
> +to jump to at the top of the loop (define_expand "doloop_end"
> +  [(parallel [(set (pc) (if_then_else
> +                          (ne (match_operand:SI 0 "" "")
> +                              (const_int 1))
> +                          (label_ref (match_operand 1 "" ""))
> +                          (pc)))
> +              (set (match_dup 0)
> +                   (plus:SI (match_dup 0)
> +                            (const_int -1)))
> +              (unspec [(const_int 0)] UNSPEC_LSETUP_END)
> +              (clobber (match_dup 2))])] ; match_scratch
> +  "TARGET_LOOPS && optimize"
> +{
> +  /* The loop optimizer doesn't check the predicates... */
> +  if (GET_MODE (operands[0]) != SImode)
> +    FAIL;
> +  operands[2] = gen_rtx_SCRATCH (SImode);
> +})
> +
> 
>  ;; Setting a register from a comparison.
> 
> Index: gcc/config/xtensa/xtensa.c
> ================================================================
> ===
> --- gcc/config/xtensa/xtensa.c    (revision 216079)
> +++ gcc/config/xtensa/xtensa.c    (working copy)
> @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3.  If not see
> #include "gimplify.h"
>  #include "df.h"
>  #include "builtins.h"
> +#include "dumpfile.h"
> +#include "hw-doloop.h"
> 
> 
>  /* Enumeration for all of the relational tests, so that we can build @@ -186,6
> +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
> 
>  static bool constantpool_address_p (const_rtx addr);  static bool
> xtensa_legitimate_constant_p (enum machine_mode, rtx);
> +static void xtensa_reorg (void);
> +static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int
> &,
> +                                     unsigned int, bool); static const
> +char *xtensa_invalid_within_doloop (const rtx_insn *);
> 
>  static bool xtensa_member_type_forces_blk (const_tree,
>                         enum machine_mode mode); @@ -312,6
> +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE  #undef
> TARGET_LEGITIMATE_CONSTANT_P  #define
> TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
> 
> +#undef TARGET_MACHINE_DEPENDENT_REORG
> +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
> +
> +#undef TARGET_CAN_USE_DOLOOP_P
> +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
> +
> +#undef TARGET_INVALID_WITHIN_DOLOOP
> +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
> 
> 
> @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand
>          }
>      }
> 
> -  output_asm_insn ("# loop end for %0", operands);
> +  output_asm_insn ("%1_LEND:", operands);
>  }
> 
> 
> @@ -3712,4 +3727,236 @@ xtensa_legitimate_constant_p (enum
> machine_mode mo
>    return !xtensa_tls_referenced_p (x);
>  }
> 
> +/* Implement TARGET_CAN_USE_DOLOOP_P.  */
> +
> +static bool
> +xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
> +                         unsigned int loop_depth, bool entered_at_top)
> +{
> +  /* Considering limitations in the hardware, only use doloop
> +     for innermost loops which must be entered from the top.  */
> +  if (loop_depth > 1 || !entered_at_top)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* NULL if INSN insn is valid within a low-overhead loop.
> +   Otherwise return why doloop cannot be applied.  */
> +
> +static const char *
> +xtensa_invalid_within_doloop (const rtx_insn *insn) {
> +  if (CALL_P (insn))
> +    return "Function call in the loop.";
> +
> +  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
> +    return "Return from a call instruction in the loop.";
> +
> +  return NULL;
> +}
> +
> +/* Optimize LOOP.  */
> +
> +static bool
> +hwloop_optimize (hwloop_info loop)
> +{
> +  int i;
> +  edge entry_edge;
> +  basic_block entry_bb;
> +  rtx iter_reg;
> +  rtx_insn *insn, *seq, *entry_after;
> +
> +  if (loop->depth > 1)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not innermost\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (!loop->incoming_dest)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has more than one entry\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->incoming_dest != loop->head)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not entered from head\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->has_call || loop->has_asm)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has invalid insn\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Scan all the blocks to make sure they don't use iter_reg.  */  if
> + (loop->iter_reg_used || loop->iter_reg_used_outside)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d uses iterator\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Check if start_label appears before doloop_end.  */  insn =
> + loop->start_label;  while (insn && insn != loop->loop_end)
> +    insn = NEXT_INSN (insn);
> +
> +  if (!insn)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Get the loop iteration register.  */  iter_reg = loop->iter_reg;
> +
> +  gcc_assert (REG_P (iter_reg));
> +
> +  entry_edge = NULL;
> +
> +  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
> +    if (entry_edge->flags & EDGE_FALLTHRU)
> +      break;
> +
> +  if (entry_edge == NULL)
> +    return false;
> +
> +  /* Place the zero_cost_loop_start instruction before the loop.  */
> + entry_bb = entry_edge->src;
> +
> +  start_sequence ();
> +
> +  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
> +                                              loop->start_label,
> +                                              loop->iter_reg));
> +
> +  seq = get_insns ();
> +
> +  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
> +    {
> +      basic_block new_bb;
> +      edge e;
> +      edge_iterator ei;
> +
> +      emit_insn_before (seq, BB_HEAD (loop->head));
> +      seq = emit_label_before (gen_label_rtx (), seq);
> +      new_bb = create_basic_block (seq, insn, entry_bb);
> +      FOR_EACH_EDGE (e, ei, loop->incoming)
> +        {
> +          if (!(e->flags & EDGE_FALLTHRU))
> +            redirect_edge_and_branch_force (e, new_bb);
> +          else
> +            redirect_edge_succ (e, new_bb);
> +        }
> +
> +      make_edge (new_bb, loop->head, 0);
> +    }
> +  else
> +    {
> +      entry_after = BB_END (entry_bb);
> +      while (DEBUG_INSN_P (entry_after)
> +             || (NOTE_P (entry_after)
> +                 && NOTE_KIND (entry_after) !=
> NOTE_INSN_BASIC_BLOCK))
> +        entry_after = PREV_INSN (entry_after);
> +
> +      emit_insn_after (seq, entry_after);
> +    }
> +
> +  end_sequence ();
> +
> +  return true;
> +}
> +
> +/* A callback for the hw-doloop pass.  Called when a loop we have discovered
> +   turns out not to be optimizable; we have to split the loop_end pattern into
> +   a subtract and a test.  */
> +
> +static void
> +hwloop_fail (hwloop_info loop)
> +{
> +  rtx test;
> +  rtx_insn *insn = loop->loop_end;
> +
> +  emit_insn_before (gen_addsi3 (loop->iter_reg,
> +                                loop->iter_reg,
> +                                constm1_rtx),
> +                    loop->loop_end);
> +
> +  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);  insn =
> + emit_jump_insn_before (gen_cbranchsi4 (test,
> +                                                loop->iter_reg,
> const0_rtx,
> +                                                loop->start_label),
> +                                loop->loop_end);
> +
> +  JUMP_LABEL (insn) = loop->start_label;
> +  LABEL_NUSES (loop->start_label)++;
> +  delete_insn (loop->loop_end);
> +}
> +
> +/* A callback for the hw-doloop pass.  This function examines INSN; if
> +   it is a doloop_end pattern we recognize, return the reg rtx for the
> +   loop counter.  Otherwise, return NULL_RTX.  */
> +
> +static rtx
> +hwloop_pattern_reg (rtx_insn *insn)
> +{
> +  rtx reg;
> +
> +  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end)
> +    return NULL_RTX;
> +
> +  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));  if (!REG_P (reg))
> +    return NULL_RTX;
> +
> +  return reg;
> +}
> +
> +
> +static struct hw_doloop_hooks xtensa_doloop_hooks = {
> +  hwloop_pattern_reg,
> +  hwloop_optimize,
> +  hwloop_fail
> +};
> +
> +/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
> +   and tries to rewrite the RTL of these loops so that proper Xtensa
> +   hardware loops are generated.  */
> +
> +static void
> +xtensa_reorg_loops (void)
> +{
> +  reorg_loops (false, &xtensa_doloop_hooks); }
> +
> +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
> +
> +static void
> +xtensa_reorg (void)
> +{
> +  /* We are freeing block_for_insn in the toplev to keep compatibility
> +     with old MDEP_REORGS that are not CFG based.  Recompute it now.
> +*/
> +  compute_bb_for_insn ();
> +
> +  df_analyze ();
> +
> +  /* Doloop optimization.  */
> +  xtensa_reorg_loops ();
> +}
> +
>  #include "gt-xtensa.h"
> Index: gcc/config/xtensa/xtensa.h
> ================================================================
> ===
> --- gcc/config/xtensa/xtensa.h    (revision 216079)
> +++ gcc/config/xtensa/xtensa.h    (working copy)
> @@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
>  #define TARGET_S32C1I        XCHAL_HAVE_S32C1I
>  #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
>  #define TARGET_THREADPTR    XCHAL_HAVE_THREADPTR
> +#define TARGET_LOOPS            XCHAL_HAVE_LOOPS
> 
>  #define TARGET_DEFAULT \
>    ((XCHAL_HAVE_L32R    ? 0 : MASK_CONST16) |                \
> 
> Cheers,
> Felix
> 
> 
> On Thu, Oct 9, 2014 at 6:52 PM, Felix Yang <fei.yang0953@gmail.com> wrote:
> > Hello Sterling,
> >
> >      My paper work with the FSF has finished and we can now move
> > forward with this patch :-)
> >      I rebased the patch on the latest trunk. Attached please find
> > version 3 of the patch.
> >      And the enclosed patch also includes the two points pointed by
> > you, do you like it?
> >      Make check regression tested with xtensa-elf-gcc built from trunk
> > with this patch.
> >      OK to apply?
> >
> > Index: gcc/ChangeLog
> >
> ================================================================
> ===
> > --- gcc/ChangeLog    (revision 216036)
> > +++ gcc/ChangeLog    (working copy)
> > @@ -1,3 +1,19 @@
> > +2014-10-09  Felix Yang  <felix.yang@huawei.com>
> > +
> > +    * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
> > +    * config/xtensa/xtensa.c (xtensa_reorg): New.
> > +    (xtensa_reorg_loops): New.
> > +    (xtensa_can_use_doloop_p): New.
> > +    (xtensa_invalid_within_doloop): New.
> > +    (hwloop_optimize): New.
> > +    (hwloop_fail): New.
> > +    (hwloop_pattern_reg): New.
> > +    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end
> label.
> > +    (xtensa_doloop_hooks): Define.
> > +    * config/xtensa/xtensa.md (doloop_end): New.
> > +    (zero_cost_loop_start): Rewritten.
> > +    (zero_cost_loop_end): Rewritten.
> > +
> >  2014-10-09  Joern Rennecke  <joern.rennecke@embecosm.com>
> >
> >      * config/avr/avr.opt (mmcu=): Change to have a string value.
> > Index: gcc/config/xtensa/xtensa.md
> >
> ================================================================
> ===
> > --- gcc/config/xtensa/xtensa.md    (revision 216036)
> > +++ gcc/config/xtensa/xtensa.md    (working copy)
> > @@ -35,6 +35,8 @@
> >    (UNSPEC_TLS_CALL    9)
> >    (UNSPEC_TP        10)
> >    (UNSPEC_MEMW        11)
> > +  (UNSPEC_LSETUP_START  12)
> > +  (UNSPEC_LSETUP_END    13)
> >
> >    (UNSPECV_SET_FP    1)
> >    (UNSPECV_ENTRY    2)
> > @@ -1289,41 +1291,67 @@
> >     (set_attr "length"    "3")])
> >
> >
> > +;; Zero-overhead looping support.
> > +
> >  ;; Define the loop insns used by bct optimization to represent the
> > -;; start and end of a zero-overhead loop (in loop.c).  This start -;;
> > template generates the loop insn; the end template doesn't generate
> > -;; any instructions since loop end is handled in hardware.
> > +;; start and end of a zero-overhead loop.  This start template
> > +generates ;; the loop insn; the end template doesn't generate any
> > +instructions since ;; loop end is handled in hardware.
> >
> >  (define_insn "zero_cost_loop_start"
> >    [(set (pc)
> > -    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
> > -              (const_int 0))
> > -              (label_ref (match_operand 1 "" ""))
> > -              (pc)))
> > -   (set (reg:SI 19)
> > -    (plus:SI (match_dup 0) (const_int -1)))]
> > +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> > +                          (const_int 1))
> > +                      (label_ref (match_operand 1 "" ""))
> > +                      (pc)))
> > +   (set (match_operand:SI 2 "register_operand" "+a0")
> > +        (plus (match_dup 2)
> > +              (const_int -1)))
> > +   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
> >    ""
> > -  "loopnez\t%0, %l1"
> > +  "loop\t%0, %l1_LEND"
> >    [(set_attr "type"    "jump")
> >     (set_attr "mode"    "none")
> >     (set_attr "length"    "3")])
> >
> >  (define_insn "zero_cost_loop_end"
> >    [(set (pc)
> > -    (if_then_else (ne (reg:SI 19) (const_int 0))
> > -              (label_ref (match_operand 0 "" ""))
> > -              (pc)))
> > -   (set (reg:SI 19)
> > -    (plus:SI (reg:SI 19) (const_int -1)))]
> > +        (if_then_else (ne (match_operand:SI 0 "register_operand" "a")
> > +                          (const_int 1))
> > +                      (label_ref (match_operand 1 "" ""))
> > +                      (pc)))
> > +   (set (match_operand:SI 2 "register_operand" "+a0")
> > +        (plus (match_dup 2)
> > +              (const_int -1)))
> > +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
> >    ""
> >  {
> > -    xtensa_emit_loop_end (insn, operands);
> > -    return "";
> > +  xtensa_emit_loop_end (insn, operands);  return "";
> >  }
> >    [(set_attr "type"    "jump")
> >     (set_attr "mode"    "none")
> >     (set_attr "length"    "0")])
> >
> > +; operand 0 is the loop count pseudo register ; operand 1 is the
> > +label to jump to at the top of the loop (define_expand "doloop_end"
> > +  [(parallel [(set (pc) (if_then_else
> > +                          (ne (match_operand:SI 0 "" "")
> > +                              (const_int 1))
> > +                          (label_ref (match_operand 1 "" ""))
> > +                          (pc)))
> > +              (set (match_dup 0)
> > +                   (plus:SI (match_dup 0)
> > +                            (const_int -1)))
> > +              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
> > +  ""
> > +{
> > +  /* The loop optimizer doesn't check the predicates... */
> > +  if (GET_MODE (operands[0]) != SImode)
> > +    FAIL;
> > +})
> > +
> >
> >  ;; Setting a register from a comparison.
> >
> > Index: gcc/config/xtensa/xtensa.c
> >
> ================================================================
> ===
> > --- gcc/config/xtensa/xtensa.c    (revision 216036)
> > +++ gcc/config/xtensa/xtensa.c    (working copy)
> > @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "gimplify.h"
> >  #include "df.h"
> >  #include "builtins.h"
> > +#include "dumpfile.h"
> > +#include "hw-doloop.h"
> >
> >
> >  /* Enumeration for all of the relational tests, so that we can build
> > @@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
> >
> >  static bool constantpool_address_p (const_rtx addr);
> >  static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
> > +static void xtensa_reorg (void);
> > +static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int
> &,
> > +                                     unsigned int, bool);
> > +static const char *xtensa_invalid_within_doloop (const rtx_insn *);
> >
> >  static bool xtensa_member_type_forces_blk (const_tree,
> >                         enum machine_mode mode);
> > @@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
> >  #undef TARGET_LEGITIMATE_CONSTANT_P
> >  #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
> >
> > +#undef TARGET_MACHINE_DEPENDENT_REORG
> > +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
> > +
> > +#undef TARGET_CAN_USE_DOLOOP_P
> > +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
> > +
> > +#undef TARGET_INVALID_WITHIN_DOLOOP
> > +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
> > +
> >  struct gcc_target targetm = TARGET_INITIALIZER;
> >
> >
> > @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx
> *operand
> >          }
> >      }
> >
> > -  output_asm_insn ("# loop end for %0", operands);
> > +  output_asm_insn ("%1_LEND:", operands);
> >  }
> >
> >
> > @@ -3712,4 +3727,239 @@ xtensa_legitimate_constant_p (enum
> machine_mode mo
> >    return !xtensa_tls_referenced_p (x);
> >  }
> >
> > +/* Implement TARGET_CAN_USE_DOLOOP_P.  */
> > +
> > +static bool
> > +xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
> > +                         unsigned int loop_depth, bool
> entered_at_top)
> > +{
> > +  if (!TARGET_LOOPS)
> > +    return false;
> > +
> > +  /* Considering limitations in the hardware, only use doloop
> > +     for innermost loops which must be entered from the top.  */
> > +  if (loop_depth > 1 || !entered_at_top)
> > +    return false;
> > +
> > +  return true;
> > +}
> > +
> > +/* NULL if INSN insn is valid within a low-overhead loop.
> > +   Otherwise return why doloop cannot be applied.  */
> > +
> > +static const char *
> > +xtensa_invalid_within_doloop (const rtx_insn *insn)
> > +{
> > +  if (CALL_P (insn))
> > +    return "Function call in the loop.";
> > +
> > +  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
> > +    return "Return from a call instruction in the loop.";
> > +
> > +  return NULL;
> > +}
> > +
> > +/* Optimize LOOP.  */
> > +
> > +static bool
> > +hwloop_optimize (hwloop_info loop)
> > +{
> > +  int i;
> > +  edge entry_edge;
> > +  basic_block entry_bb;
> > +  rtx iter_reg;
> > +  rtx_insn *insn, *seq, *entry_after;
> > +
> > +  if (loop->depth > 1)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d is not innermost\n",
> > +                 loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  if (!loop->incoming_dest)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d has more than one entry\n",
> > +                 loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  if (loop->incoming_dest != loop->head)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d is not entered from head\n",
> > +                 loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  if (loop->has_call || loop->has_asm)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d has invalid insn\n",
> > +                 loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  /* Scan all the blocks to make sure they don't use iter_reg.  */
> > +  if (loop->iter_reg_used || loop->iter_reg_used_outside)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d uses iterator\n",
> > +                 loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  /* Check if start_label appears before doloop_end.  */
> > +  insn = loop->start_label;
> > +  while (insn && insn != loop->loop_end)
> > +    insn = NEXT_INSN (insn);
> > +
> > +  if (!insn)
> > +    {
> > +      if (dump_file)
> > +        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
> > +                 loop->loop_no);
> > +      return false;
> > +    }
> > +
> > +  /* Get the loop iteration register.  */
> > +  iter_reg = loop->iter_reg;
> > +
> > +  gcc_assert (REG_P (iter_reg));
> > +
> > +  entry_edge = NULL;
> > +
> > +  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
> > +    if (entry_edge->flags & EDGE_FALLTHRU)
> > +      break;
> > +
> > +  if (entry_edge == NULL)
> > +    return false;
> > +
> > +  /* Place the zero_cost_loop_start instruction before the loop.  */
> > +  entry_bb = entry_edge->src;
> > +
> > +  start_sequence ();
> > +
> > +  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
> > +                                              loop->start_label,
> > +                                              loop->iter_reg));
> > +
> > +  seq = get_insns ();
> > +
> > +  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
> > +    {
> > +      basic_block new_bb;
> > +      edge e;
> > +      edge_iterator ei;
> > +
> > +      emit_insn_before (seq, BB_HEAD (loop->head));
> > +      seq = emit_label_before (gen_label_rtx (), seq);
> > +      new_bb = create_basic_block (seq, insn, entry_bb);
> > +      FOR_EACH_EDGE (e, ei, loop->incoming)
> > +        {
> > +          if (!(e->flags & EDGE_FALLTHRU))
> > +            redirect_edge_and_branch_force (e, new_bb);
> > +          else
> > +            redirect_edge_succ (e, new_bb);
> > +        }
> > +
> > +      make_edge (new_bb, loop->head, 0);
> > +    }
> > +  else
> > +    {
> > +      entry_after = BB_END (entry_bb);
> > +      while (DEBUG_INSN_P (entry_after)
> > +             || (NOTE_P (entry_after)
> > +                 && NOTE_KIND (entry_after) !=
> NOTE_INSN_BASIC_BLOCK))
> > +        entry_after = PREV_INSN (entry_after);
> > +
> > +      emit_insn_after (seq, entry_after);
> > +    }
> > +
> > +  end_sequence ();
> > +
> > +  return true;
> > +}
> > +
> > +/* A callback for the hw-doloop pass.  Called when a loop we have
> discovered
> > +   turns out not to be optimizable; we have to split the loop_end pattern
> into
> > +   a subtract and a test.  */
> > +
> > +static void
> > +hwloop_fail (hwloop_info loop)
> > +{
> > +  rtx test;
> > +  rtx_insn *insn = loop->loop_end;
> > +
> > +  emit_insn_before (gen_addsi3 (loop->iter_reg,
> > +                                loop->iter_reg,
> > +                                constm1_rtx),
> > +                    loop->loop_end);
> > +
> > +  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
> > +  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
> > +                                                loop->iter_reg,
> const0_rtx,
> > +
> loop->start_label),
> > +                                loop->loop_end);
> > +
> > +  JUMP_LABEL (insn) = loop->start_label;
> > +  LABEL_NUSES (loop->start_label)++;
> > +  delete_insn (loop->loop_end);
> > +}
> > +
> > +/* A callback for the hw-doloop pass.  This function examines INSN; if
> > +   it is a doloop_end pattern we recognize, return the reg rtx for the
> > +   loop counter.  Otherwise, return NULL_RTX.  */
> > +
> > +static rtx
> > +hwloop_pattern_reg (rtx_insn *insn)
> > +{
> > +  rtx reg;
> > +
> > +  if (!JUMP_P (insn) || recog_memoized (insn) !=
> CODE_FOR_zero_cost_loop_end)
> > +    return NULL_RTX;
> > +
> > +  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
> > +  if (!REG_P (reg))
> > +    return NULL_RTX;
> > +
> > +  return reg;
> > +}
> > +
> > +
> > +static struct hw_doloop_hooks xtensa_doloop_hooks =
> > +{
> > +  hwloop_pattern_reg,
> > +  hwloop_optimize,
> > +  hwloop_fail
> > +};
> > +
> > +/* Run from machine_dependent_reorg, this pass looks for doloop_end
> insns
> > +   and tries to rewrite the RTL of these loops so that proper Xtensa
> > +   hardware loops are generated.  */
> > +
> > +static void
> > +xtensa_reorg_loops (void)
> > +{
> > +  reorg_loops (false, &xtensa_doloop_hooks);
> > +}
> > +
> > +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
> > +
> > +static void
> > +xtensa_reorg (void)
> > +{
> > +  /* We are freeing block_for_insn in the toplev to keep compatibility
> > +     with old MDEP_REORGS that are not CFG based.  Recompute it now.
> */
> > +  compute_bb_for_insn ();
> > +
> > +  df_analyze ();
> > +
> > +  /* Doloop optimization.  */
> > +  xtensa_reorg_loops ();
> > +}
> > +
> >  #include "gt-xtensa.h"
> > Index: gcc/config/xtensa/xtensa.h
> >
> ================================================================
> ===
> > --- gcc/config/xtensa/xtensa.h    (revision 216036)
> > +++ gcc/config/xtensa/xtensa.h    (working copy)
> > @@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size;
> >  #define TARGET_S32C1I        XCHAL_HAVE_S32C1I
> >  #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
> >  #define TARGET_THREADPTR    XCHAL_HAVE_THREADPTR
> > +#define TARGET_LOOPS            XCHAL_HAVE_LOOPS
> >
> >  #define TARGET_DEFAULT \
> >    ((XCHAL_HAVE_L32R    ? 0 : MASK_CONST16) |                \
> > Cheers,
> > Felix
> >
> >
> > On Tue, Jan 14, 2014 at 1:23 AM, Sterling Augustine
> > <augustine.sterling@gmail.com> wrote:
> >> On Thu, Jan 9, 2014 at 7:48 PM, Yangfei (Felix) <felix.yang@huawei.com>
> wrote:
> >>> And here is the xtensa configuration tested (include/xtensa-config.h):
> >>>
> >>> #define XCHAL_HAVE_BE           0
> >>> #define XCHAL_HAVE_LOOPS                1
> >>
> >>
> >> Hi Felix,
> >>
> >> I like this patch, and expect I will approve it. However, I would like
> >> you to do two more things before I do:
> >>
> >> 1. Ensure it doesn't generate zcl's when:
> >>
> >> #define XCHAL_HAVE_LOOPS 0
> >>
> >> 2. Ensure it doesn't produce loops bodies that contain ret, retw,
> >> ret.n or retw.n as the last instruction. It might be easier to just
> >> disallow them in loop bodies entirely though.
> >>
> >> Thanks!

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-10 14:01             ` Felix Yang
  2014-10-11  9:32               ` [PING] [PATCH, xtensa] " Yangfei (Felix)
@ 2014-10-13 16:09               ` augustine.sterling
  2014-10-13 16:30                 ` Felix Yang
  1 sibling, 1 reply; 29+ messages in thread
From: augustine.sterling @ 2014-10-13 16:09 UTC (permalink / raw)
  To: Felix Yang; +Cc: Yangfei (Felix), gcc-patches

On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
> Hi Sterling,
>
>     I made some improvement to the patch. Two changes:
>     1. TARGET_LOOPS is now used as a condition of the doloop related
> patterns, which is more elegant.

Fine.

>     2. As the trip count register of the zero-cost loop maybe
> potentially spilled, we need to change the patterns in order to handle
> this issue.

Actually, for xtensa you don't. The trip count is copied into LCOUNT
at the execution of the loop instruction, and therefore a spill or
whatever doesn't matter--it won't affect the result. So as long as you
have the trip count at the start of the loop, you are fine.

This does bring up an issue of whether or not the trip count can be
modified during the loop. (note that this is different than early
exit.) If it can, you can't use a zero-overhead loop. Does your patch
address this case.

The solution is similar to that adapted by c6x backend.
> Just turn the zero-cost loop into a regular loop when that happens
> when reload is completed.
>     Attached please find version 4 of the patch. Make check regression
> tested with xtensa-elf-gcc/simulator.
>     OK for trunk?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-13 16:09               ` [PATCH] " augustine.sterling
@ 2014-10-13 16:30                 ` Felix Yang
  2014-10-14 15:43                   ` Felix Yang
  0 siblings, 1 reply; 29+ messages in thread
From: Felix Yang @ 2014-10-13 16:30 UTC (permalink / raw)
  To: augustine.sterling; +Cc: Yangfei (Felix), gcc-patches

Thanks for the comments.

The patch checked the usage of teh trip count register, making sure
that it is not used in the loop body other than the doloop_end or
lives past the doloop_end instruction, as the following code snippet
shows:

+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n",
+                 loop->loop_no);
+      return false;
+    }

    For the spill issue, I think we need to handle it. The reason is
that currently we are not telling GCC about the existence of the
LCOUNT register. Instead, we keep the trip count in a general register
and it's possible that this register can be spilled when register
pressure is high.
    It's a good idea to post another patch to describe the LCOUNT
register in GCC in order to free this general register. But I want
this patch applied as a first step, OK?

Cheers,
Felix


On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com
<augustine.sterling@gmail.com> wrote:
> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
>> Hi Sterling,
>>
>>     I made some improvement to the patch. Two changes:
>>     1. TARGET_LOOPS is now used as a condition of the doloop related
>> patterns, which is more elegant.
>
> Fine.
>
>>     2. As the trip count register of the zero-cost loop maybe
>> potentially spilled, we need to change the patterns in order to handle
>> this issue.
>
> Actually, for xtensa you don't. The trip count is copied into LCOUNT
> at the execution of the loop instruction, and therefore a spill or
> whatever doesn't matter--it won't affect the result. So as long as you
> have the trip count at the start of the loop, you are fine.
>
> This does bring up an issue of whether or not the trip count can be
> modified during the loop. (note that this is different than early
> exit.) If it can, you can't use a zero-overhead loop. Does your patch
> address this case.
>
> The solution is similar to that adapted by c6x backend.
>> Just turn the zero-cost loop into a regular loop when that happens
>> when reload is completed.
>>     Attached please find version 4 of the patch. Make check regression
>> tested with xtensa-elf-gcc/simulator.
>>     OK for trunk?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-13 16:30                 ` Felix Yang
@ 2014-10-14 15:43                   ` Felix Yang
  2014-10-15 19:51                     ` augustine.sterling
  0 siblings, 1 reply; 29+ messages in thread
From: Felix Yang @ 2014-10-14 15:43 UTC (permalink / raw)
  To: augustine.sterling; +Cc: Yangfei (Felix), gcc-patches

PING?
Cheers,
Felix


On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
> Thanks for the comments.
>
> The patch checked the usage of teh trip count register, making sure
> that it is not used in the loop body other than the doloop_end or
> lives past the doloop_end instruction, as the following code snippet
> shows:
>
> +  /* Scan all the blocks to make sure they don't use iter_reg.  */
> +  if (loop->iter_reg_used || loop->iter_reg_used_outside)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d uses iterator\n",
> +                 loop->loop_no);
> +      return false;
> +    }
>
>     For the spill issue, I think we need to handle it. The reason is
> that currently we are not telling GCC about the existence of the
> LCOUNT register. Instead, we keep the trip count in a general register
> and it's possible that this register can be spilled when register
> pressure is high.
>     It's a good idea to post another patch to describe the LCOUNT
> register in GCC in order to free this general register. But I want
> this patch applied as a first step, OK?
>
> Cheers,
> Felix
>
>
> On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com
> <augustine.sterling@gmail.com> wrote:
>> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
>>> Hi Sterling,
>>>
>>>     I made some improvement to the patch. Two changes:
>>>     1. TARGET_LOOPS is now used as a condition of the doloop related
>>> patterns, which is more elegant.
>>
>> Fine.
>>
>>>     2. As the trip count register of the zero-cost loop maybe
>>> potentially spilled, we need to change the patterns in order to handle
>>> this issue.
>>
>> Actually, for xtensa you don't. The trip count is copied into LCOUNT
>> at the execution of the loop instruction, and therefore a spill or
>> whatever doesn't matter--it won't affect the result. So as long as you
>> have the trip count at the start of the loop, you are fine.
>>
>> This does bring up an issue of whether or not the trip count can be
>> modified during the loop. (note that this is different than early
>> exit.) If it can, you can't use a zero-overhead loop. Does your patch
>> address this case.
>>
>> The solution is similar to that adapted by c6x backend.
>>> Just turn the zero-cost loop into a regular loop when that happens
>>> when reload is completed.
>>>     Attached please find version 4 of the patch. Make check regression
>>> tested with xtensa-elf-gcc/simulator.
>>>     OK for trunk?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-14 15:43                   ` Felix Yang
@ 2014-10-15 19:51                     ` augustine.sterling
  2014-10-16  4:52                       ` Yangfei (Felix)
  0 siblings, 1 reply; 29+ messages in thread
From: augustine.sterling @ 2014-10-15 19:51 UTC (permalink / raw)
  To: Felix Yang; +Cc: Yangfei (Felix), gcc-patches

On Tue, Oct 14, 2014 at 8:39 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
> PING?
> Cheers,
> Felix

Felix,

This isn't my day job, 24-hour pings are unproductive.

You shouldn't need to worry about the trip count register getting
spilled. It makes no difference whatsoever to how the loop
operates--the trip count is dead with regards to the loop once the
instruction executes. You don't need to describe LCOUNT to gcc in
order for this not to matter. It should be enough to describe the zcl
as consuming the value in the same way a branch instruction consumes a
value.

If you have a case where spilling it is causing a problem, then there
is a bug in your code, papered over by dropping case when it is
spilled. Similarly with iter_reg_used_outside--it shouldn't affect
whether or not a zcl is valid here. If you have a case where it does,
there is likely a bug in your code.

If the code is easier to write by maintaining trip_count up, then fine
(for now); you give up some performance (in fact, a lot of
performance), but that doesn't matter as to the correctness.


>
>
> On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
>> Thanks for the comments.
>>
>> The patch checked the usage of teh trip count register, making sure
>> that it is not used in the loop body other than the doloop_end or
>> lives past the doloop_end instruction, as the following code snippet
>> shows:
>>
>> +  /* Scan all the blocks to make sure they don't use iter_reg.  */
>> +  if (loop->iter_reg_used || loop->iter_reg_used_outside)
>> +    {
>> +      if (dump_file)
>> +        fprintf (dump_file, ";; loop %d uses iterator\n",
>> +                 loop->loop_no);
>> +      return false;
>> +    }
>>
>>     For the spill issue, I think we need to handle it. The reason is
>> that currently we are not telling GCC about the existence of the
>> LCOUNT register. Instead, we keep the trip count in a general register
>> and it's possible that this register can be spilled when register
>> pressure is high.
>>     It's a good idea to post another patch to describe the LCOUNT
>> register in GCC in order to free this general register. But I want
>> this patch applied as a first step, OK?
>>
>> Cheers,
>> Felix
>>
>>
>> On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com
>> <augustine.sterling@gmail.com> wrote:
>>> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
>>>> Hi Sterling,
>>>>
>>>>     I made some improvement to the patch. Two changes:
>>>>     1. TARGET_LOOPS is now used as a condition of the doloop related
>>>> patterns, which is more elegant.
>>>
>>> Fine.
>>>
>>>>     2. As the trip count register of the zero-cost loop maybe
>>>> potentially spilled, we need to change the patterns in order to handle
>>>> this issue.
>>>
>>> Actually, for xtensa you don't. The trip count is copied into LCOUNT
>>> at the execution of the loop instruction, and therefore a spill or
>>> whatever doesn't matter--it won't affect the result. So as long as you
>>> have the trip count at the start of the loop, you are fine.
>>>
>>> This does bring up an issue of whether or not the trip count can be
>>> modified during the loop. (note that this is different than early
>>> exit.) If it can, you can't use a zero-overhead loop. Does your patch
>>> address this case.
>>>
>>> The solution is similar to that adapted by c6x backend.
>>>> Just turn the zero-cost loop into a regular loop when that happens
>>>> when reload is completed.
>>>>     Attached please find version 4 of the patch. Make check regression
>>>> tested with xtensa-elf-gcc/simulator.
>>>>     OK for trunk?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-15 19:51                     ` augustine.sterling
@ 2014-10-16  4:52                       ` Yangfei (Felix)
  2014-10-21 14:57                         ` augustine.sterling
  0 siblings, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-16  4:52 UTC (permalink / raw)
  To: gcc-patches, augustine.sterling, Felix Yang

Hi Sterling,

    Since the patch is delayed for a long time, I'm kind of pushing it. Sorry for that. 
    Yeah, you are right. We have some performance issue here as GCC may use one more general register in some cases with this patch. 
    Take the following arraysum testcase for example. In doloop optimization, GCC figures out that the number of iterations is 1024 and creates a new pseudo 79 as the new trip count register. 
    The pseudo 79 is live throughout the loop, this makes the register pressure in the loop higher. And it's possible that this new pseudo is spilled by reload when the register pressure is very high. 
    I know that the xtensa loop instruction copies the trip count register into the LCOUNT special register. And we need describe this hardware feature in GCC in order to free the trip count register. 
    But I find it difficult to do. Do you have any good suggestions on this? 

arraysum.c:
int g[1024];
int g_sum;

void test_entry ()
{
        int i, Sum = 0;

        for (i = 0; i < 1024; i++)
          Sum = Sum + g[i];

        g_sum = Sum;
}


1. RTL before the doloop optimization pass(arraysum.c.193r.loop2_invariant):
(note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
(note 32 34 36 2 NOTE_INSN_FUNCTION_BEG)
(insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ])
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
     (expr_list:REG_EQUAL (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
        (nil)))
(insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ])
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
     (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
                (const_int 4096 [0x1000])))
        (nil)))
(insn 33 37 42 2 (set (reg/v:SI 74 [ Sum ])
        (const_int 0 [0])) arraysum.c:6 29 {movsi_internal}
     (nil))
(code_label 42 33 38 3 2 "" [0 uses])
(note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
(insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
        (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal}
     (nil))
(insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ])
        (plus:SI (reg/v:SI 74 [ Sum ])
            (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3}
     (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
        (nil)))
(insn 41 40 43 3 (set (reg:SI 72 [ ivtmp$8 ])
        (plus:SI (reg:SI 72 [ ivtmp$8 ])
            (const_int 4 [0x4]))) 1 {addsi3}
     (nil))
(jump_insn 43 41 52 3 (set (pc)
        (if_then_else (ne (reg:SI 72 [ ivtmp$8 ])
                (reg/f:SI 76 [ D.1393 ]))
            (label_ref:SI 52)
            (pc))) arraysum.c:8 39 {*btrue}
     (int_list:REG_BR_PROB 9899 (nil))
 -> 52)
(code_label 52 43 51 5 3 "" [1 uses])
(note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK)
(note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(insn 45 44 46 4 (set (reg/f:SI 78)
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2  S4 A32])) arraysum.c:11 29 {movsi_internal}
     (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum")  <var_decl 0x7f6eef5d6360 g_sum>)
        (nil)))
(insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32])
        (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal}
     (expr_list:REG_DEAD (reg/f:SI 78)
        (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ])
            (nil))))


2. RTL after the doloop optimization pass(arraysum.c.195r.loop2_doloop):
(note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
(note 32 34 36 2 NOTE_INSN_FUNCTION_BEG)
(insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ])
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
     (expr_list:REG_EQUAL (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
        (nil)))
(insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ])
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
     (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
                (const_int 4096 [0x1000])))
        (nil)))
(insn 33 37 54 2 (set (reg/v:SI 74 [ Sum ])
        (const_int 0 [0])) arraysum.c:6 29 {movsi_internal}
     (nil))
(insn 54 33 42 2 (set (reg:SI 79)
        (const_int 1024 [0x400])) arraysum.c:6 -1
     (nil))
(code_label 42 54 38 3 2 "" [0 uses])
(note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
(insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
        (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal}
     (nil))
(insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ])
        (plus:SI (reg/v:SI 74 [ Sum ])
            (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3}
     (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
        (nil)))
(insn 41 40 53 3 (set (reg:SI 72 [ ivtmp$8 ])
        (plus:SI (reg:SI 72 [ ivtmp$8 ])
            (const_int 4 [0x4]))) 1 {addsi3}
     (nil))
(jump_insn 53 41 52 3 (parallel [
            (set (pc)
                (if_then_else (ne (reg:SI 79)
                        (const_int 1 [0x1]))
                    (label_ref 52)
                    (pc)))
            (set (reg:SI 79)
                (plus:SI (reg:SI 79)
                    (const_int -1 [0xffffffffffffffff])))
            (unspec [
                    (const_int 0 [0])
                ] 13)
            (clobber (scratch:SI))
        ]) -1
     (int_list:REG_BR_PROB 9899 (nil))
 -> 52)
(code_label 52 53 51 5 3 "" [1 uses])
(note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK)
(note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(insn 45 44 46 4 (set (reg/f:SI 78)
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2  S4 A32])) arraysum.c:11 29 {movsi_internal}
     (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum")  <var_decl 0x7f6eef5d6360 g_sum>)
        (nil)))
(insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32])
        (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal}
     (expr_list:REG_DEAD (reg/f:SI 78)
        (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ])
            (nil))))


> 
> On Tue, Oct 14, 2014 at 8:39 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
> > PING?
> > Cheers,
> > Felix
> 
> Felix,
> 
> This isn't my day job, 24-hour pings are unproductive.
> 
> You shouldn't need to worry about the trip count register getting spilled. It
> makes no difference whatsoever to how the loop operates--the trip count is
> dead with regards to the loop once the instruction executes. You don't need to
> describe LCOUNT to gcc in order for this not to matter. It should be enough to
> describe the zcl as consuming the value in the same way a branch instruction
> consumes a value.
> 
> If you have a case where spilling it is causing a problem, then there is a bug in
> your code, papered over by dropping case when it is spilled. Similarly with
> iter_reg_used_outside--it shouldn't affect whether or not a zcl is valid here. If
> you have a case where it does, there is likely a bug in your code.
> 
> If the code is easier to write by maintaining trip_count up, then fine (for now);
> you give up some performance (in fact, a lot of performance), but that doesn't
> matter as to the correctness.
> 
> 
> >
> >
> > On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com>
> wrote:
> >> Thanks for the comments.
> >>
> >> The patch checked the usage of teh trip count register, making sure
> >> that it is not used in the loop body other than the doloop_end or
> >> lives past the doloop_end instruction, as the following code snippet
> >> shows:
> >>
> >> +  /* Scan all the blocks to make sure they don't use iter_reg.  */
> >> + if (loop->iter_reg_used || loop->iter_reg_used_outside)
> >> +    {
> >> +      if (dump_file)
> >> +        fprintf (dump_file, ";; loop %d uses iterator\n",
> >> +                 loop->loop_no);
> >> +      return false;
> >> +    }
> >>
> >>     For the spill issue, I think we need to handle it. The reason is
> >> that currently we are not telling GCC about the existence of the
> >> LCOUNT register. Instead, we keep the trip count in a general
> >> register and it's possible that this register can be spilled when
> >> register pressure is high.
> >>     It's a good idea to post another patch to describe the LCOUNT
> >> register in GCC in order to free this general register. But I want
> >> this patch applied as a first step, OK?
> >>
> >> Cheers,
> >> Felix
> >>
> >>
> >> On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com
> >> <augustine.sterling@gmail.com> wrote:
> >>> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com>
> wrote:
> >>>> Hi Sterling,
> >>>>
> >>>>     I made some improvement to the patch. Two changes:
> >>>>     1. TARGET_LOOPS is now used as a condition of the doloop
> >>>> related patterns, which is more elegant.
> >>>
> >>> Fine.
> >>>
> >>>>     2. As the trip count register of the zero-cost loop maybe
> >>>> potentially spilled, we need to change the patterns in order to
> >>>> handle this issue.
> >>>
> >>> Actually, for xtensa you don't. The trip count is copied into LCOUNT
> >>> at the execution of the loop instruction, and therefore a spill or
> >>> whatever doesn't matter--it won't affect the result. So as long as
> >>> you have the trip count at the start of the loop, you are fine.
> >>>
> >>> This does bring up an issue of whether or not the trip count can be
> >>> modified during the loop. (note that this is different than early
> >>> exit.) If it can, you can't use a zero-overhead loop. Does your
> >>> patch address this case.
> >>>
> >>> The solution is similar to that adapted by c6x backend.
> >>>> Just turn the zero-cost loop into a regular loop when that happens
> >>>> when reload is completed.
> >>>>     Attached please find version 4 of the patch. Make check
> >>>> regression tested with xtensa-elf-gcc/simulator.
> >>>>     OK for trunk?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-16  4:52                       ` Yangfei (Felix)
@ 2014-10-21 14:57                         ` augustine.sterling
  2014-10-22  2:20                           ` Yangfei (Felix)
  2014-10-22  5:20                           ` Yangfei (Felix)
  0 siblings, 2 replies; 29+ messages in thread
From: augustine.sterling @ 2014-10-21 14:57 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: gcc-patches, Felix Yang

On Wed, Oct 15, 2014 at 7:10 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> Hi Sterling,
>
>     Since the patch is delayed for a long time, I'm kind of pushing it. Sorry for that.
>     Yeah, you are right. We have some performance issue here as GCC may use one more general register in some cases with this patch.
>     Take the following arraysum testcase for example. In doloop optimization, GCC figures out that the number of iterations is 1024 and creates a new pseudo 79 as the new trip count register.
>     The pseudo 79 is live throughout the loop, this makes the register pressure in the loop higher. And it's possible that this new pseudo is spilled by reload when the register pressure is very high.
>     I know that the xtensa loop instruction copies the trip count register into the LCOUNT special register. And we need describe this hardware feature in GCC in order to free the trip count register.
>     But I find it difficult to do. Do you have any good suggestions on this?

There are two issues related to the trip count, one I would like you
to solve now, one later.

1. Later: The trip count doesn't need to be updated at all inside
these loops, once the loop instruction executes. The code below
relates to this case.

2. Now: You should be able to use a loop instruction regardless of
whether the trip count is spilled. If you have an example where that
wouldn't work, I would love to see it.

>
> arraysum.c:
> int g[1024];
> int g_sum;
>
> void test_entry ()
> {
>         int i, Sum = 0;
>
>         for (i = 0; i < 1024; i++)
>           Sum = Sum + g[i];
>
>         g_sum = Sum;
> }
>
>
> 1. RTL before the doloop optimization pass(arraysum.c.193r.loop2_invariant):
> (note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
> (note 32 34 36 2 NOTE_INSN_FUNCTION_BEG)
> (insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ])
>         (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
>      (expr_list:REG_EQUAL (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
>         (nil)))
> (insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ])
>         (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
>      (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
>                 (const_int 4096 [0x1000])))
>         (nil)))
> (insn 33 37 42 2 (set (reg/v:SI 74 [ Sum ])
>         (const_int 0 [0])) arraysum.c:6 29 {movsi_internal}
>      (nil))
> (code_label 42 33 38 3 2 "" [0 uses])
> (note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
> (insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
>         (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal}
>      (nil))
> (insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ])
>         (plus:SI (reg/v:SI 74 [ Sum ])
>             (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3}
>      (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
>         (nil)))
> (insn 41 40 43 3 (set (reg:SI 72 [ ivtmp$8 ])
>         (plus:SI (reg:SI 72 [ ivtmp$8 ])
>             (const_int 4 [0x4]))) 1 {addsi3}
>      (nil))
> (jump_insn 43 41 52 3 (set (pc)
>         (if_then_else (ne (reg:SI 72 [ ivtmp$8 ])
>                 (reg/f:SI 76 [ D.1393 ]))
>             (label_ref:SI 52)
>             (pc))) arraysum.c:8 39 {*btrue}
>      (int_list:REG_BR_PROB 9899 (nil))
>  -> 52)
> (code_label 52 43 51 5 3 "" [1 uses])
> (note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK)
> (note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
> (insn 45 44 46 4 (set (reg/f:SI 78)
>         (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2  S4 A32])) arraysum.c:11 29 {movsi_internal}
>      (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum")  <var_decl 0x7f6eef5d6360 g_sum>)
>         (nil)))
> (insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32])
>         (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal}
>      (expr_list:REG_DEAD (reg/f:SI 78)
>         (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ])
>             (nil))))
>
>
> 2. RTL after the doloop optimization pass(arraysum.c.195r.loop2_doloop):
> (note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
> (note 32 34 36 2 NOTE_INSN_FUNCTION_BEG)
> (insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ])
>         (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
>      (expr_list:REG_EQUAL (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
>         (nil)))
> (insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ])
>         (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2  S4 A32])) 29 {movsi_internal}
>      (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g")  <var_decl 0x7f6eef5d62d0 g>)
>                 (const_int 4096 [0x1000])))
>         (nil)))
> (insn 33 37 54 2 (set (reg/v:SI 74 [ Sum ])
>         (const_int 0 [0])) arraysum.c:6 29 {movsi_internal}
>      (nil))
> (insn 54 33 42 2 (set (reg:SI 79)
>         (const_int 1024 [0x400])) arraysum.c:6 -1
>      (nil))
> (code_label 42 54 38 3 2 "" [0 uses])
> (note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
> (insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
>         (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal}
>      (nil))
> (insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ])
>         (plus:SI (reg/v:SI 74 [ Sum ])
>             (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3}
>      (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ])
>         (nil)))
> (insn 41 40 53 3 (set (reg:SI 72 [ ivtmp$8 ])
>         (plus:SI (reg:SI 72 [ ivtmp$8 ])
>             (const_int 4 [0x4]))) 1 {addsi3}
>      (nil))
> (jump_insn 53 41 52 3 (parallel [
>             (set (pc)
>                 (if_then_else (ne (reg:SI 79)
>                         (const_int 1 [0x1]))
>                     (label_ref 52)
>                     (pc)))
>             (set (reg:SI 79)
>                 (plus:SI (reg:SI 79)
>                     (const_int -1 [0xffffffffffffffff])))
>             (unspec [
>                     (const_int 0 [0])
>                 ] 13)
>             (clobber (scratch:SI))
>         ]) -1
>      (int_list:REG_BR_PROB 9899 (nil))
>  -> 52)
> (code_label 52 53 51 5 3 "" [1 uses])
> (note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK)
> (note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
> (insn 45 44 46 4 (set (reg/f:SI 78)
>         (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2  S4 A32])) arraysum.c:11 29 {movsi_internal}
>      (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum")  <var_decl 0x7f6eef5d6360 g_sum>)
>         (nil)))
> (insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32])
>         (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal}
>      (expr_list:REG_DEAD (reg/f:SI 78)
>         (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ])
>             (nil))))
>
>
>>
>> On Tue, Oct 14, 2014 at 8:39 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
>> > PING?
>> > Cheers,
>> > Felix
>>
>> Felix,
>>
>> This isn't my day job, 24-hour pings are unproductive.
>>
>> You shouldn't need to worry about the trip count register getting spilled. It
>> makes no difference whatsoever to how the loop operates--the trip count is
>> dead with regards to the loop once the instruction executes. You don't need to
>> describe LCOUNT to gcc in order for this not to matter. It should be enough to
>> describe the zcl as consuming the value in the same way a branch instruction
>> consumes a value.
>>
>> If you have a case where spilling it is causing a problem, then there is a bug in
>> your code, papered over by dropping case when it is spilled. Similarly with
>> iter_reg_used_outside--it shouldn't affect whether or not a zcl is valid here. If
>> you have a case where it does, there is likely a bug in your code.
>>
>> If the code is easier to write by maintaining trip_count up, then fine (for now);
>> you give up some performance (in fact, a lot of performance), but that doesn't
>> matter as to the correctness.
>>
>>
>> >
>> >
>> > On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com>
>> wrote:
>> >> Thanks for the comments.
>> >>
>> >> The patch checked the usage of teh trip count register, making sure
>> >> that it is not used in the loop body other than the doloop_end or
>> >> lives past the doloop_end instruction, as the following code snippet
>> >> shows:
>> >>
>> >> +  /* Scan all the blocks to make sure they don't use iter_reg.  */
>> >> + if (loop->iter_reg_used || loop->iter_reg_used_outside)
>> >> +    {
>> >> +      if (dump_file)
>> >> +        fprintf (dump_file, ";; loop %d uses iterator\n",
>> >> +                 loop->loop_no);
>> >> +      return false;
>> >> +    }
>> >>
>> >>     For the spill issue, I think we need to handle it. The reason is
>> >> that currently we are not telling GCC about the existence of the
>> >> LCOUNT register. Instead, we keep the trip count in a general
>> >> register and it's possible that this register can be spilled when
>> >> register pressure is high.
>> >>     It's a good idea to post another patch to describe the LCOUNT
>> >> register in GCC in order to free this general register. But I want
>> >> this patch applied as a first step, OK?
>> >>
>> >> Cheers,
>> >> Felix
>> >>
>> >>
>> >> On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com
>> >> <augustine.sterling@gmail.com> wrote:
>> >>> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com>
>> wrote:
>> >>>> Hi Sterling,
>> >>>>
>> >>>>     I made some improvement to the patch. Two changes:
>> >>>>     1. TARGET_LOOPS is now used as a condition of the doloop
>> >>>> related patterns, which is more elegant.
>> >>>
>> >>> Fine.
>> >>>
>> >>>>     2. As the trip count register of the zero-cost loop maybe
>> >>>> potentially spilled, we need to change the patterns in order to
>> >>>> handle this issue.
>> >>>
>> >>> Actually, for xtensa you don't. The trip count is copied into LCOUNT
>> >>> at the execution of the loop instruction, and therefore a spill or
>> >>> whatever doesn't matter--it won't affect the result. So as long as
>> >>> you have the trip count at the start of the loop, you are fine.
>> >>>
>> >>> This does bring up an issue of whether or not the trip count can be
>> >>> modified during the loop. (note that this is different than early
>> >>> exit.) If it can, you can't use a zero-overhead loop. Does your
>> >>> patch address this case.
>> >>>
>> >>> The solution is similar to that adapted by c6x backend.
>> >>>> Just turn the zero-cost loop into a regular loop when that happens
>> >>>> when reload is completed.
>> >>>>     Attached please find version 4 of the patch. Make check
>> >>>> regression tested with xtensa-elf-gcc/simulator.
>> >>>>     OK for trunk?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-21 14:57                         ` augustine.sterling
@ 2014-10-22  2:20                           ` Yangfei (Felix)
  2014-10-22  5:20                           ` Yangfei (Felix)
  1 sibling, 0 replies; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-22  2:20 UTC (permalink / raw)
  To: augustine.sterling, gcc-patches; +Cc: Felix Yang

[-- Attachment #1: Type: text/plain, Size: 1627 bytes --]

Hi Sterling,

    Attached please find the testcase for the spill issue. Try it out with the patch :-)


> 
> On Wed, Oct 15, 2014 at 7:10 PM, Yangfei (Felix) <felix.yang@huawei.com>
> wrote:
> > Hi Sterling,
> >
> >     Since the patch is delayed for a long time, I'm kind of pushing it. Sorry for
> that.
> >     Yeah, you are right. We have some performance issue here as GCC may
> use one more general register in some cases with this patch.
> >     Take the following arraysum testcase for example. In doloop optimization,
> GCC figures out that the number of iterations is 1024 and creates a new pseudo
> 79 as the new trip count register.
> >     The pseudo 79 is live throughout the loop, this makes the register
> pressure in the loop higher. And it's possible that this new pseudo is spilled by
> reload when the register pressure is very high.
> >     I know that the xtensa loop instruction copies the trip count register into
> the LCOUNT special register. And we need describe this hardware feature in GCC
> in order to free the trip count register.
> >     But I find it difficult to do. Do you have any good suggestions on this?
> 
> There are two issues related to the trip count, one I would like you to solve now,
> one later.
> 
> 1. Later: The trip count doesn't need to be updated at all inside these loops, once
> the loop instruction executes. The code below relates to this case.
> 
> 2. Now: You should be able to use a loop instruction regardless of whether the
> trip count is spilled. If you have an example where that wouldn't work, I would
> love to see it.
> 

[-- Attachment #2: tripcount-spill.c --]
[-- Type: text/plain, Size: 1626 bytes --]

void
foo (unsigned f, long v, unsigned *w, unsigned a, unsigned b, unsigned e, unsigned c, unsigned d)
{
  unsigned h = v / 4, x[16];
  while (f < h)
    {
      unsigned i;
      f++;
      a |= (a >> 30);
      d = (d << 30) | ((unsigned) d >> 30);
      c = (c << 30) | ((unsigned) c >> 30);
      b = 30 | ((unsigned) b >> 30);
      d += a = (a << 30) | ((unsigned) a >> 2);
      c += ((d << 5) | (d >> 27)) + ((e & (a ^ b))) + 0x5a827999 + x[12];
      a += (c & e);
      c = 30 | ((unsigned) c);
      i = x[5] ^ x[7] ^ x[8] ^ x[3];
      x[5] = (i << 1) | ((unsigned) i >> 31);
      i = x[6] ^ x[2] ^ x[14] ^ x[13];
      x[6] = (i << 1) | (i >> 31);
      b += (c | (c >> 5)) + (d ^ e) + 0x6ed9eba1 + (x[7] = (i << 1) | ((unsigned) i >> 31));
      x[8] = i | 1;
      e += (a | 5) + b + (i = x[9] ^ x[6], x[10] = (i << (unsigned) i));
      e = 30 | ((unsigned) e >> 30);
      i = x[12] ^ x[14] ^ x[12] ^ x[12], (x[12] = 1 | ((unsigned) i));
      i = x[13] ^ x[5] ^ x[10], (x[13] = (i << (unsigned) 1));
      i = x[2] ^ x[7] ^ x[12], (x[15] = i | ((unsigned) i >> 1));
      i = x[2] ^ x[0] ^ x[13], (x[0] = (i << 1) | 31);
      e = (e << 30) | 2;
      i = x[14] ^ x[2] ^ x[15], (x[2] = i | 1);
      x[3] = i | ((unsigned) i);
      i = x[14] ^ x[12] ^ x[4], (x[4] = 1 | ((unsigned) i >> 1));
      x[5] = i | 1;
      e = (e << 30) | 30;
      b += (5 | ((unsigned) e >> 5)) + 0x8f1bbcdc + (x[9] = (i | ((unsigned) i >> 1)));
      i = x[2] ^ (x [10] = ((i << 1) | (i >> 1)));
      x[13] = (i | ((unsigned) i >> 1));
      (i = x[14] ^ x[0] ^ x[14], (x[14] = ((i << 1) | 31)));
      a = *w += a;
    }
}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-21 14:57                         ` augustine.sterling
  2014-10-22  2:20                           ` Yangfei (Felix)
@ 2014-10-22  5:20                           ` Yangfei (Felix)
  2014-10-23 17:51                             ` augustine.sterling
  1 sibling, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-22  5:20 UTC (permalink / raw)
  To: augustine.sterling, gcc-patches; +Cc: Felix Yang

If the tripcount spill issue is not handled in the pattern, ICE may happen then. 
Here reload is trying to spill pseudo 173, but a memory operand is not allowed in zero_cost_loop_end pattern. 
And this is what I am trying to solve. 

pr44023.c:48:1: error: unable to generate reloads for:
 }
 ^
(jump_insn 136 113 116 4 (parallel [
            (set (pc)
                (if_then_else (ne (reg:SI 173)
                        (const_int 1 [0x1]))
                    (label_ref:SI 114)
                    (pc)))
            (set (reg:SI 173)
                (plus:SI (reg:SI 173)
                    (const_int -1 [0xffffffffffffffff])))
            (unspec [
                    (const_int 0 [0])
                ] 13)
        ]) pr44023.c:46 48 {zero_cost_loop_end}
     (int_list:REG_BR_PROB 9100 (nil))
 -> 114)
pr44023.c:48:1: internal compiler error: in find_reloads, at reload.c:3833
0x989383 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)
        ../../trunk/gcc/rtl-error.c:110
0x9777cb find_reloads(rtx_insn*, int, int, int, short*)
        ../../trunk/gcc/reload.c:3833
0x9875f1 calculate_needs_all_insns
        ../../trunk/gcc/reload1.c:1515
0x9875f1 reload(rtx_insn*, int)
        ../../trunk/gcc/reload1.c:1003
0x86a8a7 do_reload
        ../../trunk/gcc/ira.c:5323
0x86acd8 execute
        ../../trunk/gcc/ira.c:5470


> 
> Hi Sterling,
> 
>     Attached please find the testcase for the spill issue. Try it out with the
> patch :-)
> 
> 
> >
> > On Wed, Oct 15, 2014 at 7:10 PM, Yangfei (Felix)
> > <felix.yang@huawei.com>
> > wrote:
> > > Hi Sterling,
> > >
> > >     Since the patch is delayed for a long time, I'm kind of pushing
> > > it. Sorry for
> > that.
> > >     Yeah, you are right. We have some performance issue here as GCC
> > > may
> > use one more general register in some cases with this patch.
> > >     Take the following arraysum testcase for example. In doloop
> > > optimization,
> > GCC figures out that the number of iterations is 1024 and creates a
> > new pseudo
> > 79 as the new trip count register.
> > >     The pseudo 79 is live throughout the loop, this makes the
> > > register
> > pressure in the loop higher. And it's possible that this new pseudo is
> > spilled by reload when the register pressure is very high.
> > >     I know that the xtensa loop instruction copies the trip count
> > > register into
> > the LCOUNT special register. And we need describe this hardware
> > feature in GCC in order to free the trip count register.
> > >     But I find it difficult to do. Do you have any good suggestions on this?
> >
> > There are two issues related to the trip count, one I would like you
> > to solve now, one later.
> >
> > 1. Later: The trip count doesn't need to be updated at all inside
> > these loops, once the loop instruction executes. The code below relates to this
> case.
> >
> > 2. Now: You should be able to use a loop instruction regardless of
> > whether the trip count is spilled. If you have an example where that
> > wouldn't work, I would love to see it.
> >

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-22  5:20                           ` Yangfei (Felix)
@ 2014-10-23 17:51                             ` augustine.sterling
  2014-10-24  1:49                               ` Yangfei (Felix)
  2014-10-24  4:32                               ` Yangfei (Felix)
  0 siblings, 2 replies; 29+ messages in thread
From: augustine.sterling @ 2014-10-23 17:51 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: gcc-patches, Felix Yang

On Tue, Oct 21, 2014 at 7:20 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> If the tripcount spill issue is not handled in the pattern, ICE may happen then.
> Here reload is trying to spill pseudo 173, but a memory operand is not allowed in zero_cost_loop_end pattern.
> And this is what I am trying to solve.

We have full control of the zero_cost_loop_end pattern. Plus, it
doesn't actually generate any real code. Edit it so it can take a
memory operand.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-23 17:51                             ` augustine.sterling
@ 2014-10-24  1:49                               ` Yangfei (Felix)
  2014-10-24  4:32                               ` Yangfei (Felix)
  1 sibling, 0 replies; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-24  1:49 UTC (permalink / raw)
  To: augustine.sterling; +Cc: gcc-patches, Felix Yang

> On Tue, Oct 21, 2014 at 7:20 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> > If the tripcount spill issue is not handled in the pattern, ICE may happen then.
> > Here reload is trying to spill pseudo 173, but a memory operand is not allowed
> in zero_cost_loop_end pattern.
> > And this is what I am trying to solve.
> 
> We have full control of the zero_cost_loop_end pattern. Plus, it doesn't actually
> generate any real code. Edit it so it can take a memory operand.

Here the key point is we need a general purpose register for the "loop" instruction. 
If the trip count register is spilled, we don't have a general purpose register then. And we cannot use zero-cost looping in this situation.
And that's why I spilt the zero_cost_loop_end into a normal test and branch. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-23 17:51                             ` augustine.sterling
  2014-10-24  1:49                               ` Yangfei (Felix)
@ 2014-10-24  4:32                               ` Yangfei (Felix)
  2014-10-24  6:28                                 ` augustine.sterling
  1 sibling, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-24  4:32 UTC (permalink / raw)
  To: gcc-patches, augustine.sterling; +Cc: Felix Yang

> > On Tue, Oct 21, 2014 at 7:20 PM, Yangfei (Felix) <felix.yang@huawei.com>
> wrote:
> > > If the tripcount spill issue is not handled in the pattern, ICE may happen then.
> > > Here reload is trying to spill pseudo 173, but a memory operand is
> > > not allowed
> > in zero_cost_loop_end pattern.
> > > And this is what I am trying to solve.
> >
> > We have full control of the zero_cost_loop_end pattern. Plus, it
> > doesn't actually generate any real code. Edit it so it can take a memory
> operand.
> 
> Here the key point is we need a general purpose register for the "loop"
> instruction.
> If the trip count register is spilled, we don't have a general purpose register then.
> And we cannot use zero-cost looping in this situation.
> And that's why I spilt the zero_cost_loop_end into a normal test and branch.

Also note that the hwloop_pattern_reg interface also expects a general purpose register in the doloop_end pattern. 



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  4:32                               ` Yangfei (Felix)
@ 2014-10-24  6:28                                 ` augustine.sterling
  2014-10-24  6:33                                   ` Yangfei (Felix)
  0 siblings, 1 reply; 29+ messages in thread
From: augustine.sterling @ 2014-10-24  6:28 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: gcc-patches, Felix Yang

On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
>> Here the key point is we need a general purpose register for the "loop"
>> instruction.

So the question to ask here is, "How does this work today, without
loop instructions?" Somehow--even when it has been spilled--a branch
instruction can test the trip count. There should be no difference.

>> And we cannot use zero-cost looping in this situation.
>> And that's why I spilt the zero_cost_loop_end into a normal test and branch.

> Also note that the hwloop_pattern_reg interface also expects a general purpose register in the doloop_end pattern.

If there were no loop instruction, how would this work?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  6:28                                 ` augustine.sterling
@ 2014-10-24  6:33                                   ` Yangfei (Felix)
  2014-10-24  6:40                                     ` augustine.sterling
  0 siblings, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-24  6:33 UTC (permalink / raw)
  To: augustine.sterling; +Cc: gcc-patches, Felix Yang

> 
> On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> >> Here the key point is we need a general purpose register for the "loop"
> >> instruction.
> 
> So the question to ask here is, "How does this work today, without loop
> instructions?" Somehow--even when it has been spilled--a branch instruction can
> test the trip count. There should be no difference.
> 
> >> And we cannot use zero-cost looping in this situation.
> >> And that's why I spilt the zero_cost_loop_end into a normal test and branch.
> 
> > Also note that the hwloop_pattern_reg interface also expects a general
> purpose register in the doloop_end pattern.
> 
> If there were no loop instruction, how would this work?


Just take a look at my patch. I handle this in the new define_split:

+(define_split
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "")
+        (plus:SI (match_dup 0)
+                 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch 3))]
+  "TARGET_LOOPS && optimize && reload_completed"
+  [(const_int 0)]
+{
+  if (!REG_P (operands[0]))
+    {
+      rtx test;
+
+      /* Fallback into a normal conditional branch insn.  */
+      emit_move_insn (operands[3], operands[0]);
+      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
+      emit_move_insn (operands[0], operands[3]);
+      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
+      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
+                                      const0_rtx, operands[1]));
+    }
+  else
+    {
+      emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2]));
+    }
+
+  DONE;
+})

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  6:33                                   ` Yangfei (Felix)
@ 2014-10-24  6:40                                     ` augustine.sterling
  2014-10-24  6:43                                       ` Yangfei (Felix)
  0 siblings, 1 reply; 29+ messages in thread
From: augustine.sterling @ 2014-10-24  6:40 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: gcc-patches, Felix Yang

I mean without your patch at all.

On Thu, Oct 23, 2014 at 11:30 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
>>
>> On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
>> >> Here the key point is we need a general purpose register for the "loop"
>> >> instruction.
>>
>> So the question to ask here is, "How does this work today, without loop
>> instructions?" Somehow--even when it has been spilled--a branch instruction can
>> test the trip count. There should be no difference.
>>
>> >> And we cannot use zero-cost looping in this situation.
>> >> And that's why I spilt the zero_cost_loop_end into a normal test and branch.
>>
>> > Also note that the hwloop_pattern_reg interface also expects a general
>> purpose register in the doloop_end pattern.
>>
>> If there were no loop instruction, how would this work?
>
>
> Just take a look at my patch. I handle this in the new define_split:
>
> +(define_split
> +  [(set (pc)
> +        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "")
> +                          (const_int 1))
> +                      (label_ref (match_operand 1 "" ""))
> +                      (pc)))
> +   (set (match_operand:SI 2 "nonimmediate_operand" "")
> +        (plus:SI (match_dup 0)
> +                 (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
> +   (clobber (match_scratch 3))]
> +  "TARGET_LOOPS && optimize && reload_completed"
> +  [(const_int 0)]
> +{
> +  if (!REG_P (operands[0]))
> +    {
> +      rtx test;
> +
> +      /* Fallback into a normal conditional branch insn.  */
> +      emit_move_insn (operands[3], operands[0]);
> +      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
> +      emit_move_insn (operands[0], operands[3]);
> +      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
> +      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
> +                                      const0_rtx, operands[1]));
> +    }
> +  else
> +    {
> +      emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2]));
> +    }
> +
> +  DONE;
> +})

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  6:40                                     ` augustine.sterling
@ 2014-10-24  6:43                                       ` Yangfei (Felix)
  2014-10-24  6:49                                         ` augustine.sterling
  0 siblings, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-24  6:43 UTC (permalink / raw)
  To: augustine.sterling; +Cc: gcc-patches, Felix Yang

1. The original xtensa port never generates "loop" instruction at all.
2. A port doesn't need to implement hwloop_pattern_reg hook if it has no zero-cost loop instruction.

Is that clear?

> 
> I mean without your patch at all.
> 
> On Thu, Oct 23, 2014 at 11:30 PM, Yangfei (Felix) <felix.yang@huawei.com>
> wrote:
> >>
> >> On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com>
> wrote:
> >> >> Here the key point is we need a general purpose register for the "loop"
> >> >> instruction.
> >>
> >> So the question to ask here is, "How does this work today, without
> >> loop instructions?" Somehow--even when it has been spilled--a branch
> >> instruction can test the trip count. There should be no difference.
> >>
> >> >> And we cannot use zero-cost looping in this situation.
> >> >> And that's why I spilt the zero_cost_loop_end into a normal test and
> branch.
> >>
> >> > Also note that the hwloop_pattern_reg interface also expects a
> >> > general
> >> purpose register in the doloop_end pattern.
> >>
> >> If there were no loop instruction, how would this work?
> >
> >
> > Just take a look at my patch. I handle this in the new define_split:
> >
> > +(define_split
> > +  [(set (pc)
> > +        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand"
> "")
> > +                          (const_int 1))
> > +                      (label_ref (match_operand 1 "" ""))
> > +                      (pc)))
> > +   (set (match_operand:SI 2 "nonimmediate_operand" "")
> > +        (plus:SI (match_dup 0)
> > +                 (const_int -1)))
> > +   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
> > +   (clobber (match_scratch 3))]
> > +  "TARGET_LOOPS && optimize && reload_completed"
> > +  [(const_int 0)]
> > +{
> > +  if (!REG_P (operands[0]))
> > +    {
> > +      rtx test;
> > +
> > +      /* Fallback into a normal conditional branch insn.  */
> > +      emit_move_insn (operands[3], operands[0]);
> > +      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
> > +      emit_move_insn (operands[0], operands[3]);
> > +      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
> > +      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
> > +                                      const0_rtx, operands[1]));
> > +    }
> > +  else
> > +    {
> > +      emit_jump_insn (gen_loop_end (operands[0], operands[1],
> operands[2]));
> > +    }
> > +
> > +  DONE;
> > +})

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  6:43                                       ` Yangfei (Felix)
@ 2014-10-24  6:49                                         ` augustine.sterling
  2014-10-24  6:53                                           ` Yangfei (Felix)
  0 siblings, 1 reply; 29+ messages in thread
From: augustine.sterling @ 2014-10-24  6:49 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: gcc-patches, Felix Yang

On Thu, Oct 23, 2014 at 11:40 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> 1. The original xtensa port never generates "loop" instruction at all.
> 2. A port doesn't need to implement hwloop_pattern_reg hook if it has no zero-cost loop instruction.
>
> Is that clear?

We are talking in circles. I understand very well what goes on here.

My point is:

1. Right now, today, GCC generates loops with branch instructions even
when the trip count is spilled.
2. Branch instructions and loop instructions have identical register
requirements.

Therefore:

3. loop instructions should be generatable when the trip count is spilled.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  6:49                                         ` augustine.sterling
@ 2014-10-24  6:53                                           ` Yangfei (Felix)
  2014-10-24  7:15                                             ` Andrew Pinski
  0 siblings, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-24  6:53 UTC (permalink / raw)
  To: augustine.sterling; +Cc: gcc-patches, Felix Yang

Thanks for the explanation. I think I am clear about what you are thinking now. 
That's an interesting question. I am not sure about reason why GCC's reload cannot handle a doloop_end insn.
I guess maybe the doloop_end pattern is special? I mean it's a branch insn in a parallel form.



> 
> On Thu, Oct 23, 2014 at 11:40 PM, Yangfei (Felix) <felix.yang@huawei.com>
> wrote:
> > 1. The original xtensa port never generates "loop" instruction at all.
> > 2. A port doesn't need to implement hwloop_pattern_reg hook if it has no
> zero-cost loop instruction.
> >
> > Is that clear?
> 
> We are talking in circles. I understand very well what goes on here.
> 
> My point is:
> 
> 1. Right now, today, GCC generates loops with branch instructions even when the
> trip count is spilled.
> 2. Branch instructions and loop instructions have identical register requirements.
> 
> Therefore:
> 
> 3. loop instructions should be generatable when the trip count is spilled.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  6:53                                           ` Yangfei (Felix)
@ 2014-10-24  7:15                                             ` Andrew Pinski
  2014-10-28 12:24                                               ` Yangfei (Felix)
  0 siblings, 1 reply; 29+ messages in thread
From: Andrew Pinski @ 2014-10-24  7:15 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: augustine.sterling, gcc-patches, Felix Yang

On Thu, Oct 23, 2014 at 11:51 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> Thanks for the explanation. I think I am clear about what you are thinking now.
> That's an interesting question. I am not sure about reason why GCC's reload cannot handle a doloop_end insn.
> I guess maybe the doloop_end pattern is special? I mean it's a branch insn in a parallel form.


No it is not special.  Just jump are never handled by reload.  I
thought this was documented somewhere also.  Basically the main issue
with jumps is where does the reload value go which side of the jump?

Thanks,
Andrew

>
>
>
>>
>> On Thu, Oct 23, 2014 at 11:40 PM, Yangfei (Felix) <felix.yang@huawei.com>
>> wrote:
>> > 1. The original xtensa port never generates "loop" instruction at all.
>> > 2. A port doesn't need to implement hwloop_pattern_reg hook if it has no
>> zero-cost loop instruction.
>> >
>> > Is that clear?
>>
>> We are talking in circles. I understand very well what goes on here.
>>
>> My point is:
>>
>> 1. Right now, today, GCC generates loops with branch instructions even when the
>> trip count is spilled.
>> 2. Branch instructions and loop instructions have identical register requirements.
>>
>> Therefore:
>>
>> 3. loop instructions should be generatable when the trip count is spilled.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-24  7:15                                             ` Andrew Pinski
@ 2014-10-28 12:24                                               ` Yangfei (Felix)
  2014-10-30 22:02                                                 ` augustine.sterling
  0 siblings, 1 reply; 29+ messages in thread
From: Yangfei (Felix) @ 2014-10-28 12:24 UTC (permalink / raw)
  To: gcc-patches, augustine.sterling, Andrew Pinski; +Cc: Felix Yang

> On Thu, Oct 23, 2014 at 11:51 PM, Yangfei (Felix) <felix.yang@huawei.com>
> wrote:
> > Thanks for the explanation. I think I am clear about what you are thinking now.
> > That's an interesting question. I am not sure about reason why GCC's reload
> cannot handle a doloop_end insn.
> > I guess maybe the doloop_end pattern is special? I mean it's a branch insn in a
> parallel form.
> 
> 
> No it is not special.  Just jump are never handled by reload.  I thought this was
> documented somewhere also.  Basically the main issue with jumps is where
> does the reload value go which side of the jump?
> 
> Thanks,
> Andrew
> 


Hi Sterling,
  How do you think about this issue? 
  As c6x/bfin port handles this the same way, is it OK for the patch to be applied? 
  Thanks. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] Add zero-overhead looping for xtensa backend
  2014-10-28 12:24                                               ` Yangfei (Felix)
@ 2014-10-30 22:02                                                 ` augustine.sterling
  0 siblings, 0 replies; 29+ messages in thread
From: augustine.sterling @ 2014-10-30 22:02 UTC (permalink / raw)
  To: Yangfei (Felix); +Cc: gcc-patches, Andrew Pinski, Felix Yang

[-- Attachment #1: Type: text/plain, Size: 1395 bytes --]

On Tue, Oct 28, 2014 at 5:22 AM, Yangfei (Felix) <felix.yang@huawei.com> wrote:
> Hi Sterling,
>   How do you think about this issue?
>   As c6x/bfin port handles this the same way, is it OK for the patch to be applied?
>   Thanks.

I have committed this patch as attached. I made a couple of minor
cleanups, plus some small fixes to the ChangeLog entry.

The new code generated is better than the old code, but I'm not
particularly happy with the result. In particular, it is way too
conservative around spilling the trip count, and it still maintains
the trip count inside the loop, in spite of the trip count being dead
at that point. It won't always be dead, but when it is, it should be
eliminated.

So there is quite a bit of performance that could still be gained here.

Thanks for the patch Felix.

Sterling

2014-10-10  Felix Yang  <felix.yang@huawei.com>

    * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro.
    * config/xtensa/xtensa.c: Include dumpfile.h and hw-doloop.h.
    (xtensa_reorg, xtensa_reorg_loops): New.
    (xtensa_can_use_doloop_p, xtensa_invalid_within_doloop): New.
    (hwloop_optimize, hwloop_fail, hwloop_pattern_reg): New.
    (xtensa_emit_loop_end): Emit the zero-overhead loop end label.
    (xtensa_doloop_hooks): Define.
    * config/xtensa/xtensa.md (doloop_end, loop_end): New
    (zero_cost_loop_start): Rewritten.
    (zero_cost_loop_end): Likewise.

[-- Attachment #2: loop-patch.diff --]
[-- Type: text/plain, Size: 13288 bytes --]

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c	(revision 216943)
+++ gcc/config/xtensa/xtensa.c	(working copy)
@@ -74,6 +74,8 @@
 #include "gimplify.h"
 #include "df.h"
 #include "builtins.h"
+#include "dumpfile.h"
+#include "hw-doloop.h"
 #include "rtl-iter.h"
 
 
@@ -200,6 +202,10 @@
 
 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const rtx_insn *);
 
 static bool xtensa_member_type_forces_blk (const_tree,
 					   machine_mode mode);
@@ -326,6 +332,15 @@
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 \f
@@ -1690,7 +1705,7 @@
         }
     }
 
-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }
 
 
@@ -3720,4 +3735,236 @@
   return !xtensa_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (const widest_int &, const widest_int &,
+                         unsigned int loop_depth, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop
+     for innermost loops which must be entered from the top.  */
+  if (loop_depth > 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return)
+    return "Return from a call instruction in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx iter_reg;
+  rtx_insn *insn, *seq, *entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test;
+  rtx_insn *insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx_insn *insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (false, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"
Index: gcc/config/xtensa/xtensa.h
===================================================================
--- gcc/config/xtensa/xtensa.h	(revision 216943)
+++ gcc/config/xtensa/xtensa.h	(working copy)
@@ -65,6 +65,7 @@
 #define TARGET_S32C1I		XCHAL_HAVE_S32C1I
 #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 #define TARGET_THREADPTR	XCHAL_HAVE_THREADPTR
+#define TARGET_LOOPS	        XCHAL_HAVE_LOOPS
 
 #define TARGET_DEFAULT \
   ((XCHAL_HAVE_L32R	? 0 : MASK_CONST16) |				\
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md	(revision 216943)
+++ gcc/config/xtensa/xtensa.md	(working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL	9)
   (UNSPEC_TP		10)
   (UNSPEC_MEMW		11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)
 
   (UNSPECV_SET_FP	1)
   (UNSPECV_ENTRY	2)
@@ -1279,21 +1281,25 @@
    (set_attr "length"	"3")])
 
 
+;; Zero-overhead looping support.
+
 ;; Define the loop insns used by bct optimization to represent the
-;; start and end of a zero-overhead loop (in loop.c).  This start
-;; template generates the loop insn; the end template doesn't generate
-;; any instructions since loop end is handled in hardware.
+;; start and end of a zero-overhead loop.  This start template generates
+;; the loop insn; the end template doesn't generate any instructions since
+;; loop end is handled in hardware.
 
 (define_insn "zero_cost_loop_start"
   [(set (pc)
-	(if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-			  (const_int 0))
-		      (label_ref (match_operand 1 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (match_dup 0) (const_int -1)))]
-  ""
-  "loopnez\t%0, %l1"
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
+  "TARGET_LOOPS && optimize"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
@@ -1300,20 +1306,95 @@
 
 (define_insn "zero_cost_loop_end"
   [(set (pc)
-	(if_then_else (ne (reg:SI 19) (const_int 0))
-		      (label_ref (match_operand 0 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (reg:SI 19) (const_int -1)))]
-  ""
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "2,2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "=a,m")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch:SI 3 "=X,&r"))]
+  "TARGET_LOOPS && optimize"
+  "#"
+  [(set_attr "type"	"jump")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"0")])
+
+(define_insn "loop_end"
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "register_operand" "2")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "register_operand" "=a")
+        (plus (match_dup 0)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
+  "TARGET_LOOPS && optimize"
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
+(define_split
+  [(set (pc)
+        (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 2 "nonimmediate_operand" "")
+        (plus:SI (match_dup 0)
+                 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+   (clobber (match_scratch 3))]
+  "TARGET_LOOPS && optimize && reload_completed"
+  [(const_int 0)]
+{
+  if (!REG_P (operands[0]))
+    {
+      rtx test;
+
+      /* Fallback into a normal conditional branch insn.  */
+      emit_move_insn (operands[3], operands[0]);
+      emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx));
+      emit_move_insn (operands[0], operands[3]);
+      test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx);
+      emit_jump_insn (gen_cbranchsi4 (test, operands[3],
+                                      const0_rtx, operands[1]));
+    }
+  else
+    {
+      emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2]));
+    }
+
+  DONE;
+})
+
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)
+              (clobber (match_dup 2))])] ; match_scratch
+  "TARGET_LOOPS && optimize"
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+  operands[2] = gen_rtx_SCRATCH (SImode);
+})
+
 \f
 ;; Setting a register from a comparison.
 

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2014-10-30 21:41 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-08 16:27 [PATCH] Add zero-overhead looping for xtensa backend Felix Yang
2014-01-08 16:49 ` Sterling Augustine
2014-01-09 15:08   ` Felix Yang
2014-01-09 23:51     ` Felix Yang
2014-01-10  3:49       ` Yangfei (Felix)
2014-01-13 17:24         ` Sterling Augustine
2014-10-09 11:04           ` Felix Yang
2014-10-10 14:01             ` Felix Yang
2014-10-11  9:32               ` [PING] [PATCH, xtensa] " Yangfei (Felix)
2014-10-13 16:09               ` [PATCH] " augustine.sterling
2014-10-13 16:30                 ` Felix Yang
2014-10-14 15:43                   ` Felix Yang
2014-10-15 19:51                     ` augustine.sterling
2014-10-16  4:52                       ` Yangfei (Felix)
2014-10-21 14:57                         ` augustine.sterling
2014-10-22  2:20                           ` Yangfei (Felix)
2014-10-22  5:20                           ` Yangfei (Felix)
2014-10-23 17:51                             ` augustine.sterling
2014-10-24  1:49                               ` Yangfei (Felix)
2014-10-24  4:32                               ` Yangfei (Felix)
2014-10-24  6:28                                 ` augustine.sterling
2014-10-24  6:33                                   ` Yangfei (Felix)
2014-10-24  6:40                                     ` augustine.sterling
2014-10-24  6:43                                       ` Yangfei (Felix)
2014-10-24  6:49                                         ` augustine.sterling
2014-10-24  6:53                                           ` Yangfei (Felix)
2014-10-24  7:15                                             ` Andrew Pinski
2014-10-28 12:24                                               ` Yangfei (Felix)
2014-10-30 22:02                                                 ` augustine.sterling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).