public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/6] [ARC] Remove non standard funcions calls.
  2018-10-10  8:01 [PATCH 0/6] ARC updates Claudiu Zissulescu
  2018-10-10  8:01 ` [PATCH 6/6] [ARC] Handle store cacheline hazard Claudiu Zissulescu
@ 2018-10-10  8:01 ` Claudiu Zissulescu
  2018-10-11 10:14   ` Andrew Burgess
  2018-10-10  8:01 ` [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads Claudiu Zissulescu
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-10  8:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Replace all custom "library" calls with compiler known patterns.

gcc/
xxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/arc.md (mulsi3): Remove call to mulsi_600_lib.
	(mulsi3_600_lib): Remove pattern.
	(umulsi3_highpart_600_lib_le): Likewise.
	(umulsi3_highpart): Remove call to umulsi3_highpart_600_lib_le.
	(umulsidi3): Remove call to umulsidi3_600_lib.
	(umulsidi3_600_lib): Remove pattern.
	(peephole2): Remove peephole using the above deprecated patterns.

testsuite/
xxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>

	* gcc.target/arc/mulsi3_highpart-2.c: Update test.

libgcc/
xxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/lib1funcs.S (_muldi3): New function.
	* config/arc/t-arc (LIB1ASMFUNCS): Add _muldi3.
---
 gcc/config/arc/arc.md                         | 158 ++----------------
 .../gcc.target/arc/mulsi3_highpart-2.c        |   5 +-
 libgcc/config/arc/lib1funcs.S                 |  54 ++++++
 libgcc/config/arc/t-arc                       |   2 +-
 4 files changed, 67 insertions(+), 152 deletions(-)

diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 42ca820b91d..d73289a20c4 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -2076,44 +2076,21 @@ archs4x, archs4xd, archs4xd_slow"
 ;; SI <- SI * SI
 
 (define_expand "mulsi3"
- [(set (match_operand:SI 0 "nonimmediate_operand"            "")
+ [(set (match_operand:SI 0 "register_operand"            "")
 	(mult:SI (match_operand:SI 1 "register_operand"  "")
 		 (match_operand:SI 2 "nonmemory_operand" "")))]
-  ""
+  "TARGET_ANY_MPY"
 {
-  if (TARGET_MPY)
-    {
-      if (!register_operand (operands[0], SImode))
-	{
-	  rtx result = gen_reg_rtx (SImode);
-
-	  emit_insn (gen_mulsi3 (result, operands[1], operands[2]));
-	  emit_move_insn (operands[0], result);
-	  DONE;
-	}
-    }
-  else if (TARGET_MUL64_SET)
+  if (TARGET_MUL64_SET)
     {
-     rtx tmp = gen_reg_rtx (SImode);
-     emit_insn (gen_mulsi64 (tmp, operands[1], operands[2]));
-     emit_move_insn (operands[0], tmp);
+     emit_insn (gen_mulsi64 (operands[0], operands[1], operands[2]));
      DONE;
     }
   else if (TARGET_MULMAC_32BY16_SET)
     {
-     rtx tmp = gen_reg_rtx (SImode);
-     emit_insn (gen_mulsi32x16 (tmp, operands[1], operands[2]));
-     emit_move_insn (operands[0], tmp);
+     emit_insn (gen_mulsi32x16 (operands[0], operands[1], operands[2]));
      DONE;
     }
-  else
-    {
-      emit_move_insn (gen_rtx_REG (SImode, R0_REG), operands[1]);
-      emit_move_insn (gen_rtx_REG (SImode, R1_REG), operands[2]);
-      emit_insn (gen_mulsi3_600_lib ());
-      emit_move_insn (operands[0], gen_rtx_REG (SImode, R0_REG));
-      DONE;
-    }
 })
 
 (define_insn_and_split "mulsi32x16"
@@ -2229,27 +2206,6 @@ archs4x, archs4xd, archs4xd_slow"
    (set_attr "predicable" "yes,yes,no,yes")
    (set_attr "cond" "canuse,canuse,canuse_limm,canuse")])
 
-; If we compile without an mul option enabled, but link with libraries
-; for a mul option, we'll see clobbers of multiplier output registers.
-; There is also an implementation using norm that clobbers the loop registers.
-(define_insn "mulsi3_600_lib"
-  [(set (reg:SI R0_REG)
-	(mult:SI (reg:SI R0_REG) (reg:SI R1_REG)))
-   (clobber (reg:SI RETURN_ADDR_REGNUM))
-   (clobber (reg:SI R1_REG))
-   (clobber (reg:SI R2_REG))
-   (clobber (reg:SI R3_REG))
-   (clobber (reg:DI MUL64_OUT_REG))
-   (clobber (reg:SI LP_COUNT))
-   (clobber (reg:SI LP_START))
-   (clobber (reg:SI LP_END))
-   (clobber (reg:CC CC_REG))]
-  "!TARGET_ANY_MPY
-   && SFUNC_CHECK_PREDICABLE"
-  "*return arc_output_libcall (\"__mulsi3\");"
-  [(set_attr "is_sfunc" "yes")
-   (set_attr "predicable" "yes")])
-
 (define_insn_and_split "mulsidi_600"
   [(set (match_operand:DI 0 "register_operand"                               "=c, c,c,  c")
 	(mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand"  "%Rcq#q, c,c,  c"))
@@ -2504,48 +2460,6 @@ archs4x, archs4xd, archs4xd_slow"
    (set_attr "predicable" "yes,no,yes,no")
    (set_attr "cond" "canuse,nocond,canuse,nocond")])
 
-; Implementations include additional labels for umulsidi3, so we got all
-; the same clobbers - plus one for the result low part.  */
-(define_insn "umulsi3_highpart_600_lib_le"
-  [(set (reg:SI R1_REG)
-	(truncate:SI
-	 (lshiftrt:DI
-	  (mult:DI (zero_extend:DI (reg:SI R0_REG))
-		   (zero_extend:DI (reg:SI R1_REG)))
-	  (const_int 32))))
-   (clobber (reg:SI RETURN_ADDR_REGNUM))
-   (clobber (reg:SI R0_REG))
-   (clobber (reg:DI R2_REG))
-   (clobber (reg:SI R12_REG))
-   (clobber (reg:DI MUL64_OUT_REG))
-   (clobber (reg:CC CC_REG))]
-  "!TARGET_BIG_ENDIAN
-   && !TARGET_ANY_MPY
-   && SFUNC_CHECK_PREDICABLE"
-  "*return arc_output_libcall (\"__umulsi3_highpart\");"
-  [(set_attr "is_sfunc" "yes")
-   (set_attr "predicable" "yes")])
-
-(define_insn "umulsi3_highpart_600_lib_be"
-  [(set (reg:SI R0_REG)
-	(truncate:SI
-	 (lshiftrt:DI
-	  (mult:DI (zero_extend:DI (reg:SI R0_REG))
-		   (zero_extend:DI (reg:SI R1_REG)))
-	  (const_int 32))))
-   (clobber (reg:SI RETURN_ADDR_REGNUM))
-   (clobber (reg:SI R1_REG))
-   (clobber (reg:DI R2_REG))
-   (clobber (reg:SI R12_REG))
-   (clobber (reg:DI MUL64_OUT_REG))
-   (clobber (reg:CC CC_REG))]
-  "TARGET_BIG_ENDIAN
-   && !TARGET_ANY_MPY
-   && SFUNC_CHECK_PREDICABLE"
-  "*return arc_output_libcall (\"__umulsi3_highpart\");"
-  [(set_attr "is_sfunc" "yes")
-   (set_attr "predicable" "yes")])
-
 ;; (zero_extend:DI (const_int)) leads to internal errors in combine, so we
 ;; need a separate pattern for immediates
 ;; ??? This is fine for combine, but not for reload.
@@ -2572,23 +2486,11 @@ archs4x, archs4xd, archs4xd_slow"
 	   (zero_extend:DI (match_operand:SI 1 "register_operand" ""))
 	   (zero_extend:DI (match_operand:SI 2 "nonmemory_operand" "")))
 	  (const_int 32))))]
-  "!TARGET_MUL64_SET && !TARGET_MULMAC_32BY16_SET"
+  "TARGET_MPY"
   "
 {
   rtx target = operands[0];
 
-  if (!TARGET_MPY)
-    {
-      emit_move_insn (gen_rtx_REG (SImode, 0), operands[1]);
-      emit_move_insn (gen_rtx_REG (SImode, 1), operands[2]);
-      if (TARGET_BIG_ENDIAN)
-	emit_insn (gen_umulsi3_highpart_600_lib_be ());
-      else
-	emit_insn (gen_umulsi3_highpart_600_lib_le ());
-      emit_move_insn (target, gen_rtx_REG (SImode, 0));
-      DONE;
-    }
-
   if (!register_operand (target, SImode))
     target = gen_reg_rtx (SImode);
 
@@ -2607,7 +2509,7 @@ archs4x, archs4xd, archs4xd_slow"
   [(set (match_operand:DI 0 "register_operand" "")
 	(mult:DI (zero_extend:DI (match_operand:SI 1 "register_operand" ""))
 		 (zero_extend:DI (match_operand:SI 2 "nonmemory_operand" ""))))]
-  ""
+  "TARGET_ANY_MPY"
 {
   if (TARGET_PLUS_MACD)
     {
@@ -2646,12 +2548,8 @@ archs4x, archs4xd, archs4xd_slow"
       DONE;
     }
   else
-    {
-      emit_move_insn (gen_rtx_REG (SImode, R0_REG), operands[1]);
-      emit_move_insn (gen_rtx_REG (SImode, R1_REG), operands[2]);
-      emit_insn (gen_umulsidi3_600_lib ());
-      emit_move_insn (operands[0], gen_rtx_REG (DImode, R0_REG));
-      DONE;
+  {
+   gcc_unreachable ();
     }
 })
 
@@ -2729,7 +2627,7 @@ archs4x, archs4xd, archs4xd_slow"
 		 (zero_extend:DI (match_operand:SI 2 "extend_operand" "cL"))))]
   "TARGET_MPY && !TARGET_PLUS_MACD"
   "#"
-  "reload_completed"
+  "TARGET_MPY && !TARGET_PLUS_MACD && reload_completed"
   [(const_int 0)]
 {
   int hi = !TARGET_BIG_ENDIAN;
@@ -2743,42 +2641,6 @@ archs4x, archs4xd, archs4xd_slow"
   [(set_attr "type" "umulti")
   (set_attr "length" "8")])
 
-(define_insn "umulsidi3_600_lib"
-  [(set (reg:DI R0_REG)
-	(mult:DI (zero_extend:DI (reg:SI R0_REG))
-		 (zero_extend:DI (reg:SI R1_REG))))
-   (clobber (reg:SI RETURN_ADDR_REGNUM))
-   (clobber (reg:DI R2_REG))
-   (clobber (reg:SI R12_REG))
-   (clobber (reg:DI MUL64_OUT_REG))
-   (clobber (reg:CC CC_REG))]
-   "!TARGET_ANY_MPY
-   && SFUNC_CHECK_PREDICABLE"
-  "*return arc_output_libcall (\"__umulsidi3\");"
-  [(set_attr "is_sfunc" "yes")
-   (set_attr "predicable" "yes")])
-
-(define_peephole2
-  [(parallel
-     [(set (reg:DI R0_REG)
-	   (mult:DI (zero_extend:DI (reg:SI R0_REG))
-		    (zero_extend:DI (reg:SI R1_REG))))
-      (clobber (reg:SI RETURN_ADDR_REGNUM))
-      (clobber (reg:DI R2_REG))
-      (clobber (reg:SI R12_REG))
-      (clobber (reg:DI MUL64_OUT_REG))
-      (clobber (reg:CC CC_REG))])]
-  "!TARGET_ANY_MPY
-   && peep2_regno_dead_p (1, TARGET_BIG_ENDIAN ? R1_REG : R0_REG)"
-  [(pc)]
-{
-  if (TARGET_BIG_ENDIAN)
-    emit_insn (gen_umulsi3_highpart_600_lib_be ());
-  else
-    emit_insn (gen_umulsi3_highpart_600_lib_le ());
-  DONE;
-})
-
 (define_expand "addsi3"
   [(set (match_operand:SI 0 "dest_reg_operand" "")
 	(plus:SI (match_operand:SI 1 "register_operand" "")
diff --git a/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c b/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c
index 4b54cbf6a52..22b28cf2507 100644
--- a/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c
+++ b/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c
@@ -1,7 +1,7 @@
 /* { dg-do run } */
 /* { dg-skip-if "ARC700 always has mpy option on" { arc700 } } */
 /* { dg-skip-if "ARC600 doesn't have mpy instruction" { arc6xx } } */
-/* { dg-options "-O2 -mmpy-option=0 -w" } */
+/* { dg-options "-O2 -mmpy-option=0 -w -save-temps" } */
 
 #include <stdlib.h>
 
@@ -28,5 +28,4 @@ main (void)
 }
 
 /* { dg-final { scan-assembler-not "mpyhu\[ \t\]" } } */
-/* { dg-final { scan-assembler-not "@__muldi3" } } */
-/* { dg-final { scan-assembler "@__umulsi3_highpart" } } */
+/* { dg-final { scan-assembler "@__muldi3" } } */
diff --git a/libgcc/config/arc/lib1funcs.S b/libgcc/config/arc/lib1funcs.S
index 9a626022612..249dd7a7ff7 100644
--- a/libgcc/config/arc/lib1funcs.S
+++ b/libgcc/config/arc/lib1funcs.S
@@ -232,6 +232,60 @@ SYM(__umulsi3_highpart):
 #endif
 #endif /* L_umulsidi3 */
 
+#ifdef L_muldi3
+	.section .text
+	.align 4
+	.global SYM(__muldi3)
+SYM(__muldi3):
+#ifdef __LITTLE_ENDIAN__
+        push_s blink
+        mov_s r4,r3     ;4
+        mov_s r5,r2     ;4
+        mov_s r9,r0     ;4
+        mov_s r8,r1     ;4
+        bl.d @__umulsidi3
+        mov_s r1,r2     ;4
+        mov_s r6,r0     ;4
+        mov_s r7,r1     ;4
+        mov_s r0,r9     ;4
+        bl.d @__mulsi3
+        mov_s r1,r4     ;4
+        mov_s r4,r0     ;4
+        mov_s r1,r8     ;4
+        bl.d @__mulsi3
+        mov_s r0,r5     ;4
+        pop_s blink
+        add_s r0,r0,r4 ;2
+        add r1,r0,r7
+        j_s.d [blink]
+        mov_s r0,r6     ;4
+#else
+	push_s  blink
+	mov_s   r5,r3
+	mov_s   r9,r2
+	mov_s   r4,r1
+	mov_s   r8,r0
+	mov_s   r0,r1
+	bl.d 	@__umulsidi3
+	mov_s   r1,r3
+	mov_s   r7,r0
+	mov_s   r6,r1
+	mov_s   r0,r4
+	bl.d    @__mulsi3
+	mov_s   r1,r9
+	mov_s   r4,r0
+	mov_s   r1,r8
+	bl.d    @__mulsi3
+	mov_s   r0,r5
+	pop_s   blink
+	add_s   r0,r0,r4
+	add_s   r0,r0,r7
+	j_s.d   [blink]
+	mov_s   r1,r6
+#endif /* __LITTLE_ENDIAN__ */
+ENDFUNC(__muldi3)
+#endif /* L_muldi3 */
+
 #ifdef  L_umulsi3_highpart
 #include "ieee-754/arc-ieee-754.h"
 /* For use without a barrel shifter, and for ARC700 / ARC_MUL64, the
diff --git a/libgcc/config/arc/t-arc b/libgcc/config/arc/t-arc
index ad30fdb1db7..c79bc1cfdc4 100644
--- a/libgcc/config/arc/t-arc
+++ b/libgcc/config/arc/t-arc
@@ -21,7 +21,7 @@
 
 CROSS_LIBGCC1 = libgcc1-asm.a
 LIB1ASMSRC = arc/lib1funcs.S
-LIB1ASMFUNCS = _mulsi3 _umulsidi3  _umulsi3_highpart \
+LIB1ASMFUNCS = _mulsi3 _umulsidi3  _umulsi3_highpart _muldi3 \
   _udivsi3 _divsi3 _umodsi3 _modsi3 \
   _divmod_tools _clzsi2 \
   _millicodethunk_st _millicodethunk_ld _millicodethunk_ret \
-- 
2.17.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 2/6] [ARC] Cleanup TLS implementation.
  2018-10-10  8:01 [PATCH 0/6] ARC updates Claudiu Zissulescu
                   ` (2 preceding siblings ...)
  2018-10-10  8:01 ` [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads Claudiu Zissulescu
@ 2018-10-10  8:01 ` Claudiu Zissulescu
  2018-10-11 10:13   ` Andrew Burgess
  2018-10-10  8:49 ` [PATCH 3/6] [ARC] Add BI/BIH instruction support Claudiu Zissulescu
  2018-10-10  9:05 ` [PATCH 5/6] [ARC] Refurbish and improve prologue/epilogue functions Claudiu Zissulescu
  5 siblings, 1 reply; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-10  8:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Cleanup TLS implementation and add a number of tests.

gcc/
2018-07-25  Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/arc.c (arc_get_tp): Remove function.
	(arc_emit_call_tls_get_addr): Likewise.
	(arc_call_tls_get_addr): New function.
	(arc_legitimize_tls_address): Make use of arc_call_tls_get_addr.
	* config/arc/arc.md (tls_load_tp_soft): Remove.
	(tls_gd_get_addr): Likewise.

testsuite/
2018-07-25  Claudiu Zissulescu  <claziss@synopsys.com>

	* gcc.target/arc/tls-gd.c: New file.
	* gcc.target/arc/tls-ie.c: Likewise.
	* gcc.target/arc/tls-ld.c: Likewise.
	* gcc.target/arc/tls-le.c: Likewise.
---
 gcc/config/arc/arc.c                  | 95 +++++++++++----------------
 gcc/config/arc/arc.md                 | 21 ------
 gcc/testsuite/gcc.target/arc/tls-1.c  |  2 +-
 gcc/testsuite/gcc.target/arc/tls-gd.c | 17 +++++
 gcc/testsuite/gcc.target/arc/tls-ie.c | 17 +++++
 gcc/testsuite/gcc.target/arc/tls-ld.c | 18 +++++
 gcc/testsuite/gcc.target/arc/tls-le.c | 16 +++++
 7 files changed, 106 insertions(+), 80 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-gd.c
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-ie.c
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-ld.c
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-le.c

diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index de4c7433c1b..56f566795ff 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -5559,51 +5559,30 @@ arc_raw_symbolic_reference_mentioned_p (rtx op, bool skip_local)
   return false;
 }
 
-/* Get the thread pointer.  */
+/* Emit a call to __tls_get_addr.  TI is the argument to this function.
+   RET is an RTX for the return value location.  The entire insn sequence
+   is returned.  */
+static GTY(()) rtx arc_tls_symbol;
 
 static rtx
-arc_get_tp (void)
+arc_call_tls_get_addr (rtx ti)
 {
-   /* If arc_tp_regno has been set, we can use that hard register
-      directly as a base register.  */
-  if (arc_tp_regno != -1)
-    return gen_rtx_REG (Pmode, arc_tp_regno);
-
-  /* Otherwise, call __read_tp.  Copy the result to a pseudo to avoid
-     conflicts with function arguments / results.  */
-  rtx reg = gen_reg_rtx (Pmode);
-  emit_insn (gen_tls_load_tp_soft ());
-  emit_move_insn (reg, gen_rtx_REG (Pmode, R0_REG));
-  return reg;
-}
-
-/* Helper to be used by TLS Global dynamic model.  */
-
-static rtx
-arc_emit_call_tls_get_addr (rtx sym, int reloc, rtx eqv)
-{
-  rtx r0 = gen_rtx_REG (Pmode, R0_REG);
-  rtx call_fusage = NULL_RTX;
-
-  start_sequence ();
-
-  rtx x = arc_unspec_offset (sym, reloc);
-  emit_move_insn (r0, x);
-  use_reg (&call_fusage, r0);
+  rtx arg = gen_rtx_REG (Pmode, R0_REG);
+  rtx ret = gen_rtx_REG (Pmode, R0_REG);
+  rtx fn;
+  rtx_insn *insn;
 
-  gcc_assert (reloc == UNSPEC_TLS_GD);
-  rtx call_insn = emit_call_insn (gen_tls_gd_get_addr (sym));
-  /* Should we set RTL_CONST_CALL_P?  We read memory, but not in a
-     way that the application should care.  */
-  RTL_PURE_CALL_P (call_insn) = 1;
-  add_function_usage_to (call_insn, call_fusage);
+  if (!arc_tls_symbol)
+    arc_tls_symbol = init_one_libfunc ("__tls_get_addr");
 
-  rtx_insn *insns = get_insns ();
-  end_sequence ();
+  emit_move_insn (arg, ti);
+  fn = gen_rtx_MEM (SImode, arc_tls_symbol);
+  insn = emit_call_insn (gen_call_value (ret, fn, const0_rtx));
+  RTL_CONST_CALL_P (insn) = 1;
+  use_reg (&CALL_INSN_FUNCTION_USAGE (insn), ret);
+  use_reg (&CALL_INSN_FUNCTION_USAGE (insn), arg);
 
-  rtx dest = gen_reg_rtx (Pmode);
-  emit_libcall_block (insns, dest, r0, eqv);
-  return dest;
+  return ret;
 }
 
 #define DTPOFF_ZERO_SYM ".tdata"
@@ -5614,16 +5593,26 @@ arc_emit_call_tls_get_addr (rtx sym, int reloc, rtx eqv)
 static rtx
 arc_legitimize_tls_address (rtx addr, enum tls_model model)
 {
+  rtx tmp;
+
   if (!flag_pic && model == TLS_MODEL_LOCAL_DYNAMIC)
     model = TLS_MODEL_LOCAL_EXEC;
 
+
+  /* The TP pointer needs to be set.  */
+  gcc_assert (arc_tp_regno != -1);
+
   switch (model)
     {
+    case TLS_MODEL_GLOBAL_DYNAMIC:
+      tmp = gen_reg_rtx (Pmode);
+      emit_move_insn (tmp, arc_unspec_offset (addr, UNSPEC_TLS_GD));
+      return arc_call_tls_get_addr (tmp);
+
     case TLS_MODEL_LOCAL_DYNAMIC:
       rtx base;
       tree decl;
       const char *base_name;
-      rtvec v;
 
       decl = SYMBOL_REF_DECL (addr);
       base_name = DTPOFF_ZERO_SYM;
@@ -5631,31 +5620,21 @@ arc_legitimize_tls_address (rtx addr, enum tls_model model)
 	base_name = ".tbss";
 
       base = gen_rtx_SYMBOL_REF (Pmode, base_name);
-      if (strcmp (base_name, DTPOFF_ZERO_SYM) == 0)
-	{
-	  if (!flag_pic)
-	    goto local_exec;
-	  v = gen_rtvec (1, addr);
-	}
-      else
-	v = gen_rtvec (2, addr, base);
-      addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_TLS_OFF);
-      addr = gen_rtx_CONST (Pmode, addr);
-      base = arc_legitimize_tls_address (base, TLS_MODEL_GLOBAL_DYNAMIC);
-      return gen_rtx_PLUS (Pmode, force_reg (Pmode, base), addr);
-
-    case TLS_MODEL_GLOBAL_DYNAMIC:
-      return arc_emit_call_tls_get_addr (addr, UNSPEC_TLS_GD, addr);
+      tmp = gen_reg_rtx (Pmode);
+      emit_move_insn (tmp, arc_unspec_offset (base, UNSPEC_TLS_GD));
+      base = arc_call_tls_get_addr (tmp);
+      return gen_rtx_PLUS (Pmode, force_reg (Pmode, base),
+			   arc_unspec_offset (addr, UNSPEC_TLS_OFF));
 
     case TLS_MODEL_INITIAL_EXEC:
       addr = arc_unspec_offset (addr, UNSPEC_TLS_IE);
       addr = copy_to_mode_reg (Pmode, gen_const_mem (Pmode, addr));
-      return gen_rtx_PLUS (Pmode, arc_get_tp (), addr);
+      return gen_rtx_PLUS (Pmode, gen_rtx_REG (Pmode, arc_tp_regno), addr);
 
     case TLS_MODEL_LOCAL_EXEC:
-    local_exec:
       addr = arc_unspec_offset (addr, UNSPEC_TLS_OFF);
-      return gen_rtx_PLUS (Pmode, arc_get_tp (), addr);
+      return gen_rtx_PLUS (Pmode, gen_rtx_REG (Pmode, arc_tp_regno), addr);
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index d73289a20c4..6ea67791627 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -5310,27 +5310,6 @@ archs4x, archs4xd, archs4xd_slow"
   [(set_attr "type" "call")
    (set_attr "is_SIBCALL" "yes")])
 
-(define_insn "tls_load_tp_soft"
-  [(set (reg:SI R0_REG) (unspec:SI [(const_int 0)] UNSPEC_TLS_OFF))
-   (clobber (reg:SI RETURN_ADDR_REGNUM))]
-  ""
-  "*return arc_output_libcall (\"__read_tp\");"
-  [(set_attr "is_sfunc" "yes")
-   (set_attr "predicable" "yes")])
-
-(define_insn "tls_gd_get_addr"
-  [(set (reg:SI R0_REG)
-	(call:SI (mem:SI (unspec:SI [(match_operand:SI 0
-				      "symbolic_operand" "X,X")]
-			  UNSPEC_TLS_GD))
-		 (const_int 0)))
-   (clobber (reg:SI RETURN_ADDR_REGNUM))]
-  ""
-  ".tls_gd_ld %0`bl%* __tls_get_addr@plt"
-  [(set_attr "type" "call")
-   ; With TARGET_MEDIUM_CALLS, plt calls are not predicable.
-   (set_attr "predicable" "no")])
-
 ;; For thread pointer builtins
 (define_expand "get_thread_pointersi"
   [(set (match_operand:SI 0 "register_operand") (match_dup 1))]
diff --git a/gcc/testsuite/gcc.target/arc/tls-1.c b/gcc/testsuite/gcc.target/arc/tls-1.c
index 6521b641549..da21a5ba032 100644
--- a/gcc/testsuite/gcc.target/arc/tls-1.c
+++ b/gcc/testsuite/gcc.target/arc/tls-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target tls } */
-/* { dg-skip-if "" { arc-*-elf* } } */
+/* { dg-skip-if "" { arc*-*-elf* } } */
 /* { dg-options "-O3 -std=gnu99" } */
 
 /* Check if addressing the `pos` member of struct is done via tls
diff --git a/gcc/testsuite/gcc.target/arc/tls-gd.c b/gcc/testsuite/gcc.target/arc/tls-gd.c
new file mode 100644
index 00000000000..aa1b5429b08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/tls-gd.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fpic } */
+/* { dg-options "-O2 -fpic -ftls-model=global-dynamic" } */
+/* { dg-require-effective-target tls } */
+/* { dg-skip-if "" { arc*-*-elf* } } */
+
+/* Check if tls global dynamic is correctly generated.  */
+
+extern __thread int e2;
+
+int *ae2 (void)
+{
+  return &e2;
+}
+
+/* { dg-final { scan-assembler "add r0,pcl,@e2@tlsgd" } } */
+/* { dg-final { scan-assembler "bl @__tls_get_addr@plt" } } */
diff --git a/gcc/testsuite/gcc.target/arc/tls-ie.c b/gcc/testsuite/gcc.target/arc/tls-ie.c
new file mode 100644
index 00000000000..0c981cfbf67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/tls-ie.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fpic } */
+/* { dg-options "-O2 -fpic -ftls-model=initial-exec" } */
+/* { dg-require-effective-target tls } */
+/* { dg-skip-if "" { arc*-*-elf* } } */
+
+/* Check if tls initial execution is correctly generated.  */
+
+extern __thread int e2;
+
+int *ae2 (void)
+{
+  return &e2;
+}
+
+/* { dg-final { scan-assembler "ld r0,\\\[pcl,@e2@tlsie\\\]" } } */
+/* { dg-final { scan-assembler "add_s r0,r0,r25" } } */
diff --git a/gcc/testsuite/gcc.target/arc/tls-ld.c b/gcc/testsuite/gcc.target/arc/tls-ld.c
new file mode 100644
index 00000000000..351c3f02abd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/tls-ld.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fpic } */
+/* { dg-options "-O2 -fpic -ftls-model=local-dynamic" } */
+/* { dg-require-effective-target tls } */
+/* { dg-skip-if "" { arc*-*-elf* } } */
+
+/* Check if tls local dynamic is correctly generated.  */
+
+extern __thread int e2;
+
+int *ae2 (void)
+{
+  return &e2;
+}
+
+/* { dg-final { scan-assembler "add r0,pcl,@.tbss@tlsgd" } } */
+/* { dg-final { scan-assembler "bl @__tls_get_addr@plt" } } */
+/* { dg-final { scan-assembler "add_s r0,r0,@e2@dtpoff" } } */
diff --git a/gcc/testsuite/gcc.target/arc/tls-le.c b/gcc/testsuite/gcc.target/arc/tls-le.c
new file mode 100644
index 00000000000..ae3089b5070
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/tls-le.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fpic } */
+/* { dg-options "-O2 -fpic -ftls-model=local-exec" } */
+/* { dg-require-effective-target tls } */
+/* { dg-skip-if "" { arc*-*-elf* } } */
+
+/* Check if tls local execution is correctly generated.  */
+
+extern __thread int e2;
+
+int *ae2 (void)
+{
+  return &e2;
+}
+
+/* { dg-final { scan-assembler "add r0,r25,@e2@tpoff" } } */
-- 
2.17.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads
  2018-10-10  8:01 [PATCH 0/6] ARC updates Claudiu Zissulescu
  2018-10-10  8:01 ` [PATCH 6/6] [ARC] Handle store cacheline hazard Claudiu Zissulescu
  2018-10-10  8:01 ` [PATCH 1/6] [ARC] Remove non standard funcions calls Claudiu Zissulescu
@ 2018-10-10  8:01 ` Claudiu Zissulescu
  2018-10-22 18:15   ` Andrew Burgess
  2018-10-10  8:01 ` [PATCH 2/6] [ARC] Cleanup TLS implementation Claudiu Zissulescu
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-10  8:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Simple peephole rules which combines multiple ld/st instructions into
64-bit load/store instructions. It only works for architectures which
are having double load/store option on.

gcc/
	Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/arc-protos.h (gen_operands_ldd_std): Add.
	* config/arc/arc.c (operands_ok_ldd_std): New function.
	(mem_ok_for_ldd_std): Likewise.
	(gen_operands_ldd_std): Likewise.
	* config/arc/arc.md: Add peephole2 rules for std/ldd.
---
 gcc/config/arc/arc-protos.h |   1 +
 gcc/config/arc/arc.c        | 163 ++++++++++++++++++++++++++++++++++++
 gcc/config/arc/arc.md       |  67 +++++++++++++++
 3 files changed, 231 insertions(+)

diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
index 24bea6e1efb..55f8ed4c643 100644
--- a/gcc/config/arc/arc-protos.h
+++ b/gcc/config/arc/arc-protos.h
@@ -46,6 +46,7 @@ extern int arc_return_address_register (unsigned int);
 extern unsigned int arc_compute_function_type (struct function *);
 extern bool arc_is_uncached_mem_p (rtx);
 extern bool arc_lra_p (void);
+extern bool gen_operands_ldd_std (rtx *operands, bool load, bool commute);
 #endif /* RTX_CODE */
 
 extern unsigned int arc_compute_frame_size (int);
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index 18dd0de6af7..9bc69e9fbc9 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -10803,6 +10803,169 @@ arc_cannot_substitute_mem_equiv_p (rtx)
   return true;
 }
 
+/* Checks whether the operands are valid for use in an LDD/STD
+   instruction.	 Assumes that RT, RT2, and RN are REG.	This is
+   guaranteed by the patterns.	Assumes that the address in the base
+   register RN is word aligned.	 Pattern guarantees that both memory
+   accesses use the same base register, the offsets are constants
+   within the range, and the gap between the offsets is 4.  If preload
+   complete then check that registers are legal.  WBACK indicates
+   whether address is updated.	*/
+
+static bool
+operands_ok_ldd_std (rtx rt, rtx rt2, rtx rn ATTRIBUTE_UNUSED,
+		    HOST_WIDE_INT offset)
+{
+  unsigned int t, t2;
+
+  if (!reload_completed)
+    return true;
+
+  if (!(SMALL_INT_RANGE (offset, (GET_MODE_SIZE (DImode) - 1) & -4,
+			 (offset & (GET_MODE_SIZE (DImode) - 1) & 3
+			  ? 0 : -(-GET_MODE_SIZE (DImode) | -4) >> 1))))
+    return false;
+
+  t = REGNO (rt);
+  t2 = REGNO (rt2);
+
+  if ((t2 == 63)
+      || (t % 2 != 0)	/* First destination register is not even.  */
+      || (t2 != t + 1))
+      return false;
+
+  return true;
+}
+
+/* Helper for gen_operands_ldd_std.  Returns true iff the memory
+   operand MEM's address contains an immediate offset from the base
+   register and has no side effects, in which case it sets BASE and
+   OFFSET accordingly.	*/
+
+static bool
+mem_ok_for_ldd_std (rtx mem, rtx *base, rtx *offset)
+{
+  rtx addr;
+
+  gcc_assert (base != NULL && offset != NULL);
+
+  /* TODO: Handle more general memory operand patterns, such as
+     PRE_DEC and PRE_INC.  */
+
+  if (side_effects_p (mem))
+    return false;
+
+  /* Can't deal with subregs.  */
+  if (GET_CODE (mem) == SUBREG)
+    return false;
+
+  gcc_assert (MEM_P (mem));
+
+  *offset = const0_rtx;
+
+  addr = XEXP (mem, 0);
+
+  /* If addr isn't valid for DImode, then we can't handle it.  */
+  if (!arc_legitimate_address_p (DImode, addr,
+				reload_in_progress || reload_completed))
+    return false;
+
+  if (REG_P (addr))
+    {
+      *base = addr;
+      return true;
+    }
+  else if (GET_CODE (addr) == PLUS || GET_CODE (addr) == MINUS)
+    {
+      *base = XEXP (addr, 0);
+      *offset = XEXP (addr, 1);
+      return (REG_P (*base) && CONST_INT_P (*offset));
+    }
+
+  return false;
+}
+
+/* Called from peephole2 to replace two word-size accesses with a
+   single LDD/STD instruction.	Returns true iff we can generate a new
+   instruction sequence.  That is, both accesses use the same base
+   register and the gap between constant offsets is 4.	OPERANDS are
+   the operands found by the peephole matcher; OPERANDS[0,1] are
+   register operands, and OPERANDS[2,3] are the corresponding memory
+   operands.  LOAD indicates whether the access is load or store.  */
+
+bool
+gen_operands_ldd_std (rtx *operands, bool load, bool commute)
+{
+  int i, gap;
+  HOST_WIDE_INT offsets[2], offset;
+  int nops = 2;
+  rtx cur_base, cur_offset, tmp;
+  rtx base = NULL_RTX;
+
+  /* Check that the memory references are immediate offsets from the
+     same base register.  Extract the base register, the destination
+     registers, and the corresponding memory offsets.  */
+  for (i = 0; i < nops; i++)
+    {
+      if (!mem_ok_for_ldd_std (operands[nops+i], &cur_base, &cur_offset))
+	return false;
+
+      if (i == 0)
+	base = cur_base;
+      else if (REGNO (base) != REGNO (cur_base))
+	return false;
+
+      offsets[i] = INTVAL (cur_offset);
+      if (GET_CODE (operands[i]) == SUBREG)
+	{
+	  tmp = SUBREG_REG (operands[i]);
+	  gcc_assert (GET_MODE (operands[i]) == GET_MODE (tmp));
+	  operands[i] = tmp;
+	}
+    }
+
+  /* Make sure there is no dependency between the individual loads.  */
+  if (load && REGNO (operands[0]) == REGNO (base))
+    return false; /* RAW */
+
+  if (load && REGNO (operands[0]) == REGNO (operands[1]))
+    return false; /* WAW */
+
+  /* Make sure the instructions are ordered with lower memory access first.  */
+  if (offsets[0] > offsets[1])
+    {
+      gap = offsets[0] - offsets[1];
+      offset = offsets[1];
+
+      /* Swap the instructions such that lower memory is accessed first.  */
+      std::swap (operands[0], operands[1]);
+      std::swap (operands[2], operands[3]);
+    }
+  else
+    {
+      gap = offsets[1] - offsets[0];
+      offset = offsets[0];
+    }
+
+  /* Make sure accesses are to consecutive memory locations.  */
+  if (gap != 4)
+    return false;
+
+  /* Make sure we generate legal instructions.	*/
+  if (operands_ok_ldd_std (operands[0], operands[1], base, offset))
+    return true;
+
+  if (load && commute)
+    {
+      /* Try reordering registers.  */
+      std::swap (operands [0], operands[1]);
+      if (operands_ok_ldd_std (operands[0], operands[1], base, offset))
+	return true;
+    }
+
+  return false;
+}
+
 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P arc_use_anchors_for_symbol_p
 
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 1ed230fa5f0..b968022e64a 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -6363,6 +6363,73 @@ archs4x, archs4xd, archs4xd_slow"
   [(set (reg:CC CC_REG) (compare:CC (match_dup 3)
 				    (ashift:SI (match_dup 1) (match_dup 2))))])
 
+(define_peephole2 ; std
+[(set (match_operand:SI 2 "memory_operand" "")
+      (match_operand:SI 0 "register_operand" ""))
+ (set (match_operand:SI 3 "memory_operand" "")
+      (match_operand:SI 1 "register_operand" ""))]
+ "TARGET_LL64"
+ [(const_int 0)]
+{
+ if (!gen_operands_ldd_std (operands, false, false))
+   FAIL;
+ operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
+ operands[2] = adjust_address (operands[2], DImode, 0);
+ emit_insn (gen_rtx_SET (operands[2], operands[0]));
+ DONE;
+ })
+
+(define_peephole2 ; ldd
+  [(set (match_operand:SI 0 "register_operand" "")
+        (match_operand:SI 2 "memory_operand" ""))
+   (set (match_operand:SI 1 "register_operand" "")
+        (match_operand:SI 3 "memory_operand" ""))]
+  "TARGET_LL64"
+  [(const_int 0)]
+{
+  if (!gen_operands_ldd_std (operands, true, false))
+    FAIL;
+  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
+  operands[2] = adjust_address (operands[2], DImode, 0);
+  emit_insn (gen_rtx_SET (operands[0], operands[2]));
+  DONE;
+})
+
+;; We require consecutive registers for LDD instruction.  Check if we
+;; can reorder them and use an LDD.
+
+(define_peephole2 ; swap the destination registers of two loads
+		  ; before a commutative operation.
+  [(set (match_operand:SI 0 "register_operand" "")
+        (match_operand:SI 2 "memory_operand" ""))
+   (set (match_operand:SI 1 "register_operand" "")
+        (match_operand:SI 3 "memory_operand" ""))
+   (set (match_operand:SI 4 "register_operand" "")
+        (match_operator:SI 5 "commutative_operator"
+			   [(match_operand 6 "register_operand" "")
+			    (match_operand 7 "register_operand" "") ]))]
+  "TARGET_LL64
+   && (((rtx_equal_p(operands[0], operands[6]))
+         && (rtx_equal_p(operands[1], operands[7])))
+        || ((rtx_equal_p(operands[0], operands[7]))
+             && (rtx_equal_p(operands[1], operands[6]))))
+   && (peep2_reg_dead_p (3, operands[0]) || rtx_equal_p (operands[0], operands[4]))
+   && (peep2_reg_dead_p (3, operands[1]) || rtx_equal_p (operands[1], operands[4]))"
+  [(set (match_dup 0) (match_dup 2))
+   (set (match_dup 4) (match_op_dup 5 [(match_dup 6) (match_dup 7)]))]
+  {
+    if (!gen_operands_ldd_std (operands, true, true))
+     {
+        FAIL;
+     }
+    else
+     {
+        operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
+        operands[2] = adjust_address (operands[2], DImode, 0);
+     }
+   }
+)
+
 ;; include the arc-FPX instructions
 (include "fpx.md")
 
-- 
2.17.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 0/6] ARC updates
@ 2018-10-10  8:01 Claudiu Zissulescu
  2018-10-10  8:01 ` [PATCH 6/6] [ARC] Handle store cacheline hazard Claudiu Zissulescu
                   ` (5 more replies)
  0 siblings, 6 replies; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-10  8:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Hi Andrew,

Please find a number of patches that are adding more features to arc backend (BI/BIH instructions, peephole rules, enter/leave instructions) or fixes exising issues (store hazards, tls implementation, library calls).

Please let me know if you have any question,
Claudiu


Claudiu Zissulescu (6):
  [ARC] Remove non standard funcions calls.
  [ARC] Cleanup TLS implementation.
  [ARC] Add BI/BIH instruction support.
  [ARC] Add peephole rules to combine store/loads into double
    store/loads
  [ARC] Refurbish and improve prologue/epilogue functions.
  [ARC] Handle store cacheline hazard.

 gcc/common/config/arc/arc-common.c            |    1 +
 gcc/config/arc/arc-arch.h                     |    1 +
 gcc/config/arc/arc-protos.h                   |    2 +
 gcc/config/arc/arc.c                          | 1701 +++++++++++------
 gcc/config/arc/arc.h                          |  106 +-
 gcc/config/arc/arc.md                         |  644 +++----
 gcc/config/arc/arc.opt                        |   19 +-
 gcc/config/arc/arc700.md                      |   18 +-
 gcc/config/arc/predicates.md                  |   12 +
 gcc/doc/invoke.texi                           |   27 +-
 gcc/testsuite/gcc.target/arc/firq-1.c         |    8 +-
 gcc/testsuite/gcc.target/arc/firq-3.c         |   14 +-
 gcc/testsuite/gcc.target/arc/firq-4.c         |   12 +-
 gcc/testsuite/gcc.target/arc/interrupt-6.c    |    2 +-
 gcc/testsuite/gcc.target/arc/jumptable.c      |   34 +
 .../gcc.target/arc/mulsi3_highpart-2.c        |    5 +-
 gcc/testsuite/gcc.target/arc/tls-1.c          |    2 +-
 gcc/testsuite/gcc.target/arc/tls-gd.c         |   17 +
 gcc/testsuite/gcc.target/arc/tls-ie.c         |   17 +
 gcc/testsuite/gcc.target/arc/tls-ld.c         |   18 +
 gcc/testsuite/gcc.target/arc/tls-le.c         |   16 +
 libgcc/config/arc/lib1funcs.S                 |   54 +
 libgcc/config/arc/t-arc                       |    2 +-
 23 files changed, 1769 insertions(+), 963 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arc/jumptable.c
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-gd.c
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-ie.c
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-ld.c
 create mode 100644 gcc/testsuite/gcc.target/arc/tls-le.c

-- 
2.17.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 6/6] [ARC] Handle store cacheline hazard.
  2018-10-10  8:01 [PATCH 0/6] ARC updates Claudiu Zissulescu
@ 2018-10-10  8:01 ` Claudiu Zissulescu
  2018-10-30 10:13   ` Andrew Burgess
  2018-10-10  8:01 ` [PATCH 1/6] [ARC] Remove non standard funcions calls Claudiu Zissulescu
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-10  8:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Handle store cacheline hazard for A700 cpus by inserting two NOP_S
between ST ST LD or their logical equivalent (like ST ST NOP_S NOP_S
J_L.D LD)

gcc/
2016-08-01  Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/arc-arch.h (ARC_TUNE_ARC7XX): New tune value.
	* config/arc/arc.c (arc_active_insn): New function.
	(check_store_cacheline_hazard): Likewise.
	(workaround_arc_anomaly): Use check_store_cacheline_hazard.
	(arc_override_options): Disable delay slot scheduler for older
	A7.
	(arc_store_addr_hazard_p): New implementation, old one renamed to
	...
	(arc_store_addr_hazard_internal_p): Renamed.
	(arc_reorg): Don't combine into brcc instructions which are part
	of hardware hazard solution.
	* config/arc/arc.md (attr tune): Consider new arc7xx tune value.
	(tune_arc700): Likewise.
	* config/arc/arc.opt (arc7xx): New tune value.
	* config/arc/arc700.md: Improve A7 scheduler.
---
 gcc/config/arc/arc-arch.h |   1 +
 gcc/config/arc/arc.c      | 142 ++++++++++++++++++++++++++++++++------
 gcc/config/arc/arc.md     |   8 ++-
 gcc/config/arc/arc.opt    |   3 +
 gcc/config/arc/arc700.md  |  18 +----
 5 files changed, 132 insertions(+), 40 deletions(-)

diff --git a/gcc/config/arc/arc-arch.h b/gcc/config/arc/arc-arch.h
index 859af0684b8..ad540607e55 100644
--- a/gcc/config/arc/arc-arch.h
+++ b/gcc/config/arc/arc-arch.h
@@ -71,6 +71,7 @@ enum arc_tune_attr
   {
     ARC_TUNE_NONE,
     ARC_TUNE_ARC600,
+    ARC_TUNE_ARC7XX,
     ARC_TUNE_ARC700_4_2_STD,
     ARC_TUNE_ARC700_4_2_XMAC,
     ARC_TUNE_CORE_3,
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index ab7735d6b38..90454928379 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -1308,6 +1308,10 @@ arc_override_options (void)
   if (TARGET_LONG_CALLS_SET)
     target_flags &= ~MASK_MILLICODE_THUNK_SET;
 
+  /* A7 has an issue with delay slots.  */
+  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
+    flag_delayed_branch = 0;
+
   /* These need to be done at start up.  It's convenient to do them here.  */
   arc_init ();
 }
@@ -7529,11 +7533,91 @@ arc_invalid_within_doloop (const rtx_insn *insn)
   return NULL;
 }
 
+static rtx_insn *
+arc_active_insn (rtx_insn *insn)
+{
+  rtx_insn *nxt = next_active_insn (insn);
+
+  if (nxt && GET_CODE (PATTERN (nxt)) == ASM_INPUT)
+    nxt = next_active_insn (nxt);
+  return nxt;
+}
+
+/* Search for a sequence made out of two stores and a given number of
+   loads, insert a nop if required.  */
+
+static void
+check_store_cacheline_hazard (void)
+{
+  rtx_insn *insn, *succ0, *insn1;
+  bool found = false;
+
+  for (insn = get_insns (); insn; insn = arc_active_insn (insn))
+    {
+      succ0 = arc_active_insn (insn);
+
+      if (!succ0)
+	return;
+
+      if (!single_set (insn) || !single_set (succ0))
+	continue;
+
+      if ((get_attr_type (insn) != TYPE_STORE)
+	  || (get_attr_type (succ0) != TYPE_STORE))
+	continue;
+
+      /* Found at least two consecutive stores.  Goto the end of the
+	 store sequence.  */
+      for (insn1 = succ0; insn1; insn1 = arc_active_insn (insn1))
+	if (!single_set (insn1) || get_attr_type (insn1) != TYPE_STORE)
+	  break;
+
+      /* Now, check the next two instructions for the following cases:
+         1. next instruction is a LD => insert 2 nops between store
+	    sequence and load.
+	 2. next-next instruction is a LD => inset 1 nop after the store
+	    sequence.  */
+      if (insn1 && single_set (insn1)
+	  && (get_attr_type (insn1) == TYPE_LOAD))
+	{
+	  found = true;
+	  emit_insn_before (gen_nopv (), insn1);
+	  emit_insn_before (gen_nopv (), insn1);
+	}
+      else
+	{
+	  if (insn1 && (get_attr_type (insn1) == TYPE_COMPARE))
+	    {
+	      /* REG_SAVE_NOTE is used by Haifa scheduler, we are in
+		 reorg, so it is safe to reuse it for avoiding the
+		 current compare insn to be part of a BRcc
+		 optimization.  */
+	      add_reg_note (insn1, REG_SAVE_NOTE, GEN_INT (3));
+	    }
+	  insn1 = arc_active_insn (insn1);
+	  if (insn1 && single_set (insn1)
+	      && (get_attr_type (insn1) == TYPE_LOAD))
+	    {
+	      found = true;
+	      emit_insn_before (gen_nopv (), insn1);
+	    }
+	}
+
+      insn = insn1;
+      if (found)
+	{
+	  /* warning (0, "Potential lockup sequence found, patching"); */
+	  found = false;
+	}
+    }
+}
+
 /* Return true if a load instruction (CONSUMER) uses the same address as a
    store instruction (PRODUCER).  This function is used to avoid st/ld
    address hazard in ARC700 cores.  */
-bool
-arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
+
+static bool
+arc_store_addr_hazard_internal_p (rtx_insn* producer, rtx_insn* consumer)
 {
   rtx in_set, out_set;
   rtx out_addr, in_addr;
@@ -7581,6 +7665,14 @@ arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
   return false;
 }
 
+bool
+arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
+{
+  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
+    return true;
+  return arc_store_addr_hazard_internal_p (producer, consumer);
+}
+
 /* The same functionality as arc_hazard.  It is called in machine
    reorg before any other optimization.  Hence, the NOP size is taken
    into account when doing branch shortening.  */
@@ -7589,6 +7681,7 @@ static void
 workaround_arc_anomaly (void)
 {
   rtx_insn *insn, *succ0;
+  rtx_insn *succ1;
 
   /* For any architecture: call arc_hazard here.  */
   for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
@@ -7600,27 +7693,30 @@ workaround_arc_anomaly (void)
 	}
     }
 
-  if (TARGET_ARC700)
-    {
-      rtx_insn *succ1;
+  if (!TARGET_ARC700)
+    return;
 
-      for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
-	{
-	  succ0 = next_real_insn (insn);
-	  if (arc_store_addr_hazard_p (insn, succ0))
-	    {
-	      emit_insn_after (gen_nopv (), insn);
-	      emit_insn_after (gen_nopv (), insn);
-	      continue;
-	    }
+  /* Old A7 are suffering of a cache hazard, and we need to insert two
+     nops between any sequence of stores and a load.  */
+  if (arc_tune != ARC_TUNE_ARC7XX)
+    check_store_cacheline_hazard ();
 
-	  /* Avoid adding nops if the instruction between the ST and LD is
-	     a call or jump.  */
-	  succ1 = next_real_insn (succ0);
-	  if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
-	      && arc_store_addr_hazard_p (insn, succ1))
-	    emit_insn_after (gen_nopv (), insn);
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      succ0 = next_real_insn (insn);
+      if (arc_store_addr_hazard_internal_p (insn, succ0))
+	{
+	  emit_insn_after (gen_nopv (), insn);
+	  emit_insn_after (gen_nopv (), insn);
+	  continue;
 	}
+
+      /* Avoid adding nops if the instruction between the ST and LD is
+	 a call or jump.  */
+      succ1 = next_real_insn (succ0);
+      if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
+	  && arc_store_addr_hazard_internal_p (insn, succ1))
+	emit_insn_after (gen_nopv (), insn);
     }
 }
 
@@ -8291,11 +8387,15 @@ arc_reorg (void)
 	      if (!link_insn)
 		continue;
 	      else
-		/* Check if this is a data dependency.  */
 		{
+		  /* Check if this is a data dependency.  */
 		  rtx op, cc_clob_rtx, op0, op1, brcc_insn, note;
 		  rtx cmp0, cmp1;
 
+		  /* Make sure we can use it for brcc insns.  */
+		  if (find_reg_note (link_insn, REG_SAVE_NOTE, GEN_INT (3)))
+		    continue;
+
 		  /* Ok this is the set cc. copy args here.  */
 		  op = XEXP (pc_target, 0);
 
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index fb8a1c9ee09..caf7deda505 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -600,11 +600,13 @@
 ;;   somehow modify them to become inelegible for delay slots if a decision
 ;;   is made that makes conditional execution required.
 
-(define_attr "tune" "none,arc600,arc700_4_2_std,arc700_4_2_xmac, core_3, \
-archs4x, archs4xd, archs4xd_slow"
+(define_attr "tune" "none,arc600,arc7xx,arc700_4_2_std,arc700_4_2_xmac, \
+core_3, archs4x, archs4xd, archs4xd_slow"
   (const
    (cond [(symbol_ref "arc_tune == TUNE_ARC600")
 	  (const_string "arc600")
+	  (symbol_ref "arc_tune == ARC_TUNE_ARC7XX")
+	  (const_string "arc7xx")
 	  (symbol_ref "arc_tune == TUNE_ARC700_4_2_STD")
 	  (const_string "arc700_4_2_std")
 	  (symbol_ref "arc_tune == TUNE_ARC700_4_2_XMAC")
@@ -619,7 +621,7 @@ archs4x, archs4xd, archs4xd_slow"
 	 (const_string "none"))))
 
 (define_attr "tune_arc700" "false,true"
-  (if_then_else (eq_attr "tune" "arc700_4_2_std, arc700_4_2_xmac")
+  (if_then_else (eq_attr "tune" "arc7xx, arc700_4_2_std, arc700_4_2_xmac")
 		(const_string "true")
 		(const_string "false")))
 
diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
index 93e18af1d27..bcffb2720ba 100644
--- a/gcc/config/arc/arc.opt
+++ b/gcc/config/arc/arc.opt
@@ -262,6 +262,9 @@ Enum(arc_tune_attr) String(arc600) Value(ARC_TUNE_ARC600)
 EnumValue
 Enum(arc_tune_attr) String(arc601) Value(ARC_TUNE_ARC600)
 
+EnumValue
+Enum(arc_tune_attr) String(arc7xx) Value(ARC_TUNE_ARC7XX)
+
 EnumValue
 Enum(arc_tune_attr) String(arc700) Value(ARC_TUNE_ARC700_4_2_STD)
 
diff --git a/gcc/config/arc/arc700.md b/gcc/config/arc/arc700.md
index a0f9f74a9f2..cbb868d8dcd 100644
--- a/gcc/config/arc/arc700.md
+++ b/gcc/config/arc/arc700.md
@@ -145,28 +145,14 @@
 ; no functional unit runs when blockage is reserved
 (exclusion_set "blockage" "core, multiplier")
 
-(define_insn_reservation "data_load_DI" 4
-  (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "load")
-       (match_operand:DI 0 "" ""))
-  "issue+dmp, issue+dmp, dmp_write_port, dmp_write_port")
-
 (define_insn_reservation "data_load" 3
   (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "load")
-       (not (match_operand:DI 0 "" "")))
+       (eq_attr "type" "load"))
   "issue+dmp, nothing, dmp_write_port")
 
-(define_insn_reservation "data_store_DI" 2
-  (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "store")
-       (match_operand:DI 0 "" ""))
-  "issue+dmp_write_port, issue+dmp_write_port")
-
 (define_insn_reservation "data_store" 1
   (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "store")
-       (not (match_operand:DI 0 "" "")))
+       (eq_attr "type" "store"))
   "issue+dmp_write_port")
 
 (define_bypass 3 "data_store" "data_load" "arc_store_addr_hazard_p")
-- 
2.17.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 3/6] [ARC] Add BI/BIH instruction support.
  2018-10-10  8:01 [PATCH 0/6] ARC updates Claudiu Zissulescu
                   ` (3 preceding siblings ...)
  2018-10-10  8:01 ` [PATCH 2/6] [ARC] Cleanup TLS implementation Claudiu Zissulescu
@ 2018-10-10  8:49 ` Claudiu Zissulescu
  2018-10-16 23:19   ` Andrew Burgess
  2018-10-17  7:19   ` Sandra Loosemore
  2018-10-10  9:05 ` [PATCH 5/6] [ARC] Refurbish and improve prologue/epilogue functions Claudiu Zissulescu
  5 siblings, 2 replies; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-10  8:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Use BI/BIH instruction to implement casesi pattern. Only ARC V2.

gcc/
2018-03-21  Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/arc.c (arc_override_options): Remove
	TARGET_COMPACT_CASESI.
	* config/arc/arc.h (ASM_OUTPUT_ADDR_DIFF_ELT): Update.
	(CASE_VECTOR_MODE): Likewise.
	(CASE_VECTOR_PC_RELATIVE): Likewise.
	(CASE_VECTOR_SHORTEN_MODE): Likewise.
	(CASE_VECTOR_SHORTEN_MODE1): Delete.
	(ADDR_VEC_ALIGN): Update.
	(ASM_OUTPUT_CASE_LABEL): Undefine.
	(ASM_OUTPUT_BEFORE_CASE_LABEL): Undefine.
	(TARGET_BI_BIH): Define.
	(DEFAULT_BRANCH_INDEX): Likewise.
	* config/arc/arc.md (casesi): Rework to accept BI/BIH
	instructions, remove compact_casesi use case.
	(casesi_compact_jump): Remove.
	(casesi_dispatch): New pattern.
	* config/arc/arc.opt: Add mbranch-index option. Deprecate
	compact_casesi option.
	* doc/invoke.texi: Document mbranch-index option.

gcc/testsuite
Claudiu Zissulescu  <claziss@synopsys.com>

	* gcc.target/arc/jumptable.c: New test.
---
 gcc/config/arc/arc.c                     |  19 --
 gcc/config/arc/arc.h                     | 106 ++++++-----
 gcc/config/arc/arc.md                    | 218 +++++++----------------
 gcc/config/arc/arc.opt                   |   6 +-
 gcc/doc/invoke.texi                      |   9 +-
 gcc/testsuite/gcc.target/arc/jumptable.c |  34 ++++
 6 files changed, 171 insertions(+), 221 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arc/jumptable.c

diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index 56f566795ff..18dd0de6af7 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -1291,33 +1291,14 @@ arc_override_options (void)
   if (arc_size_opt_level == 3)
     optimize_size = 1;
 
-  /* Compact casesi is not a valid option for ARCv2 family.  */
-  if (TARGET_V2)
-    {
-      if (TARGET_COMPACT_CASESI)
-	{
-	  warning (OPT_mcompact_casesi,
-		   "compact-casesi is not applicable to ARCv2");
-	  TARGET_COMPACT_CASESI = 0;
-	}
-    }
-  else if (optimize_size == 1
-	   && !global_options_set.x_TARGET_COMPACT_CASESI)
-    TARGET_COMPACT_CASESI = 1;
-
   if (flag_pic)
     target_flags |= MASK_NO_SDATA_SET;
 
   if (flag_no_common == 255)
     flag_no_common = !TARGET_NO_SDATA_SET;
 
-  /* TARGET_COMPACT_CASESI needs the "q" register class.  */
   if (TARGET_MIXED_CODE)
     TARGET_Q_CLASS = 1;
-  if (!TARGET_Q_CLASS)
-    TARGET_COMPACT_CASESI = 0;
-  if (TARGET_COMPACT_CASESI)
-    TARGET_CASE_VECTOR_PC_RELATIVE = 1;
 
   /* Check for small data option */
   if (!global_options_set.x_g_switch_value && !TARGET_NO_SDATA_SET)
diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
index dd78a6bbbd1..cb48b85d6e7 100644
--- a/gcc/config/arc/arc.h
+++ b/gcc/config/arc/arc.h
@@ -1264,25 +1264,39 @@ do {							\
 } while (0)
 
 /* This is how to output an element of a case-vector that is relative.  */
-#define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL) \
-do {							\
-  char label[30];					\
-  ASM_GENERATE_INTERNAL_LABEL (label, "L", VALUE);	\
-  switch (GET_MODE (BODY))				\
-    {							\
-    case E_QImode: fprintf (FILE, "\t.byte "); break;	\
-    case E_HImode: fprintf (FILE, "\t.hword "); break;	\
-    case E_SImode: fprintf (FILE, "\t.word "); break;	\
-    default: gcc_unreachable ();			\
-    }							\
-  assemble_name (FILE, label);				\
-  fprintf (FILE, "-");					\
-  ASM_GENERATE_INTERNAL_LABEL (label, "L", REL);	\
-  assemble_name (FILE, label);				\
-  if (TARGET_COMPACT_CASESI)				\
-    fprintf (FILE, " + %d", 4 + arc_get_unalign ());	\
-  fprintf(FILE, "\n");                                  \
-} while (0)
+#define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL)	\
+  do {								\
+    char label[30];						\
+    ASM_GENERATE_INTERNAL_LABEL (label, "L", VALUE);		\
+    if (!TARGET_BI_BIH)						\
+      {								\
+	switch (GET_MODE (BODY))				\
+	  {							\
+	  case E_QImode: fprintf (FILE, "\t.byte "); break;	\
+	  case E_HImode: fprintf (FILE, "\t.hword "); break;	\
+	  case E_SImode: fprintf (FILE, "\t.word "); break;	\
+	  default: gcc_unreachable ();				\
+	  }							\
+	assemble_name (FILE, label);				\
+	fprintf (FILE, "-");					\
+	ASM_GENERATE_INTERNAL_LABEL (label, "L", REL);		\
+	assemble_name (FILE, label);				\
+	fprintf(FILE, "\n");					\
+      } else {							\
+      switch (GET_MODE (BODY))					\
+	{							\
+	case E_SImode: fprintf (FILE, "\tb\t@"); break;		\
+	case E_HImode:						\
+	case E_QImode: fprintf (FILE, "\tb_s\t@"); break;	\
+	default: gcc_unreachable ();				\
+	}							\
+      assemble_name (FILE, label);				\
+      fprintf(FILE, "\n");					\
+    }								\
+  } while (0)
+
+/* Defined to also emit an .align in elfos.h.  We don't want that.  */
+#undef ASM_OUTPUT_CASE_LABEL
 
 /* ADDR_DIFF_VECs are in the text section and thus can affect the
    current alignment.  */
@@ -1380,36 +1394,34 @@ do { \
    for the index in the tablejump instruction.
    If we have pc relative case vectors, we start the case vector shortening
    with QImode.  */
-#define CASE_VECTOR_MODE \
-  ((optimize && (CASE_VECTOR_PC_RELATIVE || flag_pic)) ? QImode : Pmode)
+#define CASE_VECTOR_MODE						\
+  (TARGET_BI_BIH ? SImode						\
+   : (optimize && (CASE_VECTOR_PC_RELATIVE || flag_pic)) ? QImode : Pmode)
 
 /* Define as C expression which evaluates to nonzero if the tablejump
    instruction expects the table to contain offsets from the address of the
    table.
    Do not define this if the table should contain absolute addresses.  */
-#define CASE_VECTOR_PC_RELATIVE TARGET_CASE_VECTOR_PC_RELATIVE
-
-#define CASE_VECTOR_SHORTEN_MODE(MIN_OFFSET, MAX_OFFSET, BODY) \
-  CASE_VECTOR_SHORTEN_MODE_1 \
-    (MIN_OFFSET, TARGET_COMPACT_CASESI ? MAX_OFFSET + 6 : MAX_OFFSET, BODY)
-
-#define CASE_VECTOR_SHORTEN_MODE_1(MIN_OFFSET, MAX_OFFSET, BODY) \
-((MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 255 \
- ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, QImode) \
- : (MIN_OFFSET) >= -128 && (MAX_OFFSET) <= 127 \
- ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, QImode) \
- : (MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 65535 \
- ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, HImode) \
- : (MIN_OFFSET) >= -32768 && (MAX_OFFSET) <= 32767 \
- ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, HImode) \
- : SImode)
-
-#define ADDR_VEC_ALIGN(VEC_INSN) \
-  (exact_log2 (GET_MODE_SIZE (as_a <scalar_int_mode> \
-			      (GET_MODE (PATTERN (VEC_INSN))))))
-#undef ASM_OUTPUT_BEFORE_CASE_LABEL
-#define ASM_OUTPUT_BEFORE_CASE_LABEL(FILE, PREFIX, NUM, TABLE) \
-  ASM_OUTPUT_ALIGN ((FILE), ADDR_VEC_ALIGN (TABLE))
+#define CASE_VECTOR_PC_RELATIVE					\
+  (TARGET_CASE_VECTOR_PC_RELATIVE || TARGET_BI_BIH)
+
+#define CASE_VECTOR_SHORTEN_MODE(MIN_OFFSET, MAX_OFFSET, BODY)		\
+  (TARGET_BI_BIH ?						\
+   ((MIN_OFFSET) >= -512 && (MAX_OFFSET) <= 508 ? HImode : SImode)	\
+   : ((MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 255				\
+      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, QImode)	\
+      : (MIN_OFFSET) >= -128 && (MAX_OFFSET) <= 127			\
+      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, QImode)	\
+      : (MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 65535			\
+      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, HImode)	\
+      : (MIN_OFFSET) >= -32768 && (MAX_OFFSET) <= 32767			\
+      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, HImode)	\
+      : SImode))
+
+#define ADDR_VEC_ALIGN(VEC_INSN)					\
+  (TARGET_BI_BIH ? 0							\
+   : exact_log2 (GET_MODE_SIZE (as_a <scalar_int_mode>			\
+				(GET_MODE (PATTERN (VEC_INSN))))))
 
 #define INSN_LENGTH_ALIGNMENT(INSN)		  \
   ((JUMP_TABLE_DATA_P (INSN)			  \
@@ -1636,4 +1648,10 @@ enum
 #define TARGET_LRA arc_lra_p()
 #endif
 
+/* BI/BIH feature macro.  */
+#define TARGET_BI_BIH (TARGET_BRANCH_INDEX && TARGET_CODE_DENSITY)
+
+/* The default option for BI/BIH instructions.  */
+#define DEFAULT_BRANCH_INDEX 0
+
 #endif /* GCC_ARC_H */
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 6ea67791627..1ed230fa5f0 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -3968,60 +3968,72 @@ archs4x, archs4xd, archs4xd_slow"
    (set_attr "cond" "canuse,canuse_limm,canuse,canuse,canuse")])
 
 ;; Implement a switch statement.
-
 (define_expand "casesi"
-  [(set (match_dup 5)
-	(minus:SI (match_operand:SI 0 "register_operand" "")
-		  (match_operand:SI 1 "nonmemory_operand" "")))
-   (set (reg:CC CC_REG)
-	(compare:CC (match_dup 5)
-		    (match_operand:SI 2 "nonmemory_operand" "")))
-   (set (pc)
-	(if_then_else (gtu (reg:CC CC_REG)
-			   (const_int 0))
-		      (label_ref (match_operand 4 "" ""))
-		      (pc)))
-   (set (match_dup 6)
-	(unspec:SI [(match_operand 3 "" "")
-		    (match_dup 5) (match_dup 7)] UNSPEC_ARC_CASESI))
-   (parallel [(set (pc) (match_dup 6)) (use (match_dup 7))])]
+  [(match_operand:SI 0 "register_operand" "")	; Index
+   (match_operand:SI 1 "const_int_operand" "")	; Lower bound
+   (match_operand:SI 2 "const_int_operand" "")	; Total range
+   (match_operand:SI 3 "" "")		; Table label
+   (match_operand:SI 4 "" "")]		; Out of range label
   ""
-  "
-{
-  rtx x;
-
-  operands[5] = gen_reg_rtx (SImode);
-  operands[6] = gen_reg_rtx (SImode);
-  operands[7] = operands[3];
-  emit_insn (gen_subsi3 (operands[5], operands[0], operands[1]));
-  emit_insn (gen_cmpsi_cc_insn_mixed (operands[5], operands[2]));
-  x = gen_rtx_GTU (VOIDmode, gen_rtx_REG (CCmode, CC_REG), const0_rtx);
-  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-			    gen_rtx_LABEL_REF (VOIDmode, operands[4]), pc_rtx);
-  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
-  if (TARGET_COMPACT_CASESI)
-    {
-      emit_jump_insn (gen_casesi_compact_jump (operands[5], operands[7]));
-    }
-  else
-    {
+  {
+   if (operands[1] != const0_rtx)
+     {
+       rtx reg = gen_reg_rtx (SImode);
+       emit_insn (gen_subsi3 (reg, operands[0], operands[1]));
+       operands[0] = reg;
+      }
+   emit_jump_insn (gen_cbranchsi4 (gen_rtx_GTU (SImode, operands[0],
+							operands[2]),
+				   operands[0], operands[2], operands[4]));
+   if (TARGET_BI_BIH)
+     {
+      emit_jump_insn (gen_casesi_dispatch (operands[0], operands[3]));
+     }
+   else
+   {
+      rtx reg = gen_reg_rtx (SImode);
+      rtx lbl = operands[3];
       operands[3] = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
-      if (flag_pic || !cse_not_expected)
+      if (flag_pic)
 	operands[3] = force_reg (Pmode, operands[3]);
-      emit_insn (gen_casesi_load (operands[6],
-				  operands[3], operands[5], operands[7]));
+      emit_insn (gen_casesi_load (reg,
+				  operands[3], operands[0], lbl));
       if (CASE_VECTOR_PC_RELATIVE || flag_pic)
-	emit_insn (gen_addsi3 (operands[6], operands[6], operands[3]));
-      emit_jump_insn (gen_casesi_jump (operands[6], operands[7]));
+	emit_insn (gen_addsi3 (reg, reg, operands[3]));
+      emit_jump_insn (gen_casesi_jump (reg, lbl));
+     }
+   DONE;
+  })
+
+(define_insn "casesi_dispatch"
+  [(set (pc)
+	(unspec:SI [(match_operand:SI 0 "register_operand" "r")
+		    (label_ref (match_operand 1 "" ""))]
+		   UNSPEC_ARC_CASESI))]
+  "TARGET_BI_BIH"
+  {
+   rtx diff_vec = PATTERN (next_nonnote_insn (as_a<rtx_insn *> (operands[1])));
+   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
+   switch (GET_MODE (diff_vec))
+   {
+    case E_SImode:
+     return \"bi\\t[%0]\";
+    case E_HImode:
+    case E_QImode:
+    return \"bih\\t[%0]\";
+    default: gcc_unreachable ();
     }
-  DONE;
-}")
+   }
+  [(set_attr "type" "brcc_no_delay_slot")
+   (set_attr "iscompact" "false")
+   (set_attr "length" "4")])
 
 (define_insn "casesi_load"
-  [(set (match_operand:SI 0 "register_operand"             "=Rcq,r,r")
-	(unspec:SI [(match_operand:SI 1 "nonmemory_operand" "Rcq,c,Cal")
-		    (match_operand:SI 2 "register_operand"  "Rcq,c,c")
-		    (label_ref (match_operand 3 "" ""))] UNSPEC_ARC_CASESI))]
+  [(set (match_operand:SI 0 "register_operand"             "=q,r,r")
+	(mem:SI (unspec:SI [(match_operand:SI 1 "nonmemory_operand" "q,r,Cal")
+			    (match_operand:SI 2 "register_operand"  "q,r,r")]
+			   UNSPEC_ARC_CASESI)))
+   (use (label_ref (match_operand 3 "" "")))]
   ""
   "*
 {
@@ -4037,15 +4049,15 @@ archs4x, archs4xd, archs4xd_slow"
   switch (GET_MODE (diff_vec))
     {
     case E_SImode:
-      return \"ld.as %0,[%1,%2]%&\";
+      return \"ld.as\\t%0,[%1,%2]%&\";
     case E_HImode:
       if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
-	return \"ld%_.as %0,[%1,%2]\";
-      return \"ld%_.x.as %0,[%1,%2]\";
+	return \"ld%_.as\\t%0,[%1,%2]\";
+      return \"ld%_.x.as\\t%0,[%1,%2]\";
     case E_QImode:
       if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
-	return \"ldb%? %0,[%1,%2]%&\";
-      return \"ldb.x %0,[%1,%2]\";
+	return \"ldb%?\\t%0,[%1,%2]%&\";
+      return \"ldb.x\\t%0,[%1,%2]\";
     default:
       gcc_unreachable ();
     }
@@ -4085,110 +4097,6 @@ archs4x, archs4xd, archs4xd_slow"
    (set_attr "iscompact" "false,maybe,false")
    (set_attr "cond" "canuse")])
 
-(define_insn "casesi_compact_jump"
-  [(set (pc)
-	(unspec:SI [(match_operand:SI 0 "register_operand" "c,q")]
-		   UNSPEC_ARC_CASESI))
-   (use (label_ref (match_operand 1 "" "")))
-   (clobber (match_scratch:SI 2 "=q,0"))]
-  "TARGET_COMPACT_CASESI"
-  "*
-{
-  rtx diff_vec = PATTERN (next_nonnote_insn (as_a<rtx_insn *> (operands[1])));
-  int unalign = arc_get_unalign ();
-  rtx xop[3];
-  const char *s;
-
-  xop[0] = operands[0];
-  xop[2] = operands[2];
-  gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
-
-  switch (GET_MODE (diff_vec))
-    {
-    case E_SImode:
-      /* Max length can be 12 in this case, but this is OK because
-	 2 of these are for alignment, and are anticipated in the length
-	 of the ADDR_DIFF_VEC.  */
-      if (unalign && !satisfies_constraint_Rcq (xop[0]))
-	s = \"add2 %2,pcl,%0\n\tld_s %2,[%2,12]\";
-      else if (unalign)
-	s = \"add_s %2,%0,2\n\tld.as %2,[pcl,%2]\";
-      else
-	s = \"add %2,%0,2\n\tld.as %2,[pcl,%2]\";
-      arc_clear_unalign ();
-      break;
-    case E_HImode:
-      if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
-	{
-	  if (satisfies_constraint_Rcq (xop[0]))
-	    {
-	      s = \"add_s %2,%0,%1\n\tld%_.as %2,[pcl,%2]\";
-	      xop[1] = GEN_INT ((10 - unalign) / 2U);
-	    }
-	  else
-	    {
-	      s = \"add1 %2,pcl,%0\n\tld%__s %2,[%2,%1]\";
-	      xop[1] = GEN_INT (10 + unalign);
-	    }
-	}
-      else
-	{
-	  if (satisfies_constraint_Rcq (xop[0]))
-	    {
-	      s = \"add_s %2,%0,%1\n\tld%_.x.as %2,[pcl,%2]\";
-	      xop[1] = GEN_INT ((10 - unalign) / 2U);
-	    }
-	  else
-	    {
-	      s = \"add1 %2,pcl,%0\n\tld%__s.x %2,[%2,%1]\";
-	      xop[1] = GEN_INT (10 + unalign);
-	    }
-	}
-      arc_toggle_unalign ();
-      break;
-    case E_QImode:
-      if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
-	{
-	  if ((rtx_equal_p (xop[2], xop[0])
-	       || find_reg_note (insn, REG_DEAD, xop[0]))
-	      && satisfies_constraint_Rcq (xop[0]))
-	    {
-	      s = \"add_s %0,%0,pcl\n\tldb_s %2,[%0,%1]\";
-	      xop[1] = GEN_INT (8 + unalign);
-	    }
-	  else
-	    {
-	      s = \"add %2,%0,pcl\n\tldb_s %2,[%2,%1]\";
-	      xop[1] = GEN_INT (10 + unalign);
-	      arc_toggle_unalign ();
-	    }
-	}
-      else if ((rtx_equal_p (xop[0], xop[2])
-		|| find_reg_note (insn, REG_DEAD, xop[0]))
-	       && satisfies_constraint_Rcq (xop[0]))
-	{
-	  s = \"add_s %0,%0,%1\n\tldb.x %2,[pcl,%0]\";
-	  xop[1] = GEN_INT (10 - unalign);
-	  arc_toggle_unalign ();
-	}
-      else
-	{
-	  /* ??? Length is 12.  */
-	  s = \"add %2,%0,%1\n\tldb.x %2,[pcl,%2]\";
-	  xop[1] = GEN_INT (8 + unalign);
-	}
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  output_asm_insn (s, xop);
-  return \"add_s %2,%2,pcl\n\tj_s%* [%2]\";
-}"
-  [(set_attr "length" "10")
-   (set_attr "type" "jump")
-   (set_attr "iscompact" "true")
-   (set_attr "cond" "nocond")])
-
 (define_expand "call"
   ;; operands[1] is stack_size_rtx
   ;; operands[2] is next_arg_register
diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
index ee06c063837..3e96b58375d 100644
--- a/gcc/config/arc/arc.opt
+++ b/gcc/config/arc/arc.opt
@@ -328,7 +328,7 @@ Target Var(TARGET_CASE_VECTOR_PC_RELATIVE)
 Use pc-relative switch case tables - this enables case table shortening.
 
 mcompact-casesi
-Target Var(TARGET_COMPACT_CASESI)
+Target Warn(%qs is deprecated)
 Enable compact casesi pattern.
 
 mq-class
@@ -528,3 +528,7 @@ Enum(arc_lpc) String(32) Value(32)
 mrf16
 Target Report Mask(RF16)
 Enable 16-entry register file.
+
+mbranch-index
+Target Report Var(TARGET_BRANCH_INDEX) Init(DEFAULT_BRANCH_INDEX)
+Enable use of BI/BIH instructions when available.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 802cc642453..454587310c8 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -650,7 +650,7 @@ Objective-C and Objective-C++ Dialects}.
 -mmixed-code  -mq-class  -mRcq  -mRcw  -msize-level=@var{level} @gol
 -mtune=@var{cpu}  -mmultcost=@var{num} @gol
 -munalign-prob-threshold=@var{probability}  -mmpy-option=@var{multo} @gol
--mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16}
+-mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16 -mbranch-index}
 
 @emph{ARM Options}
 @gccoptlist{-mapcs-frame  -mno-apcs-frame @gol
@@ -15814,6 +15814,11 @@ This option instructs the compiler to generate code for a 16-entry
 register file.  This option defines the @code{__ARC_RF16__}
 preprocessor macro.
 
+@item -mbranch-index
+@opindex mbranch-index
+Enable use of @code{bi} or @code{bih} instructions to implement jump
+tables.
+
 @end table
 
 The following options are passed through to the assembler, and also
@@ -15985,7 +15990,7 @@ This is the default for @option{-Os}.
 @item -mcompact-casesi
 @opindex mcompact-casesi
 Enable compact @code{casesi} pattern.  This is the default for @option{-Os},
-and only available for ARCv1 cores.
+and only available for ARCv1 cores.  This option is deprecated.
 
 @item -mno-cond-exec
 @opindex mno-cond-exec
diff --git a/gcc/testsuite/gcc.target/arc/jumptable.c b/gcc/testsuite/gcc.target/arc/jumptable.c
new file mode 100644
index 00000000000..fbc58e33149
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/jumptable.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { arc700 || arc6xx } } */
+/* { dg-options "-O2 -mbranch-index -mcode-density" { target { arcem || archs } } } */
+
+extern void max( int,int);
+
+int switchCase(int value, int b)
+{
+  switch(value){
+  case 100:
+    value = b * value;
+    break;
+  case 101:
+    value = b << value;
+    break;
+  case 102:
+    value = b / value;
+    break;
+  case 103:
+    value = b >> value;
+    break;
+  case 104:
+    value = b + value;
+    break;
+  case 105:
+    value = b - value;
+    break;
+  }
+  max(value, b);
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "bih" 1 } } */
+/* { dg-final { scan-assembler-times "b_s" 8 } } */
-- 
2.17.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 5/6] [ARC] Refurbish and improve prologue/epilogue functions.
  2018-10-10  8:01 [PATCH 0/6] ARC updates Claudiu Zissulescu
                   ` (4 preceding siblings ...)
  2018-10-10  8:49 ` [PATCH 3/6] [ARC] Add BI/BIH instruction support Claudiu Zissulescu
@ 2018-10-10  9:05 ` Claudiu Zissulescu
  2018-10-22 18:26   ` Andrew Burgess
  5 siblings, 1 reply; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-10  9:05 UTC (permalink / raw)
  To: gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Reimplement how prologue and epilogue is emitted to accomodate
enter/leave instructions, as well as improving the size of the
existing techinques.

The following modifications are added:

- millicode thunk calls can be now selected regardless of the
  optimization level. However they are enabled for size optimizations
  by default.  Also, the millicode optimization is turned off when we
  compile for long jumps.

- the compiler is able to use enter/leave instructions for prologue
  and epilogue. As these instructions are not ABI compatible we gurad
  them under a switch (i.e., -mcode-density-frame). When this option
  is on, the compiler will try emitting enter/leave instructions, if
  not, then millicode thunk calls (if enabled), and latter the regular
  push/pop instructions.

- The prologue/epilogue is now optimized to use pointer walks, hence
  improving the chance to have push_s/pop_s instructions emitted. It
  also tries to combine the stack adjustments with load/store
  operations.

gcc/
xxxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>

	* common/config/arc/arc-common.c (arc_option_optimization_table):
	Millicode optimization is default on for size optimizations.
	* config/arc/arc-protos.h (arc_check_multi): New function.
	* config/arc/arc.c (RTX_OK_FOR_OFFSET_P): Rearange.
	(arc_override_options): Disable millicode when long calls option
	is on.
	(arc_frame_info): Change it from int to bool.
	(arc_compute_frame_size): Clean up.
	(arc_save_restore): Remove.
	(frame_save_reg): New function.
	(frame_restore_reg): Likewise.
	(arc_enter_leave_p): Likewise.
	(arc_save_callee_saves): Likewise.
	(arc_restore_callee_saves): Likewise.
	(arc_save_callee_enter): Likewise.
	(arc_restore_callee_leave): Likewise.
	(arc_save_callee_milli): Likewise.
	(arc_restore_callee_milli): Likewise.
	(arc_expand_prologue): Reimplement to emit enter/leave
	instructions.
	(arc_expand_epilogue): Likewise.
	(arc_check_multi): New function.
	* config/arc/arc.md (push_multi_fp): New pattern.
	(push_multi_fp_blink): Likewise.
	(pop_multi_fp): Likewise.
	(pop_multi_fp_blink): Likewise.
	(pop_multi_fp_ret): Likewise.
	(pop_multi_fp_blink_ret): Likewise.
	* config/arc/arc.opt (mmillicode): Update option.
	(mcode-density-frame): New option.
	* config/arc/predicates.md (push_multi_operand): New predicate.
	(pop_multi_operand): Likewise.
	* doc/invoke.texi (ARC): Update ARC options information.

gcc/testsuite
xxxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>

	* gcc.target/arc/firq-1.c: Update test.
	* gcc.target/arc/firq-3.c: Likewise.
	* gcc.target/arc/firq-4.c: Likewise.
	* gcc.target/arc/interrupt-6.c: Likewise.
---
 gcc/common/config/arc/arc-common.c         |    1 +
 gcc/config/arc/arc-protos.h                |    1 +
 gcc/config/arc/arc.c                       | 1266 +++++++++++++-------
 gcc/config/arc/arc.md                      |  172 +++
 gcc/config/arc/arc.opt                     |   10 +-
 gcc/config/arc/predicates.md               |   12 +
 gcc/doc/invoke.texi                        |   18 +-
 gcc/testsuite/gcc.target/arc/firq-1.c      |    8 +-
 gcc/testsuite/gcc.target/arc/firq-3.c      |   14 +-
 gcc/testsuite/gcc.target/arc/firq-4.c      |   12 +-
 gcc/testsuite/gcc.target/arc/interrupt-6.c |    2 +-
 11 files changed, 1054 insertions(+), 462 deletions(-)

diff --git a/gcc/common/config/arc/arc-common.c b/gcc/common/config/arc/arc-common.c
index 578431a279d..2872388de2c 100644
--- a/gcc/common/config/arc/arc-common.c
+++ b/gcc/common/config/arc/arc-common.c
@@ -59,6 +59,7 @@ static const struct default_options arc_option_optimization_table[] =
     { OPT_LEVELS_SIZE, OPT_mq_class, NULL, 1 },
     { OPT_LEVELS_SIZE, OPT_mcase_vector_pcrel, NULL, 1 },
     { OPT_LEVELS_SIZE, OPT_msize_level_, NULL, 3 },
+    { OPT_LEVELS_SIZE, OPT_mmillicode, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
     { OPT_LEVELS_3_PLUS_SPEED_ONLY, OPT_msize_level_, NULL, 0 },
     { OPT_LEVELS_3_PLUS_SPEED_ONLY, OPT_malign_call, NULL, 1 },
diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
index 55f8ed4c643..6450b6a014e 100644
--- a/gcc/config/arc/arc-protos.h
+++ b/gcc/config/arc/arc-protos.h
@@ -47,6 +47,7 @@ extern unsigned int arc_compute_function_type (struct function *);
 extern bool arc_is_uncached_mem_p (rtx);
 extern bool arc_lra_p (void);
 extern bool gen_operands_ldd_std (rtx *operands, bool load, bool commute);
+extern bool arc_check_multi (rtx, bool);
 #endif /* RTX_CODE */
 
 extern unsigned int arc_compute_frame_size (int);
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index 9bc69e9fbc9..ab7735d6b38 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -89,12 +89,12 @@ HARD_REG_SET overrideregs;
 
 /* ??? Loads can handle any constant, stores can only handle small ones.  */
 /* OTOH, LIMMs cost extra, so their usefulness is limited.  */
-#define RTX_OK_FOR_OFFSET_P(MODE, X) \
-(GET_CODE (X) == CONST_INT \
- && SMALL_INT_RANGE (INTVAL (X), (GET_MODE_SIZE (MODE) - 1) & -4, \
-		     (INTVAL (X) & (GET_MODE_SIZE (MODE) - 1) & 3 \
-		      ? 0 \
-		      : -(-GET_MODE_SIZE (MODE) | -4) >> 1)))
+#define RTX_OK_FOR_OFFSET_P(MODE, X)					\
+  (GET_CODE (X) == CONST_INT						\
+   && SMALL_INT_RANGE (INTVAL (X), (GET_MODE_SIZE (MODE) - 1) & -4,	\
+		       (INTVAL (X) & (GET_MODE_SIZE (MODE) - 1) & 3	\
+			? 0						\
+			: -(-GET_MODE_SIZE (MODE) | -4) >> 1)))
 
 /* Array of valid operand punctuation characters.  */
 char arc_punct_chars[256];
@@ -1304,6 +1304,10 @@ arc_override_options (void)
   if (!global_options_set.x_g_switch_value && !TARGET_NO_SDATA_SET)
     g_switch_value = TARGET_LL64 ? 8 : 4;
 
+  /* Millicode thunks doesn't work with long calls.  */
+  if (TARGET_LONG_CALLS_SET)
+    target_flags &= ~MASK_MILLICODE_THUNK_SET;
+
   /* These need to be done at start up.  It's convenient to do them here.  */
   arc_init ();
 }
@@ -2611,9 +2615,8 @@ struct GTY (()) arc_frame_info
   unsigned int args_size;	/* # bytes that outgoing arguments take up.  */
   unsigned int reg_size;	/* # bytes needed to store regs.  */
   unsigned int var_size;	/* # bytes that variables take up.  */
-  unsigned int reg_offset;	/* Offset from new sp to store regs.  */
   unsigned int gmask;		/* Mask of saved gp registers.  */
-  int          initialized;	/* Nonzero if frame size already calculated.  */
+  bool         initialized;	/* Nonzero if frame size already calculated.  */
   short millicode_start_reg;
   short millicode_end_reg;
   bool save_return_addr;
@@ -2829,10 +2832,12 @@ arc_compute_frame_size (void)
 {
   int regno;
   unsigned int total_size, var_size, args_size, pretend_size, extra_size;
-  unsigned int reg_size, reg_offset;
+  unsigned int reg_size;
   unsigned int gmask;
   struct arc_frame_info *frame_info;
   int size;
+  unsigned int extra_plus_reg_size;
+  unsigned int extra_plus_reg_size_aligned;
 
   /* The answer might already be known.  */
   if (cfun->machine->frame_info.initialized)
@@ -2876,23 +2881,23 @@ arc_compute_frame_size (void)
     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
       {
 	reg_size += UNITS_PER_WORD;
-	gmask |= 1 << regno;
+	gmask |= 1L << regno;
       }
 
-  /* 4) Space for back trace data structure.
-	<return addr reg size> (if required) + <fp size> (if required).  */
-  frame_info->save_return_addr
-    = (!crtl->is_leaf || df_regs_ever_live_p (RETURN_ADDR_REGNUM)
-       || crtl->calls_eh_return);
-  /* Saving blink reg in case of leaf function for millicode thunk calls.  */
-  if (optimize_size
-      && !TARGET_NO_MILLICODE_THUNK_SET
+  /* Check if we need to save the return address.  */
+  frame_info->save_return_addr = (!crtl->is_leaf
+				  || df_regs_ever_live_p (RETURN_ADDR_REGNUM)
+				  || crtl->calls_eh_return);
+
+  /* Saving blink reg for millicode thunk calls.  */
+  if (TARGET_MILLICODE_THUNK_SET
       && !crtl->calls_eh_return)
     {
       if (arc_compute_millicode_save_restore_regs (gmask, frame_info))
 	frame_info->save_return_addr = true;
     }
 
+  /* 4) Calculate extra size made up of the blink + fp size.  */
   extra_size = 0;
   if (arc_must_save_return_addr (cfun))
     extra_size = 4;
@@ -2903,14 +2908,9 @@ arc_compute_frame_size (void)
   pretend_size	= crtl->args.pretend_args_size;
 
   /* Ensure everything before the locals is aligned appropriately.  */
-    {
-       unsigned int extra_plus_reg_size;
-       unsigned int extra_plus_reg_size_aligned;
-
-       extra_plus_reg_size = extra_size + reg_size;
-       extra_plus_reg_size_aligned = ARC_STACK_ALIGN(extra_plus_reg_size);
-       reg_size = extra_plus_reg_size_aligned - extra_size;
-    }
+  extra_plus_reg_size = extra_size + reg_size;
+  extra_plus_reg_size_aligned = ARC_STACK_ALIGN(extra_plus_reg_size);
+  reg_size = extra_plus_reg_size_aligned - extra_size;
 
   /* Compute total frame size.  */
   total_size = var_size + args_size + extra_size + pretend_size + reg_size;
@@ -2921,12 +2921,6 @@ arc_compute_frame_size (void)
      as an issue I've changed this to an assert for now.  */
   gcc_assert (total_size == ARC_STACK_ALIGN (total_size));
 
-  /* Compute offset of register save area from stack pointer:
-     Frame: pretend_size <blink> reg_size <fp> var_size args_size <--sp
-  */
-  reg_offset = (total_size - (pretend_size + reg_size + extra_size)
-		+ (arc_frame_pointer_needed () ? 4 : 0));
-
   /* Save computed information.  */
   frame_info->total_size   = total_size;
   frame_info->extra_size   = extra_size;
@@ -2934,7 +2928,6 @@ arc_compute_frame_size (void)
   frame_info->var_size     = var_size;
   frame_info->args_size    = args_size;
   frame_info->reg_size     = reg_size;
-  frame_info->reg_offset   = reg_offset;
   frame_info->gmask        = gmask;
   frame_info->initialized  = reload_completed;
 
@@ -2942,187 +2935,6 @@ arc_compute_frame_size (void)
   return total_size;
 }
 
-/* Common code to save/restore registers.  */
-/* BASE_REG is the base register to use for addressing and to adjust.
-   GMASK is a bitmask of general purpose registers to save/restore.
-   epilogue_p 0: prologue 1:epilogue 2:epilogue, sibling thunk
-   If *FIRST_OFFSET is non-zero, add it first to BASE_REG - preferably
-   using a pre-modify for the first memory access.  *FIRST_OFFSET is then
-   zeroed.  */
-
-static void
-arc_save_restore (rtx base_reg,
-		  unsigned int gmask, int epilogue_p, int *first_offset)
-{
-  unsigned int offset = 0;
-  int regno;
-  struct arc_frame_info *frame = &cfun->machine->frame_info;
-  rtx sibthunk_insn = NULL_RTX;
-
-  if (gmask)
-    {
-      /* Millicode thunks implementation:
-	 Generates calls to millicodes for registers starting from r13 to r25
-	 Present Limitations:
-	 - Only one range supported. The remaining regs will have the ordinary
-	   st and ld instructions for store and loads. Hence a gmask asking
-	   to store r13-14, r16-r25 will only generate calls to store and
-	   load r13 to r14 while store and load insns will be generated for
-	   r16 to r25 in the prologue and epilogue respectively.
-
-	 - Presently library only supports register ranges starting from r13.
-      */
-      if (epilogue_p == 2 || frame->millicode_end_reg > 14)
-	{
-	  int start_call = frame->millicode_start_reg;
-	  int end_call = frame->millicode_end_reg;
-	  int n_regs = end_call - start_call + 1;
-	  int i = 0, r, off = 0;
-	  rtx insn;
-	  rtx ret_addr = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
-
-	  if (*first_offset)
-	    {
-	      /* "reg_size" won't be more than 127 .  */
-	      gcc_assert (epilogue_p || abs (*first_offset) <= 127);
-	      frame_add (base_reg, *first_offset);
-	      *first_offset = 0;
-	    }
-	  insn = gen_rtx_PARALLEL
-		  (VOIDmode, rtvec_alloc ((epilogue_p == 2) + n_regs + 1));
-	  if (epilogue_p == 2)
-	    i += 2;
-	  else
-	    XVECEXP (insn, 0, n_regs) = gen_rtx_CLOBBER (VOIDmode, ret_addr);
-	  for (r = start_call; r <= end_call; r++, off += UNITS_PER_WORD, i++)
-	    {
-	      rtx reg = gen_rtx_REG (SImode, r);
-	      rtx mem
-		= gen_frame_mem (SImode, plus_constant (Pmode, base_reg, off));
-
-	      if (epilogue_p)
-		XVECEXP (insn, 0, i) = gen_rtx_SET (reg, mem);
-	      else
-		XVECEXP (insn, 0, i) = gen_rtx_SET (mem, reg);
-	      gmask = gmask & ~(1L << r);
-	    }
-	  if (epilogue_p == 2)
-	    sibthunk_insn = insn;
-	  else
-	    {
-	      insn = frame_insn (insn);
-	      for (r = start_call, off = 0;
-		   r <= end_call;
-		   r++, off += UNITS_PER_WORD)
-		{
-		  rtx reg = gen_rtx_REG (SImode, r);
-		  if (epilogue_p)
-		      add_reg_note (insn, REG_CFA_RESTORE, reg);
-		  else
-		    {
-		      rtx mem = gen_rtx_MEM (SImode, plus_constant (Pmode,
-								    base_reg,
-								    off));
-
-		      add_reg_note (insn, REG_CFA_OFFSET,
-				    gen_rtx_SET (mem, reg));
-		    }
-		}
-	    }
-	  offset += off;
-	}
-
-      for (regno = 0; regno <= 31; regno++)
-	{
-	  machine_mode mode = SImode;
-	  bool found = false;
-
-	  if (TARGET_LL64
-	      && (regno % 2 == 0)
-	      && ((gmask & (1L << regno)) != 0)
-	      && ((gmask & (1L << (regno+1))) != 0))
-	    {
-	      found = true;
-	      mode  = DImode;
-	    }
-	  else if ((gmask & (1L << regno)) != 0)
-	    {
-	      found = true;
-	      mode  = SImode;
-	    }
-
-	  if (found)
-	    {
-	      rtx reg = gen_rtx_REG (mode, regno);
-	      rtx addr, mem;
-	      int cfa_adjust = *first_offset;
-
-	      if (*first_offset)
-		{
-		  gcc_assert (!offset);
-		  addr = plus_constant (Pmode, base_reg, *first_offset);
-		  addr = gen_rtx_PRE_MODIFY (Pmode, base_reg, addr);
-		  *first_offset = 0;
-		}
-	      else
-		{
-		  gcc_assert (SMALL_INT (offset));
-		  addr = plus_constant (Pmode, base_reg, offset);
-		}
-	      mem = gen_frame_mem (mode, addr);
-	      if (epilogue_p)
-		{
-		  rtx insn =
-		    frame_move_inc (reg, mem, base_reg, addr);
-		  add_reg_note (insn, REG_CFA_RESTORE, reg);
-		  if (cfa_adjust)
-		    {
-		      enum reg_note note = REG_CFA_ADJUST_CFA;
-		      add_reg_note (insn, note,
-				    gen_rtx_SET (stack_pointer_rtx,
-						 plus_constant (Pmode,
-								stack_pointer_rtx,
-								cfa_adjust)));
-		    }
-		}
-	      else
-		frame_move_inc (mem, reg, base_reg, addr);
-	      offset += UNITS_PER_WORD;
-	      if (mode == DImode)
-		{
-		  offset += UNITS_PER_WORD;
-		  ++regno;
-		}
-	    } /* if */
-	} /* for */
-    }/* if */
-  if (sibthunk_insn)
-    {
-      int start_call = frame->millicode_start_reg;
-      int end_call = frame->millicode_end_reg;
-      int r;
-
-      rtx r12 = gen_rtx_REG (Pmode, 12);
-
-      frame_insn (gen_rtx_SET (r12, GEN_INT (offset)));
-      XVECEXP (sibthunk_insn, 0, 0) = ret_rtx;
-      XVECEXP (sibthunk_insn, 0, 1)
-	= gen_rtx_SET (stack_pointer_rtx,
-		       gen_rtx_PLUS (Pmode, stack_pointer_rtx, r12));
-      sibthunk_insn = emit_jump_insn (sibthunk_insn);
-      RTX_FRAME_RELATED_P (sibthunk_insn) = 1;
-
-      /* Would be nice if we could do this earlier, when the PARALLEL
-	 is populated, but these need to be attached after the
-	 emit.  */
-      for (r = start_call; r <= end_call; r++)
-	{
-	  rtx reg = gen_rtx_REG (SImode, r);
-	  add_reg_note (sibthunk_insn, REG_CFA_RESTORE, reg);
-	}
-    }
-} /* arc_save_restore */
-
 /* Build dwarf information when the context is saved via AUX_IRQ_CTRL
    mechanism.  */
 
@@ -3193,6 +3005,680 @@ arc_dwarf_emit_irq_save_regs (void)
   RTX_FRAME_RELATED_P (insn) = 1;
 }
 
+static int
+frame_save_reg (rtx reg, HOST_WIDE_INT offset)
+{
+  rtx addr;
+
+  if (offset)
+    {
+      rtx tmp = plus_constant (Pmode, stack_pointer_rtx,
+			       offset - GET_MODE_SIZE (GET_MODE (reg)));
+      addr = gen_frame_mem (GET_MODE (reg),
+			    gen_rtx_PRE_MODIFY (Pmode,
+						stack_pointer_rtx,
+						tmp));
+    }
+  else
+    addr = gen_frame_mem (GET_MODE (reg), gen_rtx_PRE_DEC (Pmode,
+							   stack_pointer_rtx));
+  frame_move_inc (addr, reg, stack_pointer_rtx, 0);
+
+  return GET_MODE_SIZE (GET_MODE (reg)) - offset;
+}
+
+static int
+frame_restore_reg (rtx reg, HOST_WIDE_INT offset)
+{
+  rtx addr, insn;
+
+  if (offset)
+    {
+      rtx tmp = plus_constant (Pmode, stack_pointer_rtx,
+			       offset + GET_MODE_SIZE (GET_MODE (reg)));
+      addr = gen_frame_mem (GET_MODE (reg),
+			    gen_rtx_POST_MODIFY (Pmode,
+						 stack_pointer_rtx,
+						 tmp));
+    }
+  else
+    addr = gen_frame_mem (GET_MODE (reg), gen_rtx_POST_INC (Pmode,
+							    stack_pointer_rtx));
+  insn = frame_move_inc (reg, addr, stack_pointer_rtx, 0);
+  add_reg_note (insn, REG_CFA_RESTORE, reg);
+
+  if (reg == frame_pointer_rtx)
+    add_reg_note (insn, REG_CFA_DEF_CFA,
+		  plus_constant (Pmode, stack_pointer_rtx,
+				 GET_MODE_SIZE (GET_MODE (reg)) + offset));
+  else
+    add_reg_note (insn, REG_CFA_ADJUST_CFA,
+		  gen_rtx_SET (stack_pointer_rtx,
+			       plus_constant (Pmode, stack_pointer_rtx,
+					      GET_MODE_SIZE (GET_MODE (reg))
+					      + offset)));
+
+  return GET_MODE_SIZE (GET_MODE (reg)) + offset;
+}
+
+/* Check if we have a continous range to be save/restored with the
+   help of enter/leave instructions.  */
+
+static bool
+arc_enter_leave_p (unsigned int gmask)
+{
+  int start_reg = 13;
+  int end_reg = 26;
+  int regno;
+  unsigned int rmask = 0;
+
+  if (!gmask)
+    return false;
+
+  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno)); regno++)
+    rmask |= 1L << regno;
+
+  if (rmask ^ gmask)
+    return false;
+
+  return true;
+}
+
+/* ARC's prologue, save any needed call-saved regs (and call-used if
+   this is an interrupt handler) for ARCompact ISA, using ST/STD
+   instructions. */
+
+static int
+arc_save_callee_saves (unsigned int gmask,
+		       bool save_blink,
+		       bool save_fp,
+		       HOST_WIDE_INT offset)
+{
+  rtx reg;
+  int frame_allocated = 0;
+
+  /* The home-grown ABI says link register is saved first.  */
+  if (save_blink)
+    {
+      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+      frame_allocated += frame_save_reg (reg, offset);
+      offset = 0;
+    }
+
+  /* N.B. FRAME_POINTER_MASK and RETURN_ADDR_MASK are cleared in gmask.  */
+  if (gmask)
+    for (int i = 31; i >= 0; i--)
+      {
+	machine_mode save_mode = SImode;
+
+	if (TARGET_LL64
+	    && ((i - 1) % 2 == 0)
+	    && ((gmask & (1L << i)) != 0)
+	    && ((gmask & (1L << (i - 1))) != 0))
+	  {
+	    save_mode = DImode;
+	    --i;
+	  }
+	else if ((gmask & (1L << i)) == 0)
+	  continue;
+
+	reg = gen_rtx_REG (save_mode, i);
+	frame_allocated += frame_save_reg (reg, offset);
+	offset = 0;
+      }
+
+  /* Save frame pointer if needed.  First save the FP on stack, if not
+     autosaved.  Unfortunately, I cannot add it to gmask and use the
+     above loop to save fp because our ABI states fp goes aftert all
+     registers are saved.  */
+  if (save_fp)
+    {
+      frame_allocated += frame_save_reg (frame_pointer_rtx, offset);
+      offset = 0;
+    }
+
+  /* Emit mov fp,sp.  */
+  if (arc_frame_pointer_needed ())
+    frame_move (frame_pointer_rtx, stack_pointer_rtx);
+
+  return frame_allocated;
+}
+
+static int
+arc_restore_callee_saves (unsigned int gmask,
+			  bool restore_blink,
+			  bool restore_fp,
+			  HOST_WIDE_INT offset,
+			  HOST_WIDE_INT allocated)
+{
+  rtx reg;
+  int frame_deallocated = 0;
+
+  /* Emit mov fp,sp.  */
+  if (arc_frame_pointer_needed () && offset)
+    {
+      frame_move (stack_pointer_rtx, frame_pointer_rtx);
+      frame_deallocated += offset;
+      offset = 0;
+    }
+
+  if (restore_fp)
+    {
+      /* Any offset is taken care by previous if-statement.  */
+      gcc_assert (offset == 0);
+      frame_deallocated += frame_restore_reg (frame_pointer_rtx, 0);
+    }
+
+  if (offset)
+    {
+      /* No, fp involved, hence, we need to do an add to set the sp to
+	 where first registers are.  */
+      frame_stack_add (offset);
+      frame_deallocated += offset;
+      offset = 0;
+    }
+
+  /* N.B. FRAME_POINTER_MASK and RETURN_ADDR_MASK are cleared in gmask.  */
+  if (gmask)
+    for (int i = 0; i <= 31; i++)
+      {
+	machine_mode restore_mode = SImode;
+
+	if (TARGET_LL64
+	    && ((i % 2) == 0)
+	    && ((gmask & (1L << i)) != 0)
+	    && ((gmask & (1L << (i + 1))) != 0))
+	  {
+	    restore_mode = DImode;
+	  }
+	else if ((gmask & (1L << i)) == 0)
+	  continue;
+
+	reg = gen_rtx_REG (restore_mode, i);
+	frame_deallocated += frame_restore_reg (reg, 0);
+	offset = 0;
+
+	if (restore_mode == DImode)
+	  i++;
+      }
+
+  if (restore_blink)
+    {
+      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+      frame_deallocated += frame_restore_reg (reg, allocated
+					      - frame_deallocated
+					      /* Consider as well the
+						 current restored
+						 register size.*/
+					      - UNITS_PER_WORD);
+    }
+
+  return frame_deallocated;
+}
+
+/* ARC prologue, save the registers using enter instruction.  */
+
+static int
+arc_save_callee_enter (unsigned int gmask,
+		       bool save_blink,
+		       bool save_fp,
+		       HOST_WIDE_INT offset)
+{
+  int start_reg = 13;
+  int end_reg = 26;
+  int regno, indx, off, nregs;
+  rtx insn, reg, mem;
+  int frame_allocated = 0;
+
+  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
+    regno++;
+
+  end_reg = regno - 1;
+  nregs = end_reg - start_reg + 1;
+  nregs += save_blink ? 1 : 0;
+  nregs += save_fp ? 1 : 0;
+
+  if (offset)
+    frame_stack_add (offset);
+
+  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + (save_fp ? 1 : 0)
+						  + 1));
+  indx = 0;
+
+  reg = gen_rtx_SET (stack_pointer_rtx,
+		     plus_constant (Pmode,
+				    stack_pointer_rtx,
+				    nregs * UNITS_PER_WORD));
+  RTX_FRAME_RELATED_P (reg) = 1;
+  XVECEXP (insn, 0, indx++) = reg;
+  off = nregs * UNITS_PER_WORD;
+
+  if (save_blink)
+    {
+      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
+						 stack_pointer_rtx,
+						 off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, reg);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
+      off -= UNITS_PER_WORD;
+      save_blink = false;
+    }
+
+  for (regno = start_reg;
+       regno <= end_reg;
+       regno++, indx++, off -= UNITS_PER_WORD)
+    {
+      reg = gen_rtx_REG (SImode, regno);
+      mem = gen_frame_mem (SImode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, reg);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
+      gmask = gmask & ~(1L << regno);
+    }
+
+  if (save_fp)
+    {
+      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
+						 stack_pointer_rtx,
+						 off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, frame_pointer_rtx);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
+      off -= UNITS_PER_WORD;
+
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (frame_pointer_rtx,
+					     stack_pointer_rtx);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
+      save_fp = false;
+    }
+
+  gcc_assert (off == 0);
+  insn = frame_insn (insn);
+
+  add_reg_note (insn, REG_INC, stack_pointer_rtx);
+
+  frame_allocated = nregs * UNITS_PER_WORD;
+
+  /* offset is a negative number, make sure we add it.  */
+  return frame_allocated - offset;
+}
+
+static int
+arc_restore_callee_leave (unsigned int gmask,
+			  bool restore_blink,
+			  bool restore_fp,
+			  bool return_p,
+			  HOST_WIDE_INT offset)
+{
+  int start_reg = 13;
+  int end_reg = 26;
+  int regno, indx, off, nregs;
+  rtx insn, reg, mem;
+  int frame_allocated = 0;
+
+  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
+    regno++;
+
+  end_reg = regno - 1;
+  nregs = end_reg - start_reg + 1;
+  nregs += restore_blink ? 1 : 0;
+  nregs += restore_fp ? 1 : 0;
+
+  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1
+						  + (return_p ? 1 : 0)));
+  indx = 0;
+
+  if (return_p)
+    XVECEXP (insn, 0, indx++) = ret_rtx;
+
+  if (restore_fp)
+    {
+      /* I cannot emit set (sp, fp) here as cselib expects a single sp
+	 set and not two.  Thus, use the offset, and change sp adjust
+	 value.  */
+      frame_allocated += offset;
+    }
+
+  if (offset && !restore_fp)
+    {
+      /* This add is only emmited when we do not restore fp with leave
+	 instruction.  */
+      frame_stack_add (offset);
+      frame_allocated += offset;
+      offset = 0;
+    }
+
+  reg = gen_rtx_SET (stack_pointer_rtx,
+		     plus_constant (Pmode,
+				    stack_pointer_rtx,
+				    offset + nregs * UNITS_PER_WORD));
+  RTX_FRAME_RELATED_P (reg) = 1;
+  XVECEXP (insn, 0, indx++) = reg;
+  off = nregs * UNITS_PER_WORD;
+
+  if (restore_blink)
+    {
+      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
+						 stack_pointer_rtx,
+						 off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (reg, mem);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
+      off -= UNITS_PER_WORD;
+    }
+
+  for (regno = start_reg;
+       regno <= end_reg;
+       regno++, indx++, off -= UNITS_PER_WORD)
+    {
+      reg = gen_rtx_REG (SImode, regno);
+      mem = gen_frame_mem (SImode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (reg, mem);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
+      gmask = gmask & ~(1L << regno);
+    }
+
+  if (restore_fp)
+    {
+      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
+						 stack_pointer_rtx,
+						 off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (frame_pointer_rtx, mem);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
+      off -= UNITS_PER_WORD;
+    }
+
+  gcc_assert (off == 0);
+  if (return_p)
+    {
+      insn = emit_jump_insn (insn);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+  else
+    insn = frame_insn (insn);
+
+  add_reg_note (insn, REG_INC, stack_pointer_rtx);
+
+  /* Dwarf related info.  */
+  if (restore_fp)
+    {
+      add_reg_note (insn, REG_CFA_RESTORE, frame_pointer_rtx);
+      add_reg_note (insn, REG_CFA_DEF_CFA,
+		    plus_constant (Pmode, stack_pointer_rtx,
+				   offset + nregs * UNITS_PER_WORD));
+    }
+  else
+    {
+      add_reg_note (insn, REG_CFA_ADJUST_CFA,
+		    gen_rtx_SET (stack_pointer_rtx,
+				 plus_constant (Pmode, stack_pointer_rtx,
+						nregs * UNITS_PER_WORD)));
+    }
+  if (restore_blink)
+    add_reg_note (insn, REG_CFA_RESTORE,
+		  gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM));
+  for (regno = start_reg; regno <= end_reg; regno++)
+    add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (SImode, regno));
+
+  frame_allocated += nregs * UNITS_PER_WORD;
+
+  return frame_allocated;
+}
+
+/* Millicode thunks implementation:
+   Generates calls to millicodes for registers starting from r13 to r25
+   Present Limitations:
+   - Only one range supported. The remaining regs will have the ordinary
+   st and ld instructions for store and loads. Hence a gmask asking
+   to store r13-14, r16-r25 will only generate calls to store and
+   load r13 to r14 while store and load insns will be generated for
+   r16 to r25 in the prologue and epilogue respectively.
+
+   - Presently library only supports register ranges starting from r13.
+*/
+
+static int
+arc_save_callee_milli (unsigned int gmask,
+		       bool save_blink,
+		       bool save_fp,
+		       HOST_WIDE_INT offset,
+		       HOST_WIDE_INT reg_size)
+{
+  int start_reg = 13;
+  int end_reg = 25;
+  int regno, indx, off, nregs;
+  rtx insn, reg, mem;
+  int frame_allocated = 0;
+
+  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
+    regno++;
+
+  end_reg = regno - 1;
+  nregs = end_reg - start_reg + 1;
+  gcc_assert (end_reg > 14);
+
+
+  /* Allocate space on stack for the registers, and take into account
+     also the initial offset.  The registers will be saved using
+     offsets.  N.B. OFFSET is a negative number.  */
+  if (save_blink)
+    {
+      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+      frame_allocated += frame_save_reg (reg, offset);
+      offset = 0;
+    }
+
+  if (reg_size || offset)
+    {
+      frame_stack_add (offset - reg_size);
+      frame_allocated += nregs * UNITS_PER_WORD - offset;
+      offset = 0;
+    }
+
+  /* Start generate millicode call.  */
+  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1));
+  indx = 0;
+
+  /* This is a call, we clobber blink.  */
+  XVECEXP (insn, 0, nregs) =
+    gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM));
+
+  for (regno = start_reg, indx = 0, off = 0;
+       regno <= end_reg;
+       regno++, indx++, off += UNITS_PER_WORD)
+    {
+      reg = gen_rtx_REG (SImode, regno);
+      mem = gen_frame_mem (SImode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, reg);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
+      gmask = gmask & ~(1L << regno);
+    }
+  insn = frame_insn (insn);
+
+  /* Add DWARF info.  */
+  for (regno = start_reg, off = 0;
+       regno <= end_reg;
+       regno++, off += UNITS_PER_WORD)
+    {
+      reg = gen_rtx_REG (SImode, regno);
+      mem = gen_rtx_MEM (SImode, plus_constant (Pmode,
+						stack_pointer_rtx, off));
+      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
+
+    }
+
+  /* In the case of millicode thunk, we need to restore the
+     clobbered blink register.  */
+  if (arc_must_save_return_addr (cfun))
+    {
+      emit_insn (gen_rtx_SET (gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM),
+			      gen_rtx_MEM (Pmode,
+					   plus_constant (Pmode,
+							  stack_pointer_rtx,
+							  reg_size))));
+    }
+
+  /* Save remaining registers using st instructions.  */
+  for (regno = 0; regno <= 31; regno++)
+    {
+      if ((gmask & (1L << regno)) == 0)
+	continue;
+
+      reg = gen_rtx_REG (SImode, regno);
+      mem = gen_frame_mem (SImode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  off));
+      frame_move_inc (mem, reg, stack_pointer_rtx, 0);
+      frame_allocated += UNITS_PER_WORD;
+      off += UNITS_PER_WORD;
+    }
+
+  /* Save frame pointer if needed.  First save the FP on stack, if not
+     autosaved.  Unfortunately, I cannot add it to gmask and use the
+     above loop to save fp because our ABI states fp goes aftert all
+     registers are saved.  */
+  if (save_fp)
+    frame_allocated += frame_save_reg (frame_pointer_rtx, offset);
+
+  /* Emit mov fp,sp.  */
+  if (arc_frame_pointer_needed ())
+    frame_move (frame_pointer_rtx, stack_pointer_rtx);
+
+  return frame_allocated;
+}
+
+/* Like the previous function but restore.  */
+
+static int
+arc_restore_callee_milli (unsigned int gmask,
+			  bool restore_blink,
+			  bool restore_fp,
+			  bool return_p,
+			  HOST_WIDE_INT offset)
+{
+  int start_reg = 13;
+  int end_reg = 25;
+  int regno, indx, off, nregs;
+  rtx insn, reg, mem;
+  int frame_allocated = 0;
+
+  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
+    regno++;
+
+  end_reg = regno - 1;
+  nregs = end_reg - start_reg + 1;
+  gcc_assert (end_reg > 14);
+
+  /* Emit mov fp,sp.  */
+  if (arc_frame_pointer_needed () && offset)
+    {
+      frame_move (stack_pointer_rtx, frame_pointer_rtx);
+      frame_allocated = offset;
+      offset = 0;
+    }
+
+  if (restore_fp)
+    frame_allocated += frame_restore_reg (frame_pointer_rtx, 0);
+
+  if (offset)
+    {
+      /* No fp involved, hence, we need to adjust the sp via an
+	 add.  */
+      frame_stack_add (offset);
+      frame_allocated += offset;
+      offset = 0;
+    }
+
+  /* Start generate millicode call.  */
+  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc ((return_p ? 1 : 0)
+						  + nregs + 1));
+  indx = 0;
+
+  if (return_p)
+    {
+      /* sibling call, the blink is restored with the help of the
+	 value held into r12.  */
+      reg = gen_rtx_REG (Pmode, 12);
+      XVECEXP (insn, 0, indx++) = ret_rtx;
+      XVECEXP (insn, 0, indx++) =
+	gen_rtx_SET (stack_pointer_rtx,
+		     gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg));
+      frame_allocated += UNITS_PER_WORD;
+    }
+  else
+    {
+      /* This is a call, we clobber blink.  */
+      XVECEXP (insn, 0, nregs) =
+	gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM));
+    }
+
+  for (regno = start_reg, off = 0;
+       regno <= end_reg;
+       regno++, indx++, off += UNITS_PER_WORD)
+    {
+      reg = gen_rtx_REG (SImode, regno);
+      mem = gen_frame_mem (SImode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  off));
+      XVECEXP (insn, 0, indx) = gen_rtx_SET (reg, mem);
+      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
+      gmask = gmask & ~(1L << regno);
+    }
+
+  /* Restore remaining registers using LD instructions.  */
+  for (regno = 0; regno <= 31; regno++)
+    {
+      if ((gmask & (1L << regno)) == 0)
+	continue;
+
+      reg = gen_rtx_REG (SImode, regno);
+      mem = gen_frame_mem (SImode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  off));
+      rtx tmp = frame_move_inc (reg, mem, stack_pointer_rtx, 0);
+      add_reg_note (tmp, REG_CFA_RESTORE, reg);
+      off += UNITS_PER_WORD;
+    }
+
+  /* Emit millicode call.  */
+  if (return_p)
+    {
+      reg = gen_rtx_REG (Pmode, 12);
+      frame_insn (gen_rtx_SET (reg, GEN_INT (off)));
+      frame_allocated += off;
+      insn = emit_jump_insn (insn);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+  else
+    insn = frame_insn (insn);
+
+  /* Add DWARF info.  */
+  for (regno = start_reg, off = 0;
+       regno <= end_reg;
+       regno++, off += UNITS_PER_WORD)
+    {
+      reg = gen_rtx_REG (SImode, regno);
+      add_reg_note (insn, REG_CFA_RESTORE, reg);
+
+    }
+
+  if (restore_blink && !return_p)
+    {
+      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+      mem = gen_frame_mem (Pmode, plus_constant (Pmode, stack_pointer_rtx,
+						 off));
+      insn = frame_insn (gen_rtx_SET (reg, mem));
+      add_reg_note (insn, REG_CFA_RESTORE, reg);
+    }
+
+  return frame_allocated;
+}
+
 /* Set up the stack and frame pointer (if desired) for the function.  */
 
 void
@@ -3200,13 +3686,12 @@ arc_expand_prologue (void)
 {
   int size;
   unsigned int gmask = cfun->machine->frame_info.gmask;
-  /*  unsigned int frame_pointer_offset;*/
+  struct arc_frame_info *frame = &cfun->machine->frame_info;
   unsigned int frame_size_to_allocate;
-  /* (FIXME: The first store will use a PRE_MODIFY; this will usually be r13.
-     Change the stack layout so that we rather store a high register with the
-     PRE_MODIFY, thus enabling more short insn generation.)  */
   int first_offset = 0;
   unsigned int fn_type = arc_compute_function_type (cfun);
+  bool save_blink = false;
+  bool save_fp = false;
 
   /* Naked functions don't have prologue.  */
   if (ARC_NAKED_P (fn_type))
@@ -3229,87 +3714,42 @@ arc_expand_prologue (void)
   gcc_assert (!(size == 0 && gmask));
 
   /* Allocate space for register arguments if this is a variadic function.  */
-  if (cfun->machine->frame_info.pretend_size != 0)
-    {
-       /* Ensure pretend_size is maximum of 8 * word_size.  */
-      gcc_assert (cfun->machine->frame_info.pretend_size <= 32);
-
-      frame_stack_add (-(HOST_WIDE_INT)cfun->machine->frame_info.pretend_size);
-      frame_size_to_allocate -= cfun->machine->frame_info.pretend_size;
-    }
+  if (frame->pretend_size != 0)
+    first_offset = -frame->pretend_size;
 
   /* IRQ using automatic save mechanism will save the register before
      anything we do.  */
   if (ARC_AUTO_IRQ_P (fn_type)
       && !ARC_FAST_INTERRUPT_P (fn_type))
     {
-      arc_dwarf_emit_irq_save_regs ();
-    }
-
-  /* The home-grown ABI says link register is saved first.  */
-  if (arc_must_save_return_addr (cfun)
-      && !ARC_AUTOBLINK_IRQ_P (fn_type))
-    {
-      rtx ra = gen_rtx_REG (SImode, RETURN_ADDR_REGNUM);
-      rtx mem = gen_frame_mem (Pmode,
-			       gen_rtx_PRE_DEC (Pmode,
-						stack_pointer_rtx));
-
-      frame_move_inc (mem, ra, stack_pointer_rtx, 0);
-      frame_size_to_allocate -= UNITS_PER_WORD;
-    }
-
-  /* Save any needed call-saved regs (and call-used if this is an
-     interrupt handler) for ARCompact ISA.  */
-  if (cfun->machine->frame_info.reg_size)
-    {
-      first_offset = -cfun->machine->frame_info.reg_size;
-      /* N.B. FRAME_POINTER_MASK and RETURN_ADDR_MASK are cleared in gmask.  */
-      arc_save_restore (stack_pointer_rtx, gmask, 0, &first_offset);
-      frame_size_to_allocate -= cfun->machine->frame_info.reg_size;
-    }
-
-  /* In the case of millicode thunk, we need to restore the clobbered
-     blink register.  */
-  if (cfun->machine->frame_info.millicode_end_reg > 0
-      && arc_must_save_return_addr (cfun))
-    {
-      HOST_WIDE_INT tmp = cfun->machine->frame_info.reg_size;
-      emit_insn (gen_rtx_SET (gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM),
-			      gen_rtx_MEM (Pmode,
-					   plus_constant (Pmode,
-							  stack_pointer_rtx,
-							  tmp))));
-    }
-
-  /* Save frame pointer if needed.  First save the FP on stack, if not
-     autosaved.  */
-  if (arc_frame_pointer_needed ()
-      && !ARC_AUTOFP_IRQ_P (fn_type))
-    {
-      rtx addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-			       GEN_INT (-UNITS_PER_WORD + first_offset));
-      rtx mem = gen_frame_mem (Pmode, gen_rtx_PRE_MODIFY (Pmode,
-							  stack_pointer_rtx,
-							  addr));
-      frame_move_inc (mem, frame_pointer_rtx, stack_pointer_rtx, 0);
-      frame_size_to_allocate -= UNITS_PER_WORD;
+      frame_stack_add (first_offset);
       first_offset = 0;
+      arc_dwarf_emit_irq_save_regs ();
     }
 
-  /* Emit mov fp,sp.  */
-  if (arc_frame_pointer_needed ())
-    {
-      frame_move (frame_pointer_rtx, stack_pointer_rtx);
-    }
-
-  /* ??? We don't handle the case where the saved regs are more than 252
-     bytes away from sp.  This can be handled by decrementing sp once, saving
-     the regs, and then decrementing it again.  The epilogue doesn't have this
-     problem as the `ld' insn takes reg+limm values (though it would be more
-     efficient to avoid reg+limm).  */
+  save_blink = arc_must_save_return_addr (cfun)
+    && !ARC_AUTOBLINK_IRQ_P (fn_type);
+  save_fp = arc_frame_pointer_needed () && !ARC_AUTOFP_IRQ_P (fn_type);
+
+  /* Use enter/leave only for non-interrupt functions.  */
+  if (TARGET_CODE_DENSITY
+      && TARGET_CODE_DENSITY_FRAME
+      && !ARC_AUTOFP_IRQ_P (fn_type)
+      && !ARC_AUTOBLINK_IRQ_P (fn_type)
+      && !ARC_INTERRUPT_P (fn_type)
+      && arc_enter_leave_p (gmask))
+      frame_size_to_allocate -= arc_save_callee_enter (gmask, save_blink,
+						       save_fp,
+						       first_offset);
+  else if (frame->millicode_end_reg > 14)
+    frame_size_to_allocate -= arc_save_callee_milli (gmask, save_blink,
+						     save_fp,
+						     first_offset,
+						     frame->reg_size);
+  else
+    frame_size_to_allocate -= arc_save_callee_saves (gmask, save_blink, save_fp,
+						     first_offset);
 
-  frame_size_to_allocate -= first_offset;
   /* Allocate the stack frame.  */
   if (frame_size_to_allocate > 0)
     {
@@ -3318,8 +3758,7 @@ arc_expand_prologue (void)
 	 will prevent the scheduler from moving stores to the frame
 	 before the stack adjustment.  */
       if (arc_frame_pointer_needed ())
-	emit_insn (gen_stack_tie (stack_pointer_rtx,
-				  hard_frame_pointer_rtx));
+	emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
     }
 }
 
@@ -3331,170 +3770,71 @@ arc_expand_epilogue (int sibcall_p)
 {
   int size;
   unsigned int fn_type = arc_compute_function_type (cfun);
-
-  size = arc_compute_frame_size ();
-
-  unsigned int pretend_size = cfun->machine->frame_info.pretend_size;
-  unsigned int frame_size;
   unsigned int size_to_deallocate;
   int restored;
   int can_trust_sp_p = !cfun->calls_alloca;
-  int first_offset = 0;
-  int millicode_p = cfun->machine->frame_info.millicode_end_reg > 0;
-  rtx insn;
+  int first_offset;
+  bool restore_fp = arc_frame_pointer_needed () && !ARC_AUTOFP_IRQ_P (fn_type);
+  bool restore_blink = arc_must_save_return_addr (cfun)
+    && !ARC_AUTOBLINK_IRQ_P (fn_type);
+  unsigned int gmask = cfun->machine->frame_info.gmask;
+  bool return_p = !sibcall_p && fn_type == ARC_FUNCTION_NORMAL
+		   && !cfun->machine->frame_info.pretend_size;
+  struct arc_frame_info *frame = &cfun->machine->frame_info;
+
 
   /* Naked functions don't have epilogue.  */
   if (ARC_NAKED_P (fn_type))
     return;
 
+  size = arc_compute_frame_size ();
   size_to_deallocate = size;
 
-  frame_size = size - (pretend_size +
-		       cfun->machine->frame_info.reg_size +
-		       cfun->machine->frame_info.extra_size);
-
-  /* ??? There are lots of optimizations that can be done here.
-     EG: Use fp to restore regs if it's closer.
-     Maybe in time we'll do them all.  For now, always restore regs from
-     sp, but don't restore sp if we don't have to.  */
+  first_offset = size - (frame->pretend_size + frame->reg_size
+			 + frame->extra_size);
 
   if (!can_trust_sp_p)
     gcc_assert (arc_frame_pointer_needed ());
 
-  /* Restore stack pointer to the beginning of saved register area for
-     ARCompact ISA.  */
-  if (frame_size)
-    {
-      if (arc_frame_pointer_needed ())
-	frame_move (stack_pointer_rtx, frame_pointer_rtx);
-      else
-	first_offset = frame_size;
-      size_to_deallocate -= frame_size;
-    }
-  else if (!can_trust_sp_p)
-    frame_stack_add (-frame_size);
-
-
-  /* Restore any saved registers.  */
-  if (arc_frame_pointer_needed ()
-      && !ARC_AUTOFP_IRQ_P (fn_type))
-    {
-      rtx addr = gen_rtx_POST_INC (Pmode, stack_pointer_rtx);
-
-      insn = frame_move_inc (frame_pointer_rtx, gen_frame_mem (Pmode, addr),
-			     stack_pointer_rtx, 0);
-      add_reg_note (insn, REG_CFA_RESTORE, frame_pointer_rtx);
-      add_reg_note (insn, REG_CFA_DEF_CFA,
-		    plus_constant (SImode, stack_pointer_rtx,
-				   4));
-      size_to_deallocate -= UNITS_PER_WORD;
-    }
-
-  /* Load blink after the calls to thunk calls in case of optimize size.  */
-  if (millicode_p)
-    {
-	  int sibthunk_p = (!sibcall_p
-			    && fn_type == ARC_FUNCTION_NORMAL
-			    && !cfun->machine->frame_info.pretend_size);
-
-	  gcc_assert (!(cfun->machine->frame_info.gmask
-			& (FRAME_POINTER_MASK | RETURN_ADDR_MASK)));
-	  arc_save_restore (stack_pointer_rtx,
-			    cfun->machine->frame_info.gmask,
-			    1 + sibthunk_p, &first_offset);
-	  if (sibthunk_p)
-	    return;
-    }
-  /* If we are to restore registers, and first_offset would require
-     a limm to be encoded in a PRE_MODIFY, yet we can add it with a
-     fast add to the stack pointer, do this now.  */
-  if ((!SMALL_INT (first_offset)
-       && cfun->machine->frame_info.gmask
-       && ((TARGET_ARC700 && !optimize_size)
-	    ? first_offset <= 0x800
-	    : satisfies_constraint_C2a (GEN_INT (first_offset))))
-       /* Also do this if we have both gprs and return
-	  address to restore, and they both would need a LIMM.  */
-      || (arc_must_save_return_addr (cfun)
-	  && !SMALL_INT ((cfun->machine->frame_info.reg_size + first_offset) >> 2)
-	  && cfun->machine->frame_info.gmask))
-    {
-      frame_stack_add (first_offset);
-      first_offset = 0;
+  if (TARGET_CODE_DENSITY
+      && TARGET_CODE_DENSITY_FRAME
+      && !ARC_AUTOFP_IRQ_P (fn_type)
+      && !ARC_AUTOBLINK_IRQ_P (fn_type)
+      && !ARC_INTERRUPT_P(fn_type)
+      && arc_enter_leave_p (gmask))
+    {
+      /* using leave instruction.  */
+      size_to_deallocate -= arc_restore_callee_leave (gmask, restore_blink,
+						      restore_fp,
+						      return_p,
+						      first_offset);
+      if (return_p)
+	{
+	  gcc_assert (size_to_deallocate == 0);
+	  return;
+	}
     }
-  if (arc_must_save_return_addr (cfun)
-      && !ARC_AUTOBLINK_IRQ_P (fn_type))
+  else if (frame->millicode_end_reg > 14)
     {
-      rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
-      int ra_offs = cfun->machine->frame_info.reg_size + first_offset;
-      rtx addr = plus_constant (Pmode, stack_pointer_rtx, ra_offs);
-      HOST_WIDE_INT cfa_adjust = 0;
-
-      /* If the load of blink would need a LIMM, but we can add
-	 the offset quickly to sp, do the latter.  */
-      if (!SMALL_INT (ra_offs >> 2)
-	  && !cfun->machine->frame_info.gmask
-	  && ((TARGET_ARC700 && !optimize_size)
-	       ? ra_offs <= 0x800
-	       : satisfies_constraint_C2a (GEN_INT (ra_offs))))
-	{
-	   size_to_deallocate -= ra_offs - first_offset;
-	   first_offset = 0;
-	   frame_stack_add (ra_offs);
-	   ra_offs = 0;
-	   addr = stack_pointer_rtx;
-	}
-      /* See if we can combine the load of the return address with the
-	 final stack adjustment.
-	 We need a separate load if there are still registers to
-	 restore.  We also want a separate load if the combined insn
-	 would need a limm, but a separate load doesn't.  */
-      if (ra_offs
-	  && !cfun->machine->frame_info.gmask
-	  && (SMALL_INT (ra_offs) || !SMALL_INT (ra_offs >> 2)))
-	{
-	  addr = gen_rtx_PRE_MODIFY (Pmode, stack_pointer_rtx, addr);
-	  cfa_adjust = ra_offs;
-	  first_offset = 0;
-	  size_to_deallocate -= cfun->machine->frame_info.reg_size;
-	}
-      else if (!ra_offs && size_to_deallocate == UNITS_PER_WORD)
+      /* using millicode calls.  */
+      size_to_deallocate -= arc_restore_callee_milli (gmask, restore_blink,
+						      restore_fp,
+						      return_p,
+						      first_offset);
+      if (return_p)
 	{
-	  addr = gen_rtx_POST_INC (Pmode, addr);
-	  cfa_adjust = GET_MODE_SIZE (Pmode);
-	  size_to_deallocate = 0;
-	}
-
-      insn = frame_move_inc (ra, gen_frame_mem (Pmode, addr),
-			     stack_pointer_rtx, addr);
-      if (cfa_adjust)
-	{
-	  enum reg_note note = REG_CFA_ADJUST_CFA;
-
-	  add_reg_note (insn, note,
-			gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (SImode, stack_pointer_rtx,
-						    cfa_adjust)));
+	  gcc_assert (size_to_deallocate == 0);
+	  return;
 	}
-      add_reg_note (insn, REG_CFA_RESTORE, ra);
     }
+  else
+    size_to_deallocate -= arc_restore_callee_saves (gmask, restore_blink,
+						    restore_fp,
+						    first_offset,
+						    size_to_deallocate);
 
-  if (!millicode_p)
-    {
-       if (cfun->machine->frame_info.reg_size)
-	 arc_save_restore (stack_pointer_rtx,
-	   /* The zeroing of these two bits is unnecessary, but leave this in for clarity.  */
-			   cfun->machine->frame_info.gmask
-			   & ~(FRAME_POINTER_MASK | RETURN_ADDR_MASK), 1, &first_offset);
-    }
-
-  /* The rest of this function does the following:
-     ARCompact    : handle epilogue_delay, restore sp (phase-2), return
-  */
-
-  /* Keep track of how much of the stack pointer we've restored.
-     It makes the following a lot more readable.  */
-  size_to_deallocate += first_offset;
+  /* Keep track of how much of the stack pointer we've restored.  It
+     makes the following a lot more readable.  */
   restored = size - size_to_deallocate;
 
   if (size > restored)
@@ -3517,6 +3857,62 @@ arc_expand_epilogue (int sibcall_p)
     emit_jump_insn (gen_simple_return ());
 }
 
+
+bool
+arc_check_multi (rtx op, bool push_p)
+{
+  HOST_WIDE_INT len = XVECLEN (op, 0);
+  unsigned int regno, i, start;
+  unsigned int memp = push_p ? 0 : 1;
+  rtx elt;
+
+  if (len <= 1)
+    return false;
+
+  start = 1;
+  elt = XVECEXP (op, 0, 0);
+  if (!push_p && GET_CODE (elt) == RETURN)
+    start = 2;
+
+  for (i = start, regno = 13; i < len; i++, regno++)
+    {
+      rtx elt = XVECEXP (op, 0, i);
+      rtx reg, mem, addr;
+
+      if (GET_CODE (elt) != SET)
+	return false;
+      mem = XEXP (elt, memp);
+      reg = XEXP (elt, 1 - memp);
+
+      if (!REG_P (reg)
+	  || !MEM_P (mem))
+	return false;
+
+      /* Check for blink.  */
+      if (REGNO (reg) == RETURN_ADDR_REGNUM
+	  && i == start)
+	regno = 12;
+      else if (REGNO (reg) == FRAME_POINTER_REGNUM)
+	++i;
+      else if (REGNO (reg) != regno)
+	return false;
+
+      addr = XEXP (mem, 0);
+      if (GET_CODE (addr) == PLUS)
+	{
+	  if (!rtx_equal_p (stack_pointer_rtx, XEXP (addr, 0))
+	      || !CONST_INT_P (XEXP (addr, 1)))
+	    return false;
+	}
+      else
+	{
+	  if (!rtx_equal_p (stack_pointer_rtx, addr))
+	    return false;
+	}
+    }
+return true;
+}
+
 /* Return rtx for the location of the return address on the stack,
    suitable for use in __builtin_eh_return.  The new return address
    will be written to this location in order to redirect the return to
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index b968022e64a..fb8a1c9ee09 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -6430,6 +6430,178 @@ archs4x, archs4xd, archs4xd_slow"
    }
 )
 
+(define_insn "*push_multi_fp"
+  [(match_parallel 0 "push_multi_operand"
+		   [(set (reg:SI SP_REG)
+			 (plus:SI (reg:SI SP_REG)
+				  (match_operand 1 "immediate_operand" "")))
+		    (set (mem:SI (plus:SI (reg:SI SP_REG)
+					  (match_dup 1)))
+			 (reg:SI 13))])]
+  "TARGET_CODE_DENSITY"
+  {
+   int len = XVECLEN (operands[0], 0);
+   rtx tmp = XVECEXP (operands[0], 0, len - 1);
+   if (MEM_P (XEXP (tmp, 0)))
+     {
+      operands[2] = XEXP (tmp, 1);
+      return "enter_s\\t{r13-%2} ; sp=sp-%1";
+     }
+   else
+     {
+      tmp = XVECEXP (operands[0], 0, len - 3);
+      operands[2] = XEXP (tmp, 1);
+      return "enter_s\\t{r13-%2, fp} ; sp=sp-%1";
+     }
+  }
+  [(set_attr "type" "call_no_delay_slot")
+   (set_attr "length" "2")])
+
+(define_insn "*push_multi_fp_blink"
+  [(match_parallel 0 "push_multi_operand"
+		   [(set (reg:SI SP_REG)
+			 (plus:SI (reg:SI SP_REG)
+				  (match_operand 1 "immediate_operand" "")))
+		    (set (mem:SI (plus:SI (reg:SI SP_REG)
+					  (match_dup 1)))
+			 (reg:SI RETURN_ADDR_REGNUM))])]
+  "TARGET_CODE_DENSITY"
+  {
+   int len = XVECLEN (operands[0], 0);
+   rtx tmp = XVECEXP (operands[0], 0, len - 1);
+   if (MEM_P (XEXP (tmp, 0)))
+     {
+      operands[2] = XEXP (tmp, 1);
+      return "enter_s\\t{r13-%2, blink} ; sp=sp-%1";
+     }
+   else
+     {
+      tmp = XVECEXP (operands[0], 0, len - 3);
+      operands[2] = XEXP (tmp, 1);
+      return "enter_s\\t{r13-%2, fp, blink} ; sp=sp-%1";
+     }
+  }
+  [(set_attr "type" "call_no_delay_slot")
+   (set_attr "length" "2")])
+
+(define_insn "*pop_multi_fp"
+  [(match_parallel 0 "pop_multi_operand"
+		   [(set (reg:SI SP_REG)
+			 (plus:SI (reg:SI SP_REG)
+				  (match_operand 1 "immediate_operand" "")))
+		    (set (reg:SI 13)
+			 (mem:SI
+			  (plus:SI (reg:SI SP_REG)
+				   (match_operand 2 "immediate_operand" ""))))])]
+  "TARGET_CODE_DENSITY"
+  {
+   int len = XVECLEN (operands[0], 0);
+   rtx tmp = XVECEXP (operands[0], 0, len - 1);
+   if (XEXP (tmp, 0) != frame_pointer_rtx)
+     {
+      operands[3] = XEXP (tmp, 0);
+      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
+      return "leave_s\\t{r13-%3} ; sp=sp+%1";
+     }
+   else
+     {
+      tmp = XVECEXP (operands[0], 0, len - 2);
+      operands[3] = XEXP (tmp, 0);
+      return "leave_s\\t{r13-%3, fp} ; sp=sp+%1";
+     }
+  }
+  [(set_attr "type" "call_no_delay_slot")
+   (set_attr "length" "2")])
+
+(define_insn "*pop_multi_fp_blink"
+  [(match_parallel 0 "pop_multi_operand"
+		   [(set (reg:SI SP_REG)
+			 (plus:SI (reg:SI SP_REG)
+				  (match_operand 1 "immediate_operand" "")))
+		    (set (reg:SI RETURN_ADDR_REGNUM)
+			 (mem:SI
+			  (plus:SI (reg:SI SP_REG)
+				   (match_operand 2 "immediate_operand" ""))))])]
+  "TARGET_CODE_DENSITY"
+  {
+   int len = XVECLEN (operands[0], 0);
+   rtx tmp = XVECEXP (operands[0], 0, len - 1);
+   if (XEXP (tmp, 0) != frame_pointer_rtx)
+     {
+      operands[3] = XEXP (tmp, 0);
+      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
+      return "leave_s\\t{r13-%3, blink} ; sp=sp+%1";
+     }
+   else
+     {
+      tmp = XVECEXP (operands[0], 0, len - 2);
+      operands[3] = XEXP (tmp, 0);
+      return "leave_s\\t{r13-%3, fp, blink} ; sp=sp+%1";
+     }
+  }
+  [(set_attr "type" "call_no_delay_slot")
+   (set_attr "length" "2")])
+
+(define_insn "*pop_multi_fp_ret"
+  [(match_parallel 0 "pop_multi_operand"
+		   [(return)
+		    (set (reg:SI SP_REG)
+			 (plus:SI (reg:SI SP_REG)
+				  (match_operand 1 "immediate_operand" "")))
+		    (set (reg:SI 13)
+			 (mem:SI
+			  (plus:SI (reg:SI SP_REG)
+				   (match_operand 2 "immediate_operand" ""))))])]
+  "TARGET_CODE_DENSITY"
+  {
+   int len = XVECLEN (operands[0], 0);
+   rtx tmp = XVECEXP (operands[0], 0, len - 1);
+   if (XEXP (tmp, 0) != frame_pointer_rtx)
+     {
+      operands[3] = XEXP (tmp, 0);
+      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
+      return "leave_s\\t{r13-%3, pcl} ; sp=sp+%1";
+     }
+   else
+     {
+      tmp = XVECEXP (operands[0], 0, len - 2);
+      operands[3] = XEXP (tmp, 0);
+      return "leave_s\\t{r13-%3, fp, pcl} ; sp=sp+%1";
+     }
+  }
+  [(set_attr "type" "call_no_delay_slot")
+   (set_attr "length" "2")])
+
+(define_insn "*pop_multi_fp_blink_ret"
+  [(match_parallel 0 "pop_multi_operand"
+		   [(return)
+		    (set (reg:SI SP_REG)
+			 (plus:SI (reg:SI SP_REG)
+				  (match_operand 1 "immediate_operand" "")))
+		    (set (reg:SI RETURN_ADDR_REGNUM)
+			 (mem:SI
+			  (plus:SI (reg:SI SP_REG)
+				   (match_operand 2 "immediate_operand" ""))))])]
+  "TARGET_CODE_DENSITY"
+  {
+   int len = XVECLEN (operands[0], 0);
+   rtx tmp = XVECEXP (operands[0], 0, len - 1);
+   if (XEXP (tmp, 0) != frame_pointer_rtx)
+     {
+      operands[3] = XEXP (tmp, 0);
+      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
+      return "leave_s\\t{r13-%3, blink, pcl} ; sp=sp+%1";
+     }
+   else
+     {
+      tmp = XVECEXP (operands[0], 0, len - 2);
+      operands[3] = XEXP (tmp, 0);
+      return "leave_s\\t{r13-%3, fp, blink, pcl} ; sp=sp+%1";
+     }
+  }
+  [(set_attr "type" "call_no_delay_slot")
+   (set_attr "length" "2")])
+
 ;; include the arc-FPX instructions
 (include "fpx.md")
 
diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
index 3e96b58375d..93e18af1d27 100644
--- a/gcc/config/arc/arc.opt
+++ b/gcc/config/arc/arc.opt
@@ -193,9 +193,9 @@ msdata
 Target Report InverseMask(NO_SDATA_SET)
 Generate sdata references.  This is the default, unless you compile for PIC.
 
-mno-millicode
-Target Report Mask(NO_MILLICODE_THUNK_SET)
-Do not generate millicode thunks (needed only with -Os).
+mmillicode
+Target Report Mask(MILLICODE_THUNK_SET)
+Generate millicode thunks.
 
 mspfp
 Target Report Mask(SPFP_COMPACT_SET)
@@ -532,3 +532,7 @@ Enable 16-entry register file.
 mbranch-index
 Target Report Var(TARGET_BRANCH_INDEX) Init(DEFAULT_BRANCH_INDEX)
 Enable use of BI/BIH instructions when available.
+
+mcode-density-frame
+Target Report Var(TARGET_CODE_DENSITY_FRAME)
+Enable ENTER_S and LEAVE_S opcodes for ARCv2.
diff --git a/gcc/config/arc/predicates.md b/gcc/config/arc/predicates.md
index 0abfc839b07..efa3650e1fa 100644
--- a/gcc/config/arc/predicates.md
+++ b/gcc/config/arc/predicates.md
@@ -800,3 +800,15 @@
 (define_predicate "arc_short_operand"
   (ior (match_test "register_operand (op, mode)")
        (match_test "short_unsigned_const_operand (op, mode)")))
+
+(define_special_predicate "push_multi_operand"
+  (match_code "parallel")
+  {
+   return arc_check_multi (op, true);
+})
+
+(define_special_predicate "pop_multi_operand"
+  (match_code "parallel")
+  {
+   return arc_check_multi (op, false);
+})
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 454587310c8..5cdd8ba23e9 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -646,9 +646,9 @@ Objective-C and Objective-C++ Dialects}.
 -malign-call  -mauto-modify-reg  -mbbit-peephole  -mno-brcc @gol
 -mcase-vector-pcrel  -mcompact-casesi  -mno-cond-exec  -mearly-cbranchsi @gol
 -mexpand-adddi  -mindexed-loads  -mlra  -mlra-priority-none @gol
--mlra-priority-compact mlra-priority-noncompact  -mno-millicode @gol
+-mlra-priority-compact mlra-priority-noncompact  -mmillicode @gol
 -mmixed-code  -mq-class  -mRcq  -mRcw  -msize-level=@var{level} @gol
--mtune=@var{cpu}  -mmultcost=@var{num} @gol
+-mtune=@var{cpu}  -mmultcost=@var{num} -mcode-density-frame @gol
 -munalign-prob-threshold=@var{probability}  -mmpy-option=@var{multo} @gol
 -mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16 -mbranch-index}
 
@@ -16042,15 +16042,21 @@ Indicate target register priority for r0..r3 / r12..r15.
 @opindex mlra-priority-noncompact
 Reduce target register priority for r0..r3 / r12..r15.
 
-@item -mno-millicode
-@opindex mno-millicode
+@item -mmillicode
+@opindex mmillicode
 When optimizing for size (using @option{-Os}), prologues and epilogues
 that have to save or restore a large number of registers are often
 shortened by using call to a special function in libgcc; this is
 referred to as a @emph{millicode} call.  As these calls can pose
 performance issues, and/or cause linking issues when linking in a
-nonstandard way, this option is provided to turn off millicode call
-generation.
+nonstandard way, this option is provided to turn on or off millicode
+call generation.
+
+@item -mcode-density-frame
+@opindex mcode-density-frame
+This option enable the compiler to emit @code{enter} and @code{leave}
+instructions.  These instructions are only valid for CPUs with
+code-density feature.
 
 @item -mmixed-code
 @opindex mmixed-code
diff --git a/gcc/testsuite/gcc.target/arc/firq-1.c b/gcc/testsuite/gcc.target/arc/firq-1.c
index 87f408793dc..34d2fedcb45 100644
--- a/gcc/testsuite/gcc.target/arc/firq-1.c
+++ b/gcc/testsuite/gcc.target/arc/firq-1.c
@@ -20,8 +20,8 @@ handler1 (void)
        : "r0", "r1", "r2", "r3", "r4",
 	 "r5", "r6", "r7", "r8", "r9");
 }
-/* { dg-final { scan-assembler-times "r2,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r4,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r6,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r8,\\\[sp,\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "r2,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r4,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r6,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r8,\\\[sp" 2 } } */
 /* { dg-final { scan-assembler "rtie" } } */
diff --git a/gcc/testsuite/gcc.target/arc/firq-3.c b/gcc/testsuite/gcc.target/arc/firq-3.c
index cfd37949780..2d45f3c0af5 100644
--- a/gcc/testsuite/gcc.target/arc/firq-3.c
+++ b/gcc/testsuite/gcc.target/arc/firq-3.c
@@ -28,13 +28,13 @@ handler1 (void)
 /* { dg-final { scan-assembler-not "r15,\\\[sp" } } */
 
 /* { dg-final { scan-assembler-times "r4,\\\[sp" 2 } } */
-/* { dg-final { scan-assembler-times "r6,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r8,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r10,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r16,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r18,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r20,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r24,\\\[sp,\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "r6,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r8,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r10,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r16,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r18,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r20,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r24,\\\[sp" 2 } } */
 /* { dg-final { scan-assembler-times "fp,\\\[sp," 2 } } */
 
 /* { dg-final { scan-assembler "rtie" } } */
diff --git a/gcc/testsuite/gcc.target/arc/firq-4.c b/gcc/testsuite/gcc.target/arc/firq-4.c
index 2531c001bef..828facddf08 100644
--- a/gcc/testsuite/gcc.target/arc/firq-4.c
+++ b/gcc/testsuite/gcc.target/arc/firq-4.c
@@ -18,13 +18,13 @@ handler1 (void)
 		  "r25");
 }
 /* { dg-final { scan-assembler-times "r4,\\\[sp" 2 } } */
-/* { dg-final { scan-assembler-times "r6,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r8,\\\[sp,\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "r6,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r8,\\\[sp" 2 } } */
 
-/* { dg-final { scan-assembler-times "r16,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r18,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r20,\\\[sp,\[0-9\]+\\\]" 2 } } */
-/* { dg-final { scan-assembler-times "r24,\\\[sp,\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "r16,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r18,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r20,\\\[sp" 2 } } */
+/* { dg-final { scan-assembler-times "r24,\\\[sp" 2 } } */
 
 /* { dg-final { scan-assembler-not "fp,\\\[sp" } } */
 /* { dg-final { scan-assembler-not "push.*fp" } } */
diff --git a/gcc/testsuite/gcc.target/arc/interrupt-6.c b/gcc/testsuite/gcc.target/arc/interrupt-6.c
index 509ff302124..d82bd67edd8 100644
--- a/gcc/testsuite/gcc.target/arc/interrupt-6.c
+++ b/gcc/testsuite/gcc.target/arc/interrupt-6.c
@@ -18,5 +18,5 @@ foo(void)
   bar (p);
 }
 /* { dg-final { scan-assembler-not ".*fp,\\\[sp" } } */
-/* { dg-final { scan-assembler "ld.*blink,\\\[sp\\\]" } } */
+/* { dg-final { scan-assembler "ld.*blink,\\\[sp" } } */
 /* { dg-final { scan-assembler "push_s.*blink" } } */
-- 
2.17.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/6] [ARC] Cleanup TLS implementation.
  2018-10-10  8:01 ` [PATCH 2/6] [ARC] Cleanup TLS implementation Claudiu Zissulescu
@ 2018-10-11 10:13   ` Andrew Burgess
  2018-10-31 13:11     ` claziss
  0 siblings, 1 reply; 25+ messages in thread
From: Andrew Burgess @ 2018-10-11 10:13 UTC (permalink / raw)
  To: Claudiu Zissulescu; +Cc: gcc-patches, fbedard, claziss

* Claudiu Zissulescu <claziss@gmail.com> [2018-10-10 11:00:12 +0300]:

> Cleanup TLS implementation and add a number of tests.
> 
> gcc/
> 2018-07-25  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* config/arc/arc.c (arc_get_tp): Remove function.
> 	(arc_emit_call_tls_get_addr): Likewise.
> 	(arc_call_tls_get_addr): New function.
> 	(arc_legitimize_tls_address): Make use of arc_call_tls_get_addr.
> 	* config/arc/arc.md (tls_load_tp_soft): Remove.
> 	(tls_gd_get_addr): Likewise.
> 
> testsuite/
> 2018-07-25  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* gcc.target/arc/tls-gd.c: New file.
> 	* gcc.target/arc/tls-ie.c: Likewise.
> 	* gcc.target/arc/tls-ld.c: Likewise.
> 	* gcc.target/arc/tls-le.c: Likewise.
> ---
>  gcc/config/arc/arc.c                  | 95 +++++++++++----------------
>  gcc/config/arc/arc.md                 | 21 ------
>  gcc/testsuite/gcc.target/arc/tls-1.c  |  2 +-
>  gcc/testsuite/gcc.target/arc/tls-gd.c | 17 +++++
>  gcc/testsuite/gcc.target/arc/tls-ie.c | 17 +++++
>  gcc/testsuite/gcc.target/arc/tls-ld.c | 18 +++++
>  gcc/testsuite/gcc.target/arc/tls-le.c | 16 +++++
>  7 files changed, 106 insertions(+), 80 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arc/tls-gd.c
>  create mode 100644 gcc/testsuite/gcc.target/arc/tls-ie.c
>  create mode 100644 gcc/testsuite/gcc.target/arc/tls-ld.c
>  create mode 100644 gcc/testsuite/gcc.target/arc/tls-le.c
> 
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index de4c7433c1b..56f566795ff 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -5559,51 +5559,30 @@ arc_raw_symbolic_reference_mentioned_p (rtx op, bool skip_local)
>    return false;
>  }
>  
> -/* Get the thread pointer.  */
> +/* Emit a call to __tls_get_addr.  TI is the argument to this function.
> +   RET is an RTX for the return value location.  The entire insn sequence
> +   is returned.  */

This comment should be moved down to arc_call_tls_get_addr, and a new
comment should be added explaining what arc_tls_symbol is used for.

Otherwise, this seems fine.

Thanks,
Andrew

> +static GTY(()) rtx arc_tls_symbol;
>  
>  static rtx
> -arc_get_tp (void)
> +arc_call_tls_get_addr (rtx ti)
>  {
> -   /* If arc_tp_regno has been set, we can use that hard register
> -      directly as a base register.  */
> -  if (arc_tp_regno != -1)
> -    return gen_rtx_REG (Pmode, arc_tp_regno);
> -
> -  /* Otherwise, call __read_tp.  Copy the result to a pseudo to avoid
> -     conflicts with function arguments / results.  */
> -  rtx reg = gen_reg_rtx (Pmode);
> -  emit_insn (gen_tls_load_tp_soft ());
> -  emit_move_insn (reg, gen_rtx_REG (Pmode, R0_REG));
> -  return reg;
> -}
> -
> -/* Helper to be used by TLS Global dynamic model.  */
> -
> -static rtx
> -arc_emit_call_tls_get_addr (rtx sym, int reloc, rtx eqv)
> -{
> -  rtx r0 = gen_rtx_REG (Pmode, R0_REG);
> -  rtx call_fusage = NULL_RTX;
> -
> -  start_sequence ();
> -
> -  rtx x = arc_unspec_offset (sym, reloc);
> -  emit_move_insn (r0, x);
> -  use_reg (&call_fusage, r0);
> +  rtx arg = gen_rtx_REG (Pmode, R0_REG);
> +  rtx ret = gen_rtx_REG (Pmode, R0_REG);
> +  rtx fn;
> +  rtx_insn *insn;
>  
> -  gcc_assert (reloc == UNSPEC_TLS_GD);
> -  rtx call_insn = emit_call_insn (gen_tls_gd_get_addr (sym));
> -  /* Should we set RTL_CONST_CALL_P?  We read memory, but not in a
> -     way that the application should care.  */
> -  RTL_PURE_CALL_P (call_insn) = 1;
> -  add_function_usage_to (call_insn, call_fusage);
> +  if (!arc_tls_symbol)
> +    arc_tls_symbol = init_one_libfunc ("__tls_get_addr");
>  
> -  rtx_insn *insns = get_insns ();
> -  end_sequence ();
> +  emit_move_insn (arg, ti);
> +  fn = gen_rtx_MEM (SImode, arc_tls_symbol);
> +  insn = emit_call_insn (gen_call_value (ret, fn, const0_rtx));
> +  RTL_CONST_CALL_P (insn) = 1;
> +  use_reg (&CALL_INSN_FUNCTION_USAGE (insn), ret);
> +  use_reg (&CALL_INSN_FUNCTION_USAGE (insn), arg);
>  
> -  rtx dest = gen_reg_rtx (Pmode);
> -  emit_libcall_block (insns, dest, r0, eqv);
> -  return dest;
> +  return ret;
>  }
>  
>  #define DTPOFF_ZERO_SYM ".tdata"
> @@ -5614,16 +5593,26 @@ arc_emit_call_tls_get_addr (rtx sym, int reloc, rtx eqv)
>  static rtx
>  arc_legitimize_tls_address (rtx addr, enum tls_model model)
>  {
> +  rtx tmp;
> +
>    if (!flag_pic && model == TLS_MODEL_LOCAL_DYNAMIC)
>      model = TLS_MODEL_LOCAL_EXEC;
>  
> +
> +  /* The TP pointer needs to be set.  */
> +  gcc_assert (arc_tp_regno != -1);
> +
>    switch (model)
>      {
> +    case TLS_MODEL_GLOBAL_DYNAMIC:
> +      tmp = gen_reg_rtx (Pmode);
> +      emit_move_insn (tmp, arc_unspec_offset (addr, UNSPEC_TLS_GD));
> +      return arc_call_tls_get_addr (tmp);
> +
>      case TLS_MODEL_LOCAL_DYNAMIC:
>        rtx base;
>        tree decl;
>        const char *base_name;
> -      rtvec v;
>  
>        decl = SYMBOL_REF_DECL (addr);
>        base_name = DTPOFF_ZERO_SYM;
> @@ -5631,31 +5620,21 @@ arc_legitimize_tls_address (rtx addr, enum tls_model model)
>  	base_name = ".tbss";
>  
>        base = gen_rtx_SYMBOL_REF (Pmode, base_name);
> -      if (strcmp (base_name, DTPOFF_ZERO_SYM) == 0)
> -	{
> -	  if (!flag_pic)
> -	    goto local_exec;
> -	  v = gen_rtvec (1, addr);
> -	}
> -      else
> -	v = gen_rtvec (2, addr, base);
> -      addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_TLS_OFF);
> -      addr = gen_rtx_CONST (Pmode, addr);
> -      base = arc_legitimize_tls_address (base, TLS_MODEL_GLOBAL_DYNAMIC);
> -      return gen_rtx_PLUS (Pmode, force_reg (Pmode, base), addr);
> -
> -    case TLS_MODEL_GLOBAL_DYNAMIC:
> -      return arc_emit_call_tls_get_addr (addr, UNSPEC_TLS_GD, addr);
> +      tmp = gen_reg_rtx (Pmode);
> +      emit_move_insn (tmp, arc_unspec_offset (base, UNSPEC_TLS_GD));
> +      base = arc_call_tls_get_addr (tmp);
> +      return gen_rtx_PLUS (Pmode, force_reg (Pmode, base),
> +			   arc_unspec_offset (addr, UNSPEC_TLS_OFF));
>  
>      case TLS_MODEL_INITIAL_EXEC:
>        addr = arc_unspec_offset (addr, UNSPEC_TLS_IE);
>        addr = copy_to_mode_reg (Pmode, gen_const_mem (Pmode, addr));
> -      return gen_rtx_PLUS (Pmode, arc_get_tp (), addr);
> +      return gen_rtx_PLUS (Pmode, gen_rtx_REG (Pmode, arc_tp_regno), addr);
>  
>      case TLS_MODEL_LOCAL_EXEC:
> -    local_exec:
>        addr = arc_unspec_offset (addr, UNSPEC_TLS_OFF);
> -      return gen_rtx_PLUS (Pmode, arc_get_tp (), addr);
> +      return gen_rtx_PLUS (Pmode, gen_rtx_REG (Pmode, arc_tp_regno), addr);
> +
>      default:
>        gcc_unreachable ();
>      }
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index d73289a20c4..6ea67791627 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -5310,27 +5310,6 @@ archs4x, archs4xd, archs4xd_slow"
>    [(set_attr "type" "call")
>     (set_attr "is_SIBCALL" "yes")])
>  
> -(define_insn "tls_load_tp_soft"
> -  [(set (reg:SI R0_REG) (unspec:SI [(const_int 0)] UNSPEC_TLS_OFF))
> -   (clobber (reg:SI RETURN_ADDR_REGNUM))]
> -  ""
> -  "*return arc_output_libcall (\"__read_tp\");"
> -  [(set_attr "is_sfunc" "yes")
> -   (set_attr "predicable" "yes")])
> -
> -(define_insn "tls_gd_get_addr"
> -  [(set (reg:SI R0_REG)
> -	(call:SI (mem:SI (unspec:SI [(match_operand:SI 0
> -				      "symbolic_operand" "X,X")]
> -			  UNSPEC_TLS_GD))
> -		 (const_int 0)))
> -   (clobber (reg:SI RETURN_ADDR_REGNUM))]
> -  ""
> -  ".tls_gd_ld %0`bl%* __tls_get_addr@plt"
> -  [(set_attr "type" "call")
> -   ; With TARGET_MEDIUM_CALLS, plt calls are not predicable.
> -   (set_attr "predicable" "no")])
> -
>  ;; For thread pointer builtins
>  (define_expand "get_thread_pointersi"
>    [(set (match_operand:SI 0 "register_operand") (match_dup 1))]
> diff --git a/gcc/testsuite/gcc.target/arc/tls-1.c b/gcc/testsuite/gcc.target/arc/tls-1.c
> index 6521b641549..da21a5ba032 100644
> --- a/gcc/testsuite/gcc.target/arc/tls-1.c
> +++ b/gcc/testsuite/gcc.target/arc/tls-1.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-require-effective-target tls } */
> -/* { dg-skip-if "" { arc-*-elf* } } */
> +/* { dg-skip-if "" { arc*-*-elf* } } */
>  /* { dg-options "-O3 -std=gnu99" } */
>  
>  /* Check if addressing the `pos` member of struct is done via tls
> diff --git a/gcc/testsuite/gcc.target/arc/tls-gd.c b/gcc/testsuite/gcc.target/arc/tls-gd.c
> new file mode 100644
> index 00000000000..aa1b5429b08
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arc/tls-gd.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target fpic } */
> +/* { dg-options "-O2 -fpic -ftls-model=global-dynamic" } */
> +/* { dg-require-effective-target tls } */
> +/* { dg-skip-if "" { arc*-*-elf* } } */
> +
> +/* Check if tls global dynamic is correctly generated.  */
> +
> +extern __thread int e2;
> +
> +int *ae2 (void)
> +{
> +  return &e2;
> +}
> +
> +/* { dg-final { scan-assembler "add r0,pcl,@e2@tlsgd" } } */
> +/* { dg-final { scan-assembler "bl @__tls_get_addr@plt" } } */
> diff --git a/gcc/testsuite/gcc.target/arc/tls-ie.c b/gcc/testsuite/gcc.target/arc/tls-ie.c
> new file mode 100644
> index 00000000000..0c981cfbf67
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arc/tls-ie.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target fpic } */
> +/* { dg-options "-O2 -fpic -ftls-model=initial-exec" } */
> +/* { dg-require-effective-target tls } */
> +/* { dg-skip-if "" { arc*-*-elf* } } */
> +
> +/* Check if tls initial execution is correctly generated.  */
> +
> +extern __thread int e2;
> +
> +int *ae2 (void)
> +{
> +  return &e2;
> +}
> +
> +/* { dg-final { scan-assembler "ld r0,\\\[pcl,@e2@tlsie\\\]" } } */
> +/* { dg-final { scan-assembler "add_s r0,r0,r25" } } */
> diff --git a/gcc/testsuite/gcc.target/arc/tls-ld.c b/gcc/testsuite/gcc.target/arc/tls-ld.c
> new file mode 100644
> index 00000000000..351c3f02abd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arc/tls-ld.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target fpic } */
> +/* { dg-options "-O2 -fpic -ftls-model=local-dynamic" } */
> +/* { dg-require-effective-target tls } */
> +/* { dg-skip-if "" { arc*-*-elf* } } */
> +
> +/* Check if tls local dynamic is correctly generated.  */
> +
> +extern __thread int e2;
> +
> +int *ae2 (void)
> +{
> +  return &e2;
> +}
> +
> +/* { dg-final { scan-assembler "add r0,pcl,@.tbss@tlsgd" } } */
> +/* { dg-final { scan-assembler "bl @__tls_get_addr@plt" } } */
> +/* { dg-final { scan-assembler "add_s r0,r0,@e2@dtpoff" } } */
> diff --git a/gcc/testsuite/gcc.target/arc/tls-le.c b/gcc/testsuite/gcc.target/arc/tls-le.c
> new file mode 100644
> index 00000000000..ae3089b5070
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arc/tls-le.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target fpic } */
> +/* { dg-options "-O2 -fpic -ftls-model=local-exec" } */
> +/* { dg-require-effective-target tls } */
> +/* { dg-skip-if "" { arc*-*-elf* } } */
> +
> +/* Check if tls local execution is correctly generated.  */
> +
> +extern __thread int e2;
> +
> +int *ae2 (void)
> +{
> +  return &e2;
> +}
> +
> +/* { dg-final { scan-assembler "add r0,r25,@e2@tpoff" } } */
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/6] [ARC] Remove non standard funcions calls.
  2018-10-10  8:01 ` [PATCH 1/6] [ARC] Remove non standard funcions calls Claudiu Zissulescu
@ 2018-10-11 10:14   ` Andrew Burgess
  2018-10-31 12:40     ` claziss
  0 siblings, 1 reply; 25+ messages in thread
From: Andrew Burgess @ 2018-10-11 10:14 UTC (permalink / raw)
  To: Claudiu Zissulescu; +Cc: gcc-patches, fbedard, claziss

* Claudiu Zissulescu <claziss@gmail.com> [2018-10-10 11:00:11 +0300]:

> Replace all custom "library" calls with compiler known patterns.
> 
> gcc/
> xxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* config/arc/arc.md (mulsi3): Remove call to mulsi_600_lib.
> 	(mulsi3_600_lib): Remove pattern.
> 	(umulsi3_highpart_600_lib_le): Likewise.
> 	(umulsi3_highpart): Remove call to umulsi3_highpart_600_lib_le.
> 	(umulsidi3): Remove call to umulsidi3_600_lib.
> 	(umulsidi3_600_lib): Remove pattern.
> 	(peephole2): Remove peephole using the above deprecated patterns.
> 
> testsuite/
> xxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* gcc.target/arc/mulsi3_highpart-2.c: Update test.
> 
> libgcc/
> xxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* config/arc/lib1funcs.S (_muldi3): New function.
> 	* config/arc/t-arc (LIB1ASMFUNCS): Add _muldi3.

This seems fine.

Thanks,
Andrew

> ---
>  gcc/config/arc/arc.md                         | 158 ++----------------
>  .../gcc.target/arc/mulsi3_highpart-2.c        |   5 +-
>  libgcc/config/arc/lib1funcs.S                 |  54 ++++++
>  libgcc/config/arc/t-arc                       |   2 +-
>  4 files changed, 67 insertions(+), 152 deletions(-)
> 
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index 42ca820b91d..d73289a20c4 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -2076,44 +2076,21 @@ archs4x, archs4xd, archs4xd_slow"
>  ;; SI <- SI * SI
>  
>  (define_expand "mulsi3"
> - [(set (match_operand:SI 0 "nonimmediate_operand"            "")
> + [(set (match_operand:SI 0 "register_operand"            "")
>  	(mult:SI (match_operand:SI 1 "register_operand"  "")
>  		 (match_operand:SI 2 "nonmemory_operand" "")))]
> -  ""
> +  "TARGET_ANY_MPY"
>  {
> -  if (TARGET_MPY)
> -    {
> -      if (!register_operand (operands[0], SImode))
> -	{
> -	  rtx result = gen_reg_rtx (SImode);
> -
> -	  emit_insn (gen_mulsi3 (result, operands[1], operands[2]));
> -	  emit_move_insn (operands[0], result);
> -	  DONE;
> -	}
> -    }
> -  else if (TARGET_MUL64_SET)
> +  if (TARGET_MUL64_SET)
>      {
> -     rtx tmp = gen_reg_rtx (SImode);
> -     emit_insn (gen_mulsi64 (tmp, operands[1], operands[2]));
> -     emit_move_insn (operands[0], tmp);
> +     emit_insn (gen_mulsi64 (operands[0], operands[1], operands[2]));
>       DONE;
>      }
>    else if (TARGET_MULMAC_32BY16_SET)
>      {
> -     rtx tmp = gen_reg_rtx (SImode);
> -     emit_insn (gen_mulsi32x16 (tmp, operands[1], operands[2]));
> -     emit_move_insn (operands[0], tmp);
> +     emit_insn (gen_mulsi32x16 (operands[0], operands[1], operands[2]));
>       DONE;
>      }
> -  else
> -    {
> -      emit_move_insn (gen_rtx_REG (SImode, R0_REG), operands[1]);
> -      emit_move_insn (gen_rtx_REG (SImode, R1_REG), operands[2]);
> -      emit_insn (gen_mulsi3_600_lib ());
> -      emit_move_insn (operands[0], gen_rtx_REG (SImode, R0_REG));
> -      DONE;
> -    }
>  })
>  
>  (define_insn_and_split "mulsi32x16"
> @@ -2229,27 +2206,6 @@ archs4x, archs4xd, archs4xd_slow"
>     (set_attr "predicable" "yes,yes,no,yes")
>     (set_attr "cond" "canuse,canuse,canuse_limm,canuse")])
>  
> -; If we compile without an mul option enabled, but link with libraries
> -; for a mul option, we'll see clobbers of multiplier output registers.
> -; There is also an implementation using norm that clobbers the loop registers.
> -(define_insn "mulsi3_600_lib"
> -  [(set (reg:SI R0_REG)
> -	(mult:SI (reg:SI R0_REG) (reg:SI R1_REG)))
> -   (clobber (reg:SI RETURN_ADDR_REGNUM))
> -   (clobber (reg:SI R1_REG))
> -   (clobber (reg:SI R2_REG))
> -   (clobber (reg:SI R3_REG))
> -   (clobber (reg:DI MUL64_OUT_REG))
> -   (clobber (reg:SI LP_COUNT))
> -   (clobber (reg:SI LP_START))
> -   (clobber (reg:SI LP_END))
> -   (clobber (reg:CC CC_REG))]
> -  "!TARGET_ANY_MPY
> -   && SFUNC_CHECK_PREDICABLE"
> -  "*return arc_output_libcall (\"__mulsi3\");"
> -  [(set_attr "is_sfunc" "yes")
> -   (set_attr "predicable" "yes")])
> -
>  (define_insn_and_split "mulsidi_600"
>    [(set (match_operand:DI 0 "register_operand"                               "=c, c,c,  c")
>  	(mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand"  "%Rcq#q, c,c,  c"))
> @@ -2504,48 +2460,6 @@ archs4x, archs4xd, archs4xd_slow"
>     (set_attr "predicable" "yes,no,yes,no")
>     (set_attr "cond" "canuse,nocond,canuse,nocond")])
>  
> -; Implementations include additional labels for umulsidi3, so we got all
> -; the same clobbers - plus one for the result low part.  */
> -(define_insn "umulsi3_highpart_600_lib_le"
> -  [(set (reg:SI R1_REG)
> -	(truncate:SI
> -	 (lshiftrt:DI
> -	  (mult:DI (zero_extend:DI (reg:SI R0_REG))
> -		   (zero_extend:DI (reg:SI R1_REG)))
> -	  (const_int 32))))
> -   (clobber (reg:SI RETURN_ADDR_REGNUM))
> -   (clobber (reg:SI R0_REG))
> -   (clobber (reg:DI R2_REG))
> -   (clobber (reg:SI R12_REG))
> -   (clobber (reg:DI MUL64_OUT_REG))
> -   (clobber (reg:CC CC_REG))]
> -  "!TARGET_BIG_ENDIAN
> -   && !TARGET_ANY_MPY
> -   && SFUNC_CHECK_PREDICABLE"
> -  "*return arc_output_libcall (\"__umulsi3_highpart\");"
> -  [(set_attr "is_sfunc" "yes")
> -   (set_attr "predicable" "yes")])
> -
> -(define_insn "umulsi3_highpart_600_lib_be"
> -  [(set (reg:SI R0_REG)
> -	(truncate:SI
> -	 (lshiftrt:DI
> -	  (mult:DI (zero_extend:DI (reg:SI R0_REG))
> -		   (zero_extend:DI (reg:SI R1_REG)))
> -	  (const_int 32))))
> -   (clobber (reg:SI RETURN_ADDR_REGNUM))
> -   (clobber (reg:SI R1_REG))
> -   (clobber (reg:DI R2_REG))
> -   (clobber (reg:SI R12_REG))
> -   (clobber (reg:DI MUL64_OUT_REG))
> -   (clobber (reg:CC CC_REG))]
> -  "TARGET_BIG_ENDIAN
> -   && !TARGET_ANY_MPY
> -   && SFUNC_CHECK_PREDICABLE"
> -  "*return arc_output_libcall (\"__umulsi3_highpart\");"
> -  [(set_attr "is_sfunc" "yes")
> -   (set_attr "predicable" "yes")])
> -
>  ;; (zero_extend:DI (const_int)) leads to internal errors in combine, so we
>  ;; need a separate pattern for immediates
>  ;; ??? This is fine for combine, but not for reload.
> @@ -2572,23 +2486,11 @@ archs4x, archs4xd, archs4xd_slow"
>  	   (zero_extend:DI (match_operand:SI 1 "register_operand" ""))
>  	   (zero_extend:DI (match_operand:SI 2 "nonmemory_operand" "")))
>  	  (const_int 32))))]
> -  "!TARGET_MUL64_SET && !TARGET_MULMAC_32BY16_SET"
> +  "TARGET_MPY"
>    "
>  {
>    rtx target = operands[0];
>  
> -  if (!TARGET_MPY)
> -    {
> -      emit_move_insn (gen_rtx_REG (SImode, 0), operands[1]);
> -      emit_move_insn (gen_rtx_REG (SImode, 1), operands[2]);
> -      if (TARGET_BIG_ENDIAN)
> -	emit_insn (gen_umulsi3_highpart_600_lib_be ());
> -      else
> -	emit_insn (gen_umulsi3_highpart_600_lib_le ());
> -      emit_move_insn (target, gen_rtx_REG (SImode, 0));
> -      DONE;
> -    }
> -
>    if (!register_operand (target, SImode))
>      target = gen_reg_rtx (SImode);
>  
> @@ -2607,7 +2509,7 @@ archs4x, archs4xd, archs4xd_slow"
>    [(set (match_operand:DI 0 "register_operand" "")
>  	(mult:DI (zero_extend:DI (match_operand:SI 1 "register_operand" ""))
>  		 (zero_extend:DI (match_operand:SI 2 "nonmemory_operand" ""))))]
> -  ""
> +  "TARGET_ANY_MPY"
>  {
>    if (TARGET_PLUS_MACD)
>      {
> @@ -2646,12 +2548,8 @@ archs4x, archs4xd, archs4xd_slow"
>        DONE;
>      }
>    else
> -    {
> -      emit_move_insn (gen_rtx_REG (SImode, R0_REG), operands[1]);
> -      emit_move_insn (gen_rtx_REG (SImode, R1_REG), operands[2]);
> -      emit_insn (gen_umulsidi3_600_lib ());
> -      emit_move_insn (operands[0], gen_rtx_REG (DImode, R0_REG));
> -      DONE;
> +  {
> +   gcc_unreachable ();
>      }
>  })
>  
> @@ -2729,7 +2627,7 @@ archs4x, archs4xd, archs4xd_slow"
>  		 (zero_extend:DI (match_operand:SI 2 "extend_operand" "cL"))))]
>    "TARGET_MPY && !TARGET_PLUS_MACD"
>    "#"
> -  "reload_completed"
> +  "TARGET_MPY && !TARGET_PLUS_MACD && reload_completed"
>    [(const_int 0)]
>  {
>    int hi = !TARGET_BIG_ENDIAN;
> @@ -2743,42 +2641,6 @@ archs4x, archs4xd, archs4xd_slow"
>    [(set_attr "type" "umulti")
>    (set_attr "length" "8")])
>  
> -(define_insn "umulsidi3_600_lib"
> -  [(set (reg:DI R0_REG)
> -	(mult:DI (zero_extend:DI (reg:SI R0_REG))
> -		 (zero_extend:DI (reg:SI R1_REG))))
> -   (clobber (reg:SI RETURN_ADDR_REGNUM))
> -   (clobber (reg:DI R2_REG))
> -   (clobber (reg:SI R12_REG))
> -   (clobber (reg:DI MUL64_OUT_REG))
> -   (clobber (reg:CC CC_REG))]
> -   "!TARGET_ANY_MPY
> -   && SFUNC_CHECK_PREDICABLE"
> -  "*return arc_output_libcall (\"__umulsidi3\");"
> -  [(set_attr "is_sfunc" "yes")
> -   (set_attr "predicable" "yes")])
> -
> -(define_peephole2
> -  [(parallel
> -     [(set (reg:DI R0_REG)
> -	   (mult:DI (zero_extend:DI (reg:SI R0_REG))
> -		    (zero_extend:DI (reg:SI R1_REG))))
> -      (clobber (reg:SI RETURN_ADDR_REGNUM))
> -      (clobber (reg:DI R2_REG))
> -      (clobber (reg:SI R12_REG))
> -      (clobber (reg:DI MUL64_OUT_REG))
> -      (clobber (reg:CC CC_REG))])]
> -  "!TARGET_ANY_MPY
> -   && peep2_regno_dead_p (1, TARGET_BIG_ENDIAN ? R1_REG : R0_REG)"
> -  [(pc)]
> -{
> -  if (TARGET_BIG_ENDIAN)
> -    emit_insn (gen_umulsi3_highpart_600_lib_be ());
> -  else
> -    emit_insn (gen_umulsi3_highpart_600_lib_le ());
> -  DONE;
> -})
> -
>  (define_expand "addsi3"
>    [(set (match_operand:SI 0 "dest_reg_operand" "")
>  	(plus:SI (match_operand:SI 1 "register_operand" "")
> diff --git a/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c b/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c
> index 4b54cbf6a52..22b28cf2507 100644
> --- a/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c
> +++ b/gcc/testsuite/gcc.target/arc/mulsi3_highpart-2.c
> @@ -1,7 +1,7 @@
>  /* { dg-do run } */
>  /* { dg-skip-if "ARC700 always has mpy option on" { arc700 } } */
>  /* { dg-skip-if "ARC600 doesn't have mpy instruction" { arc6xx } } */
> -/* { dg-options "-O2 -mmpy-option=0 -w" } */
> +/* { dg-options "-O2 -mmpy-option=0 -w -save-temps" } */
>  
>  #include <stdlib.h>
>  
> @@ -28,5 +28,4 @@ main (void)
>  }
>  
>  /* { dg-final { scan-assembler-not "mpyhu\[ \t\]" } } */
> -/* { dg-final { scan-assembler-not "@__muldi3" } } */
> -/* { dg-final { scan-assembler "@__umulsi3_highpart" } } */
> +/* { dg-final { scan-assembler "@__muldi3" } } */
> diff --git a/libgcc/config/arc/lib1funcs.S b/libgcc/config/arc/lib1funcs.S
> index 9a626022612..249dd7a7ff7 100644
> --- a/libgcc/config/arc/lib1funcs.S
> +++ b/libgcc/config/arc/lib1funcs.S
> @@ -232,6 +232,60 @@ SYM(__umulsi3_highpart):
>  #endif
>  #endif /* L_umulsidi3 */
>  
> +#ifdef L_muldi3
> +	.section .text
> +	.align 4
> +	.global SYM(__muldi3)
> +SYM(__muldi3):
> +#ifdef __LITTLE_ENDIAN__
> +        push_s blink
> +        mov_s r4,r3     ;4
> +        mov_s r5,r2     ;4
> +        mov_s r9,r0     ;4
> +        mov_s r8,r1     ;4
> +        bl.d @__umulsidi3
> +        mov_s r1,r2     ;4
> +        mov_s r6,r0     ;4
> +        mov_s r7,r1     ;4
> +        mov_s r0,r9     ;4
> +        bl.d @__mulsi3
> +        mov_s r1,r4     ;4
> +        mov_s r4,r0     ;4
> +        mov_s r1,r8     ;4
> +        bl.d @__mulsi3
> +        mov_s r0,r5     ;4
> +        pop_s blink
> +        add_s r0,r0,r4 ;2
> +        add r1,r0,r7
> +        j_s.d [blink]
> +        mov_s r0,r6     ;4
> +#else
> +	push_s  blink
> +	mov_s   r5,r3
> +	mov_s   r9,r2
> +	mov_s   r4,r1
> +	mov_s   r8,r0
> +	mov_s   r0,r1
> +	bl.d 	@__umulsidi3
> +	mov_s   r1,r3
> +	mov_s   r7,r0
> +	mov_s   r6,r1
> +	mov_s   r0,r4
> +	bl.d    @__mulsi3
> +	mov_s   r1,r9
> +	mov_s   r4,r0
> +	mov_s   r1,r8
> +	bl.d    @__mulsi3
> +	mov_s   r0,r5
> +	pop_s   blink
> +	add_s   r0,r0,r4
> +	add_s   r0,r0,r7
> +	j_s.d   [blink]
> +	mov_s   r1,r6
> +#endif /* __LITTLE_ENDIAN__ */
> +ENDFUNC(__muldi3)
> +#endif /* L_muldi3 */
> +
>  #ifdef  L_umulsi3_highpart
>  #include "ieee-754/arc-ieee-754.h"
>  /* For use without a barrel shifter, and for ARC700 / ARC_MUL64, the
> diff --git a/libgcc/config/arc/t-arc b/libgcc/config/arc/t-arc
> index ad30fdb1db7..c79bc1cfdc4 100644
> --- a/libgcc/config/arc/t-arc
> +++ b/libgcc/config/arc/t-arc
> @@ -21,7 +21,7 @@
>  
>  CROSS_LIBGCC1 = libgcc1-asm.a
>  LIB1ASMSRC = arc/lib1funcs.S
> -LIB1ASMFUNCS = _mulsi3 _umulsidi3  _umulsi3_highpart \
> +LIB1ASMFUNCS = _mulsi3 _umulsidi3  _umulsi3_highpart _muldi3 \
>    _udivsi3 _divsi3 _umodsi3 _modsi3 \
>    _divmod_tools _clzsi2 \
>    _millicodethunk_st _millicodethunk_ld _millicodethunk_ret \
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/6] [ARC] Add BI/BIH instruction support.
  2018-10-10  8:49 ` [PATCH 3/6] [ARC] Add BI/BIH instruction support Claudiu Zissulescu
@ 2018-10-16 23:19   ` Andrew Burgess
  2018-10-17 17:21     ` Claudiu Zissulescu
  2018-10-31 12:59     ` claziss
  2018-10-17  7:19   ` Sandra Loosemore
  1 sibling, 2 replies; 25+ messages in thread
From: Andrew Burgess @ 2018-10-16 23:19 UTC (permalink / raw)
  To: Claudiu Zissulescu; +Cc: gcc-patches, fbedard, claziss

* Claudiu Zissulescu <claziss@gmail.com> [2018-10-10 11:00:13 +0300]:

> Use BI/BIH instruction to implement casesi pattern. Only ARC V2.

This removes the compact-casesi as an option for earlier ARC, right?
Was there a reason why that had to be done?

> 
> gcc/
> 2018-03-21  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* config/arc/arc.c (arc_override_options): Remove
> 	TARGET_COMPACT_CASESI.
> 	* config/arc/arc.h (ASM_OUTPUT_ADDR_DIFF_ELT): Update.
> 	(CASE_VECTOR_MODE): Likewise.
> 	(CASE_VECTOR_PC_RELATIVE): Likewise.
> 	(CASE_VECTOR_SHORTEN_MODE): Likewise.
> 	(CASE_VECTOR_SHORTEN_MODE1): Delete.
> 	(ADDR_VEC_ALIGN): Update.
> 	(ASM_OUTPUT_CASE_LABEL): Undefine.
> 	(ASM_OUTPUT_BEFORE_CASE_LABEL): Undefine.
> 	(TARGET_BI_BIH): Define.
> 	(DEFAULT_BRANCH_INDEX): Likewise.
> 	* config/arc/arc.md (casesi): Rework to accept BI/BIH
> 	instructions, remove compact_casesi use case.
> 	(casesi_compact_jump): Remove.
> 	(casesi_dispatch): New pattern.
> 	* config/arc/arc.opt: Add mbranch-index option. Deprecate
> 	compact_casesi option.
> 	* doc/invoke.texi: Document mbranch-index option.

I guess if you feel that dropping compact-casesi support for earlier
targets is appropriate, then that's fine.  There's some formatting
issues I point out below.  But otherwise seems reasonable.

Thanks,
Andrew

> 
> gcc/testsuite
> Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* gcc.target/arc/jumptable.c: New test.
> ---
>  gcc/config/arc/arc.c                     |  19 --
>  gcc/config/arc/arc.h                     | 106 ++++++-----
>  gcc/config/arc/arc.md                    | 218 +++++++----------------
>  gcc/config/arc/arc.opt                   |   6 +-
>  gcc/doc/invoke.texi                      |   9 +-
>  gcc/testsuite/gcc.target/arc/jumptable.c |  34 ++++
>  6 files changed, 171 insertions(+), 221 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arc/jumptable.c
> 
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index 56f566795ff..18dd0de6af7 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -1291,33 +1291,14 @@ arc_override_options (void)
>    if (arc_size_opt_level == 3)
>      optimize_size = 1;
>  
> -  /* Compact casesi is not a valid option for ARCv2 family.  */
> -  if (TARGET_V2)
> -    {
> -      if (TARGET_COMPACT_CASESI)
> -	{
> -	  warning (OPT_mcompact_casesi,
> -		   "compact-casesi is not applicable to ARCv2");
> -	  TARGET_COMPACT_CASESI = 0;
> -	}
> -    }
> -  else if (optimize_size == 1
> -	   && !global_options_set.x_TARGET_COMPACT_CASESI)
> -    TARGET_COMPACT_CASESI = 1;
> -
>    if (flag_pic)
>      target_flags |= MASK_NO_SDATA_SET;
>  
>    if (flag_no_common == 255)
>      flag_no_common = !TARGET_NO_SDATA_SET;
>  
> -  /* TARGET_COMPACT_CASESI needs the "q" register class.  */
>    if (TARGET_MIXED_CODE)
>      TARGET_Q_CLASS = 1;
> -  if (!TARGET_Q_CLASS)
> -    TARGET_COMPACT_CASESI = 0;
> -  if (TARGET_COMPACT_CASESI)
> -    TARGET_CASE_VECTOR_PC_RELATIVE = 1;
>  
>    /* Check for small data option */
>    if (!global_options_set.x_g_switch_value && !TARGET_NO_SDATA_SET)
> diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
> index dd78a6bbbd1..cb48b85d6e7 100644
> --- a/gcc/config/arc/arc.h
> +++ b/gcc/config/arc/arc.h
> @@ -1264,25 +1264,39 @@ do {							\
>  } while (0)
>  
>  /* This is how to output an element of a case-vector that is relative.  */
> -#define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL) \
> -do {							\
> -  char label[30];					\
> -  ASM_GENERATE_INTERNAL_LABEL (label, "L", VALUE);	\
> -  switch (GET_MODE (BODY))				\
> -    {							\
> -    case E_QImode: fprintf (FILE, "\t.byte "); break;	\
> -    case E_HImode: fprintf (FILE, "\t.hword "); break;	\
> -    case E_SImode: fprintf (FILE, "\t.word "); break;	\
> -    default: gcc_unreachable ();			\
> -    }							\
> -  assemble_name (FILE, label);				\
> -  fprintf (FILE, "-");					\
> -  ASM_GENERATE_INTERNAL_LABEL (label, "L", REL);	\
> -  assemble_name (FILE, label);				\
> -  if (TARGET_COMPACT_CASESI)				\
> -    fprintf (FILE, " + %d", 4 + arc_get_unalign ());	\
> -  fprintf(FILE, "\n");                                  \
> -} while (0)
> +#define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL)	\
> +  do {								\
> +    char label[30];						\
> +    ASM_GENERATE_INTERNAL_LABEL (label, "L", VALUE);		\
> +    if (!TARGET_BI_BIH)						\
> +      {								\
> +	switch (GET_MODE (BODY))				\
> +	  {							\
> +	  case E_QImode: fprintf (FILE, "\t.byte "); break;	\
> +	  case E_HImode: fprintf (FILE, "\t.hword "); break;	\
> +	  case E_SImode: fprintf (FILE, "\t.word "); break;	\
> +	  default: gcc_unreachable ();				\
> +	  }							\
> +	assemble_name (FILE, label);				\
> +	fprintf (FILE, "-");					\
> +	ASM_GENERATE_INTERNAL_LABEL (label, "L", REL);		\
> +	assemble_name (FILE, label);				\
> +	fprintf(FILE, "\n");					\

Missing whitespace before (.

> +      } else {							\

Split the '} else {' over separate lines.

> +      switch (GET_MODE (BODY))					\
> +	{							\
> +	case E_SImode: fprintf (FILE, "\tb\t@"); break;		\
> +	case E_HImode:						\
> +	case E_QImode: fprintf (FILE, "\tb_s\t@"); break;	\
> +	default: gcc_unreachable ();				\
> +	}							\
> +      assemble_name (FILE, label);				\
> +      fprintf(FILE, "\n");					\

Missing whitespace before (.

> +    }								\
> +  } while (0)
> +
> +/* Defined to also emit an .align in elfos.h.  We don't want that.  */
> +#undef ASM_OUTPUT_CASE_LABEL
>  
>  /* ADDR_DIFF_VECs are in the text section and thus can affect the
>     current alignment.  */
> @@ -1380,36 +1394,34 @@ do { \
>     for the index in the tablejump instruction.
>     If we have pc relative case vectors, we start the case vector shortening
>     with QImode.  */
> -#define CASE_VECTOR_MODE \
> -  ((optimize && (CASE_VECTOR_PC_RELATIVE || flag_pic)) ? QImode : Pmode)
> +#define CASE_VECTOR_MODE						\
> +  (TARGET_BI_BIH ? SImode						\
> +   : (optimize && (CASE_VECTOR_PC_RELATIVE || flag_pic)) ? QImode : Pmode)
>  
>  /* Define as C expression which evaluates to nonzero if the tablejump
>     instruction expects the table to contain offsets from the address of the
>     table.
>     Do not define this if the table should contain absolute addresses.  */
> -#define CASE_VECTOR_PC_RELATIVE TARGET_CASE_VECTOR_PC_RELATIVE
> -
> -#define CASE_VECTOR_SHORTEN_MODE(MIN_OFFSET, MAX_OFFSET, BODY) \
> -  CASE_VECTOR_SHORTEN_MODE_1 \
> -    (MIN_OFFSET, TARGET_COMPACT_CASESI ? MAX_OFFSET + 6 : MAX_OFFSET, BODY)
> -
> -#define CASE_VECTOR_SHORTEN_MODE_1(MIN_OFFSET, MAX_OFFSET, BODY) \
> -((MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 255 \
> - ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, QImode) \
> - : (MIN_OFFSET) >= -128 && (MAX_OFFSET) <= 127 \
> - ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, QImode) \
> - : (MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 65535 \
> - ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, HImode) \
> - : (MIN_OFFSET) >= -32768 && (MAX_OFFSET) <= 32767 \
> - ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, HImode) \
> - : SImode)
> -
> -#define ADDR_VEC_ALIGN(VEC_INSN) \
> -  (exact_log2 (GET_MODE_SIZE (as_a <scalar_int_mode> \
> -			      (GET_MODE (PATTERN (VEC_INSN))))))
> -#undef ASM_OUTPUT_BEFORE_CASE_LABEL
> -#define ASM_OUTPUT_BEFORE_CASE_LABEL(FILE, PREFIX, NUM, TABLE) \
> -  ASM_OUTPUT_ALIGN ((FILE), ADDR_VEC_ALIGN (TABLE))
> +#define CASE_VECTOR_PC_RELATIVE					\
> +  (TARGET_CASE_VECTOR_PC_RELATIVE || TARGET_BI_BIH)
> +
> +#define CASE_VECTOR_SHORTEN_MODE(MIN_OFFSET, MAX_OFFSET, BODY)		\
> +  (TARGET_BI_BIH ?						\
> +   ((MIN_OFFSET) >= -512 && (MAX_OFFSET) <= 508 ? HImode : SImode)	\
> +   : ((MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 255				\
> +      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, QImode)	\
> +      : (MIN_OFFSET) >= -128 && (MAX_OFFSET) <= 127			\
> +      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, QImode)	\
> +      : (MIN_OFFSET) >= 0 && (MAX_OFFSET) <= 65535			\
> +      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 1, HImode)	\
> +      : (MIN_OFFSET) >= -32768 && (MAX_OFFSET) <= 32767			\
> +      ? (ADDR_DIFF_VEC_FLAGS (BODY).offset_unsigned = 0, HImode)	\
> +      : SImode))
> +
> +#define ADDR_VEC_ALIGN(VEC_INSN)					\
> +  (TARGET_BI_BIH ? 0							\
> +   : exact_log2 (GET_MODE_SIZE (as_a <scalar_int_mode>			\
> +				(GET_MODE (PATTERN (VEC_INSN))))))
>  
>  #define INSN_LENGTH_ALIGNMENT(INSN)		  \
>    ((JUMP_TABLE_DATA_P (INSN)			  \
> @@ -1636,4 +1648,10 @@ enum
>  #define TARGET_LRA arc_lra_p()
>  #endif
>  
> +/* BI/BIH feature macro.  */
> +#define TARGET_BI_BIH (TARGET_BRANCH_INDEX && TARGET_CODE_DENSITY)
> +
> +/* The default option for BI/BIH instructions.  */
> +#define DEFAULT_BRANCH_INDEX 0
> +
>  #endif /* GCC_ARC_H */
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index 6ea67791627..1ed230fa5f0 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -3968,60 +3968,72 @@ archs4x, archs4xd, archs4xd_slow"
>     (set_attr "cond" "canuse,canuse_limm,canuse,canuse,canuse")])
>  
>  ;; Implement a switch statement.
> -
>  (define_expand "casesi"
> -  [(set (match_dup 5)
> -	(minus:SI (match_operand:SI 0 "register_operand" "")
> -		  (match_operand:SI 1 "nonmemory_operand" "")))
> -   (set (reg:CC CC_REG)
> -	(compare:CC (match_dup 5)
> -		    (match_operand:SI 2 "nonmemory_operand" "")))
> -   (set (pc)
> -	(if_then_else (gtu (reg:CC CC_REG)
> -			   (const_int 0))
> -		      (label_ref (match_operand 4 "" ""))
> -		      (pc)))
> -   (set (match_dup 6)
> -	(unspec:SI [(match_operand 3 "" "")
> -		    (match_dup 5) (match_dup 7)] UNSPEC_ARC_CASESI))
> -   (parallel [(set (pc) (match_dup 6)) (use (match_dup 7))])]
> +  [(match_operand:SI 0 "register_operand" "")	; Index
> +   (match_operand:SI 1 "const_int_operand" "")	; Lower bound
> +   (match_operand:SI 2 "const_int_operand" "")	; Total range
> +   (match_operand:SI 3 "" "")		; Table label
> +   (match_operand:SI 4 "" "")]		; Out of range label
>    ""
> -  "
> -{
> -  rtx x;
> -
> -  operands[5] = gen_reg_rtx (SImode);
> -  operands[6] = gen_reg_rtx (SImode);
> -  operands[7] = operands[3];
> -  emit_insn (gen_subsi3 (operands[5], operands[0], operands[1]));
> -  emit_insn (gen_cmpsi_cc_insn_mixed (operands[5], operands[2]));
> -  x = gen_rtx_GTU (VOIDmode, gen_rtx_REG (CCmode, CC_REG), const0_rtx);
> -  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> -			    gen_rtx_LABEL_REF (VOIDmode, operands[4]), pc_rtx);
> -  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
> -  if (TARGET_COMPACT_CASESI)
> -    {
> -      emit_jump_insn (gen_casesi_compact_jump (operands[5], operands[7]));
> -    }
> -  else
> -    {
> +  {
> +   if (operands[1] != const0_rtx)
> +     {
> +       rtx reg = gen_reg_rtx (SImode);
> +       emit_insn (gen_subsi3 (reg, operands[0], operands[1]));
> +       operands[0] = reg;
> +      }

Indentation seems wonky here.

> +   emit_jump_insn (gen_cbranchsi4 (gen_rtx_GTU (SImode, operands[0],
> +							operands[2]),
> +				   operands[0], operands[2], operands[4]));
> +   if (TARGET_BI_BIH)
> +     {
> +      emit_jump_insn (gen_casesi_dispatch (operands[0], operands[3]));
> +     }

Don't think the {} are needed for a single line.

> +   else
> +   {

Shouldn't the '{' be indented? And the block below accordingly?

> +      rtx reg = gen_reg_rtx (SImode);
> +      rtx lbl = operands[3];
>        operands[3] = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
> -      if (flag_pic || !cse_not_expected)
> +      if (flag_pic)
>  	operands[3] = force_reg (Pmode, operands[3]);
> -      emit_insn (gen_casesi_load (operands[6],
> -				  operands[3], operands[5], operands[7]));
> +      emit_insn (gen_casesi_load (reg,
> +				  operands[3], operands[0], lbl));
>        if (CASE_VECTOR_PC_RELATIVE || flag_pic)
> -	emit_insn (gen_addsi3 (operands[6], operands[6], operands[3]));
> -      emit_jump_insn (gen_casesi_jump (operands[6], operands[7]));
> +	emit_insn (gen_addsi3 (reg, reg, operands[3]));
> +      emit_jump_insn (gen_casesi_jump (reg, lbl));
> +     }
> +   DONE;
> +  })
> +
> +(define_insn "casesi_dispatch"
> +  [(set (pc)
> +	(unspec:SI [(match_operand:SI 0 "register_operand" "r")
> +		    (label_ref (match_operand 1 "" ""))]
> +		   UNSPEC_ARC_CASESI))]
> +  "TARGET_BI_BIH"
> +  {
> +   rtx diff_vec = PATTERN (next_nonnote_insn (as_a<rtx_insn *> (operands[1])));
> +   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
> +   switch (GET_MODE (diff_vec))
> +   {

Indent the { I think.

> +    case E_SImode:
> +     return \"bi\\t[%0]\";
> +    case E_HImode:
> +    case E_QImode:
> +    return \"bih\\t[%0]\";
> +    default: gcc_unreachable ();
>      }
> -  DONE;
> -}")
> +   }
> +  [(set_attr "type" "brcc_no_delay_slot")
> +   (set_attr "iscompact" "false")
> +   (set_attr "length" "4")])
>  
>  (define_insn "casesi_load"
> -  [(set (match_operand:SI 0 "register_operand"             "=Rcq,r,r")
> -	(unspec:SI [(match_operand:SI 1 "nonmemory_operand" "Rcq,c,Cal")
> -		    (match_operand:SI 2 "register_operand"  "Rcq,c,c")
> -		    (label_ref (match_operand 3 "" ""))] UNSPEC_ARC_CASESI))]
> +  [(set (match_operand:SI 0 "register_operand"             "=q,r,r")
> +	(mem:SI (unspec:SI [(match_operand:SI 1 "nonmemory_operand" "q,r,Cal")
> +			    (match_operand:SI 2 "register_operand"  "q,r,r")]
> +			   UNSPEC_ARC_CASESI)))
> +   (use (label_ref (match_operand 3 "" "")))]
>    ""
>    "*
>  {
> @@ -4037,15 +4049,15 @@ archs4x, archs4xd, archs4xd_slow"
>    switch (GET_MODE (diff_vec))
>      {
>      case E_SImode:
> -      return \"ld.as %0,[%1,%2]%&\";
> +      return \"ld.as\\t%0,[%1,%2]%&\";
>      case E_HImode:
>        if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
> -	return \"ld%_.as %0,[%1,%2]\";
> -      return \"ld%_.x.as %0,[%1,%2]\";
> +	return \"ld%_.as\\t%0,[%1,%2]\";
> +      return \"ld%_.x.as\\t%0,[%1,%2]\";
>      case E_QImode:
>        if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
> -	return \"ldb%? %0,[%1,%2]%&\";
> -      return \"ldb.x %0,[%1,%2]\";
> +	return \"ldb%?\\t%0,[%1,%2]%&\";
> +      return \"ldb.x\\t%0,[%1,%2]\";
>      default:
>        gcc_unreachable ();
>      }
> @@ -4085,110 +4097,6 @@ archs4x, archs4xd, archs4xd_slow"
>     (set_attr "iscompact" "false,maybe,false")
>     (set_attr "cond" "canuse")])
>  
> -(define_insn "casesi_compact_jump"
> -  [(set (pc)
> -	(unspec:SI [(match_operand:SI 0 "register_operand" "c,q")]
> -		   UNSPEC_ARC_CASESI))
> -   (use (label_ref (match_operand 1 "" "")))
> -   (clobber (match_scratch:SI 2 "=q,0"))]
> -  "TARGET_COMPACT_CASESI"
> -  "*
> -{
> -  rtx diff_vec = PATTERN (next_nonnote_insn (as_a<rtx_insn *> (operands[1])));
> -  int unalign = arc_get_unalign ();
> -  rtx xop[3];
> -  const char *s;
> -
> -  xop[0] = operands[0];
> -  xop[2] = operands[2];
> -  gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
> -
> -  switch (GET_MODE (diff_vec))
> -    {
> -    case E_SImode:
> -      /* Max length can be 12 in this case, but this is OK because
> -	 2 of these are for alignment, and are anticipated in the length
> -	 of the ADDR_DIFF_VEC.  */
> -      if (unalign && !satisfies_constraint_Rcq (xop[0]))
> -	s = \"add2 %2,pcl,%0\n\tld_s %2,[%2,12]\";
> -      else if (unalign)
> -	s = \"add_s %2,%0,2\n\tld.as %2,[pcl,%2]\";
> -      else
> -	s = \"add %2,%0,2\n\tld.as %2,[pcl,%2]\";
> -      arc_clear_unalign ();
> -      break;
> -    case E_HImode:
> -      if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
> -	{
> -	  if (satisfies_constraint_Rcq (xop[0]))
> -	    {
> -	      s = \"add_s %2,%0,%1\n\tld%_.as %2,[pcl,%2]\";
> -	      xop[1] = GEN_INT ((10 - unalign) / 2U);
> -	    }
> -	  else
> -	    {
> -	      s = \"add1 %2,pcl,%0\n\tld%__s %2,[%2,%1]\";
> -	      xop[1] = GEN_INT (10 + unalign);
> -	    }
> -	}
> -      else
> -	{
> -	  if (satisfies_constraint_Rcq (xop[0]))
> -	    {
> -	      s = \"add_s %2,%0,%1\n\tld%_.x.as %2,[pcl,%2]\";
> -	      xop[1] = GEN_INT ((10 - unalign) / 2U);
> -	    }
> -	  else
> -	    {
> -	      s = \"add1 %2,pcl,%0\n\tld%__s.x %2,[%2,%1]\";
> -	      xop[1] = GEN_INT (10 + unalign);
> -	    }
> -	}
> -      arc_toggle_unalign ();
> -      break;
> -    case E_QImode:
> -      if (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned)
> -	{
> -	  if ((rtx_equal_p (xop[2], xop[0])
> -	       || find_reg_note (insn, REG_DEAD, xop[0]))
> -	      && satisfies_constraint_Rcq (xop[0]))
> -	    {
> -	      s = \"add_s %0,%0,pcl\n\tldb_s %2,[%0,%1]\";
> -	      xop[1] = GEN_INT (8 + unalign);
> -	    }
> -	  else
> -	    {
> -	      s = \"add %2,%0,pcl\n\tldb_s %2,[%2,%1]\";
> -	      xop[1] = GEN_INT (10 + unalign);
> -	      arc_toggle_unalign ();
> -	    }
> -	}
> -      else if ((rtx_equal_p (xop[0], xop[2])
> -		|| find_reg_note (insn, REG_DEAD, xop[0]))
> -	       && satisfies_constraint_Rcq (xop[0]))
> -	{
> -	  s = \"add_s %0,%0,%1\n\tldb.x %2,[pcl,%0]\";
> -	  xop[1] = GEN_INT (10 - unalign);
> -	  arc_toggle_unalign ();
> -	}
> -      else
> -	{
> -	  /* ??? Length is 12.  */
> -	  s = \"add %2,%0,%1\n\tldb.x %2,[pcl,%2]\";
> -	  xop[1] = GEN_INT (8 + unalign);
> -	}
> -      break;
> -    default:
> -      gcc_unreachable ();
> -    }
> -  output_asm_insn (s, xop);
> -  return \"add_s %2,%2,pcl\n\tj_s%* [%2]\";
> -}"
> -  [(set_attr "length" "10")
> -   (set_attr "type" "jump")
> -   (set_attr "iscompact" "true")
> -   (set_attr "cond" "nocond")])
> -
>  (define_expand "call"
>    ;; operands[1] is stack_size_rtx
>    ;; operands[2] is next_arg_register
> diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
> index ee06c063837..3e96b58375d 100644
> --- a/gcc/config/arc/arc.opt
> +++ b/gcc/config/arc/arc.opt
> @@ -328,7 +328,7 @@ Target Var(TARGET_CASE_VECTOR_PC_RELATIVE)
>  Use pc-relative switch case tables - this enables case table shortening.
>  
>  mcompact-casesi
> -Target Var(TARGET_COMPACT_CASESI)
> +Target Warn(%qs is deprecated)
>  Enable compact casesi pattern.
>  
>  mq-class
> @@ -528,3 +528,7 @@ Enum(arc_lpc) String(32) Value(32)
>  mrf16
>  Target Report Mask(RF16)
>  Enable 16-entry register file.
> +
> +mbranch-index
> +Target Report Var(TARGET_BRANCH_INDEX) Init(DEFAULT_BRANCH_INDEX)
> +Enable use of BI/BIH instructions when available.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 802cc642453..454587310c8 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -650,7 +650,7 @@ Objective-C and Objective-C++ Dialects}.
>  -mmixed-code  -mq-class  -mRcq  -mRcw  -msize-level=@var{level} @gol
>  -mtune=@var{cpu}  -mmultcost=@var{num} @gol
>  -munalign-prob-threshold=@var{probability}  -mmpy-option=@var{multo} @gol
> --mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16}
> +-mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16 -mbranch-index}
>  
>  @emph{ARM Options}
>  @gccoptlist{-mapcs-frame  -mno-apcs-frame @gol
> @@ -15814,6 +15814,11 @@ This option instructs the compiler to generate code for a 16-entry
>  register file.  This option defines the @code{__ARC_RF16__}
>  preprocessor macro.
>  
> +@item -mbranch-index
> +@opindex mbranch-index
> +Enable use of @code{bi} or @code{bih} instructions to implement jump
> +tables.
> +
>  @end table
>  
>  The following options are passed through to the assembler, and also
> @@ -15985,7 +15990,7 @@ This is the default for @option{-Os}.
>  @item -mcompact-casesi
>  @opindex mcompact-casesi
>  Enable compact @code{casesi} pattern.  This is the default for @option{-Os},
> -and only available for ARCv1 cores.
> +and only available for ARCv1 cores.  This option is deprecated.
>  
>  @item -mno-cond-exec
>  @opindex mno-cond-exec
> diff --git a/gcc/testsuite/gcc.target/arc/jumptable.c b/gcc/testsuite/gcc.target/arc/jumptable.c
> new file mode 100644
> index 00000000000..fbc58e33149
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arc/jumptable.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { arc700 || arc6xx } } */
> +/* { dg-options "-O2 -mbranch-index -mcode-density" { target { arcem || archs } } } */
> +
> +extern void max( int,int);
> +
> +int switchCase(int value, int b)
> +{
> +  switch(value){
> +  case 100:
> +    value = b * value;
> +    break;
> +  case 101:
> +    value = b << value;
> +    break;
> +  case 102:
> +    value = b / value;
> +    break;
> +  case 103:
> +    value = b >> value;
> +    break;
> +  case 104:
> +    value = b + value;
> +    break;
> +  case 105:
> +    value = b - value;
> +    break;
> +  }
> +  max(value, b);
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "bih" 1 } } */
> +/* { dg-final { scan-assembler-times "b_s" 8 } } */
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/6] [ARC] Add BI/BIH instruction support.
  2018-10-10  8:49 ` [PATCH 3/6] [ARC] Add BI/BIH instruction support Claudiu Zissulescu
  2018-10-16 23:19   ` Andrew Burgess
@ 2018-10-17  7:19   ` Sandra Loosemore
  2018-10-31 12:31     ` claziss
  1 sibling, 1 reply; 25+ messages in thread
From: Sandra Loosemore @ 2018-10-17  7:19 UTC (permalink / raw)
  To: Claudiu Zissulescu, gcc-patches; +Cc: andrew.burgess, fbedard, claziss

On 10/10/2018 02:00 AM, Claudiu Zissulescu wrote:
> Use BI/BIH instruction to implement casesi pattern. Only ARC V2.

Very minor nit in the documentation part of this patch:

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 802cc642453..454587310c8 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -650,7 +650,7 @@ Objective-C and Objective-C++ Dialects}.
>   -mmixed-code  -mq-class  -mRcq  -mRcw  -msize-level=@var{level} @gol
>   -mtune=@var{cpu}  -mmultcost=@var{num} @gol
>   -munalign-prob-threshold=@var{probability}  -mmpy-option=@var{multo} @gol
> --mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16}
> +-mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16 -mbranch-index}
>   
>   @emph{ARM Options}
>   @gccoptlist{-mapcs-frame  -mno-apcs-frame @gol

Please consistently use 2 spaces between options listed on the same line 
in these option summary tables.  It makes the output a little easier to 
read.

-Sandra

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH 3/6] [ARC] Add BI/BIH instruction support.
  2018-10-16 23:19   ` Andrew Burgess
@ 2018-10-17 17:21     ` Claudiu Zissulescu
  2018-10-31 12:59     ` claziss
  1 sibling, 0 replies; 25+ messages in thread
From: Claudiu Zissulescu @ 2018-10-17 17:21 UTC (permalink / raw)
  To: Andrew Burgess; +Cc: gcc-patches, francois.bedard

Hi,

> 
> This removes the compact-casesi as an option for earlier ARC, right?
> Was there a reason why that had to be done?
> 

The compact-casesi was only designed for ARCv1 types of CPUs. Unfortunately, it was error prone leading to all kinds of runtime and compile time errors. Fixing it will nullify most of the advantages of using this format. Hence, I've remove it, leaving the standard implementation for all CPUS. 
The BI/BIH are two new instructions for ARCv2, and they should be friendly with the cache, but I couldn't prove that, hence they are guarded by an option.

Please let me know if you want to have compact-casesi back in the compiler.

Thanks,
Claudiu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads
  2018-10-10  8:01 ` [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads Claudiu Zissulescu
@ 2018-10-22 18:15   ` Andrew Burgess
  2018-10-22 23:29     ` Bernhard Reutner-Fischer
  0 siblings, 1 reply; 25+ messages in thread
From: Andrew Burgess @ 2018-10-22 18:15 UTC (permalink / raw)
  To: Claudiu Zissulescu; +Cc: gcc-patches, fbedard, claziss

* Claudiu Zissulescu <claziss@gmail.com> [2018-10-10 11:00:14 +0300]:

> Simple peephole rules which combines multiple ld/st instructions into
> 64-bit load/store instructions. It only works for architectures which
> are having double load/store option on.
> 
> gcc/
> 	Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* config/arc/arc-protos.h (gen_operands_ldd_std): Add.
> 	* config/arc/arc.c (operands_ok_ldd_std): New function.
> 	(mem_ok_for_ldd_std): Likewise.
> 	(gen_operands_ldd_std): Likewise.
> 	* config/arc/arc.md: Add peephole2 rules for std/ldd.
> ---
>  gcc/config/arc/arc-protos.h |   1 +
>  gcc/config/arc/arc.c        | 163 ++++++++++++++++++++++++++++++++++++
>  gcc/config/arc/arc.md       |  67 +++++++++++++++
>  3 files changed, 231 insertions(+)
> 
> diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
> index 24bea6e1efb..55f8ed4c643 100644
> --- a/gcc/config/arc/arc-protos.h
> +++ b/gcc/config/arc/arc-protos.h
> @@ -46,6 +46,7 @@ extern int arc_return_address_register (unsigned int);
>  extern unsigned int arc_compute_function_type (struct function *);
>  extern bool arc_is_uncached_mem_p (rtx);
>  extern bool arc_lra_p (void);
> +extern bool gen_operands_ldd_std (rtx *operands, bool load, bool commute);
>  #endif /* RTX_CODE */
>  
>  extern unsigned int arc_compute_frame_size (int);
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index 18dd0de6af7..9bc69e9fbc9 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -10803,6 +10803,169 @@ arc_cannot_substitute_mem_equiv_p (rtx)
>    return true;
>  }
>  
> +/* Checks whether the operands are valid for use in an LDD/STD
> +   instruction.	 Assumes that RT, RT2, and RN are REG.	This is
> +   guaranteed by the patterns.	Assumes that the address in the base
> +   register RN is word aligned.	 Pattern guarantees that both memory
> +   accesses use the same base register, the offsets are constants
> +   within the range, and the gap between the offsets is 4.  If preload
> +   complete then check that registers are legal.  WBACK indicates
> +   whether address is updated.	*/

You've got tabs instead of whitespace inside both this comment block,
and others within this patch.  It should be period and two whitespace
at the end of each sentence.

> +
> +static bool
> +operands_ok_ldd_std (rtx rt, rtx rt2, rtx rn ATTRIBUTE_UNUSED,
> +		    HOST_WIDE_INT offset)

Why have the RN parameter at all?  I took a quick look through patches
5/6 and don't see any additional changes to this function, we should
probably just drop this at this point.

> +{
> +  unsigned int t, t2;
> +
> +  if (!reload_completed)
> +    return true;
> +
> +  if (!(SMALL_INT_RANGE (offset, (GET_MODE_SIZE (DImode) - 1) & -4,

Couldn't we use (~0x3) instead of -4?  Maybe I'm just feeling slow
today, but the bit pattern for negative numbers don't just pop into my
head like those for positive numbers.

> +			 (offset & (GET_MODE_SIZE (DImode) - 1) & 3
> +			  ? 0 : -(-GET_MODE_SIZE (DImode) | -4) >> 1))))
> +    return false;
> +
> +  t = REGNO (rt);
> +  t2 = REGNO (rt2);
> +
> +  if ((t2 == 63)

Can we use PROGRAM_COUNTER_REGNO here?

> +      || (t % 2 != 0)	/* First destination register is not even.  */
> +      || (t2 != t + 1))
> +      return false;
> +
> +  return true;
> +}
> +
> +/* Helper for gen_operands_ldd_std.  Returns true iff the memory
> +   operand MEM's address contains an immediate offset from the base
> +   register and has no side effects, in which case it sets BASE and
> +   OFFSET accordingly.	*/
> +
> +static bool
> +mem_ok_for_ldd_std (rtx mem, rtx *base, rtx *offset)
> +{
> +  rtx addr;
> +
> +  gcc_assert (base != NULL && offset != NULL);
> +
> +  /* TODO: Handle more general memory operand patterns, such as
> +     PRE_DEC and PRE_INC.  */
> +
> +  if (side_effects_p (mem))
> +    return false;
> +
> +  /* Can't deal with subregs.  */
> +  if (GET_CODE (mem) == SUBREG)
> +    return false;
> +
> +  gcc_assert (MEM_P (mem));
> +
> +  *offset = const0_rtx;
> +
> +  addr = XEXP (mem, 0);
> +
> +  /* If addr isn't valid for DImode, then we can't handle it.  */
> +  if (!arc_legitimate_address_p (DImode, addr,
> +				reload_in_progress || reload_completed))
> +    return false;
> +
> +  if (REG_P (addr))
> +    {
> +      *base = addr;
> +      return true;
> +    }
> +  else if (GET_CODE (addr) == PLUS || GET_CODE (addr) == MINUS)
> +    {
> +      *base = XEXP (addr, 0);
> +      *offset = XEXP (addr, 1);
> +      return (REG_P (*base) && CONST_INT_P (*offset));
> +    }
> +
> +  return false;
> +}
> +
> +/* Called from peephole2 to replace two word-size accesses with a
> +   single LDD/STD instruction.	Returns true iff we can generate a new
> +   instruction sequence.  That is, both accesses use the same base
> +   register and the gap between constant offsets is 4.	OPERANDS are
> +   the operands found by the peephole matcher; OPERANDS[0,1] are
> +   register operands, and OPERANDS[2,3] are the corresponding memory
> +   operands.  LOAD indicates whether the access is load or store.  */
> +
> +bool
> +gen_operands_ldd_std (rtx *operands, bool load, bool commute)
> +{
> +  int i, gap;
> +  HOST_WIDE_INT offsets[2], offset;
> +  int nops = 2;
> +  rtx cur_base, cur_offset, tmp;
> +  rtx base = NULL_RTX;
> +
> +  /* Check that the memory references are immediate offsets from the
> +     same base register.  Extract the base register, the destination
> +     registers, and the corresponding memory offsets.  */
> +  for (i = 0; i < nops; i++)
> +    {
> +      if (!mem_ok_for_ldd_std (operands[nops+i], &cur_base, &cur_offset))
> +	return false;
> +
> +      if (i == 0)
> +	base = cur_base;
> +      else if (REGNO (base) != REGNO (cur_base))
> +	return false;
> +
> +      offsets[i] = INTVAL (cur_offset);
> +      if (GET_CODE (operands[i]) == SUBREG)
> +	{
> +	  tmp = SUBREG_REG (operands[i]);
> +	  gcc_assert (GET_MODE (operands[i]) == GET_MODE (tmp));
> +	  operands[i] = tmp;
> +	}
> +    }
> +
> +  /* Make sure there is no dependency between the individual loads.  */
> +  if (load && REGNO (operands[0]) == REGNO (base))
> +    return false; /* RAW */
> +
> +  if (load && REGNO (operands[0]) == REGNO (operands[1]))
> +    return false; /* WAW */
> +
> +  /* Make sure the instructions are ordered with lower memory access first.  */
> +  if (offsets[0] > offsets[1])
> +    {
> +      gap = offsets[0] - offsets[1];
> +      offset = offsets[1];
> +
> +      /* Swap the instructions such that lower memory is accessed first.  */
> +      std::swap (operands[0], operands[1]);
> +      std::swap (operands[2], operands[3]);
> +    }
> +  else
> +    {
> +      gap = offsets[1] - offsets[0];
> +      offset = offsets[0];
> +    }
> +
> +  /* Make sure accesses are to consecutive memory locations.  */
> +  if (gap != 4)
> +    return false;
> +
> +  /* Make sure we generate legal instructions.	*/
> +  if (operands_ok_ldd_std (operands[0], operands[1], base, offset))
> +    return true;
> +
> +  if (load && commute)
> +    {
> +      /* Try reordering registers.  */
> +      std::swap (operands [0], operands[1]);
> +      if (operands_ok_ldd_std (operands[0], operands[1], base, offset))
> +	return true;
> +    }
> +
> +  return false;
> +}
> +
>  #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
>  #define TARGET_USE_ANCHORS_FOR_SYMBOL_P arc_use_anchors_for_symbol_p
>  
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index 1ed230fa5f0..b968022e64a 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -6363,6 +6363,73 @@ archs4x, archs4xd, archs4xd_slow"
>    [(set (reg:CC CC_REG) (compare:CC (match_dup 3)
>  				    (ashift:SI (match_dup 1) (match_dup 2))))])
>  
> +(define_peephole2 ; std
> +[(set (match_operand:SI 2 "memory_operand" "")
> +      (match_operand:SI 0 "register_operand" ""))
> + (set (match_operand:SI 3 "memory_operand" "")
> +      (match_operand:SI 1 "register_operand" ""))]
> + "TARGET_LL64"
> + [(const_int 0)]
> +{
> + if (!gen_operands_ldd_std (operands, false, false))
> +   FAIL;
> + operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
> + operands[2] = adjust_address (operands[2], DImode, 0);
> + emit_insn (gen_rtx_SET (operands[2], operands[0]));
> + DONE;
> + })
> +
> +(define_peephole2 ; ldd
> +  [(set (match_operand:SI 0 "register_operand" "")
> +        (match_operand:SI 2 "memory_operand" ""))
> +   (set (match_operand:SI 1 "register_operand" "")
> +        (match_operand:SI 3 "memory_operand" ""))]
> +  "TARGET_LL64"
> +  [(const_int 0)]
> +{
> +  if (!gen_operands_ldd_std (operands, true, false))
> +    FAIL;
> +  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
> +  operands[2] = adjust_address (operands[2], DImode, 0);
> +  emit_insn (gen_rtx_SET (operands[0], operands[2]));
> +  DONE;
> +})
> +
> +;; We require consecutive registers for LDD instruction.  Check if we
> +;; can reorder them and use an LDD.
> +
> +(define_peephole2 ; swap the destination registers of two loads
> +		  ; before a commutative operation.
> +  [(set (match_operand:SI 0 "register_operand" "")
> +        (match_operand:SI 2 "memory_operand" ""))
> +   (set (match_operand:SI 1 "register_operand" "")
> +        (match_operand:SI 3 "memory_operand" ""))
> +   (set (match_operand:SI 4 "register_operand" "")
> +        (match_operator:SI 5 "commutative_operator"
> +			   [(match_operand 6 "register_operand" "")
> +			    (match_operand 7 "register_operand" "") ]))]
> +  "TARGET_LL64
> +   && (((rtx_equal_p(operands[0], operands[6]))
> +         && (rtx_equal_p(operands[1], operands[7])))
> +        || ((rtx_equal_p(operands[0], operands[7]))
> +             && (rtx_equal_p(operands[1], operands[6]))))
> +   && (peep2_reg_dead_p (3, operands[0]) || rtx_equal_p (operands[0], operands[4]))
> +   && (peep2_reg_dead_p (3, operands[1]) || rtx_equal_p (operands[1], operands[4]))"
> +  [(set (match_dup 0) (match_dup 2))
> +   (set (match_dup 4) (match_op_dup 5 [(match_dup 6) (match_dup 7)]))]
> +  {
> +    if (!gen_operands_ldd_std (operands, true, true))
> +     {
> +        FAIL;
> +     }
> +    else
> +     {
> +        operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
> +        operands[2] = adjust_address (operands[2], DImode, 0);
> +     }
> +   }
> +)
> +
>  ;; include the arc-FPX instructions
>  (include "fpx.md")
>  
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 5/6] [ARC] Refurbish and improve prologue/epilogue functions.
  2018-10-10  9:05 ` [PATCH 5/6] [ARC] Refurbish and improve prologue/epilogue functions Claudiu Zissulescu
@ 2018-10-22 18:26   ` Andrew Burgess
  0 siblings, 0 replies; 25+ messages in thread
From: Andrew Burgess @ 2018-10-22 18:26 UTC (permalink / raw)
  To: Claudiu Zissulescu; +Cc: gcc-patches, fbedard, claziss

* Claudiu Zissulescu <claziss@gmail.com> [2018-10-10 11:00:15 +0300]:

> Reimplement how prologue and epilogue is emitted to accomodate
> enter/leave instructions, as well as improving the size of the
> existing techinques.
> 
> The following modifications are added:
> 
> - millicode thunk calls can be now selected regardless of the
>   optimization level. However they are enabled for size optimizations
>   by default.  Also, the millicode optimization is turned off when we
>   compile for long jumps.
> 
> - the compiler is able to use enter/leave instructions for prologue
>   and epilogue. As these instructions are not ABI compatible we gurad
>   them under a switch (i.e., -mcode-density-frame). When this option
>   is on, the compiler will try emitting enter/leave instructions, if
>   not, then millicode thunk calls (if enabled), and latter the regular
>   push/pop instructions.
> 
> - The prologue/epilogue is now optimized to use pointer walks, hence
>   improving the chance to have push_s/pop_s instructions emitted. It
>   also tries to combine the stack adjustments with load/store
>   operations.
> 
> gcc/
> xxxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* common/config/arc/arc-common.c (arc_option_optimization_table):
> 	Millicode optimization is default on for size optimizations.
> 	* config/arc/arc-protos.h (arc_check_multi): New function.
> 	* config/arc/arc.c (RTX_OK_FOR_OFFSET_P): Rearange.
> 	(arc_override_options): Disable millicode when long calls option
> 	is on.
> 	(arc_frame_info): Change it from int to bool.
> 	(arc_compute_frame_size): Clean up.
> 	(arc_save_restore): Remove.
> 	(frame_save_reg): New function.
> 	(frame_restore_reg): Likewise.
> 	(arc_enter_leave_p): Likewise.
> 	(arc_save_callee_saves): Likewise.
> 	(arc_restore_callee_saves): Likewise.
> 	(arc_save_callee_enter): Likewise.
> 	(arc_restore_callee_leave): Likewise.
> 	(arc_save_callee_milli): Likewise.
> 	(arc_restore_callee_milli): Likewise.
> 	(arc_expand_prologue): Reimplement to emit enter/leave
> 	instructions.
> 	(arc_expand_epilogue): Likewise.
> 	(arc_check_multi): New function.
> 	* config/arc/arc.md (push_multi_fp): New pattern.
> 	(push_multi_fp_blink): Likewise.
> 	(pop_multi_fp): Likewise.
> 	(pop_multi_fp_blink): Likewise.
> 	(pop_multi_fp_ret): Likewise.
> 	(pop_multi_fp_blink_ret): Likewise.
> 	* config/arc/arc.opt (mmillicode): Update option.
> 	(mcode-density-frame): New option.
> 	* config/arc/predicates.md (push_multi_operand): New predicate.
> 	(pop_multi_operand): Likewise.
> 	* doc/invoke.texi (ARC): Update ARC options information.
> 
> gcc/testsuite
> xxxxx-xx-xx  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* gcc.target/arc/firq-1.c: Update test.
> 	* gcc.target/arc/firq-3.c: Likewise.
> 	* gcc.target/arc/firq-4.c: Likewise.
> 	* gcc.target/arc/interrupt-6.c: Likewise.

In principle this is fine.  There's missing comments, and magic
constants, and other miscellaneous formatting issues throughout the
patch.

I ran out of steam about half way through arc.c but you'll see the
pattern by then :)

Thanks
Andrew


> ---
>  gcc/common/config/arc/arc-common.c         |    1 +
>  gcc/config/arc/arc-protos.h                |    1 +
>  gcc/config/arc/arc.c                       | 1266 +++++++++++++-------
>  gcc/config/arc/arc.md                      |  172 +++
>  gcc/config/arc/arc.opt                     |   10 +-
>  gcc/config/arc/predicates.md               |   12 +
>  gcc/doc/invoke.texi                        |   18 +-
>  gcc/testsuite/gcc.target/arc/firq-1.c      |    8 +-
>  gcc/testsuite/gcc.target/arc/firq-3.c      |   14 +-
>  gcc/testsuite/gcc.target/arc/firq-4.c      |   12 +-
>  gcc/testsuite/gcc.target/arc/interrupt-6.c |    2 +-
>  11 files changed, 1054 insertions(+), 462 deletions(-)
> 
> diff --git a/gcc/common/config/arc/arc-common.c b/gcc/common/config/arc/arc-common.c
> index 578431a279d..2872388de2c 100644
> --- a/gcc/common/config/arc/arc-common.c
> +++ b/gcc/common/config/arc/arc-common.c
> @@ -59,6 +59,7 @@ static const struct default_options arc_option_optimization_table[] =
>      { OPT_LEVELS_SIZE, OPT_mq_class, NULL, 1 },
>      { OPT_LEVELS_SIZE, OPT_mcase_vector_pcrel, NULL, 1 },
>      { OPT_LEVELS_SIZE, OPT_msize_level_, NULL, 3 },
> +    { OPT_LEVELS_SIZE, OPT_mmillicode, NULL, 1 },
>      { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
>      { OPT_LEVELS_3_PLUS_SPEED_ONLY, OPT_msize_level_, NULL, 0 },
>      { OPT_LEVELS_3_PLUS_SPEED_ONLY, OPT_malign_call, NULL, 1 },
> diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
> index 55f8ed4c643..6450b6a014e 100644
> --- a/gcc/config/arc/arc-protos.h
> +++ b/gcc/config/arc/arc-protos.h
> @@ -47,6 +47,7 @@ extern unsigned int arc_compute_function_type (struct function *);
>  extern bool arc_is_uncached_mem_p (rtx);
>  extern bool arc_lra_p (void);
>  extern bool gen_operands_ldd_std (rtx *operands, bool load, bool commute);
> +extern bool arc_check_multi (rtx, bool);
>  #endif /* RTX_CODE */
>  
>  extern unsigned int arc_compute_frame_size (int);
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index 9bc69e9fbc9..ab7735d6b38 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -89,12 +89,12 @@ HARD_REG_SET overrideregs;
>  
>  /* ??? Loads can handle any constant, stores can only handle small ones.  */
>  /* OTOH, LIMMs cost extra, so their usefulness is limited.  */
> -#define RTX_OK_FOR_OFFSET_P(MODE, X) \
> -(GET_CODE (X) == CONST_INT \
> - && SMALL_INT_RANGE (INTVAL (X), (GET_MODE_SIZE (MODE) - 1) & -4, \
> -		     (INTVAL (X) & (GET_MODE_SIZE (MODE) - 1) & 3 \
> -		      ? 0 \
> -		      : -(-GET_MODE_SIZE (MODE) | -4) >> 1)))
> +#define RTX_OK_FOR_OFFSET_P(MODE, X)					\
> +  (GET_CODE (X) == CONST_INT						\
> +   && SMALL_INT_RANGE (INTVAL (X), (GET_MODE_SIZE (MODE) - 1) & -4,	\
> +		       (INTVAL (X) & (GET_MODE_SIZE (MODE) - 1) & 3	\
> +			? 0						\
> +			: -(-GET_MODE_SIZE (MODE) | -4) >> 1)))

This looks like a whitespace only change, could you take this
opportunity to rewrite the header comment.  Remove the '???', and just
document what the macro actually does.

>  
>  /* Array of valid operand punctuation characters.  */
>  char arc_punct_chars[256];
> @@ -1304,6 +1304,10 @@ arc_override_options (void)
>    if (!global_options_set.x_g_switch_value && !TARGET_NO_SDATA_SET)
>      g_switch_value = TARGET_LL64 ? 8 : 4;
>  
> +  /* Millicode thunks doesn't work with long calls.  */
> +  if (TARGET_LONG_CALLS_SET)
> +    target_flags &= ~MASK_MILLICODE_THUNK_SET;
> +
>    /* These need to be done at start up.  It's convenient to do them here.  */
>    arc_init ();
>  }
> @@ -2611,9 +2615,8 @@ struct GTY (()) arc_frame_info
>    unsigned int args_size;	/* # bytes that outgoing arguments take up.  */
>    unsigned int reg_size;	/* # bytes needed to store regs.  */
>    unsigned int var_size;	/* # bytes that variables take up.  */
> -  unsigned int reg_offset;	/* Offset from new sp to store regs.  */
>    unsigned int gmask;		/* Mask of saved gp registers.  */
> -  int          initialized;	/* Nonzero if frame size already calculated.  */
> +  bool         initialized;	/* Nonzero if frame size already calculated.  */

I don't think GNU style is for alignment of variable names.  It
certainly doesn't match the rest of this function, so I don't think
adding one random case is a good idea.  Also, the comment is now
incorrect that this is a boolean.

>    short millicode_start_reg;
>    short millicode_end_reg;
>    bool save_return_addr;
> @@ -2829,10 +2832,12 @@ arc_compute_frame_size (void)
>  {
>    int regno;
>    unsigned int total_size, var_size, args_size, pretend_size, extra_size;
> -  unsigned int reg_size, reg_offset;
> +  unsigned int reg_size;
>    unsigned int gmask;
>    struct arc_frame_info *frame_info;
>    int size;
> +  unsigned int extra_plus_reg_size;
> +  unsigned int extra_plus_reg_size_aligned;
>  
>    /* The answer might already be known.  */
>    if (cfun->machine->frame_info.initialized)
> @@ -2876,23 +2881,23 @@ arc_compute_frame_size (void)
>      for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
>        {
>  	reg_size += UNITS_PER_WORD;
> -	gmask |= 1 << regno;
> +	gmask |= 1L << regno;
>        }
>  
> -  /* 4) Space for back trace data structure.
> -	<return addr reg size> (if required) + <fp size> (if required).  */
> -  frame_info->save_return_addr
> -    = (!crtl->is_leaf || df_regs_ever_live_p (RETURN_ADDR_REGNUM)
> -       || crtl->calls_eh_return);
> -  /* Saving blink reg in case of leaf function for millicode thunk calls.  */
> -  if (optimize_size
> -      && !TARGET_NO_MILLICODE_THUNK_SET
> +  /* Check if we need to save the return address.  */
> +  frame_info->save_return_addr = (!crtl->is_leaf
> +				  || df_regs_ever_live_p (RETURN_ADDR_REGNUM)
> +				  || crtl->calls_eh_return);
> +
> +  /* Saving blink reg for millicode thunk calls.  */
> +  if (TARGET_MILLICODE_THUNK_SET
>        && !crtl->calls_eh_return)
>      {
>        if (arc_compute_millicode_save_restore_regs (gmask, frame_info))
>  	frame_info->save_return_addr = true;
>      }
>  
> +  /* 4) Calculate extra size made up of the blink + fp size.  */
>    extra_size = 0;
>    if (arc_must_save_return_addr (cfun))
>      extra_size = 4;
> @@ -2903,14 +2908,9 @@ arc_compute_frame_size (void)
>    pretend_size	= crtl->args.pretend_args_size;
>  
>    /* Ensure everything before the locals is aligned appropriately.  */
> -    {
> -       unsigned int extra_plus_reg_size;
> -       unsigned int extra_plus_reg_size_aligned;
> -
> -       extra_plus_reg_size = extra_size + reg_size;
> -       extra_plus_reg_size_aligned = ARC_STACK_ALIGN(extra_plus_reg_size);
> -       reg_size = extra_plus_reg_size_aligned - extra_size;
> -    }
> +  extra_plus_reg_size = extra_size + reg_size;
> +  extra_plus_reg_size_aligned = ARC_STACK_ALIGN(extra_plus_reg_size);
> +  reg_size = extra_plus_reg_size_aligned - extra_size;
>  
>    /* Compute total frame size.  */
>    total_size = var_size + args_size + extra_size + pretend_size + reg_size;
> @@ -2921,12 +2921,6 @@ arc_compute_frame_size (void)
>       as an issue I've changed this to an assert for now.  */
>    gcc_assert (total_size == ARC_STACK_ALIGN (total_size));
>  
> -  /* Compute offset of register save area from stack pointer:
> -     Frame: pretend_size <blink> reg_size <fp> var_size args_size <--sp
> -  */
> -  reg_offset = (total_size - (pretend_size + reg_size + extra_size)
> -		+ (arc_frame_pointer_needed () ? 4 : 0));
> -
>    /* Save computed information.  */
>    frame_info->total_size   = total_size;
>    frame_info->extra_size   = extra_size;
> @@ -2934,7 +2928,6 @@ arc_compute_frame_size (void)
>    frame_info->var_size     = var_size;
>    frame_info->args_size    = args_size;
>    frame_info->reg_size     = reg_size;
> -  frame_info->reg_offset   = reg_offset;
>    frame_info->gmask        = gmask;
>    frame_info->initialized  = reload_completed;
>  
> @@ -2942,187 +2935,6 @@ arc_compute_frame_size (void)
>    return total_size;
>  }
>  
> -/* Common code to save/restore registers.  */
> -/* BASE_REG is the base register to use for addressing and to adjust.
> -   GMASK is a bitmask of general purpose registers to save/restore.
> -   epilogue_p 0: prologue 1:epilogue 2:epilogue, sibling thunk
> -   If *FIRST_OFFSET is non-zero, add it first to BASE_REG - preferably
> -   using a pre-modify for the first memory access.  *FIRST_OFFSET is then
> -   zeroed.  */
> -
> -static void
> -arc_save_restore (rtx base_reg,
> -		  unsigned int gmask, int epilogue_p, int *first_offset)
> -{
> -  unsigned int offset = 0;
> -  int regno;
> -  struct arc_frame_info *frame = &cfun->machine->frame_info;
> -  rtx sibthunk_insn = NULL_RTX;
> -
> -  if (gmask)
> -    {
> -      /* Millicode thunks implementation:
> -	 Generates calls to millicodes for registers starting from r13 to r25
> -	 Present Limitations:
> -	 - Only one range supported. The remaining regs will have the ordinary
> -	   st and ld instructions for store and loads. Hence a gmask asking
> -	   to store r13-14, r16-r25 will only generate calls to store and
> -	   load r13 to r14 while store and load insns will be generated for
> -	   r16 to r25 in the prologue and epilogue respectively.
> -
> -	 - Presently library only supports register ranges starting from r13.
> -      */
> -      if (epilogue_p == 2 || frame->millicode_end_reg > 14)
> -	{
> -	  int start_call = frame->millicode_start_reg;
> -	  int end_call = frame->millicode_end_reg;
> -	  int n_regs = end_call - start_call + 1;
> -	  int i = 0, r, off = 0;
> -	  rtx insn;
> -	  rtx ret_addr = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> -
> -	  if (*first_offset)
> -	    {
> -	      /* "reg_size" won't be more than 127 .  */
> -	      gcc_assert (epilogue_p || abs (*first_offset) <= 127);
> -	      frame_add (base_reg, *first_offset);
> -	      *first_offset = 0;
> -	    }
> -	  insn = gen_rtx_PARALLEL
> -		  (VOIDmode, rtvec_alloc ((epilogue_p == 2) + n_regs + 1));
> -	  if (epilogue_p == 2)
> -	    i += 2;
> -	  else
> -	    XVECEXP (insn, 0, n_regs) = gen_rtx_CLOBBER (VOIDmode, ret_addr);
> -	  for (r = start_call; r <= end_call; r++, off += UNITS_PER_WORD, i++)
> -	    {
> -	      rtx reg = gen_rtx_REG (SImode, r);
> -	      rtx mem
> -		= gen_frame_mem (SImode, plus_constant (Pmode, base_reg, off));
> -
> -	      if (epilogue_p)
> -		XVECEXP (insn, 0, i) = gen_rtx_SET (reg, mem);
> -	      else
> -		XVECEXP (insn, 0, i) = gen_rtx_SET (mem, reg);
> -	      gmask = gmask & ~(1L << r);
> -	    }
> -	  if (epilogue_p == 2)
> -	    sibthunk_insn = insn;
> -	  else
> -	    {
> -	      insn = frame_insn (insn);
> -	      for (r = start_call, off = 0;
> -		   r <= end_call;
> -		   r++, off += UNITS_PER_WORD)
> -		{
> -		  rtx reg = gen_rtx_REG (SImode, r);
> -		  if (epilogue_p)
> -		      add_reg_note (insn, REG_CFA_RESTORE, reg);
> -		  else
> -		    {
> -		      rtx mem = gen_rtx_MEM (SImode, plus_constant (Pmode,
> -								    base_reg,
> -								    off));
> -
> -		      add_reg_note (insn, REG_CFA_OFFSET,
> -				    gen_rtx_SET (mem, reg));
> -		    }
> -		}
> -	    }
> -	  offset += off;
> -	}
> -
> -      for (regno = 0; regno <= 31; regno++)
> -	{
> -	  machine_mode mode = SImode;
> -	  bool found = false;
> -
> -	  if (TARGET_LL64
> -	      && (regno % 2 == 0)
> -	      && ((gmask & (1L << regno)) != 0)
> -	      && ((gmask & (1L << (regno+1))) != 0))
> -	    {
> -	      found = true;
> -	      mode  = DImode;
> -	    }
> -	  else if ((gmask & (1L << regno)) != 0)
> -	    {
> -	      found = true;
> -	      mode  = SImode;
> -	    }
> -
> -	  if (found)
> -	    {
> -	      rtx reg = gen_rtx_REG (mode, regno);
> -	      rtx addr, mem;
> -	      int cfa_adjust = *first_offset;
> -
> -	      if (*first_offset)
> -		{
> -		  gcc_assert (!offset);
> -		  addr = plus_constant (Pmode, base_reg, *first_offset);
> -		  addr = gen_rtx_PRE_MODIFY (Pmode, base_reg, addr);
> -		  *first_offset = 0;
> -		}
> -	      else
> -		{
> -		  gcc_assert (SMALL_INT (offset));
> -		  addr = plus_constant (Pmode, base_reg, offset);
> -		}
> -	      mem = gen_frame_mem (mode, addr);
> -	      if (epilogue_p)
> -		{
> -		  rtx insn =
> -		    frame_move_inc (reg, mem, base_reg, addr);
> -		  add_reg_note (insn, REG_CFA_RESTORE, reg);
> -		  if (cfa_adjust)
> -		    {
> -		      enum reg_note note = REG_CFA_ADJUST_CFA;
> -		      add_reg_note (insn, note,
> -				    gen_rtx_SET (stack_pointer_rtx,
> -						 plus_constant (Pmode,
> -								stack_pointer_rtx,
> -								cfa_adjust)));
> -		    }
> -		}
> -	      else
> -		frame_move_inc (mem, reg, base_reg, addr);
> -	      offset += UNITS_PER_WORD;
> -	      if (mode == DImode)
> -		{
> -		  offset += UNITS_PER_WORD;
> -		  ++regno;
> -		}
> -	    } /* if */
> -	} /* for */
> -    }/* if */
> -  if (sibthunk_insn)
> -    {
> -      int start_call = frame->millicode_start_reg;
> -      int end_call = frame->millicode_end_reg;
> -      int r;
> -
> -      rtx r12 = gen_rtx_REG (Pmode, 12);
> -
> -      frame_insn (gen_rtx_SET (r12, GEN_INT (offset)));
> -      XVECEXP (sibthunk_insn, 0, 0) = ret_rtx;
> -      XVECEXP (sibthunk_insn, 0, 1)
> -	= gen_rtx_SET (stack_pointer_rtx,
> -		       gen_rtx_PLUS (Pmode, stack_pointer_rtx, r12));
> -      sibthunk_insn = emit_jump_insn (sibthunk_insn);
> -      RTX_FRAME_RELATED_P (sibthunk_insn) = 1;
> -
> -      /* Would be nice if we could do this earlier, when the PARALLEL
> -	 is populated, but these need to be attached after the
> -	 emit.  */
> -      for (r = start_call; r <= end_call; r++)
> -	{
> -	  rtx reg = gen_rtx_REG (SImode, r);
> -	  add_reg_note (sibthunk_insn, REG_CFA_RESTORE, reg);
> -	}
> -    }
> -} /* arc_save_restore */
> -
>  /* Build dwarf information when the context is saved via AUX_IRQ_CTRL
>     mechanism.  */
>  
> @@ -3193,6 +3005,680 @@ arc_dwarf_emit_irq_save_regs (void)
>    RTX_FRAME_RELATED_P (insn) = 1;
>  }
>  
> +static int
> +frame_save_reg (rtx reg, HOST_WIDE_INT offset)

Shouldn't this have a comment above it.

> +{
> +  rtx addr;
> +
> +  if (offset)
> +    {
> +      rtx tmp = plus_constant (Pmode, stack_pointer_rtx,
> +			       offset - GET_MODE_SIZE (GET_MODE (reg)));
> +      addr = gen_frame_mem (GET_MODE (reg),
> +			    gen_rtx_PRE_MODIFY (Pmode,
> +						stack_pointer_rtx,
> +						tmp));
> +    }
> +  else
> +    addr = gen_frame_mem (GET_MODE (reg), gen_rtx_PRE_DEC (Pmode,
> +							   stack_pointer_rtx));
> +  frame_move_inc (addr, reg, stack_pointer_rtx, 0);
> +
> +  return GET_MODE_SIZE (GET_MODE (reg)) - offset;
> +}
> +
> +static int
> +frame_restore_reg (rtx reg, HOST_WIDE_INT offset)

Missing comment again.

> +{
> +  rtx addr, insn;
> +
> +  if (offset)
> +    {
> +      rtx tmp = plus_constant (Pmode, stack_pointer_rtx,
> +			       offset + GET_MODE_SIZE (GET_MODE (reg)));
> +      addr = gen_frame_mem (GET_MODE (reg),
> +			    gen_rtx_POST_MODIFY (Pmode,
> +						 stack_pointer_rtx,
> +						 tmp));
> +    }
> +  else
> +    addr = gen_frame_mem (GET_MODE (reg), gen_rtx_POST_INC (Pmode,
> +							    stack_pointer_rtx));
> +  insn = frame_move_inc (reg, addr, stack_pointer_rtx, 0);
> +  add_reg_note (insn, REG_CFA_RESTORE, reg);
> +
> +  if (reg == frame_pointer_rtx)
> +    add_reg_note (insn, REG_CFA_DEF_CFA,
> +		  plus_constant (Pmode, stack_pointer_rtx,
> +				 GET_MODE_SIZE (GET_MODE (reg)) + offset));
> +  else
> +    add_reg_note (insn, REG_CFA_ADJUST_CFA,
> +		  gen_rtx_SET (stack_pointer_rtx,
> +			       plus_constant (Pmode, stack_pointer_rtx,
> +					      GET_MODE_SIZE (GET_MODE (reg))
> +					      + offset)));
> +
> +  return GET_MODE_SIZE (GET_MODE (reg)) + offset;
> +}
> +
> +/* Check if we have a continous range to be save/restored with the
> +   help of enter/leave instructions.  */
> +
> +static bool
> +arc_enter_leave_p (unsigned int gmask)
> +{
> +  int start_reg = 13;
> +  int end_reg = 26;

Maybe a comment explaining these numbers.  Better yet, some named
constants would be great.

> +  int regno;
> +  unsigned int rmask = 0;
> +
> +  if (!gmask)
> +    return false;
> +
> +  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno)); regno++)
> +    rmask |= 1L << regno;
> +
> +  if (rmask ^ gmask)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* ARC's prologue, save any needed call-saved regs (and call-used if
> +   this is an interrupt handler) for ARCompact ISA, using ST/STD
> +   instructions. */
> +
> +static int
> +arc_save_callee_saves (unsigned int gmask,
> +		       bool save_blink,
> +		       bool save_fp,
> +		       HOST_WIDE_INT offset)
> +{
> +  rtx reg;
> +  int frame_allocated = 0;
> +
> +  /* The home-grown ABI says link register is saved first.  */
> +  if (save_blink)
> +    {
> +      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> +      frame_allocated += frame_save_reg (reg, offset);
> +      offset = 0;
> +    }
> +
> +  /* N.B. FRAME_POINTER_MASK and RETURN_ADDR_MASK are cleared in gmask.  */
> +  if (gmask)
> +    for (int i = 31; i >= 0; i--)

Using a named constant for register 31 would be great here I think.

> +      {
> +	machine_mode save_mode = SImode;
> +
> +	if (TARGET_LL64
> +	    && ((i - 1) % 2 == 0)
> +	    && ((gmask & (1L << i)) != 0)
> +	    && ((gmask & (1L << (i - 1))) != 0))
> +	  {
> +	    save_mode = DImode;
> +	    --i;
> +	  }
> +	else if ((gmask & (1L << i)) == 0)
> +	  continue;
> +
> +	reg = gen_rtx_REG (save_mode, i);
> +	frame_allocated += frame_save_reg (reg, offset);
> +	offset = 0;
> +      }
> +
> +  /* Save frame pointer if needed.  First save the FP on stack, if not
> +     autosaved.  Unfortunately, I cannot add it to gmask and use the
> +     above loop to save fp because our ABI states fp goes aftert all
> +     registers are saved.  */
> +  if (save_fp)
> +    {
> +      frame_allocated += frame_save_reg (frame_pointer_rtx, offset);
> +      offset = 0;
> +    }
> +
> +  /* Emit mov fp,sp.  */
> +  if (arc_frame_pointer_needed ())
> +    frame_move (frame_pointer_rtx, stack_pointer_rtx);
> +
> +  return frame_allocated;
> +}
> +
> +static int
> +arc_restore_callee_saves (unsigned int gmask,
> +			  bool restore_blink,
> +			  bool restore_fp,
> +			  HOST_WIDE_INT offset,
> +			  HOST_WIDE_INT allocated)

Header comment?

> +{
> +  rtx reg;
> +  int frame_deallocated = 0;
> +
> +  /* Emit mov fp,sp.  */
> +  if (arc_frame_pointer_needed () && offset)
> +    {
> +      frame_move (stack_pointer_rtx, frame_pointer_rtx);
> +      frame_deallocated += offset;
> +      offset = 0;
> +    }
> +
> +  if (restore_fp)
> +    {
> +      /* Any offset is taken care by previous if-statement.  */
> +      gcc_assert (offset == 0);
> +      frame_deallocated += frame_restore_reg (frame_pointer_rtx, 0);
> +    }
> +
> +  if (offset)
> +    {
> +      /* No, fp involved, hence, we need to do an add to set the sp to
> +	 where first registers are.  */

Too many commas in here.  Maybe this instead?

    No $fp involved, so, we need to do an add to set the $sp to the
    location of the first register.

> +      frame_stack_add (offset);
> +      frame_deallocated += offset;
> +      offset = 0;
> +    }
> +
> +  /* N.B. FRAME_POINTER_MASK and RETURN_ADDR_MASK are cleared in gmask.  */
> +  if (gmask)
> +    for (int i = 0; i <= 31; i++)
> +      {
> +	machine_mode restore_mode = SImode;
> +
> +	if (TARGET_LL64
> +	    && ((i % 2) == 0)
> +	    && ((gmask & (1L << i)) != 0)
> +	    && ((gmask & (1L << (i + 1))) != 0))
> +	  {
> +	    restore_mode = DImode;
> +	  }

I thought GNU style required single line blocks to drop the { .. }?

> +	else if ((gmask & (1L << i)) == 0)
> +	  continue;
> +
> +	reg = gen_rtx_REG (restore_mode, i);
> +	frame_deallocated += frame_restore_reg (reg, 0);
> +	offset = 0;
> +
> +	if (restore_mode == DImode)
> +	  i++;
> +      }
> +
> +  if (restore_blink)
> +    {
> +      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> +      frame_deallocated += frame_restore_reg (reg, allocated
> +					      - frame_deallocated
> +					      /* Consider as well the
> +						 current restored
> +						 register size.*/
> +					      - UNITS_PER_WORD);
> +    }
> +
> +  return frame_deallocated;
> +}
> +
> +/* ARC prologue, save the registers using enter instruction.  */

Maybe you could document some of the function parameters?

> +
> +static int
> +arc_save_callee_enter (unsigned int gmask,
> +		       bool save_blink,
> +		       bool save_fp,
> +		       HOST_WIDE_INT offset)
> +{
> +  int start_reg = 13;
> +  int end_reg = 26;

Named constants for register numbers maybe?

> +  int regno, indx, off, nregs;
> +  rtx insn, reg, mem;
> +  int frame_allocated = 0;
> +
> +  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
> +    regno++;
> +
> +  end_reg = regno - 1;
> +  nregs = end_reg - start_reg + 1;
> +  nregs += save_blink ? 1 : 0;
> +  nregs += save_fp ? 1 : 0;
> +
> +  if (offset)
> +    frame_stack_add (offset);
> +
> +  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + (save_fp ? 1 : 0)
> +						  + 1));
> +  indx = 0;
> +
> +  reg = gen_rtx_SET (stack_pointer_rtx,
> +		     plus_constant (Pmode,
> +				    stack_pointer_rtx,
> +				    nregs * UNITS_PER_WORD));
> +  RTX_FRAME_RELATED_P (reg) = 1;
> +  XVECEXP (insn, 0, indx++) = reg;
> +  off = nregs * UNITS_PER_WORD;
> +
> +  if (save_blink)
> +    {
> +      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> +      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
> +						 stack_pointer_rtx,
> +						 off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, reg);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
> +      off -= UNITS_PER_WORD;
> +      save_blink = false;
> +    }
> +
> +  for (regno = start_reg;
> +       regno <= end_reg;
> +       regno++, indx++, off -= UNITS_PER_WORD)
> +    {
> +      reg = gen_rtx_REG (SImode, regno);
> +      mem = gen_frame_mem (SImode, plus_constant (Pmode,
> +						  stack_pointer_rtx,
> +						  off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, reg);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
> +      gmask = gmask & ~(1L << regno);
> +    }
> +
> +  if (save_fp)
> +    {
> +      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
> +						 stack_pointer_rtx,
> +						 off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, frame_pointer_rtx);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
> +      off -= UNITS_PER_WORD;
> +
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (frame_pointer_rtx,
> +					     stack_pointer_rtx);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
> +      save_fp = false;
> +    }
> +
> +  gcc_assert (off == 0);
> +  insn = frame_insn (insn);
> +
> +  add_reg_note (insn, REG_INC, stack_pointer_rtx);
> +
> +  frame_allocated = nregs * UNITS_PER_WORD;
> +
> +  /* offset is a negative number, make sure we add it.  */
> +  return frame_allocated - offset;
> +}
> +
> +static int
> +arc_restore_callee_leave (unsigned int gmask,
> +			  bool restore_blink,
> +			  bool restore_fp,
> +			  bool return_p,
> +			  HOST_WIDE_INT offset)
> +{

Header comment.

> +  int start_reg = 13;
> +  int end_reg = 26;

Named constants.

> +  int regno, indx, off, nregs;
> +  rtx insn, reg, mem;
> +  int frame_allocated = 0;
> +
> +  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
> +    regno++;
> +
> +  end_reg = regno - 1;
> +  nregs = end_reg - start_reg + 1;
> +  nregs += restore_blink ? 1 : 0;
> +  nregs += restore_fp ? 1 : 0;
> +
> +  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1
> +						  + (return_p ? 1 : 0)));
> +  indx = 0;
> +
> +  if (return_p)
> +    XVECEXP (insn, 0, indx++) = ret_rtx;
> +
> +  if (restore_fp)
> +    {
> +      /* I cannot emit set (sp, fp) here as cselib expects a single sp
> +	 set and not two.  Thus, use the offset, and change sp adjust
> +	 value.  */
> +      frame_allocated += offset;
> +    }
> +
> +  if (offset && !restore_fp)
> +    {
> +      /* This add is only emmited when we do not restore fp with leave
> +	 instruction.  */
> +      frame_stack_add (offset);
> +      frame_allocated += offset;
> +      offset = 0;
> +    }
> +
> +  reg = gen_rtx_SET (stack_pointer_rtx,
> +		     plus_constant (Pmode,
> +				    stack_pointer_rtx,
> +				    offset + nregs * UNITS_PER_WORD));
> +  RTX_FRAME_RELATED_P (reg) = 1;
> +  XVECEXP (insn, 0, indx++) = reg;
> +  off = nregs * UNITS_PER_WORD;
> +
> +  if (restore_blink)
> +    {
> +      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> +      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
> +						 stack_pointer_rtx,
> +						 off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (reg, mem);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
> +      off -= UNITS_PER_WORD;
> +    }
> +
> +  for (regno = start_reg;
> +       regno <= end_reg;
> +       regno++, indx++, off -= UNITS_PER_WORD)
> +    {
> +      reg = gen_rtx_REG (SImode, regno);
> +      mem = gen_frame_mem (SImode, plus_constant (Pmode,
> +						  stack_pointer_rtx,
> +						  off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (reg, mem);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
> +      gmask = gmask & ~(1L << regno);
> +    }
> +
> +  if (restore_fp)
> +    {
> +      mem = gen_frame_mem (Pmode, plus_constant (Pmode,
> +						 stack_pointer_rtx,
> +						 off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (frame_pointer_rtx, mem);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx++)) = 1;
> +      off -= UNITS_PER_WORD;
> +    }
> +
> +  gcc_assert (off == 0);
> +  if (return_p)
> +    {
> +      insn = emit_jump_insn (insn);
> +      RTX_FRAME_RELATED_P (insn) = 1;
> +    }
> +  else
> +    insn = frame_insn (insn);
> +
> +  add_reg_note (insn, REG_INC, stack_pointer_rtx);
> +
> +  /* Dwarf related info.  */
> +  if (restore_fp)
> +    {
> +      add_reg_note (insn, REG_CFA_RESTORE, frame_pointer_rtx);
> +      add_reg_note (insn, REG_CFA_DEF_CFA,
> +		    plus_constant (Pmode, stack_pointer_rtx,
> +				   offset + nregs * UNITS_PER_WORD));
> +    }
> +  else
> +    {
> +      add_reg_note (insn, REG_CFA_ADJUST_CFA,
> +		    gen_rtx_SET (stack_pointer_rtx,
> +				 plus_constant (Pmode, stack_pointer_rtx,
> +						nregs * UNITS_PER_WORD)));
> +    }
> +  if (restore_blink)
> +    add_reg_note (insn, REG_CFA_RESTORE,
> +		  gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM));
> +  for (regno = start_reg; regno <= end_reg; regno++)
> +    add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (SImode, regno));
> +
> +  frame_allocated += nregs * UNITS_PER_WORD;
> +
> +  return frame_allocated;
> +}
> +
> +/* Millicode thunks implementation:
> +   Generates calls to millicodes for registers starting from r13 to r25
> +   Present Limitations:
> +   - Only one range supported. The remaining regs will have the ordinary
> +   st and ld instructions for store and loads. Hence a gmask asking
> +   to store r13-14, r16-r25 will only generate calls to store and
> +   load r13 to r14 while store and load insns will be generated for
> +   r16 to r25 in the prologue and epilogue respectively.
> +
> +   - Presently library only supports register ranges starting from r13.
> +*/
> +
> +static int
> +arc_save_callee_milli (unsigned int gmask,
> +		       bool save_blink,
> +		       bool save_fp,
> +		       HOST_WIDE_INT offset,
> +		       HOST_WIDE_INT reg_size)
> +{
> +  int start_reg = 13;
> +  int end_reg = 25;
> +  int regno, indx, off, nregs;
> +  rtx insn, reg, mem;
> +  int frame_allocated = 0;
> +
> +  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
> +    regno++;
> +
> +  end_reg = regno - 1;
> +  nregs = end_reg - start_reg + 1;
> +  gcc_assert (end_reg > 14);
> +
> +
> +  /* Allocate space on stack for the registers, and take into account
> +     also the initial offset.  The registers will be saved using
> +     offsets.  N.B. OFFSET is a negative number.  */
> +  if (save_blink)
> +    {
> +      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> +      frame_allocated += frame_save_reg (reg, offset);
> +      offset = 0;
> +    }
> +
> +  if (reg_size || offset)
> +    {
> +      frame_stack_add (offset - reg_size);
> +      frame_allocated += nregs * UNITS_PER_WORD - offset;
> +      offset = 0;
> +    }
> +
> +  /* Start generate millicode call.  */
> +  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1));
> +  indx = 0;
> +
> +  /* This is a call, we clobber blink.  */
> +  XVECEXP (insn, 0, nregs) =
> +    gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM));
> +
> +  for (regno = start_reg, indx = 0, off = 0;
> +       regno <= end_reg;
> +       regno++, indx++, off += UNITS_PER_WORD)
> +    {
> +      reg = gen_rtx_REG (SImode, regno);
> +      mem = gen_frame_mem (SImode, plus_constant (Pmode,
> +						  stack_pointer_rtx,
> +						  off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (mem, reg);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
> +      gmask = gmask & ~(1L << regno);
> +    }
> +  insn = frame_insn (insn);
> +
> +  /* Add DWARF info.  */
> +  for (regno = start_reg, off = 0;
> +       regno <= end_reg;
> +       regno++, off += UNITS_PER_WORD)
> +    {
> +      reg = gen_rtx_REG (SImode, regno);
> +      mem = gen_rtx_MEM (SImode, plus_constant (Pmode,
> +						stack_pointer_rtx, off));
> +      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
> +
> +    }
> +
> +  /* In the case of millicode thunk, we need to restore the
> +     clobbered blink register.  */
> +  if (arc_must_save_return_addr (cfun))
> +    {
> +      emit_insn (gen_rtx_SET (gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM),
> +			      gen_rtx_MEM (Pmode,
> +					   plus_constant (Pmode,
> +							  stack_pointer_rtx,
> +							  reg_size))));
> +    }
> +
> +  /* Save remaining registers using st instructions.  */
> +  for (regno = 0; regno <= 31; regno++)
> +    {
> +      if ((gmask & (1L << regno)) == 0)
> +	continue;
> +
> +      reg = gen_rtx_REG (SImode, regno);
> +      mem = gen_frame_mem (SImode, plus_constant (Pmode,
> +						  stack_pointer_rtx,
> +						  off));
> +      frame_move_inc (mem, reg, stack_pointer_rtx, 0);
> +      frame_allocated += UNITS_PER_WORD;
> +      off += UNITS_PER_WORD;
> +    }
> +
> +  /* Save frame pointer if needed.  First save the FP on stack, if not
> +     autosaved.  Unfortunately, I cannot add it to gmask and use the
> +     above loop to save fp because our ABI states fp goes aftert all
> +     registers are saved.  */
> +  if (save_fp)
> +    frame_allocated += frame_save_reg (frame_pointer_rtx, offset);
> +
> +  /* Emit mov fp,sp.  */
> +  if (arc_frame_pointer_needed ())
> +    frame_move (frame_pointer_rtx, stack_pointer_rtx);
> +
> +  return frame_allocated;
> +}
> +
> +/* Like the previous function but restore.  */
> +
> +static int
> +arc_restore_callee_milli (unsigned int gmask,
> +			  bool restore_blink,
> +			  bool restore_fp,
> +			  bool return_p,
> +			  HOST_WIDE_INT offset)
> +{
> +  int start_reg = 13;
> +  int end_reg = 25;
> +  int regno, indx, off, nregs;
> +  rtx insn, reg, mem;
> +  int frame_allocated = 0;
> +
> +  for (regno = start_reg; regno <= end_reg && (gmask & (1L << regno));)
> +    regno++;
> +
> +  end_reg = regno - 1;
> +  nregs = end_reg - start_reg + 1;
> +  gcc_assert (end_reg > 14);
> +
> +  /* Emit mov fp,sp.  */
> +  if (arc_frame_pointer_needed () && offset)
> +    {
> +      frame_move (stack_pointer_rtx, frame_pointer_rtx);
> +      frame_allocated = offset;
> +      offset = 0;
> +    }
> +
> +  if (restore_fp)
> +    frame_allocated += frame_restore_reg (frame_pointer_rtx, 0);
> +
> +  if (offset)
> +    {
> +      /* No fp involved, hence, we need to adjust the sp via an
> +	 add.  */
> +      frame_stack_add (offset);
> +      frame_allocated += offset;
> +      offset = 0;
> +    }
> +
> +  /* Start generate millicode call.  */
> +  insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc ((return_p ? 1 : 0)
> +						  + nregs + 1));
> +  indx = 0;
> +
> +  if (return_p)
> +    {
> +      /* sibling call, the blink is restored with the help of the
> +	 value held into r12.  */
> +      reg = gen_rtx_REG (Pmode, 12);
> +      XVECEXP (insn, 0, indx++) = ret_rtx;
> +      XVECEXP (insn, 0, indx++) =
> +	gen_rtx_SET (stack_pointer_rtx,
> +		     gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg));
> +      frame_allocated += UNITS_PER_WORD;
> +    }
> +  else
> +    {
> +      /* This is a call, we clobber blink.  */
> +      XVECEXP (insn, 0, nregs) =
> +	gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM));
> +    }
> +
> +  for (regno = start_reg, off = 0;
> +       regno <= end_reg;
> +       regno++, indx++, off += UNITS_PER_WORD)
> +    {
> +      reg = gen_rtx_REG (SImode, regno);
> +      mem = gen_frame_mem (SImode, plus_constant (Pmode,
> +						  stack_pointer_rtx,
> +						  off));
> +      XVECEXP (insn, 0, indx) = gen_rtx_SET (reg, mem);
> +      RTX_FRAME_RELATED_P (XVECEXP (insn, 0, indx)) = 1;
> +      gmask = gmask & ~(1L << regno);
> +    }
> +
> +  /* Restore remaining registers using LD instructions.  */
> +  for (regno = 0; regno <= 31; regno++)
> +    {
> +      if ((gmask & (1L << regno)) == 0)
> +	continue;
> +
> +      reg = gen_rtx_REG (SImode, regno);
> +      mem = gen_frame_mem (SImode, plus_constant (Pmode,
> +						  stack_pointer_rtx,
> +						  off));
> +      rtx tmp = frame_move_inc (reg, mem, stack_pointer_rtx, 0);
> +      add_reg_note (tmp, REG_CFA_RESTORE, reg);
> +      off += UNITS_PER_WORD;
> +    }
> +
> +  /* Emit millicode call.  */
> +  if (return_p)
> +    {
> +      reg = gen_rtx_REG (Pmode, 12);
> +      frame_insn (gen_rtx_SET (reg, GEN_INT (off)));
> +      frame_allocated += off;
> +      insn = emit_jump_insn (insn);
> +      RTX_FRAME_RELATED_P (insn) = 1;
> +    }
> +  else
> +    insn = frame_insn (insn);
> +
> +  /* Add DWARF info.  */
> +  for (regno = start_reg, off = 0;
> +       regno <= end_reg;
> +       regno++, off += UNITS_PER_WORD)
> +    {
> +      reg = gen_rtx_REG (SImode, regno);
> +      add_reg_note (insn, REG_CFA_RESTORE, reg);
> +
> +    }
> +
> +  if (restore_blink && !return_p)
> +    {
> +      reg = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> +      mem = gen_frame_mem (Pmode, plus_constant (Pmode, stack_pointer_rtx,
> +						 off));
> +      insn = frame_insn (gen_rtx_SET (reg, mem));
> +      add_reg_note (insn, REG_CFA_RESTORE, reg);
> +    }
> +
> +  return frame_allocated;
> +}
> +
>  /* Set up the stack and frame pointer (if desired) for the function.  */
>  
>  void
> @@ -3200,13 +3686,12 @@ arc_expand_prologue (void)
>  {
>    int size;
>    unsigned int gmask = cfun->machine->frame_info.gmask;
> -  /*  unsigned int frame_pointer_offset;*/
> +  struct arc_frame_info *frame = &cfun->machine->frame_info;
>    unsigned int frame_size_to_allocate;
> -  /* (FIXME: The first store will use a PRE_MODIFY; this will usually be r13.
> -     Change the stack layout so that we rather store a high register with the
> -     PRE_MODIFY, thus enabling more short insn generation.)  */
>    int first_offset = 0;
>    unsigned int fn_type = arc_compute_function_type (cfun);
> +  bool save_blink = false;
> +  bool save_fp = false;
>  
>    /* Naked functions don't have prologue.  */
>    if (ARC_NAKED_P (fn_type))
> @@ -3229,87 +3714,42 @@ arc_expand_prologue (void)
>    gcc_assert (!(size == 0 && gmask));
>  
>    /* Allocate space for register arguments if this is a variadic function.  */
> -  if (cfun->machine->frame_info.pretend_size != 0)
> -    {
> -       /* Ensure pretend_size is maximum of 8 * word_size.  */
> -      gcc_assert (cfun->machine->frame_info.pretend_size <= 32);
> -
> -      frame_stack_add (-(HOST_WIDE_INT)cfun->machine->frame_info.pretend_size);
> -      frame_size_to_allocate -= cfun->machine->frame_info.pretend_size;
> -    }
> +  if (frame->pretend_size != 0)
> +    first_offset = -frame->pretend_size;
>  
>    /* IRQ using automatic save mechanism will save the register before
>       anything we do.  */
>    if (ARC_AUTO_IRQ_P (fn_type)
>        && !ARC_FAST_INTERRUPT_P (fn_type))
>      {
> -      arc_dwarf_emit_irq_save_regs ();
> -    }
> -
> -  /* The home-grown ABI says link register is saved first.  */
> -  if (arc_must_save_return_addr (cfun)
> -      && !ARC_AUTOBLINK_IRQ_P (fn_type))
> -    {
> -      rtx ra = gen_rtx_REG (SImode, RETURN_ADDR_REGNUM);
> -      rtx mem = gen_frame_mem (Pmode,
> -			       gen_rtx_PRE_DEC (Pmode,
> -						stack_pointer_rtx));
> -
> -      frame_move_inc (mem, ra, stack_pointer_rtx, 0);
> -      frame_size_to_allocate -= UNITS_PER_WORD;
> -    }
> -
> -  /* Save any needed call-saved regs (and call-used if this is an
> -     interrupt handler) for ARCompact ISA.  */
> -  if (cfun->machine->frame_info.reg_size)
> -    {
> -      first_offset = -cfun->machine->frame_info.reg_size;
> -      /* N.B. FRAME_POINTER_MASK and RETURN_ADDR_MASK are cleared in gmask.  */
> -      arc_save_restore (stack_pointer_rtx, gmask, 0, &first_offset);
> -      frame_size_to_allocate -= cfun->machine->frame_info.reg_size;
> -    }
> -
> -  /* In the case of millicode thunk, we need to restore the clobbered
> -     blink register.  */
> -  if (cfun->machine->frame_info.millicode_end_reg > 0
> -      && arc_must_save_return_addr (cfun))
> -    {
> -      HOST_WIDE_INT tmp = cfun->machine->frame_info.reg_size;
> -      emit_insn (gen_rtx_SET (gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM),
> -			      gen_rtx_MEM (Pmode,
> -					   plus_constant (Pmode,
> -							  stack_pointer_rtx,
> -							  tmp))));
> -    }
> -
> -  /* Save frame pointer if needed.  First save the FP on stack, if not
> -     autosaved.  */
> -  if (arc_frame_pointer_needed ()
> -      && !ARC_AUTOFP_IRQ_P (fn_type))
> -    {
> -      rtx addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
> -			       GEN_INT (-UNITS_PER_WORD + first_offset));
> -      rtx mem = gen_frame_mem (Pmode, gen_rtx_PRE_MODIFY (Pmode,
> -							  stack_pointer_rtx,
> -							  addr));
> -      frame_move_inc (mem, frame_pointer_rtx, stack_pointer_rtx, 0);
> -      frame_size_to_allocate -= UNITS_PER_WORD;
> +      frame_stack_add (first_offset);
>        first_offset = 0;
> +      arc_dwarf_emit_irq_save_regs ();
>      }
>  
> -  /* Emit mov fp,sp.  */
> -  if (arc_frame_pointer_needed ())
> -    {
> -      frame_move (frame_pointer_rtx, stack_pointer_rtx);
> -    }
> -
> -  /* ??? We don't handle the case where the saved regs are more than 252
> -     bytes away from sp.  This can be handled by decrementing sp once, saving
> -     the regs, and then decrementing it again.  The epilogue doesn't have this
> -     problem as the `ld' insn takes reg+limm values (though it would be more
> -     efficient to avoid reg+limm).  */
> +  save_blink = arc_must_save_return_addr (cfun)
> +    && !ARC_AUTOBLINK_IRQ_P (fn_type);
> +  save_fp = arc_frame_pointer_needed () && !ARC_AUTOFP_IRQ_P (fn_type);
> +
> +  /* Use enter/leave only for non-interrupt functions.  */
> +  if (TARGET_CODE_DENSITY
> +      && TARGET_CODE_DENSITY_FRAME
> +      && !ARC_AUTOFP_IRQ_P (fn_type)
> +      && !ARC_AUTOBLINK_IRQ_P (fn_type)
> +      && !ARC_INTERRUPT_P (fn_type)
> +      && arc_enter_leave_p (gmask))
> +      frame_size_to_allocate -= arc_save_callee_enter (gmask, save_blink,
> +						       save_fp,
> +						       first_offset);
> +  else if (frame->millicode_end_reg > 14)
> +    frame_size_to_allocate -= arc_save_callee_milli (gmask, save_blink,
> +						     save_fp,
> +						     first_offset,
> +						     frame->reg_size);
> +  else
> +    frame_size_to_allocate -= arc_save_callee_saves (gmask, save_blink, save_fp,
> +						     first_offset);
>  
> -  frame_size_to_allocate -= first_offset;
>    /* Allocate the stack frame.  */
>    if (frame_size_to_allocate > 0)
>      {
> @@ -3318,8 +3758,7 @@ arc_expand_prologue (void)
>  	 will prevent the scheduler from moving stores to the frame
>  	 before the stack adjustment.  */
>        if (arc_frame_pointer_needed ())
> -	emit_insn (gen_stack_tie (stack_pointer_rtx,
> -				  hard_frame_pointer_rtx));
> +	emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
>      }
>  }
>  
> @@ -3331,170 +3770,71 @@ arc_expand_epilogue (int sibcall_p)
>  {
>    int size;
>    unsigned int fn_type = arc_compute_function_type (cfun);
> -
> -  size = arc_compute_frame_size ();
> -
> -  unsigned int pretend_size = cfun->machine->frame_info.pretend_size;
> -  unsigned int frame_size;
>    unsigned int size_to_deallocate;
>    int restored;
>    int can_trust_sp_p = !cfun->calls_alloca;
> -  int first_offset = 0;
> -  int millicode_p = cfun->machine->frame_info.millicode_end_reg > 0;
> -  rtx insn;
> +  int first_offset;
> +  bool restore_fp = arc_frame_pointer_needed () && !ARC_AUTOFP_IRQ_P (fn_type);
> +  bool restore_blink = arc_must_save_return_addr (cfun)
> +    && !ARC_AUTOBLINK_IRQ_P (fn_type);
> +  unsigned int gmask = cfun->machine->frame_info.gmask;
> +  bool return_p = !sibcall_p && fn_type == ARC_FUNCTION_NORMAL
> +		   && !cfun->machine->frame_info.pretend_size;
> +  struct arc_frame_info *frame = &cfun->machine->frame_info;
> +
>  
>    /* Naked functions don't have epilogue.  */
>    if (ARC_NAKED_P (fn_type))
>      return;
>  
> +  size = arc_compute_frame_size ();
>    size_to_deallocate = size;
>  
> -  frame_size = size - (pretend_size +
> -		       cfun->machine->frame_info.reg_size +
> -		       cfun->machine->frame_info.extra_size);
> -
> -  /* ??? There are lots of optimizations that can be done here.
> -     EG: Use fp to restore regs if it's closer.
> -     Maybe in time we'll do them all.  For now, always restore regs from
> -     sp, but don't restore sp if we don't have to.  */
> +  first_offset = size - (frame->pretend_size + frame->reg_size
> +			 + frame->extra_size);
>  
>    if (!can_trust_sp_p)
>      gcc_assert (arc_frame_pointer_needed ());
>  
> -  /* Restore stack pointer to the beginning of saved register area for
> -     ARCompact ISA.  */
> -  if (frame_size)
> -    {
> -      if (arc_frame_pointer_needed ())
> -	frame_move (stack_pointer_rtx, frame_pointer_rtx);
> -      else
> -	first_offset = frame_size;
> -      size_to_deallocate -= frame_size;
> -    }
> -  else if (!can_trust_sp_p)
> -    frame_stack_add (-frame_size);
> -
> -
> -  /* Restore any saved registers.  */
> -  if (arc_frame_pointer_needed ()
> -      && !ARC_AUTOFP_IRQ_P (fn_type))
> -    {
> -      rtx addr = gen_rtx_POST_INC (Pmode, stack_pointer_rtx);
> -
> -      insn = frame_move_inc (frame_pointer_rtx, gen_frame_mem (Pmode, addr),
> -			     stack_pointer_rtx, 0);
> -      add_reg_note (insn, REG_CFA_RESTORE, frame_pointer_rtx);
> -      add_reg_note (insn, REG_CFA_DEF_CFA,
> -		    plus_constant (SImode, stack_pointer_rtx,
> -				   4));
> -      size_to_deallocate -= UNITS_PER_WORD;
> -    }
> -
> -  /* Load blink after the calls to thunk calls in case of optimize size.  */
> -  if (millicode_p)
> -    {
> -	  int sibthunk_p = (!sibcall_p
> -			    && fn_type == ARC_FUNCTION_NORMAL
> -			    && !cfun->machine->frame_info.pretend_size);
> -
> -	  gcc_assert (!(cfun->machine->frame_info.gmask
> -			& (FRAME_POINTER_MASK | RETURN_ADDR_MASK)));
> -	  arc_save_restore (stack_pointer_rtx,
> -			    cfun->machine->frame_info.gmask,
> -			    1 + sibthunk_p, &first_offset);
> -	  if (sibthunk_p)
> -	    return;
> -    }
> -  /* If we are to restore registers, and first_offset would require
> -     a limm to be encoded in a PRE_MODIFY, yet we can add it with a
> -     fast add to the stack pointer, do this now.  */
> -  if ((!SMALL_INT (first_offset)
> -       && cfun->machine->frame_info.gmask
> -       && ((TARGET_ARC700 && !optimize_size)
> -	    ? first_offset <= 0x800
> -	    : satisfies_constraint_C2a (GEN_INT (first_offset))))
> -       /* Also do this if we have both gprs and return
> -	  address to restore, and they both would need a LIMM.  */
> -      || (arc_must_save_return_addr (cfun)
> -	  && !SMALL_INT ((cfun->machine->frame_info.reg_size + first_offset) >> 2)
> -	  && cfun->machine->frame_info.gmask))
> -    {
> -      frame_stack_add (first_offset);
> -      first_offset = 0;
> +  if (TARGET_CODE_DENSITY
> +      && TARGET_CODE_DENSITY_FRAME
> +      && !ARC_AUTOFP_IRQ_P (fn_type)
> +      && !ARC_AUTOBLINK_IRQ_P (fn_type)
> +      && !ARC_INTERRUPT_P(fn_type)
> +      && arc_enter_leave_p (gmask))
> +    {
> +      /* using leave instruction.  */
> +      size_to_deallocate -= arc_restore_callee_leave (gmask, restore_blink,
> +						      restore_fp,
> +						      return_p,
> +						      first_offset);
> +      if (return_p)
> +	{
> +	  gcc_assert (size_to_deallocate == 0);
> +	  return;
> +	}
>      }
> -  if (arc_must_save_return_addr (cfun)
> -      && !ARC_AUTOBLINK_IRQ_P (fn_type))
> +  else if (frame->millicode_end_reg > 14)
>      {
> -      rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
> -      int ra_offs = cfun->machine->frame_info.reg_size + first_offset;
> -      rtx addr = plus_constant (Pmode, stack_pointer_rtx, ra_offs);
> -      HOST_WIDE_INT cfa_adjust = 0;
> -
> -      /* If the load of blink would need a LIMM, but we can add
> -	 the offset quickly to sp, do the latter.  */
> -      if (!SMALL_INT (ra_offs >> 2)
> -	  && !cfun->machine->frame_info.gmask
> -	  && ((TARGET_ARC700 && !optimize_size)
> -	       ? ra_offs <= 0x800
> -	       : satisfies_constraint_C2a (GEN_INT (ra_offs))))
> -	{
> -	   size_to_deallocate -= ra_offs - first_offset;
> -	   first_offset = 0;
> -	   frame_stack_add (ra_offs);
> -	   ra_offs = 0;
> -	   addr = stack_pointer_rtx;
> -	}
> -      /* See if we can combine the load of the return address with the
> -	 final stack adjustment.
> -	 We need a separate load if there are still registers to
> -	 restore.  We also want a separate load if the combined insn
> -	 would need a limm, but a separate load doesn't.  */
> -      if (ra_offs
> -	  && !cfun->machine->frame_info.gmask
> -	  && (SMALL_INT (ra_offs) || !SMALL_INT (ra_offs >> 2)))
> -	{
> -	  addr = gen_rtx_PRE_MODIFY (Pmode, stack_pointer_rtx, addr);
> -	  cfa_adjust = ra_offs;
> -	  first_offset = 0;
> -	  size_to_deallocate -= cfun->machine->frame_info.reg_size;
> -	}
> -      else if (!ra_offs && size_to_deallocate == UNITS_PER_WORD)
> +      /* using millicode calls.  */
> +      size_to_deallocate -= arc_restore_callee_milli (gmask, restore_blink,
> +						      restore_fp,
> +						      return_p,
> +						      first_offset);
> +      if (return_p)
>  	{
> -	  addr = gen_rtx_POST_INC (Pmode, addr);
> -	  cfa_adjust = GET_MODE_SIZE (Pmode);
> -	  size_to_deallocate = 0;
> -	}
> -
> -      insn = frame_move_inc (ra, gen_frame_mem (Pmode, addr),
> -			     stack_pointer_rtx, addr);
> -      if (cfa_adjust)
> -	{
> -	  enum reg_note note = REG_CFA_ADJUST_CFA;
> -
> -	  add_reg_note (insn, note,
> -			gen_rtx_SET (stack_pointer_rtx,
> -				     plus_constant (SImode, stack_pointer_rtx,
> -						    cfa_adjust)));
> +	  gcc_assert (size_to_deallocate == 0);
> +	  return;
>  	}
> -      add_reg_note (insn, REG_CFA_RESTORE, ra);
>      }
> +  else
> +    size_to_deallocate -= arc_restore_callee_saves (gmask, restore_blink,
> +						    restore_fp,
> +						    first_offset,
> +						    size_to_deallocate);
>  
> -  if (!millicode_p)
> -    {
> -       if (cfun->machine->frame_info.reg_size)
> -	 arc_save_restore (stack_pointer_rtx,
> -	   /* The zeroing of these two bits is unnecessary, but leave this in for clarity.  */
> -			   cfun->machine->frame_info.gmask
> -			   & ~(FRAME_POINTER_MASK | RETURN_ADDR_MASK), 1, &first_offset);
> -    }
> -
> -  /* The rest of this function does the following:
> -     ARCompact    : handle epilogue_delay, restore sp (phase-2), return
> -  */
> -
> -  /* Keep track of how much of the stack pointer we've restored.
> -     It makes the following a lot more readable.  */
> -  size_to_deallocate += first_offset;
> +  /* Keep track of how much of the stack pointer we've restored.  It
> +     makes the following a lot more readable.  */
>    restored = size - size_to_deallocate;
>  
>    if (size > restored)
> @@ -3517,6 +3857,62 @@ arc_expand_epilogue (int sibcall_p)
>      emit_jump_insn (gen_simple_return ());
>  }
>  
> +
> +bool
> +arc_check_multi (rtx op, bool push_p)
> +{

Missing header comment.

> +  HOST_WIDE_INT len = XVECLEN (op, 0);
> +  unsigned int regno, i, start;
> +  unsigned int memp = push_p ? 0 : 1;
> +  rtx elt;
> +
> +  if (len <= 1)
> +    return false;
> +
> +  start = 1;
> +  elt = XVECEXP (op, 0, 0);
> +  if (!push_p && GET_CODE (elt) == RETURN)
> +    start = 2;
> +
> +  for (i = start, regno = 13; i < len; i++, regno++)
> +    {
> +      rtx elt = XVECEXP (op, 0, i);
> +      rtx reg, mem, addr;
> +
> +      if (GET_CODE (elt) != SET)
> +	return false;
> +      mem = XEXP (elt, memp);
> +      reg = XEXP (elt, 1 - memp);
> +
> +      if (!REG_P (reg)
> +	  || !MEM_P (mem))
> +	return false;
> +
> +      /* Check for blink.  */
> +      if (REGNO (reg) == RETURN_ADDR_REGNUM
> +	  && i == start)
> +	regno = 12;
> +      else if (REGNO (reg) == FRAME_POINTER_REGNUM)
> +	++i;
> +      else if (REGNO (reg) != regno)
> +	return false;
> +
> +      addr = XEXP (mem, 0);
> +      if (GET_CODE (addr) == PLUS)
> +	{
> +	  if (!rtx_equal_p (stack_pointer_rtx, XEXP (addr, 0))
> +	      || !CONST_INT_P (XEXP (addr, 1)))
> +	    return false;
> +	}
> +      else
> +	{
> +	  if (!rtx_equal_p (stack_pointer_rtx, addr))
> +	    return false;
> +	}
> +    }
> +return true;

Indentation.

> +}
> +
>  /* Return rtx for the location of the return address on the stack,
>     suitable for use in __builtin_eh_return.  The new return address
>     will be written to this location in order to redirect the return to
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index b968022e64a..fb8a1c9ee09 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -6430,6 +6430,178 @@ archs4x, archs4xd, archs4xd_slow"
>     }
>  )
>  
> +(define_insn "*push_multi_fp"
> +  [(match_parallel 0 "push_multi_operand"
> +		   [(set (reg:SI SP_REG)
> +			 (plus:SI (reg:SI SP_REG)
> +				  (match_operand 1 "immediate_operand" "")))
> +		    (set (mem:SI (plus:SI (reg:SI SP_REG)
> +					  (match_dup 1)))
> +			 (reg:SI 13))])]
> +  "TARGET_CODE_DENSITY"
> +  {
> +   int len = XVECLEN (operands[0], 0);
> +   rtx tmp = XVECEXP (operands[0], 0, len - 1);
> +   if (MEM_P (XEXP (tmp, 0)))
> +     {
> +      operands[2] = XEXP (tmp, 1);
> +      return "enter_s\\t{r13-%2} ; sp=sp-%1";
> +     }
> +   else
> +     {
> +      tmp = XVECEXP (operands[0], 0, len - 3);
> +      operands[2] = XEXP (tmp, 1);
> +      return "enter_s\\t{r13-%2, fp} ; sp=sp-%1";
> +     }
> +  }
> +  [(set_attr "type" "call_no_delay_slot")
> +   (set_attr "length" "2")])
> +
> +(define_insn "*push_multi_fp_blink"
> +  [(match_parallel 0 "push_multi_operand"
> +		   [(set (reg:SI SP_REG)
> +			 (plus:SI (reg:SI SP_REG)
> +				  (match_operand 1 "immediate_operand" "")))
> +		    (set (mem:SI (plus:SI (reg:SI SP_REG)
> +					  (match_dup 1)))
> +			 (reg:SI RETURN_ADDR_REGNUM))])]
> +  "TARGET_CODE_DENSITY"
> +  {
> +   int len = XVECLEN (operands[0], 0);
> +   rtx tmp = XVECEXP (operands[0], 0, len - 1);
> +   if (MEM_P (XEXP (tmp, 0)))
> +     {
> +      operands[2] = XEXP (tmp, 1);
> +      return "enter_s\\t{r13-%2, blink} ; sp=sp-%1";
> +     }
> +   else
> +     {
> +      tmp = XVECEXP (operands[0], 0, len - 3);
> +      operands[2] = XEXP (tmp, 1);
> +      return "enter_s\\t{r13-%2, fp, blink} ; sp=sp-%1";
> +     }
> +  }
> +  [(set_attr "type" "call_no_delay_slot")
> +   (set_attr "length" "2")])
> +
> +(define_insn "*pop_multi_fp"
> +  [(match_parallel 0 "pop_multi_operand"
> +		   [(set (reg:SI SP_REG)
> +			 (plus:SI (reg:SI SP_REG)
> +				  (match_operand 1 "immediate_operand" "")))
> +		    (set (reg:SI 13)
> +			 (mem:SI
> +			  (plus:SI (reg:SI SP_REG)
> +				   (match_operand 2 "immediate_operand" ""))))])]
> +  "TARGET_CODE_DENSITY"
> +  {
> +   int len = XVECLEN (operands[0], 0);
> +   rtx tmp = XVECEXP (operands[0], 0, len - 1);
> +   if (XEXP (tmp, 0) != frame_pointer_rtx)
> +     {
> +      operands[3] = XEXP (tmp, 0);
> +      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
> +      return "leave_s\\t{r13-%3} ; sp=sp+%1";
> +     }
> +   else
> +     {
> +      tmp = XVECEXP (operands[0], 0, len - 2);
> +      operands[3] = XEXP (tmp, 0);
> +      return "leave_s\\t{r13-%3, fp} ; sp=sp+%1";
> +     }
> +  }
> +  [(set_attr "type" "call_no_delay_slot")
> +   (set_attr "length" "2")])
> +
> +(define_insn "*pop_multi_fp_blink"
> +  [(match_parallel 0 "pop_multi_operand"
> +		   [(set (reg:SI SP_REG)
> +			 (plus:SI (reg:SI SP_REG)
> +				  (match_operand 1 "immediate_operand" "")))
> +		    (set (reg:SI RETURN_ADDR_REGNUM)
> +			 (mem:SI
> +			  (plus:SI (reg:SI SP_REG)
> +				   (match_operand 2 "immediate_operand" ""))))])]
> +  "TARGET_CODE_DENSITY"
> +  {
> +   int len = XVECLEN (operands[0], 0);
> +   rtx tmp = XVECEXP (operands[0], 0, len - 1);
> +   if (XEXP (tmp, 0) != frame_pointer_rtx)
> +     {
> +      operands[3] = XEXP (tmp, 0);
> +      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
> +      return "leave_s\\t{r13-%3, blink} ; sp=sp+%1";
> +     }
> +   else
> +     {
> +      tmp = XVECEXP (operands[0], 0, len - 2);
> +      operands[3] = XEXP (tmp, 0);
> +      return "leave_s\\t{r13-%3, fp, blink} ; sp=sp+%1";
> +     }
> +  }
> +  [(set_attr "type" "call_no_delay_slot")
> +   (set_attr "length" "2")])
> +
> +(define_insn "*pop_multi_fp_ret"
> +  [(match_parallel 0 "pop_multi_operand"
> +		   [(return)
> +		    (set (reg:SI SP_REG)
> +			 (plus:SI (reg:SI SP_REG)
> +				  (match_operand 1 "immediate_operand" "")))
> +		    (set (reg:SI 13)
> +			 (mem:SI
> +			  (plus:SI (reg:SI SP_REG)
> +				   (match_operand 2 "immediate_operand" ""))))])]
> +  "TARGET_CODE_DENSITY"
> +  {
> +   int len = XVECLEN (operands[0], 0);
> +   rtx tmp = XVECEXP (operands[0], 0, len - 1);
> +   if (XEXP (tmp, 0) != frame_pointer_rtx)
> +     {
> +      operands[3] = XEXP (tmp, 0);
> +      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
> +      return "leave_s\\t{r13-%3, pcl} ; sp=sp+%1";
> +     }
> +   else
> +     {
> +      tmp = XVECEXP (operands[0], 0, len - 2);
> +      operands[3] = XEXP (tmp, 0);
> +      return "leave_s\\t{r13-%3, fp, pcl} ; sp=sp+%1";
> +     }
> +  }
> +  [(set_attr "type" "call_no_delay_slot")
> +   (set_attr "length" "2")])
> +
> +(define_insn "*pop_multi_fp_blink_ret"
> +  [(match_parallel 0 "pop_multi_operand"
> +		   [(return)
> +		    (set (reg:SI SP_REG)
> +			 (plus:SI (reg:SI SP_REG)
> +				  (match_operand 1 "immediate_operand" "")))
> +		    (set (reg:SI RETURN_ADDR_REGNUM)
> +			 (mem:SI
> +			  (plus:SI (reg:SI SP_REG)
> +				   (match_operand 2 "immediate_operand" ""))))])]
> +  "TARGET_CODE_DENSITY"
> +  {
> +   int len = XVECLEN (operands[0], 0);
> +   rtx tmp = XVECEXP (operands[0], 0, len - 1);
> +   if (XEXP (tmp, 0) != frame_pointer_rtx)
> +     {
> +      operands[3] = XEXP (tmp, 0);
> +      gcc_assert (INTVAL (operands[1]) == INTVAL (operands[2]));
> +      return "leave_s\\t{r13-%3, blink, pcl} ; sp=sp+%1";
> +     }
> +   else
> +     {
> +      tmp = XVECEXP (operands[0], 0, len - 2);
> +      operands[3] = XEXP (tmp, 0);
> +      return "leave_s\\t{r13-%3, fp, blink, pcl} ; sp=sp+%1";
> +     }
> +  }
> +  [(set_attr "type" "call_no_delay_slot")
> +   (set_attr "length" "2")])
> +
>  ;; include the arc-FPX instructions
>  (include "fpx.md")
>  
> diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
> index 3e96b58375d..93e18af1d27 100644
> --- a/gcc/config/arc/arc.opt
> +++ b/gcc/config/arc/arc.opt
> @@ -193,9 +193,9 @@ msdata
>  Target Report InverseMask(NO_SDATA_SET)
>  Generate sdata references.  This is the default, unless you compile for PIC.
>  
> -mno-millicode
> -Target Report Mask(NO_MILLICODE_THUNK_SET)
> -Do not generate millicode thunks (needed only with -Os).
> +mmillicode
> +Target Report Mask(MILLICODE_THUNK_SET)
> +Generate millicode thunks.
>  
>  mspfp
>  Target Report Mask(SPFP_COMPACT_SET)
> @@ -532,3 +532,7 @@ Enable 16-entry register file.
>  mbranch-index
>  Target Report Var(TARGET_BRANCH_INDEX) Init(DEFAULT_BRANCH_INDEX)
>  Enable use of BI/BIH instructions when available.
> +
> +mcode-density-frame
> +Target Report Var(TARGET_CODE_DENSITY_FRAME)
> +Enable ENTER_S and LEAVE_S opcodes for ARCv2.
> diff --git a/gcc/config/arc/predicates.md b/gcc/config/arc/predicates.md
> index 0abfc839b07..efa3650e1fa 100644
> --- a/gcc/config/arc/predicates.md
> +++ b/gcc/config/arc/predicates.md
> @@ -800,3 +800,15 @@
>  (define_predicate "arc_short_operand"
>    (ior (match_test "register_operand (op, mode)")
>         (match_test "short_unsigned_const_operand (op, mode)")))
> +
> +(define_special_predicate "push_multi_operand"
> +  (match_code "parallel")
> +  {
> +   return arc_check_multi (op, true);
> +})
> +
> +(define_special_predicate "pop_multi_operand"
> +  (match_code "parallel")
> +  {
> +   return arc_check_multi (op, false);
> +})
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 454587310c8..5cdd8ba23e9 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -646,9 +646,9 @@ Objective-C and Objective-C++ Dialects}.
>  -malign-call  -mauto-modify-reg  -mbbit-peephole  -mno-brcc @gol
>  -mcase-vector-pcrel  -mcompact-casesi  -mno-cond-exec  -mearly-cbranchsi @gol
>  -mexpand-adddi  -mindexed-loads  -mlra  -mlra-priority-none @gol
> --mlra-priority-compact mlra-priority-noncompact  -mno-millicode @gol
> +-mlra-priority-compact mlra-priority-noncompact  -mmillicode @gol
>  -mmixed-code  -mq-class  -mRcq  -mRcw  -msize-level=@var{level} @gol
> --mtune=@var{cpu}  -mmultcost=@var{num} @gol
> +-mtune=@var{cpu}  -mmultcost=@var{num} -mcode-density-frame @gol
>  -munalign-prob-threshold=@var{probability}  -mmpy-option=@var{multo} @gol
>  -mdiv-rem  -mcode-density  -mll64  -mfpu=@var{fpu} -mrf16 -mbranch-index}
>  
> @@ -16042,15 +16042,21 @@ Indicate target register priority for r0..r3 / r12..r15.
>  @opindex mlra-priority-noncompact
>  Reduce target register priority for r0..r3 / r12..r15.
>  
> -@item -mno-millicode
> -@opindex mno-millicode
> +@item -mmillicode
> +@opindex mmillicode
>  When optimizing for size (using @option{-Os}), prologues and epilogues
>  that have to save or restore a large number of registers are often
>  shortened by using call to a special function in libgcc; this is
>  referred to as a @emph{millicode} call.  As these calls can pose
>  performance issues, and/or cause linking issues when linking in a
> -nonstandard way, this option is provided to turn off millicode call
> -generation.
> +nonstandard way, this option is provided to turn on or off millicode
> +call generation.
> +
> +@item -mcode-density-frame
> +@opindex mcode-density-frame
> +This option enable the compiler to emit @code{enter} and @code{leave}
> +instructions.  These instructions are only valid for CPUs with
> +code-density feature.
>  
>  @item -mmixed-code
>  @opindex mmixed-code
> diff --git a/gcc/testsuite/gcc.target/arc/firq-1.c b/gcc/testsuite/gcc.target/arc/firq-1.c
> index 87f408793dc..34d2fedcb45 100644
> --- a/gcc/testsuite/gcc.target/arc/firq-1.c
> +++ b/gcc/testsuite/gcc.target/arc/firq-1.c
> @@ -20,8 +20,8 @@ handler1 (void)
>         : "r0", "r1", "r2", "r3", "r4",
>  	 "r5", "r6", "r7", "r8", "r9");
>  }
> -/* { dg-final { scan-assembler-times "r2,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r4,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r6,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r8,\\\[sp,\[0-9\]+\\\]" 2 } } */
> +/* { dg-final { scan-assembler-times "r2,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r4,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r6,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r8,\\\[sp" 2 } } */
>  /* { dg-final { scan-assembler "rtie" } } */
> diff --git a/gcc/testsuite/gcc.target/arc/firq-3.c b/gcc/testsuite/gcc.target/arc/firq-3.c
> index cfd37949780..2d45f3c0af5 100644
> --- a/gcc/testsuite/gcc.target/arc/firq-3.c
> +++ b/gcc/testsuite/gcc.target/arc/firq-3.c
> @@ -28,13 +28,13 @@ handler1 (void)
>  /* { dg-final { scan-assembler-not "r15,\\\[sp" } } */
>  
>  /* { dg-final { scan-assembler-times "r4,\\\[sp" 2 } } */
> -/* { dg-final { scan-assembler-times "r6,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r8,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r10,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r16,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r18,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r20,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r24,\\\[sp,\[0-9\]+\\\]" 2 } } */
> +/* { dg-final { scan-assembler-times "r6,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r8,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r10,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r16,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r18,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r20,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r24,\\\[sp" 2 } } */
>  /* { dg-final { scan-assembler-times "fp,\\\[sp," 2 } } */
>  
>  /* { dg-final { scan-assembler "rtie" } } */
> diff --git a/gcc/testsuite/gcc.target/arc/firq-4.c b/gcc/testsuite/gcc.target/arc/firq-4.c
> index 2531c001bef..828facddf08 100644
> --- a/gcc/testsuite/gcc.target/arc/firq-4.c
> +++ b/gcc/testsuite/gcc.target/arc/firq-4.c
> @@ -18,13 +18,13 @@ handler1 (void)
>  		  "r25");
>  }
>  /* { dg-final { scan-assembler-times "r4,\\\[sp" 2 } } */
> -/* { dg-final { scan-assembler-times "r6,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r8,\\\[sp,\[0-9\]+\\\]" 2 } } */
> +/* { dg-final { scan-assembler-times "r6,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r8,\\\[sp" 2 } } */
>  
> -/* { dg-final { scan-assembler-times "r16,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r18,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r20,\\\[sp,\[0-9\]+\\\]" 2 } } */
> -/* { dg-final { scan-assembler-times "r24,\\\[sp,\[0-9\]+\\\]" 2 } } */
> +/* { dg-final { scan-assembler-times "r16,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r18,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r20,\\\[sp" 2 } } */
> +/* { dg-final { scan-assembler-times "r24,\\\[sp" 2 } } */
>  
>  /* { dg-final { scan-assembler-not "fp,\\\[sp" } } */
>  /* { dg-final { scan-assembler-not "push.*fp" } } */
> diff --git a/gcc/testsuite/gcc.target/arc/interrupt-6.c b/gcc/testsuite/gcc.target/arc/interrupt-6.c
> index 509ff302124..d82bd67edd8 100644
> --- a/gcc/testsuite/gcc.target/arc/interrupt-6.c
> +++ b/gcc/testsuite/gcc.target/arc/interrupt-6.c
> @@ -18,5 +18,5 @@ foo(void)
>    bar (p);
>  }
>  /* { dg-final { scan-assembler-not ".*fp,\\\[sp" } } */
> -/* { dg-final { scan-assembler "ld.*blink,\\\[sp\\\]" } } */
> +/* { dg-final { scan-assembler "ld.*blink,\\\[sp" } } */
>  /* { dg-final { scan-assembler "push_s.*blink" } } */
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads
  2018-10-22 18:15   ` Andrew Burgess
@ 2018-10-22 23:29     ` Bernhard Reutner-Fischer
  2018-10-31  9:11       ` claziss
  0 siblings, 1 reply; 25+ messages in thread
From: Bernhard Reutner-Fischer @ 2018-10-22 23:29 UTC (permalink / raw)
  To: gcc-patches, Andrew Burgess, Claudiu Zissulescu; +Cc: fbedard, claziss

On 22 October 2018 19:49:35 CEST, Andrew Burgess <andrew.burgess@embecosm.com> wrote:
>* Claudiu Zissulescu <claziss@gmail.com> [2018-10-10 11:00:14 +0300]:

>> --- a/gcc/config/arc/arc.c
>> +++ b/gcc/config/arc/arc.c
>> @@ -10803,6 +10803,169 @@ arc_cannot_substitute_mem_equiv_p (rtx)
>>    return true;
>>  }
>>  
>> +/* Checks whether the operands are valid for use in an LDD/STD
>> +   instruction.	 Assumes that RT, RT2, and RN are REG.	This is
>> +   guaranteed by the patterns.	Assumes that the address in the base
>> +   register RN is word aligned.	 Pattern guarantees that both memory
>> +   accesses use the same base register, the offsets are constants
>> +   within the range, and the gap between the offsets is 4.  If
>preload
>> +   complete then check that registers are legal.  WBACK indicates
>> +   whether address is updated.	*/
>
>You've got tabs instead of whitespace inside both this comment block,
>and others within this patch.  It should be period and two whitespace
>at the end of each sentence.

See contrib/check_GNU_style.py

Also:

s/If preload/If reload/

thanks,

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 6/6] [ARC] Handle store cacheline hazard.
  2018-10-10  8:01 ` [PATCH 6/6] [ARC] Handle store cacheline hazard Claudiu Zissulescu
@ 2018-10-30 10:13   ` Andrew Burgess
  2018-10-31  9:19     ` claziss
  2018-10-31 13:43     ` claziss
  0 siblings, 2 replies; 25+ messages in thread
From: Andrew Burgess @ 2018-10-30 10:13 UTC (permalink / raw)
  To: Claudiu Zissulescu; +Cc: gcc-patches, fbedard, claziss

* Claudiu Zissulescu <claziss@gmail.com> [2018-10-10 11:00:16 +0300]:

> Handle store cacheline hazard for A700 cpus by inserting two NOP_S
> between ST ST LD or their logical equivalent (like ST ST NOP_S NOP_S
> J_L.D LD)
> 
> gcc/
> 2016-08-01  Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* config/arc/arc-arch.h (ARC_TUNE_ARC7XX): New tune value.
> 	* config/arc/arc.c (arc_active_insn): New function.
> 	(check_store_cacheline_hazard): Likewise.
> 	(workaround_arc_anomaly): Use check_store_cacheline_hazard.
> 	(arc_override_options): Disable delay slot scheduler for older
> 	A7.
> 	(arc_store_addr_hazard_p): New implementation, old one renamed to
> 	...
> 	(arc_store_addr_hazard_internal_p): Renamed.
> 	(arc_reorg): Don't combine into brcc instructions which are part
> 	of hardware hazard solution.
> 	* config/arc/arc.md (attr tune): Consider new arc7xx tune value.
> 	(tune_arc700): Likewise.
> 	* config/arc/arc.opt (arc7xx): New tune value.
> 	* config/arc/arc700.md: Improve A7 scheduler.

Basically happy with this, most just a few missing header comments on
new functions.

Thanks,
Andrew


> ---
>  gcc/config/arc/arc-arch.h |   1 +
>  gcc/config/arc/arc.c      | 142 ++++++++++++++++++++++++++++++++------
>  gcc/config/arc/arc.md     |   8 ++-
>  gcc/config/arc/arc.opt    |   3 +
>  gcc/config/arc/arc700.md  |  18 +----
>  5 files changed, 132 insertions(+), 40 deletions(-)
> 
> diff --git a/gcc/config/arc/arc-arch.h b/gcc/config/arc/arc-arch.h
> index 859af0684b8..ad540607e55 100644
> --- a/gcc/config/arc/arc-arch.h
> +++ b/gcc/config/arc/arc-arch.h
> @@ -71,6 +71,7 @@ enum arc_tune_attr
>    {
>      ARC_TUNE_NONE,
>      ARC_TUNE_ARC600,
> +    ARC_TUNE_ARC7XX,
>      ARC_TUNE_ARC700_4_2_STD,
>      ARC_TUNE_ARC700_4_2_XMAC,
>      ARC_TUNE_CORE_3,
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index ab7735d6b38..90454928379 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -1308,6 +1308,10 @@ arc_override_options (void)
>    if (TARGET_LONG_CALLS_SET)
>      target_flags &= ~MASK_MILLICODE_THUNK_SET;
>  
> +  /* A7 has an issue with delay slots.  */
> +  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
> +    flag_delayed_branch = 0;
> +
>    /* These need to be done at start up.  It's convenient to do them here.  */
>    arc_init ();
>  }
> @@ -7529,11 +7533,91 @@ arc_invalid_within_doloop (const rtx_insn *insn)
>    return NULL;
>  }
>  
> +static rtx_insn *
> +arc_active_insn (rtx_insn *insn)

Missing header comment.

> +{
> +  rtx_insn *nxt = next_active_insn (insn);
> +
> +  if (nxt && GET_CODE (PATTERN (nxt)) == ASM_INPUT)
> +    nxt = next_active_insn (nxt);
> +  return nxt;
> +}
> +
> +/* Search for a sequence made out of two stores and a given number of
> +   loads, insert a nop if required.  */
> +
> +static void
> +check_store_cacheline_hazard (void)
> +{
> +  rtx_insn *insn, *succ0, *insn1;
> +  bool found = false;
> +
> +  for (insn = get_insns (); insn; insn = arc_active_insn (insn))
> +    {
> +      succ0 = arc_active_insn (insn);
> +
> +      if (!succ0)
> +	return;
> +
> +      if (!single_set (insn) || !single_set (succ0))
> +	continue;
> +
> +      if ((get_attr_type (insn) != TYPE_STORE)
> +	  || (get_attr_type (succ0) != TYPE_STORE))
> +	continue;
> +
> +      /* Found at least two consecutive stores.  Goto the end of the
> +	 store sequence.  */
> +      for (insn1 = succ0; insn1; insn1 = arc_active_insn (insn1))
> +	if (!single_set (insn1) || get_attr_type (insn1) != TYPE_STORE)
> +	  break;
> +
> +      /* Now, check the next two instructions for the following cases:
> +         1. next instruction is a LD => insert 2 nops between store
> +	    sequence and load.
> +	 2. next-next instruction is a LD => inset 1 nop after the store
> +	    sequence.  */
> +      if (insn1 && single_set (insn1)
> +	  && (get_attr_type (insn1) == TYPE_LOAD))
> +	{
> +	  found = true;
> +	  emit_insn_before (gen_nopv (), insn1);
> +	  emit_insn_before (gen_nopv (), insn1);
> +	}
> +      else
> +	{
> +	  if (insn1 && (get_attr_type (insn1) == TYPE_COMPARE))
> +	    {
> +	      /* REG_SAVE_NOTE is used by Haifa scheduler, we are in
> +		 reorg, so it is safe to reuse it for avoiding the
> +		 current compare insn to be part of a BRcc
> +		 optimization.  */
> +	      add_reg_note (insn1, REG_SAVE_NOTE, GEN_INT (3));
> +	    }
> +	  insn1 = arc_active_insn (insn1);
> +	  if (insn1 && single_set (insn1)
> +	      && (get_attr_type (insn1) == TYPE_LOAD))
> +	    {
> +	      found = true;
> +	      emit_insn_before (gen_nopv (), insn1);
> +	    }
> +	}
> +
> +      insn = insn1;
> +      if (found)
> +	{
> +	  /* warning (0, "Potential lockup sequence found, patching"); */

I'm not a fan of this approach.  I'd rather the comment explain what
problem was found and patched, and why displaying a warning is not
appropriate.  The commented out code just leaves me asking ... why?

> +	  found = false;
> +	}
> +    }
> +}
> +
>  /* Return true if a load instruction (CONSUMER) uses the same address as a
>     store instruction (PRODUCER).  This function is used to avoid st/ld
>     address hazard in ARC700 cores.  */
> -bool
> -arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
> +
> +static bool
> +arc_store_addr_hazard_internal_p (rtx_insn* producer, rtx_insn* consumer)
>  {
>    rtx in_set, out_set;
>    rtx out_addr, in_addr;
> @@ -7581,6 +7665,14 @@ arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
>    return false;
>  }
>  
> +bool
> +arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
> +{

Missing header comment.

> +  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
> +    return true;
> +  return arc_store_addr_hazard_internal_p (producer, consumer);
> +}
> +
>  /* The same functionality as arc_hazard.  It is called in machine
>     reorg before any other optimization.  Hence, the NOP size is taken
>     into account when doing branch shortening.  */
> @@ -7589,6 +7681,7 @@ static void
>  workaround_arc_anomaly (void)
>  {
>    rtx_insn *insn, *succ0;
> +  rtx_insn *succ1;
>  
>    /* For any architecture: call arc_hazard here.  */
>    for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> @@ -7600,27 +7693,30 @@ workaround_arc_anomaly (void)
>  	}
>      }
>  
> -  if (TARGET_ARC700)
> -    {
> -      rtx_insn *succ1;
> +  if (!TARGET_ARC700)
> +    return;
>  
> -      for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> -	{
> -	  succ0 = next_real_insn (insn);
> -	  if (arc_store_addr_hazard_p (insn, succ0))
> -	    {
> -	      emit_insn_after (gen_nopv (), insn);
> -	      emit_insn_after (gen_nopv (), insn);
> -	      continue;
> -	    }
> +  /* Old A7 are suffering of a cache hazard, and we need to insert two
> +     nops between any sequence of stores and a load.  */
> +  if (arc_tune != ARC_TUNE_ARC7XX)
> +    check_store_cacheline_hazard ();
>  
> -	  /* Avoid adding nops if the instruction between the ST and LD is
> -	     a call or jump.  */
> -	  succ1 = next_real_insn (succ0);
> -	  if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
> -	      && arc_store_addr_hazard_p (insn, succ1))
> -	    emit_insn_after (gen_nopv (), insn);
> +  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> +    {
> +      succ0 = next_real_insn (insn);
> +      if (arc_store_addr_hazard_internal_p (insn, succ0))
> +	{
> +	  emit_insn_after (gen_nopv (), insn);
> +	  emit_insn_after (gen_nopv (), insn);
> +	  continue;
>  	}
> +
> +      /* Avoid adding nops if the instruction between the ST and LD is
> +	 a call or jump.  */
> +      succ1 = next_real_insn (succ0);
> +      if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
> +	  && arc_store_addr_hazard_internal_p (insn, succ1))
> +	emit_insn_after (gen_nopv (), insn);
>      }
>  }
>  
> @@ -8291,11 +8387,15 @@ arc_reorg (void)
>  	      if (!link_insn)
>  		continue;
>  	      else
> -		/* Check if this is a data dependency.  */
>  		{
> +		  /* Check if this is a data dependency.  */
>  		  rtx op, cc_clob_rtx, op0, op1, brcc_insn, note;
>  		  rtx cmp0, cmp1;
>  
> +		  /* Make sure we can use it for brcc insns.  */
> +		  if (find_reg_note (link_insn, REG_SAVE_NOTE, GEN_INT (3)))
> +		    continue;
> +
>  		  /* Ok this is the set cc. copy args here.  */
>  		  op = XEXP (pc_target, 0);
>  
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index fb8a1c9ee09..caf7deda505 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -600,11 +600,13 @@
>  ;;   somehow modify them to become inelegible for delay slots if a decision
>  ;;   is made that makes conditional execution required.
>  
> -(define_attr "tune" "none,arc600,arc700_4_2_std,arc700_4_2_xmac, core_3, \
> -archs4x, archs4xd, archs4xd_slow"
> +(define_attr "tune" "none,arc600,arc7xx,arc700_4_2_std,arc700_4_2_xmac, \
> +core_3, archs4x, archs4xd, archs4xd_slow"
>    (const
>     (cond [(symbol_ref "arc_tune == TUNE_ARC600")
>  	  (const_string "arc600")
> +	  (symbol_ref "arc_tune == ARC_TUNE_ARC7XX")
> +	  (const_string "arc7xx")
>  	  (symbol_ref "arc_tune == TUNE_ARC700_4_2_STD")
>  	  (const_string "arc700_4_2_std")
>  	  (symbol_ref "arc_tune == TUNE_ARC700_4_2_XMAC")
> @@ -619,7 +621,7 @@ archs4x, archs4xd, archs4xd_slow"
>  	 (const_string "none"))))
>  
>  (define_attr "tune_arc700" "false,true"
> -  (if_then_else (eq_attr "tune" "arc700_4_2_std, arc700_4_2_xmac")
> +  (if_then_else (eq_attr "tune" "arc7xx, arc700_4_2_std, arc700_4_2_xmac")
>  		(const_string "true")
>  		(const_string "false")))
>  
> diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
> index 93e18af1d27..bcffb2720ba 100644
> --- a/gcc/config/arc/arc.opt
> +++ b/gcc/config/arc/arc.opt
> @@ -262,6 +262,9 @@ Enum(arc_tune_attr) String(arc600) Value(ARC_TUNE_ARC600)
>  EnumValue
>  Enum(arc_tune_attr) String(arc601) Value(ARC_TUNE_ARC600)
>  
> +EnumValue
> +Enum(arc_tune_attr) String(arc7xx) Value(ARC_TUNE_ARC7XX)
> +
>  EnumValue
>  Enum(arc_tune_attr) String(arc700) Value(ARC_TUNE_ARC700_4_2_STD)
>  
> diff --git a/gcc/config/arc/arc700.md b/gcc/config/arc/arc700.md
> index a0f9f74a9f2..cbb868d8dcd 100644
> --- a/gcc/config/arc/arc700.md
> +++ b/gcc/config/arc/arc700.md
> @@ -145,28 +145,14 @@
>  ; no functional unit runs when blockage is reserved
>  (exclusion_set "blockage" "core, multiplier")
>  
> -(define_insn_reservation "data_load_DI" 4
> -  (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "load")
> -       (match_operand:DI 0 "" ""))
> -  "issue+dmp, issue+dmp, dmp_write_port, dmp_write_port")
> -
>  (define_insn_reservation "data_load" 3
>    (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "load")
> -       (not (match_operand:DI 0 "" "")))
> +       (eq_attr "type" "load"))
>    "issue+dmp, nothing, dmp_write_port")
>  
> -(define_insn_reservation "data_store_DI" 2
> -  (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "store")
> -       (match_operand:DI 0 "" ""))
> -  "issue+dmp_write_port, issue+dmp_write_port")
> -
>  (define_insn_reservation "data_store" 1
>    (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "store")
> -       (not (match_operand:DI 0 "" "")))
> +       (eq_attr "type" "store"))
>    "issue+dmp_write_port")
>  
>  (define_bypass 3 "data_store" "data_load" "arc_store_addr_hazard_p")
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads
  2018-10-22 23:29     ` Bernhard Reutner-Fischer
@ 2018-10-31  9:11       ` claziss
  2018-11-12 11:33         ` claziss
  2018-11-13 10:00         ` Andrew Burgess
  0 siblings, 2 replies; 25+ messages in thread
From: claziss @ 2018-10-31  9:11 UTC (permalink / raw)
  To: Bernhard Reutner-Fischer, gcc-patches, Andrew Burgess; +Cc: fbedard, claziss

[-- Attachment #1: Type: text/plain, Size: 132 bytes --]

Thank you for your review. Please find attached a new respin patch with
your feedback in.

Please let me know if it is ok,
Claudiu 

[-- Attachment #2: 0001-ARC-Add-peephole-rules-to-combine-store-loads-into-d.patch --]
[-- Type: text/x-patch, Size: 9277 bytes --]

From 4ff7d8419783eceeffbaf27df017d0a93c3af942 Mon Sep 17 00:00:00 2001
From: Claudiu Zissulescu <claziss@gmail.com>
Date: Thu, 9 Aug 2018 14:29:05 +0300
Subject: [PATCH] [ARC] Add peephole rules to combine store/loads into double
 store/loads

Simple peephole rules which combines multiple ld/st instructions into
64-bit load/store instructions. It only works for architectures which
are having double load/store option on.

gcc/
	Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/arc-protos.h (gen_operands_ldd_std): Add.
	* config/arc/arc.c (operands_ok_ldd_std): New function.
	(mem_ok_for_ldd_std): Likewise.
	(gen_operands_ldd_std): Likewise.
	* config/arc/arc.md: Add peephole2 rules for std/ldd.
---
 gcc/config/arc/arc-protos.h |   1 +
 gcc/config/arc/arc.c        | 161 ++++++++++++++++++++++++++++++++++++
 gcc/config/arc/arc.md       |  69 ++++++++++++++++
 3 files changed, 231 insertions(+)

diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
index 24bea6e1efb..55f8ed4c643 100644
--- a/gcc/config/arc/arc-protos.h
+++ b/gcc/config/arc/arc-protos.h
@@ -46,6 +46,7 @@ extern int arc_return_address_register (unsigned int);
 extern unsigned int arc_compute_function_type (struct function *);
 extern bool arc_is_uncached_mem_p (rtx);
 extern bool arc_lra_p (void);
+extern bool gen_operands_ldd_std (rtx *operands, bool load, bool commute);
 #endif /* RTX_CODE */
 
 extern unsigned int arc_compute_frame_size (int);
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index 18dd0de6af7..daf785dbdb8 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -10803,6 +10803,167 @@ arc_cannot_substitute_mem_equiv_p (rtx)
   return true;
 }
 
+/* Checks whether the operands are valid for use in an LDD/STD
+   instruction.  Assumes that RT, and RT2 are REG.  This is guaranteed
+   by the patterns.  Assumes that the address in the base register RN
+   is word aligned.  Pattern guarantees that both memory accesses use
+   the same base register, the offsets are constants within the range,
+   and the gap between the offsets is 4.  If reload complete then
+   check that registers are legal.  */
+
+static bool
+operands_ok_ldd_std (rtx rt, rtx rt2, HOST_WIDE_INT offset)
+{
+  unsigned int t, t2;
+
+  if (!reload_completed)
+    return true;
+
+  if (!(SMALL_INT_RANGE (offset, (GET_MODE_SIZE (DImode) - 1) & (~0x03),
+			 (offset & (GET_MODE_SIZE (DImode) - 1) & 3
+			  ? 0 : -(-GET_MODE_SIZE (DImode) | (~0x03)) >> 1))))
+    return false;
+
+  t = REGNO (rt);
+  t2 = REGNO (rt2);
+
+  if ((t2 == PROGRAM_COUNTER_REGNO)
+      || (t % 2 != 0)	/* First destination register is not even.  */
+      || (t2 != t + 1))
+      return false;
+
+  return true;
+}
+
+/* Helper for gen_operands_ldd_std.  Returns true iff the memory
+   operand MEM's address contains an immediate offset from the base
+   register and has no side effects, in which case it sets BASE and
+   OFFSET accordingly.  */
+
+static bool
+mem_ok_for_ldd_std (rtx mem, rtx *base, rtx *offset)
+{
+  rtx addr;
+
+  gcc_assert (base != NULL && offset != NULL);
+
+  /* TODO: Handle more general memory operand patterns, such as
+     PRE_DEC and PRE_INC.  */
+
+  if (side_effects_p (mem))
+    return false;
+
+  /* Can't deal with subregs.  */
+  if (GET_CODE (mem) == SUBREG)
+    return false;
+
+  gcc_assert (MEM_P (mem));
+
+  *offset = const0_rtx;
+
+  addr = XEXP (mem, 0);
+
+  /* If addr isn't valid for DImode, then we can't handle it.  */
+  if (!arc_legitimate_address_p (DImode, addr,
+				reload_in_progress || reload_completed))
+    return false;
+
+  if (REG_P (addr))
+    {
+      *base = addr;
+      return true;
+    }
+  else if (GET_CODE (addr) == PLUS || GET_CODE (addr) == MINUS)
+    {
+      *base = XEXP (addr, 0);
+      *offset = XEXP (addr, 1);
+      return (REG_P (*base) && CONST_INT_P (*offset));
+    }
+
+  return false;
+}
+
+/* Called from peephole2 to replace two word-size accesses with a
+   single LDD/STD instruction.  Returns true iff we can generate a new
+   instruction sequence.  That is, both accesses use the same base
+   register and the gap between constant offsets is 4.  OPERANDS are
+   the operands found by the peephole matcher; OPERANDS[0,1] are
+   register operands, and OPERANDS[2,3] are the corresponding memory
+   operands.  LOAD indicates whether the access is load or store.  */
+
+bool
+gen_operands_ldd_std (rtx *operands, bool load, bool commute)
+{
+  int i, gap;
+  HOST_WIDE_INT offsets[2], offset;
+  int nops = 2;
+  rtx cur_base, cur_offset, tmp;
+  rtx base = NULL_RTX;
+
+  /* Check that the memory references are immediate offsets from the
+     same base register.  Extract the base register, the destination
+     registers, and the corresponding memory offsets.  */
+  for (i = 0; i < nops; i++)
+    {
+      if (!mem_ok_for_ldd_std (operands[nops+i], &cur_base, &cur_offset))
+	return false;
+
+      if (i == 0)
+	base = cur_base;
+      else if (REGNO (base) != REGNO (cur_base))
+	return false;
+
+      offsets[i] = INTVAL (cur_offset);
+      if (GET_CODE (operands[i]) == SUBREG)
+	{
+	  tmp = SUBREG_REG (operands[i]);
+	  gcc_assert (GET_MODE (operands[i]) == GET_MODE (tmp));
+	  operands[i] = tmp;
+	}
+    }
+
+  /* Make sure there is no dependency between the individual loads.  */
+  if (load && REGNO (operands[0]) == REGNO (base))
+    return false; /* RAW.  */
+
+  if (load && REGNO (operands[0]) == REGNO (operands[1]))
+    return false; /* WAW.  */
+
+  /* Make sure the instructions are ordered with lower memory access first.  */
+  if (offsets[0] > offsets[1])
+    {
+      gap = offsets[0] - offsets[1];
+      offset = offsets[1];
+
+      /* Swap the instructions such that lower memory is accessed first.  */
+      std::swap (operands[0], operands[1]);
+      std::swap (operands[2], operands[3]);
+    }
+  else
+    {
+      gap = offsets[1] - offsets[0];
+      offset = offsets[0];
+    }
+
+  /* Make sure accesses are to consecutive memory locations.  */
+  if (gap != 4)
+    return false;
+
+  /* Make sure we generate legal instructions.  */
+  if (operands_ok_ldd_std (operands[0], operands[1], offset))
+    return true;
+
+  if (load && commute)
+    {
+      /* Try reordering registers.  */
+      std::swap (operands[0], operands[1]);
+      if (operands_ok_ldd_std (operands[0], operands[1], offset))
+	return true;
+    }
+
+  return false;
+}
+
 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P arc_use_anchors_for_symbol_p
 
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 1ed230fa5f0..526fd17a0cf 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -6363,6 +6363,75 @@ archs4x, archs4xd, archs4xd_slow"
   [(set (reg:CC CC_REG) (compare:CC (match_dup 3)
 				    (ashift:SI (match_dup 1) (match_dup 2))))])
 
+(define_peephole2 ; std
+  [(set (match_operand:SI 2 "memory_operand" "")
+	(match_operand:SI 0 "register_operand" ""))
+   (set (match_operand:SI 3 "memory_operand" "")
+	(match_operand:SI 1 "register_operand" ""))]
+  "TARGET_LL64"
+  [(const_int 0)]
+{
+  if (!gen_operands_ldd_std (operands, false, false))
+    FAIL;
+  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
+  operands[2] = adjust_address (operands[2], DImode, 0);
+  emit_insn (gen_rtx_SET (operands[2], operands[0]));
+  DONE;
+})
+
+(define_peephole2 ; ldd
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operand:SI 2 "memory_operand" ""))
+   (set (match_operand:SI 1 "register_operand" "")
+	(match_operand:SI 3 "memory_operand" ""))]
+  "TARGET_LL64"
+  [(const_int 0)]
+{
+  if (!gen_operands_ldd_std (operands, true, false))
+    FAIL;
+  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
+  operands[2] = adjust_address (operands[2], DImode, 0);
+  emit_insn (gen_rtx_SET (operands[0], operands[2]));
+  DONE;
+})
+
+;; We require consecutive registers for LDD instruction.  Check if we
+;; can reorder them and use an LDD.
+
+(define_peephole2 ; swap the destination registers of two loads
+		  ; before a commutative operation.
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operand:SI 2 "memory_operand" ""))
+   (set (match_operand:SI 1 "register_operand" "")
+	(match_operand:SI 3 "memory_operand" ""))
+   (set (match_operand:SI 4 "register_operand" "")
+	(match_operator:SI 5 "commutative_operator"
+			   [(match_operand 6 "register_operand" "")
+			    (match_operand 7 "register_operand" "") ]))]
+  "TARGET_LL64
+   && (((rtx_equal_p (operands[0], operands[6]))
+	 && (rtx_equal_p (operands[1], operands[7])))
+	|| ((rtx_equal_p (operands[0], operands[7]))
+	     && (rtx_equal_p (operands[1], operands[6]))))
+   && (peep2_reg_dead_p (3, operands[0])
+       || rtx_equal_p (operands[0], operands[4]))
+   && (peep2_reg_dead_p (3, operands[1])
+       || rtx_equal_p (operands[1], operands[4]))"
+  [(set (match_dup 0) (match_dup 2))
+   (set (match_dup 4) (match_op_dup 5 [(match_dup 6) (match_dup 7)]))]
+  {
+    if (!gen_operands_ldd_std (operands, true, true))
+     {
+	FAIL;
+     }
+    else
+     {
+	operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
+	operands[2] = adjust_address (operands[2], DImode, 0);
+     }
+   }
+)
+
 ;; include the arc-FPX instructions
 (include "fpx.md")
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 6/6] [ARC] Handle store cacheline hazard.
  2018-10-30 10:13   ` Andrew Burgess
@ 2018-10-31  9:19     ` claziss
  2018-10-31 13:43     ` claziss
  1 sibling, 0 replies; 25+ messages in thread
From: claziss @ 2018-10-31  9:19 UTC (permalink / raw)
  To: Andrew Burgess; +Cc: gcc-patches, fbedard, claziss


> I'm not a fan of this approach.  I'd rather the comment explain what
> problem was found and patched, and why displaying a warning is not
> appropriate.  The commented out code just leaves me asking ... why?
> 

Having the warning here breaks a number of builds, like the linux
kernel build. On the other hand the users were curious if the locking
sequence was common or not. I'll remove the commented warning for
clarity and I will provide to the curious users a patch to get the
warning back on for their needs.

Thanks,
Claudiu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/6] [ARC] Add BI/BIH instruction support.
  2018-10-17  7:19   ` Sandra Loosemore
@ 2018-10-31 12:31     ` claziss
  0 siblings, 0 replies; 25+ messages in thread
From: claziss @ 2018-10-31 12:31 UTC (permalink / raw)
  To: Sandra Loosemore, gcc-patches; +Cc: andrew.burgess, fbedard, claziss

Thank you all for your review. The patch is pushed with your input in.

//Claudiu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/6] [ARC] Remove non standard funcions calls.
  2018-10-11 10:14   ` Andrew Burgess
@ 2018-10-31 12:40     ` claziss
  0 siblings, 0 replies; 25+ messages in thread
From: claziss @ 2018-10-31 12:40 UTC (permalink / raw)
  To: Andrew Burgess; +Cc: gcc-patches, fbedard, claziss

Thank you for your review. Patch pushed,
Claudiu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/6] [ARC] Add BI/BIH instruction support.
  2018-10-16 23:19   ` Andrew Burgess
  2018-10-17 17:21     ` Claudiu Zissulescu
@ 2018-10-31 12:59     ` claziss
  1 sibling, 0 replies; 25+ messages in thread
From: claziss @ 2018-10-31 12:59 UTC (permalink / raw)
  To: Andrew Burgess; +Cc: gcc-patches, fbedard, claziss

Committed with your feedback in. Thank you,
Claudiu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/6] [ARC] Cleanup TLS implementation.
  2018-10-11 10:13   ` Andrew Burgess
@ 2018-10-31 13:11     ` claziss
  0 siblings, 0 replies; 25+ messages in thread
From: claziss @ 2018-10-31 13:11 UTC (permalink / raw)
  To: Andrew Burgess; +Cc: gcc-patches, fbedard, claziss

Committed with your feedback in. Thank you,
Claudiu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 6/6] [ARC] Handle store cacheline hazard.
  2018-10-30 10:13   ` Andrew Burgess
  2018-10-31  9:19     ` claziss
@ 2018-10-31 13:43     ` claziss
  1 sibling, 0 replies; 25+ messages in thread
From: claziss @ 2018-10-31 13:43 UTC (permalink / raw)
  To: Andrew Burgess; +Cc: gcc-patches, fbedard, claziss

Committed with feedback in. Thank you,
Claudiu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads
  2018-10-31  9:11       ` claziss
@ 2018-11-12 11:33         ` claziss
  2018-11-13 10:00         ` Andrew Burgess
  1 sibling, 0 replies; 25+ messages in thread
From: claziss @ 2018-11-12 11:33 UTC (permalink / raw)
  To: gcc-patches, Andrew Burgess; +Cc: fbedard, claziss

PING.

On Wed, 2018-10-31 at 10:33 +0200, claziss@gmail.com wrote:
> Thank you for your review. Please find attached a new respin patch
> with
> your feedback in.
> 
> Please let me know if it is ok,
> Claudiu 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads
  2018-10-31  9:11       ` claziss
  2018-11-12 11:33         ` claziss
@ 2018-11-13 10:00         ` Andrew Burgess
  1 sibling, 0 replies; 25+ messages in thread
From: Andrew Burgess @ 2018-11-13 10:00 UTC (permalink / raw)
  To: claziss; +Cc: Bernhard Reutner-Fischer, gcc-patches, fbedard, claziss

* claziss@gmail.com <claziss@gmail.com> [2018-10-31 10:33:33 +0200]:

> Thank you for your review. Please find attached a new respin patch with
> your feedback in.
> 
> Please let me know if it is ok,
> Claudiu 

> From 4ff7d8419783eceeffbaf27df017d0a93c3af942 Mon Sep 17 00:00:00 2001
> From: Claudiu Zissulescu <claziss@gmail.com>
> Date: Thu, 9 Aug 2018 14:29:05 +0300
> Subject: [PATCH] [ARC] Add peephole rules to combine store/loads into double
>  store/loads
> 
> Simple peephole rules which combines multiple ld/st instructions into
> 64-bit load/store instructions. It only works for architectures which
> are having double load/store option on.
> 
> gcc/
> 	Claudiu Zissulescu  <claziss@synopsys.com>
> 
> 	* config/arc/arc-protos.h (gen_operands_ldd_std): Add.
> 	* config/arc/arc.c (operands_ok_ldd_std): New function.
> 	(mem_ok_for_ldd_std): Likewise.
> 	(gen_operands_ldd_std): Likewise.
> 	* config/arc/arc.md: Add peephole2 rules for std/ldd.

Looks good.

Thanks,
Andrew


> ---
>  gcc/config/arc/arc-protos.h |   1 +
>  gcc/config/arc/arc.c        | 161 ++++++++++++++++++++++++++++++++++++
>  gcc/config/arc/arc.md       |  69 ++++++++++++++++
>  3 files changed, 231 insertions(+)
> 
> diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
> index 24bea6e1efb..55f8ed4c643 100644
> --- a/gcc/config/arc/arc-protos.h
> +++ b/gcc/config/arc/arc-protos.h
> @@ -46,6 +46,7 @@ extern int arc_return_address_register (unsigned int);
>  extern unsigned int arc_compute_function_type (struct function *);
>  extern bool arc_is_uncached_mem_p (rtx);
>  extern bool arc_lra_p (void);
> +extern bool gen_operands_ldd_std (rtx *operands, bool load, bool commute);
>  #endif /* RTX_CODE */
>  
>  extern unsigned int arc_compute_frame_size (int);
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index 18dd0de6af7..daf785dbdb8 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -10803,6 +10803,167 @@ arc_cannot_substitute_mem_equiv_p (rtx)
>    return true;
>  }
>  
> +/* Checks whether the operands are valid for use in an LDD/STD
> +   instruction.  Assumes that RT, and RT2 are REG.  This is guaranteed
> +   by the patterns.  Assumes that the address in the base register RN
> +   is word aligned.  Pattern guarantees that both memory accesses use
> +   the same base register, the offsets are constants within the range,
> +   and the gap between the offsets is 4.  If reload complete then
> +   check that registers are legal.  */
> +
> +static bool
> +operands_ok_ldd_std (rtx rt, rtx rt2, HOST_WIDE_INT offset)
> +{
> +  unsigned int t, t2;
> +
> +  if (!reload_completed)
> +    return true;
> +
> +  if (!(SMALL_INT_RANGE (offset, (GET_MODE_SIZE (DImode) - 1) & (~0x03),
> +			 (offset & (GET_MODE_SIZE (DImode) - 1) & 3
> +			  ? 0 : -(-GET_MODE_SIZE (DImode) | (~0x03)) >> 1))))
> +    return false;
> +
> +  t = REGNO (rt);
> +  t2 = REGNO (rt2);
> +
> +  if ((t2 == PROGRAM_COUNTER_REGNO)
> +      || (t % 2 != 0)	/* First destination register is not even.  */
> +      || (t2 != t + 1))
> +      return false;
> +
> +  return true;
> +}
> +
> +/* Helper for gen_operands_ldd_std.  Returns true iff the memory
> +   operand MEM's address contains an immediate offset from the base
> +   register and has no side effects, in which case it sets BASE and
> +   OFFSET accordingly.  */
> +
> +static bool
> +mem_ok_for_ldd_std (rtx mem, rtx *base, rtx *offset)
> +{
> +  rtx addr;
> +
> +  gcc_assert (base != NULL && offset != NULL);
> +
> +  /* TODO: Handle more general memory operand patterns, such as
> +     PRE_DEC and PRE_INC.  */
> +
> +  if (side_effects_p (mem))
> +    return false;
> +
> +  /* Can't deal with subregs.  */
> +  if (GET_CODE (mem) == SUBREG)
> +    return false;
> +
> +  gcc_assert (MEM_P (mem));
> +
> +  *offset = const0_rtx;
> +
> +  addr = XEXP (mem, 0);
> +
> +  /* If addr isn't valid for DImode, then we can't handle it.  */
> +  if (!arc_legitimate_address_p (DImode, addr,
> +				reload_in_progress || reload_completed))
> +    return false;
> +
> +  if (REG_P (addr))
> +    {
> +      *base = addr;
> +      return true;
> +    }
> +  else if (GET_CODE (addr) == PLUS || GET_CODE (addr) == MINUS)
> +    {
> +      *base = XEXP (addr, 0);
> +      *offset = XEXP (addr, 1);
> +      return (REG_P (*base) && CONST_INT_P (*offset));
> +    }
> +
> +  return false;
> +}
> +
> +/* Called from peephole2 to replace two word-size accesses with a
> +   single LDD/STD instruction.  Returns true iff we can generate a new
> +   instruction sequence.  That is, both accesses use the same base
> +   register and the gap between constant offsets is 4.  OPERANDS are
> +   the operands found by the peephole matcher; OPERANDS[0,1] are
> +   register operands, and OPERANDS[2,3] are the corresponding memory
> +   operands.  LOAD indicates whether the access is load or store.  */
> +
> +bool
> +gen_operands_ldd_std (rtx *operands, bool load, bool commute)
> +{
> +  int i, gap;
> +  HOST_WIDE_INT offsets[2], offset;
> +  int nops = 2;
> +  rtx cur_base, cur_offset, tmp;
> +  rtx base = NULL_RTX;
> +
> +  /* Check that the memory references are immediate offsets from the
> +     same base register.  Extract the base register, the destination
> +     registers, and the corresponding memory offsets.  */
> +  for (i = 0; i < nops; i++)
> +    {
> +      if (!mem_ok_for_ldd_std (operands[nops+i], &cur_base, &cur_offset))
> +	return false;
> +
> +      if (i == 0)
> +	base = cur_base;
> +      else if (REGNO (base) != REGNO (cur_base))
> +	return false;
> +
> +      offsets[i] = INTVAL (cur_offset);
> +      if (GET_CODE (operands[i]) == SUBREG)
> +	{
> +	  tmp = SUBREG_REG (operands[i]);
> +	  gcc_assert (GET_MODE (operands[i]) == GET_MODE (tmp));
> +	  operands[i] = tmp;
> +	}
> +    }
> +
> +  /* Make sure there is no dependency between the individual loads.  */
> +  if (load && REGNO (operands[0]) == REGNO (base))
> +    return false; /* RAW.  */
> +
> +  if (load && REGNO (operands[0]) == REGNO (operands[1]))
> +    return false; /* WAW.  */
> +
> +  /* Make sure the instructions are ordered with lower memory access first.  */
> +  if (offsets[0] > offsets[1])
> +    {
> +      gap = offsets[0] - offsets[1];
> +      offset = offsets[1];
> +
> +      /* Swap the instructions such that lower memory is accessed first.  */
> +      std::swap (operands[0], operands[1]);
> +      std::swap (operands[2], operands[3]);
> +    }
> +  else
> +    {
> +      gap = offsets[1] - offsets[0];
> +      offset = offsets[0];
> +    }
> +
> +  /* Make sure accesses are to consecutive memory locations.  */
> +  if (gap != 4)
> +    return false;
> +
> +  /* Make sure we generate legal instructions.  */
> +  if (operands_ok_ldd_std (operands[0], operands[1], offset))
> +    return true;
> +
> +  if (load && commute)
> +    {
> +      /* Try reordering registers.  */
> +      std::swap (operands[0], operands[1]);
> +      if (operands_ok_ldd_std (operands[0], operands[1], offset))
> +	return true;
> +    }
> +
> +  return false;
> +}
> +
>  #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
>  #define TARGET_USE_ANCHORS_FOR_SYMBOL_P arc_use_anchors_for_symbol_p
>  
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index 1ed230fa5f0..526fd17a0cf 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -6363,6 +6363,75 @@ archs4x, archs4xd, archs4xd_slow"
>    [(set (reg:CC CC_REG) (compare:CC (match_dup 3)
>  				    (ashift:SI (match_dup 1) (match_dup 2))))])
>  
> +(define_peephole2 ; std
> +  [(set (match_operand:SI 2 "memory_operand" "")
> +	(match_operand:SI 0 "register_operand" ""))
> +   (set (match_operand:SI 3 "memory_operand" "")
> +	(match_operand:SI 1 "register_operand" ""))]
> +  "TARGET_LL64"
> +  [(const_int 0)]
> +{
> +  if (!gen_operands_ldd_std (operands, false, false))
> +    FAIL;
> +  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
> +  operands[2] = adjust_address (operands[2], DImode, 0);
> +  emit_insn (gen_rtx_SET (operands[2], operands[0]));
> +  DONE;
> +})
> +
> +(define_peephole2 ; ldd
> +  [(set (match_operand:SI 0 "register_operand" "")
> +	(match_operand:SI 2 "memory_operand" ""))
> +   (set (match_operand:SI 1 "register_operand" "")
> +	(match_operand:SI 3 "memory_operand" ""))]
> +  "TARGET_LL64"
> +  [(const_int 0)]
> +{
> +  if (!gen_operands_ldd_std (operands, true, false))
> +    FAIL;
> +  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
> +  operands[2] = adjust_address (operands[2], DImode, 0);
> +  emit_insn (gen_rtx_SET (operands[0], operands[2]));
> +  DONE;
> +})
> +
> +;; We require consecutive registers for LDD instruction.  Check if we
> +;; can reorder them and use an LDD.
> +
> +(define_peephole2 ; swap the destination registers of two loads
> +		  ; before a commutative operation.
> +  [(set (match_operand:SI 0 "register_operand" "")
> +	(match_operand:SI 2 "memory_operand" ""))
> +   (set (match_operand:SI 1 "register_operand" "")
> +	(match_operand:SI 3 "memory_operand" ""))
> +   (set (match_operand:SI 4 "register_operand" "")
> +	(match_operator:SI 5 "commutative_operator"
> +			   [(match_operand 6 "register_operand" "")
> +			    (match_operand 7 "register_operand" "") ]))]
> +  "TARGET_LL64
> +   && (((rtx_equal_p (operands[0], operands[6]))
> +	 && (rtx_equal_p (operands[1], operands[7])))
> +	|| ((rtx_equal_p (operands[0], operands[7]))
> +	     && (rtx_equal_p (operands[1], operands[6]))))
> +   && (peep2_reg_dead_p (3, operands[0])
> +       || rtx_equal_p (operands[0], operands[4]))
> +   && (peep2_reg_dead_p (3, operands[1])
> +       || rtx_equal_p (operands[1], operands[4]))"
> +  [(set (match_dup 0) (match_dup 2))
> +   (set (match_dup 4) (match_op_dup 5 [(match_dup 6) (match_dup 7)]))]
> +  {
> +    if (!gen_operands_ldd_std (operands, true, true))
> +     {
> +	FAIL;
> +     }
> +    else
> +     {
> +	operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
> +	operands[2] = adjust_address (operands[2], DImode, 0);
> +     }
> +   }
> +)
> +
>  ;; include the arc-FPX instructions
>  (include "fpx.md")
>  
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2018-11-13 10:00 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-10  8:01 [PATCH 0/6] ARC updates Claudiu Zissulescu
2018-10-10  8:01 ` [PATCH 6/6] [ARC] Handle store cacheline hazard Claudiu Zissulescu
2018-10-30 10:13   ` Andrew Burgess
2018-10-31  9:19     ` claziss
2018-10-31 13:43     ` claziss
2018-10-10  8:01 ` [PATCH 1/6] [ARC] Remove non standard funcions calls Claudiu Zissulescu
2018-10-11 10:14   ` Andrew Burgess
2018-10-31 12:40     ` claziss
2018-10-10  8:01 ` [PATCH 4/6] [ARC] Add peephole rules to combine store/loads into double store/loads Claudiu Zissulescu
2018-10-22 18:15   ` Andrew Burgess
2018-10-22 23:29     ` Bernhard Reutner-Fischer
2018-10-31  9:11       ` claziss
2018-11-12 11:33         ` claziss
2018-11-13 10:00         ` Andrew Burgess
2018-10-10  8:01 ` [PATCH 2/6] [ARC] Cleanup TLS implementation Claudiu Zissulescu
2018-10-11 10:13   ` Andrew Burgess
2018-10-31 13:11     ` claziss
2018-10-10  8:49 ` [PATCH 3/6] [ARC] Add BI/BIH instruction support Claudiu Zissulescu
2018-10-16 23:19   ` Andrew Burgess
2018-10-17 17:21     ` Claudiu Zissulescu
2018-10-31 12:59     ` claziss
2018-10-17  7:19   ` Sandra Loosemore
2018-10-31 12:31     ` claziss
2018-10-10  9:05 ` [PATCH 5/6] [ARC] Refurbish and improve prologue/epilogue functions Claudiu Zissulescu
2018-10-22 18:26   ` Andrew Burgess

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).