From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-patches-return-360357-listarch-gcc-patches=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 24746 invoked by alias); 24 Jan 2014 17:16:57 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Received: (qmail 24735 invoked by uid 89); 24 Jan 2014 17:16:56 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.6 required=5.0 tests=AWL,BAYES_00,MSGID_MULTIPLE_AT,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=no version=3.3.2
X-HELO: service87.mimecast.com
Received: from service87.mimecast.com (HELO service87.mimecast.com) (91.220.42.44) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 24 Jan 2014 17:16:54 +0000
Received: from cam-owa2.Emea.Arm.com (fw-tnat.cambridge.arm.com [217.140.96.21]) by service87.mimecast.com; Fri, 24 Jan 2014 17:16:51 +0000
Received: from E102352xp ([10.1.255.212]) by cam-owa2.Emea.Arm.com with Microsoft SMTPSVC(6.0.3790.3959);	 Fri, 24 Jan 2014 17:16:51 +0000
From: "Ian Bolton" <ian.bolton@arm.com>
To: <gcc-patches@gcc.gnu.org>
Subject: [PATCH, ARM] Suppress Redundant Flag Setting for Cortex-A15
Date: Fri, 24 Jan 2014 17:16:00 -0000
Message-ID: <000001cf1928$0213ec00$063bc400$@bolton@arm.com>
MIME-Version: 1.0
X-MC-Unique: 114012417165101301
Content-Type: multipart/mixed;	boundary="----=_NextPart_000_0001_01CF1928.0213EC00"
X-IsSubscribed: yes
X-SW-Source: 2014-01/txt/msg01603.txt.bz2

This is a multi-part message in MIME format.

------=_NextPart_000_0001_01CF1928.0213EC00
Content-Type: text/plain; charset=WINDOWS-1252
Content-Transfer-Encoding: quoted-printable
Content-length: 1081

Hi there!

An existing optimisation for Thumb-2 converts t32 encodings to
t16 encodings to reduce codesize, at the expense of causing
redundant flag setting for ADD, AND, etc.  This redundant flag
setting can have negative performance impact on cortex-a15.

This patch introduces two new tuning options so that the conversion
from t32 to t16, which takes place in thumb2_reorg, can be suppressed
for cortex-a15.

To maintain some of the original benefit (reduced codesize), the
suppression is only done where the enclosing basic block is deemed
worthy of optimising for speed.

This tested with no regressions and performance has improved for
the workloads tested on cortex-a15.  (It might be beneficial to
other processors too, but that has not been investigated yet.)

OK for stage 1?

Cheers,
Ian


2014-01-24  Ian Bolton  <ian.bolton@arm.com>

gcc/
	* config/arm/arm-protos.h (tune_params): New struct members.
	* config/arm/arm.c: Initialise tune_params per processor.
	(thumb2_reorg): Suppress conversion from t32 to t16 when
	optimizing for speed, based on new tune_params.

------=_NextPart_000_0001_01CF1928.0213EC00
Content-Type: text/plain; name=aarch32_prefer_t32_encodings_patch-v5.txt
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
	filename="aarch32_prefer_t32_encodings_patch-v5.txt"
Content-length: 11755

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 13874ee..74645ee 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -272,6 +272,11 @@ struct tune_params
   const struct cpu_vec_costs* vec_costs;
   /* Prefer Neon for 64-bit bitops.  */
   bool prefer_neon_for_64bits;
+  /* Prefer 32-bit encoding instead of flag-setting 16-bit encoding.  */
+  bool disparage_flag_setting_t16_encodings;
+  /* Prefer 32-bit encoding instead of 16-bit encoding where subset of fla=
gs
+     would be set.  */
+  bool disparage_partial_flag_setting_t16_encodings;
 };
=20
 extern const struct tune_params *current_tune;
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index fc81bf6..1ebaf84 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1481,7 +1481,8 @@ const struct tune_params arm_slowmul_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_fastmul_tune =3D
@@ -1497,7 +1498,8 @@ const struct tune_params arm_fastmul_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 /* StrongARM has early execution of branches, so a sequence that is worth
@@ -1516,7 +1518,8 @@ const struct tune_params arm_strongarm_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_xscale_tune =3D
@@ -1532,7 +1535,8 @@ const struct tune_params arm_xscale_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_9e_tune =3D
@@ -1548,7 +1552,8 @@ const struct tune_params arm_9e_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_v6t2_tune =3D
@@ -1564,7 +1569,8 @@ const struct tune_params arm_v6t2_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 /* Generic Cortex tuning.  Use more specific tunings if appropriate.  */
@@ -1581,7 +1587,8 @@ const struct tune_params arm_cortex_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_cortex_a7_tune =3D
@@ -1597,7 +1604,8 @@ const struct tune_params arm_cortex_a7_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,			/* Vectorizer costs.  */
-  false						/* Prefer Neon for 64-bits bitops.  */
+  false,					/* Prefer Neon for 64-bits bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_cortex_a15_tune =3D
@@ -1613,7 +1621,8 @@ const struct tune_params arm_cortex_a15_tune =3D
   true,						/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  true, true                                    /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_cortex_a53_tune =3D
@@ -1629,7 +1638,8 @@ const struct tune_params arm_cortex_a53_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,			/* Vectorizer costs.  */
-  false						/* Prefer Neon for 64-bits bitops.  */
+  false,					/* Prefer Neon for 64-bits bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 /* Branches can be dual-issued on Cortex-A5, so conditional execution is
@@ -1648,7 +1658,8 @@ const struct tune_params arm_cortex_a5_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {false, false},				/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_cortex_a9_tune =3D
@@ -1664,7 +1675,8 @@ const struct tune_params arm_cortex_a9_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_cortex_a12_tune =3D
@@ -1703,7 +1715,8 @@ const struct tune_params arm_v7m_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {false, false},				/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@@ -1721,7 +1734,8 @@ const struct tune_params arm_v6m_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {false, false},				/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
 const struct tune_params arm_fa726te_tune =3D
@@ -1737,7 +1751,8 @@ const struct tune_params arm_fa726te_tune =3D
   false,					/* Prefer LDRD/STRD.  */
   {true, true},					/* Prefer non short circuit.  */
   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-  false                                         /* Prefer Neon for 64-bits=
 bitops.  */
+  false,                                        /* Prefer Neon for 64-bits=
 bitops.  */
+  false, false                                  /* Prefer 32-bit encodings=
.  */
 };
=20
=20
@@ -16763,9 +16778,20 @@ thumb2_reorg (void)
   compute_bb_for_insn ();
   df_analyze ();
=20
+  enum Convert_Action {SKIP, CONV, SWAP_CONV};
+
   FOR_EACH_BB_FN (bb, cfun)
     {
+      if (current_tune->disparage_flag_setting_t16_encodings
+	  && optimize_bb_for_speed_p (bb))
+	continue;
+
       rtx insn;
+      Convert_Action action =3D SKIP;
+      Convert_Action action_for_partial_flag_setting
+	=3D (current_tune->disparage_partial_flag_setting_t16_encodings
+	   && optimize_bb_for_speed_p (bb))
+	  ? SKIP : CONV;
=20
       COPY_REG_SET (&live, DF_LR_OUT (bb));
       df_simulate_initialize_backwards (bb, &live);
@@ -16775,7 +16801,7 @@ thumb2_reorg (void)
 	      && !REGNO_REG_SET_P (&live, CC_REGNUM)
 	      && GET_CODE (PATTERN (insn)) =3D=3D SET)
 	    {
-	      enum {SKIP, CONV, SWAP_CONV} action =3D SKIP;
+	      action =3D SKIP;
 	      rtx pat =3D PATTERN (insn);
 	      rtx dst =3D XEXP (pat, 0);
 	      rtx src =3D XEXP (pat, 1);
@@ -16856,10 +16882,11 @@ thumb2_reorg (void)
 		      /* ANDS <Rdn>,<Rm>  */
 		      if (rtx_equal_p (dst, op0)
 			  && low_register_operand (op1, SImode))
-			action =3D CONV;
+			action =3D action_for_partial_flag_setting;
 		      else if (rtx_equal_p (dst, op1)
 			       && low_register_operand (op0, SImode))
-			action =3D SWAP_CONV;
+			action =3D action_for_partial_flag_setting =3D=3D SKIP
+				 ? SKIP : SWAP_CONV;
 		      break;
=20
 		    case ASHIFTRT:
@@ -16870,26 +16897,30 @@ thumb2_reorg (void)
 		      /* LSLS <Rdn>,<Rm> */
 		      if (rtx_equal_p (dst, op0)
 			  && low_register_operand (op1, SImode))
-			action =3D CONV;
+			action =3D action_for_partial_flag_setting;
 		      /* ASRS <Rd>,<Rm>,#<imm5> */
 		      /* LSRS <Rd>,<Rm>,#<imm5> */
 		      /* LSLS <Rd>,<Rm>,#<imm5> */
 		      else if (low_register_operand (op0, SImode)
 			       && CONST_INT_P (op1)
 			       && IN_RANGE (INTVAL (op1), 0, 31))
-			action =3D CONV;
+			action =3D action_for_partial_flag_setting;
 		      break;
=20
 		    case ROTATERT:
 		      /* RORS <Rdn>,<Rm>  */
 		      if (rtx_equal_p (dst, op0)
 			  && low_register_operand (op1, SImode))
-			action =3D CONV;
+			action =3D action_for_partial_flag_setting;
 		      break;
=20
 		    case NOT:
-		    case NEG:
 		      /* MVNS <Rd>,<Rm>  */
+		      if (low_register_operand (op0, SImode))
+			action =3D action_for_partial_flag_setting;
+		      break;
+
+		    case NEG:
 		      /* NEGS <Rd>,<Rm>  (a.k.a RSBS)  */
 		      if (low_register_operand (op0, SImode))
 			action =3D CONV;
@@ -16899,7 +16930,7 @@ thumb2_reorg (void)
 		      /* MOVS <Rd>,#<imm8>  */
 		      if (CONST_INT_P (src)
 			  && IN_RANGE (INTVAL (src), 0, 255))
-			action =3D CONV;
+			action =3D action_for_partial_flag_setting;
 		      break;
=20
 		    case REG:

------=_NextPart_000_0001_01CF1928.0213EC00--