public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH ARM]Refine scaled address expression on ARM
@ 2013-08-28  7:12 bin.cheng
  2013-08-29 13:15 ` Richard Earnshaw
  0 siblings, 1 reply; 18+ messages in thread
From: bin.cheng @ 2013-08-28  7:12 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 877 bytes --]

Hi,

This patch refines scaled address expression on ARM.  It supports
"base+index*scale" in arm_legitimate_address_outer_p.  It also tries to
legitimize "base + index * scale + offset" with "reg <- base + offset;  reg
+ index * scale" by introducing thumb2_legitimize_address.  For now function
thumb2_legitimize_address is a kind of placeholder and just does the
mentioned transformation by calling to try_multiplier_address.  Hoping we
can improve it in the future.

With this patch:
1) "base+index*scale" is recognized.
2) PR57540 is fixed.

Bootstrapped and Tested on A15.  Is it OK?

Thanks.
Bin

2013-08-28  Bin Cheng  <bin.cheng@arm.com>

	* config/arm/arm.c (arm_legitimate_address_outer_p):
	Support addressing mode like "base + index * scale".
	(try_multiplier_address): New function.
	(arm_legitimize_address): Call try_multiplier_address.

[-- Attachment #2: 6-arm-scaled_address-20130828.txt --]
[-- Type: text/plain, Size: 4337 bytes --]

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 200774)
+++ gcc/config/arm/arm.c	(working copy)
@@ -5931,7 +5931,9 @@ arm_legitimate_address_outer_p (enum machine_mode
 		    && arm_legitimate_index_p (mode, xop1, outer, strict_p))
 		   || (!strict_p && will_be_in_index_register (xop1))))
 	      || (arm_address_register_rtx_p (xop1, strict_p)
-		  && arm_legitimate_index_p (mode, xop0, outer, strict_p)));
+		  && arm_legitimate_index_p (mode, xop0, outer, strict_p))
+	      || (arm_address_register_rtx_p (xop0, strict_p)
+		  && arm_legitimate_index_p (mode, xop1, outer, strict_p)));
     }
 
 #if 0
@@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
     }
 }
 
+/* Try to find address expression like base + index * scale + offset
+   in X.  If we find one, force base + offset into register and
+   construct new expression reg + index * scale; return the new
+   address expression if it's valid.  Otherwise return X.  */
+static rtx
+try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
+{
+  rtx tmp, base_reg, new_rtx;
+  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
+
+  gcc_assert (GET_CODE (x) == PLUS);
+
+  /* Try to find and record base/index/scale/offset in X. */
+  if (GET_CODE (XEXP (x, 1)) == MULT)
+    {
+      tmp = XEXP (x, 0);
+      index = XEXP (XEXP (x, 1), 0);
+      scale = XEXP (XEXP (x, 1), 1);
+      if (GET_CODE (tmp) != PLUS)
+	return x;
+
+      base = XEXP (tmp, 0);
+      offset = XEXP (tmp, 1);
+    }
+  else
+    {
+      tmp = XEXP (x, 0);
+      offset = XEXP (x, 1);
+      if (GET_CODE (tmp) != PLUS)
+	return x;
+
+      base = XEXP (tmp, 0);
+      scale = XEXP (tmp, 1);
+      if (GET_CODE (base) == MULT)
+	{
+	  tmp = base;
+	  base = scale;
+	  scale = tmp;
+	}
+      if (GET_CODE (scale) != MULT)
+	return x;
+
+      index = XEXP (scale, 0);
+      scale = XEXP (scale, 1);
+    }
+
+  if (CONST_INT_P (base))
+    {
+      tmp = base;
+      base = offset;
+      offset = tmp;
+    }
+
+  if (CONST_INT_P (index))
+    {
+      tmp = index;
+      index = scale;
+      scale = tmp;
+    }
+
+  /* ARM only supports constant scale in address.  */
+  if (!CONST_INT_P (scale))
+    return x;
+
+  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
+    return x;
+
+  /* Only register/constant are allowed in each part.  */
+  if (!symbol_mentioned_p (base)
+      && !symbol_mentioned_p (offset)
+      && !symbol_mentioned_p (index)
+      && !symbol_mentioned_p (scale))
+    {
+      /* Force "base+offset" into register and construct
+	 "register+index*scale".  Return the new expression
+	 only if it's valid.  */
+      tmp = gen_rtx_PLUS (SImode, base, offset);
+      base_reg = force_reg (SImode, tmp);
+      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
+      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
+      return new_rtx;
+    }
+
+  return x;
+}
+
+/* Try machine-dependent ways of modifying an illegitimate Thumb2 address
+   to be legitimate.  If we find one, return the new address.
+
+   TODO: legitimize_address for Thumb2.  */
+static rtx
+thumb2_legitimize_address (rtx x, rtx orig_x ATTRIBUTE_UNUSED,
+			   enum machine_mode mode)
+{
+  if (GET_CODE (x) == PLUS)
+    return try_multiplier_address (x, mode);
+
+  return x;
+}
+
 /* Try machine-dependent ways of modifying an illegitimate address
    to be legitimate.  If we find one, return the new, valid address.  */
 rtx
@@ -6659,9 +6761,9 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma
 {
   if (!TARGET_ARM)
     {
-      /* TODO: legitimize_address for Thumb2.  */
       if (TARGET_THUMB2)
-        return x;
+	return thumb2_legitimize_address (x, orig_x, mode);
+
       return thumb_legitimize_address (x, orig_x, mode);
     }
 
@@ -6673,6 +6775,10 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma
       rtx xop0 = XEXP (x, 0);
       rtx xop1 = XEXP (x, 1);
 
+      rtx new_rtx = try_multiplier_address (x, mode);
+      if (new_rtx != x)
+	return new_rtx;
+
       if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0))
 	xop0 = force_reg (SImode, xop0);
 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-08-28  7:12 [PATCH ARM]Refine scaled address expression on ARM bin.cheng
@ 2013-08-29 13:15 ` Richard Earnshaw
  2013-09-02  7:08   ` bin.cheng
  0 siblings, 1 reply; 18+ messages in thread
From: Richard Earnshaw @ 2013-08-29 13:15 UTC (permalink / raw)
  To: bin.cheng; +Cc: gcc-patches

On 28/08/13 08:00, bin.cheng wrote:
> Hi,
> 
> This patch refines scaled address expression on ARM.  It supports
> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries to
> legitimize "base + index * scale + offset" with "reg <- base + offset;  reg
> + index * scale" by introducing thumb2_legitimize_address.  For now function
> thumb2_legitimize_address is a kind of placeholder and just does the
> mentioned transformation by calling to try_multiplier_address.  Hoping we
> can improve it in the future.
> 
> With this patch:
> 1) "base+index*scale" is recognized.

That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
 So this shouldn't be necessary.  Can you identify where this
non-canoncial form is being generated?

R.

> 2) PR57540 is fixed.
> 
> Bootstrapped and Tested on A15.  Is it OK?
> 
> Thanks.
> Bin
> 
> 2013-08-28  Bin Cheng  <bin.cheng@arm.com>
> 
> 	* config/arm/arm.c (arm_legitimate_address_outer_p):
> 	Support addressing mode like "base + index * scale".
> 	(try_multiplier_address): New function.
> 	(arm_legitimize_address): Call try_multiplier_address.
> 
> 
> 6-arm-scaled_address-20130828.txt
> 
> 
> Index: gcc/config/arm/arm.c
> ===================================================================
> --- gcc/config/arm/arm.c	(revision 200774)
> +++ gcc/config/arm/arm.c	(working copy)
> @@ -5931,7 +5931,9 @@ arm_legitimate_address_outer_p (enum machine_mode
>  		    && arm_legitimate_index_p (mode, xop1, outer, strict_p))
>  		   || (!strict_p && will_be_in_index_register (xop1))))
>  	      || (arm_address_register_rtx_p (xop1, strict_p)
> -		  && arm_legitimate_index_p (mode, xop0, outer, strict_p)));
> +		  && arm_legitimate_index_p (mode, xop0, outer, strict_p))
> +	      || (arm_address_register_rtx_p (xop0, strict_p)
> +		  && arm_legitimate_index_p (mode, xop1, outer, strict_p)));
>      }
>  
>  #if 0
> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>      }
>  }
>  
> +/* Try to find address expression like base + index * scale + offset
> +   in X.  If we find one, force base + offset into register and
> +   construct new expression reg + index * scale; return the new
> +   address expression if it's valid.  Otherwise return X.  */
> +static rtx
> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
> +{
> +  rtx tmp, base_reg, new_rtx;
> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
> +
> +  gcc_assert (GET_CODE (x) == PLUS);
> +
> +  /* Try to find and record base/index/scale/offset in X. */
> +  if (GET_CODE (XEXP (x, 1)) == MULT)
> +    {
> +      tmp = XEXP (x, 0);
> +      index = XEXP (XEXP (x, 1), 0);
> +      scale = XEXP (XEXP (x, 1), 1);
> +      if (GET_CODE (tmp) != PLUS)
> +	return x;
> +
> +      base = XEXP (tmp, 0);
> +      offset = XEXP (tmp, 1);
> +    }
> +  else
> +    {
> +      tmp = XEXP (x, 0);
> +      offset = XEXP (x, 1);
> +      if (GET_CODE (tmp) != PLUS)
> +	return x;
> +
> +      base = XEXP (tmp, 0);
> +      scale = XEXP (tmp, 1);
> +      if (GET_CODE (base) == MULT)
> +	{
> +	  tmp = base;
> +	  base = scale;
> +	  scale = tmp;
> +	}
> +      if (GET_CODE (scale) != MULT)
> +	return x;
> +
> +      index = XEXP (scale, 0);
> +      scale = XEXP (scale, 1);
> +    }
> +
> +  if (CONST_INT_P (base))
> +    {
> +      tmp = base;
> +      base = offset;
> +      offset = tmp;
> +    }
> +
> +  if (CONST_INT_P (index))
> +    {
> +      tmp = index;
> +      index = scale;
> +      scale = tmp;
> +    }
> +
> +  /* ARM only supports constant scale in address.  */
> +  if (!CONST_INT_P (scale))
> +    return x;
> +
> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
> +    return x;
> +
> +  /* Only register/constant are allowed in each part.  */
> +  if (!symbol_mentioned_p (base)
> +      && !symbol_mentioned_p (offset)
> +      && !symbol_mentioned_p (index)
> +      && !symbol_mentioned_p (scale))
> +    {
> +      /* Force "base+offset" into register and construct
> +	 "register+index*scale".  Return the new expression
> +	 only if it's valid.  */
> +      tmp = gen_rtx_PLUS (SImode, base, offset);
> +      base_reg = force_reg (SImode, tmp);
> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
> +      return new_rtx;
> +    }
> +
> +  return x;
> +}
> +
> +/* Try machine-dependent ways of modifying an illegitimate Thumb2 address
> +   to be legitimate.  If we find one, return the new address.
> +
> +   TODO: legitimize_address for Thumb2.  */
> +static rtx
> +thumb2_legitimize_address (rtx x, rtx orig_x ATTRIBUTE_UNUSED,
> +			   enum machine_mode mode)
> +{
> +  if (GET_CODE (x) == PLUS)
> +    return try_multiplier_address (x, mode);
> +
> +  return x;
> +}
> +
>  /* Try machine-dependent ways of modifying an illegitimate address
>     to be legitimate.  If we find one, return the new, valid address.  */
>  rtx
> @@ -6659,9 +6761,9 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma
>  {
>    if (!TARGET_ARM)
>      {
> -      /* TODO: legitimize_address for Thumb2.  */
>        if (TARGET_THUMB2)
> -        return x;
> +	return thumb2_legitimize_address (x, orig_x, mode);
> +
>        return thumb_legitimize_address (x, orig_x, mode);
>      }
>  
> @@ -6673,6 +6775,10 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma
>        rtx xop0 = XEXP (x, 0);
>        rtx xop1 = XEXP (x, 1);
>  
> +      rtx new_rtx = try_multiplier_address (x, mode);
> +      if (new_rtx != x)
> +	return new_rtx;
> +
>        if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0))
>  	xop0 = force_reg (SImode, xop0);
>  
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH ARM]Refine scaled address expression on ARM
  2013-08-29 13:15 ` Richard Earnshaw
@ 2013-09-02  7:08   ` bin.cheng
  2013-09-18  9:38     ` bin.cheng
  0 siblings, 1 reply; 18+ messages in thread
From: bin.cheng @ 2013-09-02  7:08 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches



> -----Original Message-----
> From: Richard Earnshaw 
> Sent: Thursday, August 29, 2013 9:06 PM
> To: Bin Cheng
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>
> On 28/08/13 08:00, bin.cheng wrote:
> > Hi,
> > 
> > This patch refines scaled address expression on ARM.  It supports 
> > "base+index*scale" in arm_legitimate_address_outer_p.  It also tries 
> > to legitimize "base + index * scale + offset" with "reg <- base + 
> > offset;  reg
> > + index * scale" by introducing thumb2_legitimize_address.  For now 
> > + function
> > thumb2_legitimize_address is a kind of placeholder and just does the 
> > mentioned transformation by calling to try_multiplier_address.  Hoping 
> > we can improve it in the future.
> > 
> > With this patch:
> > 1) "base+index*scale" is recognized.
>
> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>  So this shouldn't be necessary.  Can you identify where this
non-canoncial form is being generated?
>

Oh, for now ivopt constructs "index*scale" to test whether backend supports
scaled addressing mode, which is not valid on ARM, so I was going to
construct "base + index*scale" instead.  Since "base + index * scale" is not
canonical form, I will construct the canonical form and drop this part of
the patch.

Is rest of this patch OK?

Thanks.
bin



^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH ARM]Refine scaled address expression on ARM
  2013-09-02  7:08   ` bin.cheng
@ 2013-09-18  9:38     ` bin.cheng
  2013-11-28 13:15       ` Richard Earnshaw
  0 siblings, 1 reply; 18+ messages in thread
From: bin.cheng @ 2013-09-18  9:38 UTC (permalink / raw)
  To: Bin Cheng, Richard Earnshaw; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2131 bytes --]



> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of bin.cheng
> Sent: Monday, September 02, 2013 3:09 PM
> To: Richard Earnshaw
> Cc: gcc-patches@gcc.gnu.org
> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
> 
> 
> 
> > -----Original Message-----
> > From: Richard Earnshaw
> > Sent: Thursday, August 29, 2013 9:06 PM
> > To: Bin Cheng
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
> >
> > On 28/08/13 08:00, bin.cheng wrote:
> > > Hi,
> > >
> > > This patch refines scaled address expression on ARM.  It supports
> > > "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
> > > to legitimize "base + index * scale + offset" with "reg <- base +
> > > offset;  reg
> > > + index * scale" by introducing thumb2_legitimize_address.  For now
> > > + function
> > > thumb2_legitimize_address is a kind of placeholder and just does the
> > > mentioned transformation by calling to try_multiplier_address.
> > > Hoping we can improve it in the future.
> > >
> > > With this patch:
> > > 1) "base+index*scale" is recognized.
> >
> > That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
> >  So this shouldn't be necessary.  Can you identify where this
> non-canoncial form is being generated?
> >
> 
> Oh, for now ivopt constructs "index*scale" to test whether backend
> supports scaled addressing mode, which is not valid on ARM, so I was going
> to construct "base + index*scale" instead.  Since "base + index * scale"
is not
> canonical form, I will construct the canonical form and drop this part of
the
> patch.
> 
> Is rest of this patch OK?
> 
Hi Richard, I removed the part over which you concerned and created this
updated patch.

Is it OK?

Thanks.
bin

2013-09-18  Bin Cheng  <bin.cheng@arm.com>

	* config/arm/arm.c (try_multiplier_address): New function.
	(thumb2_legitimize_address): New function.
	(arm_legitimize_address): Call try_multiplier_address and
	thumb2_legitimize_address.

[-- Attachment #2: 6-arm-scaled_address-20130918.txt --]
[-- Type: text/plain, Size: 3813 bytes --]

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 200774)
+++ gcc/config/arm/arm.c	(working copy)
@@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
     }
 }
 
+/* Try to find address expression like base + index * scale + offset
+   in X.  If we find one, force base + offset into register and
+   construct new expression reg + index * scale; return the new
+   address expression if it's valid.  Otherwise return X.  */
+static rtx
+try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
+{
+  rtx tmp, base_reg, new_rtx;
+  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
+
+  gcc_assert (GET_CODE (x) == PLUS);
+
+  /* Try to find and record base/index/scale/offset in X. */
+  if (GET_CODE (XEXP (x, 1)) == MULT)
+    {
+      tmp = XEXP (x, 0);
+      index = XEXP (XEXP (x, 1), 0);
+      scale = XEXP (XEXP (x, 1), 1);
+      if (GET_CODE (tmp) != PLUS)
+	return x;
+
+      base = XEXP (tmp, 0);
+      offset = XEXP (tmp, 1);
+    }
+  else
+    {
+      tmp = XEXP (x, 0);
+      offset = XEXP (x, 1);
+      if (GET_CODE (tmp) != PLUS)
+	return x;
+
+      base = XEXP (tmp, 0);
+      scale = XEXP (tmp, 1);
+      if (GET_CODE (base) == MULT)
+	{
+	  tmp = base;
+	  base = scale;
+	  scale = tmp;
+	}
+      if (GET_CODE (scale) != MULT)
+	return x;
+
+      index = XEXP (scale, 0);
+      scale = XEXP (scale, 1);
+    }
+
+  if (CONST_INT_P (base))
+    {
+      tmp = base;
+      base = offset;
+      offset = tmp;
+    }
+
+  if (CONST_INT_P (index))
+    {
+      tmp = index;
+      index = scale;
+      scale = tmp;
+    }
+
+  /* ARM only supports constant scale in address.  */
+  if (!CONST_INT_P (scale))
+    return x;
+
+  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
+    return x;
+
+  /* Only register/constant are allowed in each part.  */
+  if (!symbol_mentioned_p (base)
+      && !symbol_mentioned_p (offset)
+      && !symbol_mentioned_p (index)
+      && !symbol_mentioned_p (scale))
+    {
+      /* Force "base+offset" into register and construct
+	 "register+index*scale".  Return the new expression
+	 only if it's valid.  */
+      tmp = gen_rtx_PLUS (SImode, base, offset);
+      base_reg = force_reg (SImode, tmp);
+      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
+      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
+      return new_rtx;
+    }
+
+  return x;
+}
+
+/* Try machine-dependent ways of modifying an illegitimate Thumb2 address
+   to be legitimate.  If we find one, return the new address.
+
+   TODO: legitimize_address for Thumb2.  */
+static rtx
+thumb2_legitimize_address (rtx x, rtx orig_x ATTRIBUTE_UNUSED,
+			   enum machine_mode mode)
+{
+  if (GET_CODE (x) == PLUS)
+    return try_multiplier_address (x, mode);
+
+  return x;
+}
+
 /* Try machine-dependent ways of modifying an illegitimate address
    to be legitimate.  If we find one, return the new, valid address.  */
 rtx
@@ -6659,9 +6761,9 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma
 {
   if (!TARGET_ARM)
     {
-      /* TODO: legitimize_address for Thumb2.  */
       if (TARGET_THUMB2)
-        return x;
+	return thumb2_legitimize_address (x, orig_x, mode);
+
       return thumb_legitimize_address (x, orig_x, mode);
     }
 
@@ -6673,6 +6775,10 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma
       rtx xop0 = XEXP (x, 0);
       rtx xop1 = XEXP (x, 1);
 
+      rtx new_rtx = try_multiplier_address (x, mode);
+      if (new_rtx != x)
+	return new_rtx;
+
       if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0))
 	xop0 = force_reg (SImode, xop0);
 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-09-18  9:38     ` bin.cheng
@ 2013-11-28 13:15       ` Richard Earnshaw
  2013-11-28 14:28         ` Bin.Cheng
  0 siblings, 1 reply; 18+ messages in thread
From: Richard Earnshaw @ 2013-11-28 13:15 UTC (permalink / raw)
  To: bin.cheng; +Cc: gcc-patches

On 18/09/13 10:15, bin.cheng wrote:
> 
> 
>> -----Original Message-----
>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>> Sent: Monday, September 02, 2013 3:09 PM
>> To: Richard Earnshaw
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>
>>
>>
>>> -----Original Message-----
>>> From: Richard Earnshaw
>>> Sent: Thursday, August 29, 2013 9:06 PM
>>> To: Bin Cheng
>>> Cc: gcc-patches@gcc.gnu.org
>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>
>>> On 28/08/13 08:00, bin.cheng wrote:
>>>> Hi,
>>>>
>>>> This patch refines scaled address expression on ARM.  It supports
>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>> to legitimize "base + index * scale + offset" with "reg <- base +
>>>> offset;  reg
>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>> + function
>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>> mentioned transformation by calling to try_multiplier_address.
>>>> Hoping we can improve it in the future.
>>>>
>>>> With this patch:
>>>> 1) "base+index*scale" is recognized.
>>>
>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>  So this shouldn't be necessary.  Can you identify where this
>> non-canoncial form is being generated?
>>>
>>
>> Oh, for now ivopt constructs "index*scale" to test whether backend
>> supports scaled addressing mode, which is not valid on ARM, so I was going
>> to construct "base + index*scale" instead.  Since "base + index * scale"
> is not
>> canonical form, I will construct the canonical form and drop this part of
> the
>> patch.
>>
>> Is rest of this patch OK?
>>
> Hi Richard, I removed the part over which you concerned and created this
> updated patch.
> 
> Is it OK?
> 
> Thanks.
> bin
> 
> 2013-09-18  Bin Cheng  <bin.cheng@arm.com>
> 
> 	* config/arm/arm.c (try_multiplier_address): New function.
> 	(thumb2_legitimize_address): New function.
> 	(arm_legitimize_address): Call try_multiplier_address and
> 	thumb2_legitimize_address.
> 
> 
> 6-arm-scaled_address-20130918.txt
> 
> 
> Index: gcc/config/arm/arm.c
> ===================================================================
> --- gcc/config/arm/arm.c	(revision 200774)
> +++ gcc/config/arm/arm.c	(working copy)
> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>      }
>  }
>  
> +/* Try to find address expression like base + index * scale + offset
> +   in X.  If we find one, force base + offset into register and
> +   construct new expression reg + index * scale; return the new
> +   address expression if it's valid.  Otherwise return X.  */
> +static rtx
> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
> +{
> +  rtx tmp, base_reg, new_rtx;
> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
> +
> +  gcc_assert (GET_CODE (x) == PLUS);
> +
> +  /* Try to find and record base/index/scale/offset in X. */
> +  if (GET_CODE (XEXP (x, 1)) == MULT)
> +    {
> +      tmp = XEXP (x, 0);
> +      index = XEXP (XEXP (x, 1), 0);
> +      scale = XEXP (XEXP (x, 1), 1);
> +      if (GET_CODE (tmp) != PLUS)
> +	return x;
> +
> +      base = XEXP (tmp, 0);
> +      offset = XEXP (tmp, 1);
> +    }
> +  else
> +    {
> +      tmp = XEXP (x, 0);
> +      offset = XEXP (x, 1);
> +      if (GET_CODE (tmp) != PLUS)
> +	return x;
> +
> +      base = XEXP (tmp, 0);
> +      scale = XEXP (tmp, 1);
> +      if (GET_CODE (base) == MULT)
> +	{
> +	  tmp = base;
> +	  base = scale;
> +	  scale = tmp;
> +	}
> +      if (GET_CODE (scale) != MULT)
> +	return x;
> +
> +      index = XEXP (scale, 0);
> +      scale = XEXP (scale, 1);
> +    }
> +
> +  if (CONST_INT_P (base))
> +    {
> +      tmp = base;
> +      base = offset;
> +      offset = tmp;
> +    }
> +
> +  if (CONST_INT_P (index))
> +    {
> +      tmp = index;
> +      index = scale;
> +      scale = tmp;
> +    }
> +
> +  /* ARM only supports constant scale in address.  */
> +  if (!CONST_INT_P (scale))
> +    return x;
> +
> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
> +    return x;
> +
> +  /* Only register/constant are allowed in each part.  */
> +  if (!symbol_mentioned_p (base)
> +      && !symbol_mentioned_p (offset)
> +      && !symbol_mentioned_p (index)
> +      && !symbol_mentioned_p (scale))
> +    {

It would be easier to do this at the top of the function --
  if (symbol_mentioned_p (x))
    return x;


> +      /* Force "base+offset" into register and construct
> +	 "register+index*scale".  Return the new expression
> +	 only if it's valid.  */
> +      tmp = gen_rtx_PLUS (SImode, base, offset);
> +      base_reg = force_reg (SImode, tmp);
> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
> +      return new_rtx;

I can't help thinking that this is backwards.  That is, you want to
split out the mult expression and use offset addressing in the addresses
itself.  That's likely to lead to either better CSE, or more induction
vars.  Furthermore, scaled addressing modes are likely to be more
expensive than simple offset addressing modes on at least some cores.

R.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-28 13:15       ` Richard Earnshaw
@ 2013-11-28 14:28         ` Bin.Cheng
  2013-11-29 11:23           ` Bin.Cheng
  0 siblings, 1 reply; 18+ messages in thread
From: Bin.Cheng @ 2013-11-28 14:28 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: bin.cheng, gcc-patches

On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote:
> On 18/09/13 10:15, bin.cheng wrote:
>>
>>
>>> -----Original Message-----
>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>> Sent: Monday, September 02, 2013 3:09 PM
>>> To: Richard Earnshaw
>>> Cc: gcc-patches@gcc.gnu.org
>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Richard Earnshaw
>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>> To: Bin Cheng
>>>> Cc: gcc-patches@gcc.gnu.org
>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>
>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>> Hi,
>>>>>
>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>> to legitimize "base + index * scale + offset" with "reg <- base +
>>>>> offset;  reg
>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>> + function
>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>> Hoping we can improve it in the future.
>>>>>
>>>>> With this patch:
>>>>> 1) "base+index*scale" is recognized.
>>>>
>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>>  So this shouldn't be necessary.  Can you identify where this
>>> non-canoncial form is being generated?
>>>>
>>>
>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>> supports scaled addressing mode, which is not valid on ARM, so I was going
>>> to construct "base + index*scale" instead.  Since "base + index * scale"
>> is not
>>> canonical form, I will construct the canonical form and drop this part of
>> the
>>> patch.
>>>
>>> Is rest of this patch OK?
>>>
>> Hi Richard, I removed the part over which you concerned and created this
>> updated patch.
>>
>> Is it OK?
>>
>> Thanks.
>> bin
>>
>> 2013-09-18  Bin Cheng  <bin.cheng@arm.com>
>>
>>       * config/arm/arm.c (try_multiplier_address): New function.
>>       (thumb2_legitimize_address): New function.
>>       (arm_legitimize_address): Call try_multiplier_address and
>>       thumb2_legitimize_address.
>>
>>
>> 6-arm-scaled_address-20130918.txt
>>
>>
>> Index: gcc/config/arm/arm.c
>> ===================================================================
>> --- gcc/config/arm/arm.c      (revision 200774)
>> +++ gcc/config/arm/arm.c      (working copy)
>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>      }
>>  }
>>
>> +/* Try to find address expression like base + index * scale + offset
>> +   in X.  If we find one, force base + offset into register and
>> +   construct new expression reg + index * scale; return the new
>> +   address expression if it's valid.  Otherwise return X.  */
>> +static rtx
>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
>> +{
>> +  rtx tmp, base_reg, new_rtx;
>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
>> +
>> +  gcc_assert (GET_CODE (x) == PLUS);
>> +
>> +  /* Try to find and record base/index/scale/offset in X. */
>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>> +    {
>> +      tmp = XEXP (x, 0);
>> +      index = XEXP (XEXP (x, 1), 0);
>> +      scale = XEXP (XEXP (x, 1), 1);
>> +      if (GET_CODE (tmp) != PLUS)
>> +     return x;
>> +
>> +      base = XEXP (tmp, 0);
>> +      offset = XEXP (tmp, 1);
>> +    }
>> +  else
>> +    {
>> +      tmp = XEXP (x, 0);
>> +      offset = XEXP (x, 1);
>> +      if (GET_CODE (tmp) != PLUS)
>> +     return x;
>> +
>> +      base = XEXP (tmp, 0);
>> +      scale = XEXP (tmp, 1);
>> +      if (GET_CODE (base) == MULT)
>> +     {
>> +       tmp = base;
>> +       base = scale;
>> +       scale = tmp;
>> +     }
>> +      if (GET_CODE (scale) != MULT)
>> +     return x;
>> +
>> +      index = XEXP (scale, 0);
>> +      scale = XEXP (scale, 1);
>> +    }
>> +
>> +  if (CONST_INT_P (base))
>> +    {
>> +      tmp = base;
>> +      base = offset;
>> +      offset = tmp;
>> +    }
>> +
>> +  if (CONST_INT_P (index))
>> +    {
>> +      tmp = index;
>> +      index = scale;
>> +      scale = tmp;
>> +    }
>> +
>> +  /* ARM only supports constant scale in address.  */
>> +  if (!CONST_INT_P (scale))
>> +    return x;
>> +
>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>> +    return x;
>> +
>> +  /* Only register/constant are allowed in each part.  */
>> +  if (!symbol_mentioned_p (base)
>> +      && !symbol_mentioned_p (offset)
>> +      && !symbol_mentioned_p (index)
>> +      && !symbol_mentioned_p (scale))
>> +    {
>
> It would be easier to do this at the top of the function --
>   if (symbol_mentioned_p (x))
>     return x;
>
>
>> +      /* Force "base+offset" into register and construct
>> +      "register+index*scale".  Return the new expression
>> +      only if it's valid.  */
>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>> +      base_reg = force_reg (SImode, tmp);
>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>> +      return new_rtx;
>
> I can't help thinking that this is backwards.  That is, you want to
> split out the mult expression and use offset addressing in the addresses
> itself.  That's likely to lead to either better CSE, or more induction
Thanks to your review.

Actually base+offset is more likely loop invariant than scaled
expression, just as reported in pr57540.  The bug is observed in
spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
variable doesn't matter that much comparing to invariant because we
are in RTL now.

> vars.  Furthermore, scaled addressing modes are likely to be more
> expensive than simple offset addressing modes on at least some cores.
The purpose is to CSE (as you pointed out) or hoist loop invariant.
As for addressing mode is concerned, Though we may guide the
transformation once we have reliable address cost mode, we don't have
the information if base+offset is invariant, so it's difficult to
handle in backend, right?

What do you think about this?

Thanks,
bin



-- 
Best Regards.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-28 14:28         ` Bin.Cheng
@ 2013-11-29 11:23           ` Bin.Cheng
  2013-11-29 12:19             ` Richard Biener
  2013-11-29 13:31             ` Yufeng Zhang
  0 siblings, 2 replies; 18+ messages in thread
From: Bin.Cheng @ 2013-11-29 11:23 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: bin.cheng, gcc-patches, Yufeng Zhang, Richard Biener

On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote:
>> On 18/09/13 10:15, bin.cheng wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>> To: Richard Earnshaw
>>>> Cc: gcc-patches@gcc.gnu.org
>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Richard Earnshaw
>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>> To: Bin Cheng
>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>
>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>> Hi,
>>>>>>
>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>> to legitimize "base + index * scale + offset" with "reg <- base +
>>>>>> offset;  reg
>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>> + function
>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>> Hoping we can improve it in the future.
>>>>>>
>>>>>> With this patch:
>>>>>> 1) "base+index*scale" is recognized.
>>>>>
>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>>>  So this shouldn't be necessary.  Can you identify where this
>>>> non-canoncial form is being generated?
>>>>>
>>>>
>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>> supports scaled addressing mode, which is not valid on ARM, so I was going
>>>> to construct "base + index*scale" instead.  Since "base + index * scale"
>>> is not
>>>> canonical form, I will construct the canonical form and drop this part of
>>> the
>>>> patch.
>>>>
>>>> Is rest of this patch OK?
>>>>
>>> Hi Richard, I removed the part over which you concerned and created this
>>> updated patch.
>>>
>>> Is it OK?
>>>
>>> Thanks.
>>> bin
>>>
>>> 2013-09-18  Bin Cheng  <bin.cheng@arm.com>
>>>
>>>       * config/arm/arm.c (try_multiplier_address): New function.
>>>       (thumb2_legitimize_address): New function.
>>>       (arm_legitimize_address): Call try_multiplier_address and
>>>       thumb2_legitimize_address.
>>>
>>>
>>> 6-arm-scaled_address-20130918.txt
>>>
>>>
>>> Index: gcc/config/arm/arm.c
>>> ===================================================================
>>> --- gcc/config/arm/arm.c      (revision 200774)
>>> +++ gcc/config/arm/arm.c      (working copy)
>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>      }
>>>  }
>>>
>>> +/* Try to find address expression like base + index * scale + offset
>>> +   in X.  If we find one, force base + offset into register and
>>> +   construct new expression reg + index * scale; return the new
>>> +   address expression if it's valid.  Otherwise return X.  */
>>> +static rtx
>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
>>> +{
>>> +  rtx tmp, base_reg, new_rtx;
>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
>>> +
>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>> +
>>> +  /* Try to find and record base/index/scale/offset in X. */
>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>> +    {
>>> +      tmp = XEXP (x, 0);
>>> +      index = XEXP (XEXP (x, 1), 0);
>>> +      scale = XEXP (XEXP (x, 1), 1);
>>> +      if (GET_CODE (tmp) != PLUS)
>>> +     return x;
>>> +
>>> +      base = XEXP (tmp, 0);
>>> +      offset = XEXP (tmp, 1);
>>> +    }
>>> +  else
>>> +    {
>>> +      tmp = XEXP (x, 0);
>>> +      offset = XEXP (x, 1);
>>> +      if (GET_CODE (tmp) != PLUS)
>>> +     return x;
>>> +
>>> +      base = XEXP (tmp, 0);
>>> +      scale = XEXP (tmp, 1);
>>> +      if (GET_CODE (base) == MULT)
>>> +     {
>>> +       tmp = base;
>>> +       base = scale;
>>> +       scale = tmp;
>>> +     }
>>> +      if (GET_CODE (scale) != MULT)
>>> +     return x;
>>> +
>>> +      index = XEXP (scale, 0);
>>> +      scale = XEXP (scale, 1);
>>> +    }
>>> +
>>> +  if (CONST_INT_P (base))
>>> +    {
>>> +      tmp = base;
>>> +      base = offset;
>>> +      offset = tmp;
>>> +    }
>>> +
>>> +  if (CONST_INT_P (index))
>>> +    {
>>> +      tmp = index;
>>> +      index = scale;
>>> +      scale = tmp;
>>> +    }
>>> +
>>> +  /* ARM only supports constant scale in address.  */
>>> +  if (!CONST_INT_P (scale))
>>> +    return x;
>>> +
>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>> +    return x;
>>> +
>>> +  /* Only register/constant are allowed in each part.  */
>>> +  if (!symbol_mentioned_p (base)
>>> +      && !symbol_mentioned_p (offset)
>>> +      && !symbol_mentioned_p (index)
>>> +      && !symbol_mentioned_p (scale))
>>> +    {
>>
>> It would be easier to do this at the top of the function --
>>   if (symbol_mentioned_p (x))
>>     return x;
>>
>>
>>> +      /* Force "base+offset" into register and construct
>>> +      "register+index*scale".  Return the new expression
>>> +      only if it's valid.  */
>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>> +      base_reg = force_reg (SImode, tmp);
>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>> +      return new_rtx;
>>
>> I can't help thinking that this is backwards.  That is, you want to
>> split out the mult expression and use offset addressing in the addresses
>> itself.  That's likely to lead to either better CSE, or more induction
> Thanks to your review.
>
> Actually base+offset is more likely loop invariant than scaled
> expression, just as reported in pr57540.  The bug is observed in
> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
> variable doesn't matter that much comparing to invariant because we
> are in RTL now.
>
>> vars.  Furthermore, scaled addressing modes are likely to be more
>> expensive than simple offset addressing modes on at least some cores.
> The purpose is to CSE (as you pointed out) or hoist loop invariant.
> As for addressing mode is concerned, Though we may guide the
> transformation once we have reliable address cost mode, we don't have
> the information if base+offset is invariant, so it's difficult to
> handle in backend, right?
>
> What do you think about this?
>

Additionally, there is no induction variable issue here.  The memory
reference we are facing during expand are not lowered by gimple IVOPT,
which means either it's outside loop, or doesn't have induction
variable address.

Generally, there are three passes which lowers memory reference:
gimple strength reduction picks opportunity outside loop; gimple IVOPT
handles references with induction variable addresses; references not
handled by SLSR/IVOPT are handled by RTL expand, which makes bad
choice of address re-association.  I think Yufeng also noticed the
problem and are trying with patch like:
http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html

After thinking twice, I some kind of think we should not re-associate
addresses during expanding, because of lacking of context information.
 Take base + scaled_index + offset as an example in PR57540, we just
don't know if "base+offset" is loop invariant from either backend or
RTL expander.

Any comments?

Thanks,
bin



-- 
Best Regards.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 11:23           ` Bin.Cheng
@ 2013-11-29 12:19             ` Richard Biener
  2013-11-29 12:31               ` Bin.Cheng
  2013-11-29 19:35               ` Yufeng Zhang
  2013-11-29 13:31             ` Yufeng Zhang
  1 sibling, 2 replies; 18+ messages in thread
From: Richard Biener @ 2013-11-29 12:19 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: Richard Earnshaw, bin.cheng, gcc-patches, Yufeng Zhang

On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote:
>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>> To: Richard Earnshaw
>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Richard Earnshaw
>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>> To: Bin Cheng
>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>
>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>>> to legitimize "base + index * scale + offset" with "reg <- base +
>>>>>>> offset;  reg
>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>> + function
>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>> Hoping we can improve it in the future.
>>>>>>>
>>>>>>> With this patch:
>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>
>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>>>>  So this shouldn't be necessary.  Can you identify where this
>>>>> non-canoncial form is being generated?
>>>>>>
>>>>>
>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going
>>>>> to construct "base + index*scale" instead.  Since "base + index * scale"
>>>> is not
>>>>> canonical form, I will construct the canonical form and drop this part of
>>>> the
>>>>> patch.
>>>>>
>>>>> Is rest of this patch OK?
>>>>>
>>>> Hi Richard, I removed the part over which you concerned and created this
>>>> updated patch.
>>>>
>>>> Is it OK?
>>>>
>>>> Thanks.
>>>> bin
>>>>
>>>> 2013-09-18  Bin Cheng  <bin.cheng@arm.com>
>>>>
>>>>       * config/arm/arm.c (try_multiplier_address): New function.
>>>>       (thumb2_legitimize_address): New function.
>>>>       (arm_legitimize_address): Call try_multiplier_address and
>>>>       thumb2_legitimize_address.
>>>>
>>>>
>>>> 6-arm-scaled_address-20130918.txt
>>>>
>>>>
>>>> Index: gcc/config/arm/arm.c
>>>> ===================================================================
>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>      }
>>>>  }
>>>>
>>>> +/* Try to find address expression like base + index * scale + offset
>>>> +   in X.  If we find one, force base + offset into register and
>>>> +   construct new expression reg + index * scale; return the new
>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>> +static rtx
>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
>>>> +{
>>>> +  rtx tmp, base_reg, new_rtx;
>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
>>>> +
>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>> +
>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>> +    {
>>>> +      tmp = XEXP (x, 0);
>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>> +      if (GET_CODE (tmp) != PLUS)
>>>> +     return x;
>>>> +
>>>> +      base = XEXP (tmp, 0);
>>>> +      offset = XEXP (tmp, 1);
>>>> +    }
>>>> +  else
>>>> +    {
>>>> +      tmp = XEXP (x, 0);
>>>> +      offset = XEXP (x, 1);
>>>> +      if (GET_CODE (tmp) != PLUS)
>>>> +     return x;
>>>> +
>>>> +      base = XEXP (tmp, 0);
>>>> +      scale = XEXP (tmp, 1);
>>>> +      if (GET_CODE (base) == MULT)
>>>> +     {
>>>> +       tmp = base;
>>>> +       base = scale;
>>>> +       scale = tmp;
>>>> +     }
>>>> +      if (GET_CODE (scale) != MULT)
>>>> +     return x;
>>>> +
>>>> +      index = XEXP (scale, 0);
>>>> +      scale = XEXP (scale, 1);
>>>> +    }
>>>> +
>>>> +  if (CONST_INT_P (base))
>>>> +    {
>>>> +      tmp = base;
>>>> +      base = offset;
>>>> +      offset = tmp;
>>>> +    }
>>>> +
>>>> +  if (CONST_INT_P (index))
>>>> +    {
>>>> +      tmp = index;
>>>> +      index = scale;
>>>> +      scale = tmp;
>>>> +    }
>>>> +
>>>> +  /* ARM only supports constant scale in address.  */
>>>> +  if (!CONST_INT_P (scale))
>>>> +    return x;
>>>> +
>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>> +    return x;
>>>> +
>>>> +  /* Only register/constant are allowed in each part.  */
>>>> +  if (!symbol_mentioned_p (base)
>>>> +      && !symbol_mentioned_p (offset)
>>>> +      && !symbol_mentioned_p (index)
>>>> +      && !symbol_mentioned_p (scale))
>>>> +    {
>>>
>>> It would be easier to do this at the top of the function --
>>>   if (symbol_mentioned_p (x))
>>>     return x;
>>>
>>>
>>>> +      /* Force "base+offset" into register and construct
>>>> +      "register+index*scale".  Return the new expression
>>>> +      only if it's valid.  */
>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>> +      base_reg = force_reg (SImode, tmp);
>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>> +      return new_rtx;
>>>
>>> I can't help thinking that this is backwards.  That is, you want to
>>> split out the mult expression and use offset addressing in the addresses
>>> itself.  That's likely to lead to either better CSE, or more induction
>> Thanks to your review.
>>
>> Actually base+offset is more likely loop invariant than scaled
>> expression, just as reported in pr57540.  The bug is observed in
>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>> variable doesn't matter that much comparing to invariant because we
>> are in RTL now.
>>
>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>> expensive than simple offset addressing modes on at least some cores.
>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>> As for addressing mode is concerned, Though we may guide the
>> transformation once we have reliable address cost mode, we don't have
>> the information if base+offset is invariant, so it's difficult to
>> handle in backend, right?
>>
>> What do you think about this?
>>
>
> Additionally, there is no induction variable issue here.  The memory
> reference we are facing during expand are not lowered by gimple IVOPT,
> which means either it's outside loop, or doesn't have induction
> variable address.
>
> Generally, there are three passes which lowers memory reference:
> gimple strength reduction picks opportunity outside loop; gimple IVOPT
> handles references with induction variable addresses; references not
> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
> choice of address re-association.  I think Yufeng also noticed the
> problem and are trying with patch like:
> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html
>
> After thinking twice, I some kind of think we should not re-associate
> addresses during expanding, because of lacking of context information.
>  Take base + scaled_index + offset as an example in PR57540, we just
> don't know if "base+offset" is loop invariant from either backend or
> RTL expander.
>
> Any comments?

The issue is that re-association and address lowering is not integrated
with optimizers such as CSE and invariant motion.  This means it's
hard to arrive at optimal results and all passes are just "guessing"
and applying heuristics that the programmer thought are appropriate
for his machine.

I have no easy way out but think that we lower addresses too late
(at least fully).  Loop optimizers would like to see the complex
memory reference forms but starting with IVOPTs lowering all
addresses would likely be a win in the end (there are CSE, LIM
and re-association passes around after/at this point as well).

Back in the 4.3 days when I for the first time attempted to introduce
MEM_REF I forcefully lowered all memory accesses and addresses
very early (during gimplification basically).  That didn't work out well.

So my suggestion to anyone who wants to try something is
to apply an additional lowering to the whole GIMPLE, lowering
things like handled-component references and addresses
(and also things like bitfield accesses).

Richard.

> Thanks,
> bin
>
>
>
> --
> Best Regards.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 12:19             ` Richard Biener
@ 2013-11-29 12:31               ` Bin.Cheng
  2013-11-29 12:59                 ` Richard Biener
  2013-11-29 19:35               ` Yufeng Zhang
  1 sibling, 1 reply; 18+ messages in thread
From: Bin.Cheng @ 2013-11-29 12:31 UTC (permalink / raw)
  To: Richard Biener; +Cc: Richard Earnshaw, bin.cheng, gcc-patches, Yufeng Zhang

On Fri, Nov 29, 2013 at 6:44 PM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng <amker.cheng@gmail.com> wrote:
>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote:
>>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>>> To: Richard Earnshaw
>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Richard Earnshaw
>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>>> To: Bin Cheng
>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>
>>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>>>> to legitimize "base + index * scale + offset" with "reg <- base +
>>>>>>>> offset;  reg
>>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>>> + function
>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>>> Hoping we can improve it in the future.
>>>>>>>>
>>>>>>>> With this patch:
>>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>>
>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>>>>>  So this shouldn't be necessary.  Can you identify where this
>>>>>> non-canoncial form is being generated?
>>>>>>>
>>>>>>
>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going
>>>>>> to construct "base + index*scale" instead.  Since "base + index * scale"
>>>>> is not
>>>>>> canonical form, I will construct the canonical form and drop this part of
>>>>> the
>>>>>> patch.
>>>>>>
>>>>>> Is rest of this patch OK?
>>>>>>
>>>>> Hi Richard, I removed the part over which you concerned and created this
>>>>> updated patch.
>>>>>
>>>>> Is it OK?
>>>>>
>>>>> Thanks.
>>>>> bin
>>>>>
>>>>> 2013-09-18  Bin Cheng  <bin.cheng@arm.com>
>>>>>
>>>>>       * config/arm/arm.c (try_multiplier_address): New function.
>>>>>       (thumb2_legitimize_address): New function.
>>>>>       (arm_legitimize_address): Call try_multiplier_address and
>>>>>       thumb2_legitimize_address.
>>>>>
>>>>>
>>>>> 6-arm-scaled_address-20130918.txt
>>>>>
>>>>>
>>>>> Index: gcc/config/arm/arm.c
>>>>> ===================================================================
>>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>>      }
>>>>>  }
>>>>>
>>>>> +/* Try to find address expression like base + index * scale + offset
>>>>> +   in X.  If we find one, force base + offset into register and
>>>>> +   construct new expression reg + index * scale; return the new
>>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>>> +static rtx
>>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
>>>>> +{
>>>>> +  rtx tmp, base_reg, new_rtx;
>>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
>>>>> +
>>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>>> +
>>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>>> +    {
>>>>> +      tmp = XEXP (x, 0);
>>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>> +     return x;
>>>>> +
>>>>> +      base = XEXP (tmp, 0);
>>>>> +      offset = XEXP (tmp, 1);
>>>>> +    }
>>>>> +  else
>>>>> +    {
>>>>> +      tmp = XEXP (x, 0);
>>>>> +      offset = XEXP (x, 1);
>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>> +     return x;
>>>>> +
>>>>> +      base = XEXP (tmp, 0);
>>>>> +      scale = XEXP (tmp, 1);
>>>>> +      if (GET_CODE (base) == MULT)
>>>>> +     {
>>>>> +       tmp = base;
>>>>> +       base = scale;
>>>>> +       scale = tmp;
>>>>> +     }
>>>>> +      if (GET_CODE (scale) != MULT)
>>>>> +     return x;
>>>>> +
>>>>> +      index = XEXP (scale, 0);
>>>>> +      scale = XEXP (scale, 1);
>>>>> +    }
>>>>> +
>>>>> +  if (CONST_INT_P (base))
>>>>> +    {
>>>>> +      tmp = base;
>>>>> +      base = offset;
>>>>> +      offset = tmp;
>>>>> +    }
>>>>> +
>>>>> +  if (CONST_INT_P (index))
>>>>> +    {
>>>>> +      tmp = index;
>>>>> +      index = scale;
>>>>> +      scale = tmp;
>>>>> +    }
>>>>> +
>>>>> +  /* ARM only supports constant scale in address.  */
>>>>> +  if (!CONST_INT_P (scale))
>>>>> +    return x;
>>>>> +
>>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>>> +    return x;
>>>>> +
>>>>> +  /* Only register/constant are allowed in each part.  */
>>>>> +  if (!symbol_mentioned_p (base)
>>>>> +      && !symbol_mentioned_p (offset)
>>>>> +      && !symbol_mentioned_p (index)
>>>>> +      && !symbol_mentioned_p (scale))
>>>>> +    {
>>>>
>>>> It would be easier to do this at the top of the function --
>>>>   if (symbol_mentioned_p (x))
>>>>     return x;
>>>>
>>>>
>>>>> +      /* Force "base+offset" into register and construct
>>>>> +      "register+index*scale".  Return the new expression
>>>>> +      only if it's valid.  */
>>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>>> +      base_reg = force_reg (SImode, tmp);
>>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>>> +      return new_rtx;
>>>>
>>>> I can't help thinking that this is backwards.  That is, you want to
>>>> split out the mult expression and use offset addressing in the addresses
>>>> itself.  That's likely to lead to either better CSE, or more induction
>>> Thanks to your review.
>>>
>>> Actually base+offset is more likely loop invariant than scaled
>>> expression, just as reported in pr57540.  The bug is observed in
>>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>>> variable doesn't matter that much comparing to invariant because we
>>> are in RTL now.
>>>
>>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>>> expensive than simple offset addressing modes on at least some cores.
>>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>>> As for addressing mode is concerned, Though we may guide the
>>> transformation once we have reliable address cost mode, we don't have
>>> the information if base+offset is invariant, so it's difficult to
>>> handle in backend, right?
>>>
>>> What do you think about this?
>>>
>>
>> Additionally, there is no induction variable issue here.  The memory
>> reference we are facing during expand are not lowered by gimple IVOPT,
>> which means either it's outside loop, or doesn't have induction
>> variable address.
>>
>> Generally, there are three passes which lowers memory reference:
>> gimple strength reduction picks opportunity outside loop; gimple IVOPT
>> handles references with induction variable addresses; references not
>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
>> choice of address re-association.  I think Yufeng also noticed the
>> problem and are trying with patch like:
>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html
>>
>> After thinking twice, I some kind of think we should not re-associate
>> addresses during expanding, because of lacking of context information.
>>  Take base + scaled_index + offset as an example in PR57540, we just
>> don't know if "base+offset" is loop invariant from either backend or
>> RTL expander.
>>
>> Any comments?
>
> The issue is that re-association and address lowering is not integrated
> with optimizers such as CSE and invariant motion.  This means it's
> hard to arrive at optimal results and all passes are just "guessing"
> and applying heuristics that the programmer thought are appropriate
> for his machine.
>
> I have no easy way out but think that we lower addresses too late
> (at least fully).  Loop optimizers would like to see the complex
> memory reference forms but starting with IVOPTs lowering all
> addresses would likely be a win in the end (there are CSE, LIM
> and re-association passes around after/at this point as well).
>
> Back in the 4.3 days when I for the first time attempted to introduce
> MEM_REF I forcefully lowered all memory accesses and addresses
> very early (during gimplification basically).  That didn't work out well.
>
> So my suggestion to anyone who wants to try something is
> to apply an additional lowering to the whole GIMPLE, lowering
> things like handled-component references and addresses
> (and also things like bitfield accesses).
Yes, once in expander, there will be no enough information.  Since we
don't need to handle references with induction variable addresses
after IVOPT, it's not difficult to take invariant into consideration
when re-associating.  Yet I have no clue how CSE should be considered.

Nevertheless, force "base+scaled*index+offset" into register and use
reg directed addressing mode like PR57540 in expand is not good.

Thanks,
bin

-- 
Best Regards.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 12:31               ` Bin.Cheng
@ 2013-11-29 12:59                 ` Richard Biener
  0 siblings, 0 replies; 18+ messages in thread
From: Richard Biener @ 2013-11-29 12:59 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: Richard Earnshaw, bin.cheng, gcc-patches, Yufeng Zhang

On Fri, Nov 29, 2013 at 12:03 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Fri, Nov 29, 2013 at 6:44 PM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng <amker.cheng@gmail.com> wrote:
>>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
>>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote:
>>>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>>>> To: Richard Earnshaw
>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Richard Earnshaw
>>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>>>> To: Bin Cheng
>>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>>
>>>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>>>>> to legitimize "base + index * scale + offset" with "reg <- base +
>>>>>>>>> offset;  reg
>>>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>>>> + function
>>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>>>> Hoping we can improve it in the future.
>>>>>>>>>
>>>>>>>>> With this patch:
>>>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>>>
>>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>>>>>>  So this shouldn't be necessary.  Can you identify where this
>>>>>>> non-canoncial form is being generated?
>>>>>>>>
>>>>>>>
>>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going
>>>>>>> to construct "base + index*scale" instead.  Since "base + index * scale"
>>>>>> is not
>>>>>>> canonical form, I will construct the canonical form and drop this part of
>>>>>> the
>>>>>>> patch.
>>>>>>>
>>>>>>> Is rest of this patch OK?
>>>>>>>
>>>>>> Hi Richard, I removed the part over which you concerned and created this
>>>>>> updated patch.
>>>>>>
>>>>>> Is it OK?
>>>>>>
>>>>>> Thanks.
>>>>>> bin
>>>>>>
>>>>>> 2013-09-18  Bin Cheng  <bin.cheng@arm.com>
>>>>>>
>>>>>>       * config/arm/arm.c (try_multiplier_address): New function.
>>>>>>       (thumb2_legitimize_address): New function.
>>>>>>       (arm_legitimize_address): Call try_multiplier_address and
>>>>>>       thumb2_legitimize_address.
>>>>>>
>>>>>>
>>>>>> 6-arm-scaled_address-20130918.txt
>>>>>>
>>>>>>
>>>>>> Index: gcc/config/arm/arm.c
>>>>>> ===================================================================
>>>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>>>      }
>>>>>>  }
>>>>>>
>>>>>> +/* Try to find address expression like base + index * scale + offset
>>>>>> +   in X.  If we find one, force base + offset into register and
>>>>>> +   construct new expression reg + index * scale; return the new
>>>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>>>> +static rtx
>>>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
>>>>>> +{
>>>>>> +  rtx tmp, base_reg, new_rtx;
>>>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
>>>>>> +
>>>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>>>> +
>>>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>>>> +    {
>>>>>> +      tmp = XEXP (x, 0);
>>>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      base = XEXP (tmp, 0);
>>>>>> +      offset = XEXP (tmp, 1);
>>>>>> +    }
>>>>>> +  else
>>>>>> +    {
>>>>>> +      tmp = XEXP (x, 0);
>>>>>> +      offset = XEXP (x, 1);
>>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      base = XEXP (tmp, 0);
>>>>>> +      scale = XEXP (tmp, 1);
>>>>>> +      if (GET_CODE (base) == MULT)
>>>>>> +     {
>>>>>> +       tmp = base;
>>>>>> +       base = scale;
>>>>>> +       scale = tmp;
>>>>>> +     }
>>>>>> +      if (GET_CODE (scale) != MULT)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      index = XEXP (scale, 0);
>>>>>> +      scale = XEXP (scale, 1);
>>>>>> +    }
>>>>>> +
>>>>>> +  if (CONST_INT_P (base))
>>>>>> +    {
>>>>>> +      tmp = base;
>>>>>> +      base = offset;
>>>>>> +      offset = tmp;
>>>>>> +    }
>>>>>> +
>>>>>> +  if (CONST_INT_P (index))
>>>>>> +    {
>>>>>> +      tmp = index;
>>>>>> +      index = scale;
>>>>>> +      scale = tmp;
>>>>>> +    }
>>>>>> +
>>>>>> +  /* ARM only supports constant scale in address.  */
>>>>>> +  if (!CONST_INT_P (scale))
>>>>>> +    return x;
>>>>>> +
>>>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>>>> +    return x;
>>>>>> +
>>>>>> +  /* Only register/constant are allowed in each part.  */
>>>>>> +  if (!symbol_mentioned_p (base)
>>>>>> +      && !symbol_mentioned_p (offset)
>>>>>> +      && !symbol_mentioned_p (index)
>>>>>> +      && !symbol_mentioned_p (scale))
>>>>>> +    {
>>>>>
>>>>> It would be easier to do this at the top of the function --
>>>>>   if (symbol_mentioned_p (x))
>>>>>     return x;
>>>>>
>>>>>
>>>>>> +      /* Force "base+offset" into register and construct
>>>>>> +      "register+index*scale".  Return the new expression
>>>>>> +      only if it's valid.  */
>>>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>>>> +      base_reg = force_reg (SImode, tmp);
>>>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>>>> +      return new_rtx;
>>>>>
>>>>> I can't help thinking that this is backwards.  That is, you want to
>>>>> split out the mult expression and use offset addressing in the addresses
>>>>> itself.  That's likely to lead to either better CSE, or more induction
>>>> Thanks to your review.
>>>>
>>>> Actually base+offset is more likely loop invariant than scaled
>>>> expression, just as reported in pr57540.  The bug is observed in
>>>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>>>> variable doesn't matter that much comparing to invariant because we
>>>> are in RTL now.
>>>>
>>>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>>>> expensive than simple offset addressing modes on at least some cores.
>>>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>>>> As for addressing mode is concerned, Though we may guide the
>>>> transformation once we have reliable address cost mode, we don't have
>>>> the information if base+offset is invariant, so it's difficult to
>>>> handle in backend, right?
>>>>
>>>> What do you think about this?
>>>>
>>>
>>> Additionally, there is no induction variable issue here.  The memory
>>> reference we are facing during expand are not lowered by gimple IVOPT,
>>> which means either it's outside loop, or doesn't have induction
>>> variable address.
>>>
>>> Generally, there are three passes which lowers memory reference:
>>> gimple strength reduction picks opportunity outside loop; gimple IVOPT
>>> handles references with induction variable addresses; references not
>>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
>>> choice of address re-association.  I think Yufeng also noticed the
>>> problem and are trying with patch like:
>>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
>>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html
>>>
>>> After thinking twice, I some kind of think we should not re-associate
>>> addresses during expanding, because of lacking of context information.
>>>  Take base + scaled_index + offset as an example in PR57540, we just
>>> don't know if "base+offset" is loop invariant from either backend or
>>> RTL expander.
>>>
>>> Any comments?
>>
>> The issue is that re-association and address lowering is not integrated
>> with optimizers such as CSE and invariant motion.  This means it's
>> hard to arrive at optimal results and all passes are just "guessing"
>> and applying heuristics that the programmer thought are appropriate
>> for his machine.
>>
>> I have no easy way out but think that we lower addresses too late
>> (at least fully).  Loop optimizers would like to see the complex
>> memory reference forms but starting with IVOPTs lowering all
>> addresses would likely be a win in the end (there are CSE, LIM
>> and re-association passes around after/at this point as well).
>>
>> Back in the 4.3 days when I for the first time attempted to introduce
>> MEM_REF I forcefully lowered all memory accesses and addresses
>> very early (during gimplification basically).  That didn't work out well.
>>
>> So my suggestion to anyone who wants to try something is
>> to apply an additional lowering to the whole GIMPLE, lowering
>> things like handled-component references and addresses
>> (and also things like bitfield accesses).
> Yes, once in expander, there will be no enough information.  Since we
> don't need to handle references with induction variable addresses
> after IVOPT, it's not difficult to take invariant into consideration
> when re-associating.  Yet I have no clue how CSE should be considered.

You'd need to detect that (a + b)*c can be computed differently
if both a*c and b*c are available for example.  There is even a PR
about this somewhere.

> Nevertheless, force "base+scaled*index+offset" into register and use
> reg directed addressing mode like PR57540 in expand is not good.

Sure ... this is why we have IVOPTs and SLSR.  But both would
work ok with lowered input I think and would simply build up
a "leveled" IL suitable for the target.  IVOPTs also does some
simple CSE considerations AFAIK.  In the end what matters for
speed is loops after all ;)

Richard.

> Thanks,
> bin
>
> --
> Best Regards.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 11:23           ` Bin.Cheng
  2013-11-29 12:19             ` Richard Biener
@ 2013-11-29 13:31             ` Yufeng Zhang
  2013-11-29 13:46               ` Richard Biener
  2013-11-29 19:55               ` Richard Earnshaw
  1 sibling, 2 replies; 18+ messages in thread
From: Yufeng Zhang @ 2013-11-29 13:31 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: Richard Earnshaw, Bin Cheng, gcc-patches, Richard Biener

On 11/29/13 07:52, Bin.Cheng wrote:
> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com>  wrote:
>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com>  wrote:
>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>> To: Richard Earnshaw
>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Richard Earnshaw
>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>> To: Bin Cheng
>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>
>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base +
>>>>>>> offset;  reg
>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>> + function
>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>> Hoping we can improve it in the future.
>>>>>>>
>>>>>>> With this patch:
>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>
>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>>>>   So this shouldn't be necessary.  Can you identify where this
>>>>> non-canoncial form is being generated?
>>>>>>
>>>>>
>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going
>>>>> to construct "base + index*scale" instead.  Since "base + index * scale"
>>>> is not
>>>>> canonical form, I will construct the canonical form and drop this part of
>>>> the
>>>>> patch.
>>>>>
>>>>> Is rest of this patch OK?
>>>>>
>>>> Hi Richard, I removed the part over which you concerned and created this
>>>> updated patch.
>>>>
>>>> Is it OK?
>>>>
>>>> Thanks.
>>>> bin
>>>>
>>>> 2013-09-18  Bin Cheng<bin.cheng@arm.com>
>>>>
>>>>        * config/arm/arm.c (try_multiplier_address): New function.
>>>>        (thumb2_legitimize_address): New function.
>>>>        (arm_legitimize_address): Call try_multiplier_address and
>>>>        thumb2_legitimize_address.
>>>>
>>>>
>>>> 6-arm-scaled_address-20130918.txt
>>>>
>>>>
>>>> Index: gcc/config/arm/arm.c
>>>> ===================================================================
>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>       }
>>>>   }
>>>>
>>>> +/* Try to find address expression like base + index * scale + offset
>>>> +   in X.  If we find one, force base + offset into register and
>>>> +   construct new expression reg + index * scale; return the new
>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>> +static rtx
>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
>>>> +{
>>>> +  rtx tmp, base_reg, new_rtx;
>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
>>>> +
>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>> +
>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>> +    {
>>>> +      tmp = XEXP (x, 0);
>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>> +      if (GET_CODE (tmp) != PLUS)
>>>> +     return x;
>>>> +
>>>> +      base = XEXP (tmp, 0);
>>>> +      offset = XEXP (tmp, 1);
>>>> +    }
>>>> +  else
>>>> +    {
>>>> +      tmp = XEXP (x, 0);
>>>> +      offset = XEXP (x, 1);
>>>> +      if (GET_CODE (tmp) != PLUS)
>>>> +     return x;
>>>> +
>>>> +      base = XEXP (tmp, 0);
>>>> +      scale = XEXP (tmp, 1);
>>>> +      if (GET_CODE (base) == MULT)
>>>> +     {
>>>> +       tmp = base;
>>>> +       base = scale;
>>>> +       scale = tmp;
>>>> +     }
>>>> +      if (GET_CODE (scale) != MULT)
>>>> +     return x;
>>>> +
>>>> +      index = XEXP (scale, 0);
>>>> +      scale = XEXP (scale, 1);
>>>> +    }
>>>> +
>>>> +  if (CONST_INT_P (base))
>>>> +    {
>>>> +      tmp = base;
>>>> +      base = offset;
>>>> +      offset = tmp;
>>>> +    }
>>>> +
>>>> +  if (CONST_INT_P (index))
>>>> +    {
>>>> +      tmp = index;
>>>> +      index = scale;
>>>> +      scale = tmp;
>>>> +    }
>>>> +
>>>> +  /* ARM only supports constant scale in address.  */
>>>> +  if (!CONST_INT_P (scale))
>>>> +    return x;
>>>> +
>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>> +    return x;
>>>> +
>>>> +  /* Only register/constant are allowed in each part.  */
>>>> +  if (!symbol_mentioned_p (base)
>>>> +&&  !symbol_mentioned_p (offset)
>>>> +&&  !symbol_mentioned_p (index)
>>>> +&&  !symbol_mentioned_p (scale))
>>>> +    {
>>>
>>> It would be easier to do this at the top of the function --
>>>    if (symbol_mentioned_p (x))
>>>      return x;
>>>
>>>
>>>> +      /* Force "base+offset" into register and construct
>>>> +      "register+index*scale".  Return the new expression
>>>> +      only if it's valid.  */
>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>> +      base_reg = force_reg (SImode, tmp);
>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>> +      return new_rtx;
>>>
>>> I can't help thinking that this is backwards.  That is, you want to
>>> split out the mult expression and use offset addressing in the addresses
>>> itself.  That's likely to lead to either better CSE, or more induction
>> Thanks to your review.
>>
>> Actually base+offset is more likely loop invariant than scaled
>> expression, just as reported in pr57540.  The bug is observed in
>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>> variable doesn't matter that much comparing to invariant because we
>> are in RTL now.
>>
>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>> expensive than simple offset addressing modes on at least some cores.
>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>> As for addressing mode is concerned, Though we may guide the
>> transformation once we have reliable address cost mode, we don't have
>> the information if base+offset is invariant, so it's difficult to
>> handle in backend, right?
>>
>> What do you think about this?
>>
>
> Additionally, there is no induction variable issue here.  The memory
> reference we are facing during expand are not lowered by gimple IVOPT,
> which means either it's outside loop, or doesn't have induction
> variable address.
>
> Generally, there are three passes which lowers memory reference:
> gimple strength reduction picks opportunity outside loop; gimple IVOPT
> handles references with induction variable addresses; references not
> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
> choice of address re-association.  I think Yufeng also noticed the
> problem and are trying with patch like:
> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html

Yes, when it comes to addressing expression, the re-association in RTL 
expand is quite sensitive to the available address modes on the target 
and its address legitimization routines.  Both patches I proposed try to 
make the RTL expansion more canonicalized on addressing expressions, 
especially on ARM.  It is done by essentially enabling 
simplify_gen_binary () to work on a bigger RTX node.

> After thinking twice, I some kind of think we should not re-associate
> addresses during expanding, because of lacking of context information.
>   Take base + scaled_index + offset as an example in PR57540, we just
> don't know if "base+offset" is loop invariant from either backend or
> RTL expander.

I'm getting less convinced by re-associating base with offset 
unconditionally.  One counter example is

typedef int arr_1[20];
void foo (arr_1 a1, int i)
{
   a1[i+10] = 1;
}

I'm experimenting a patch to get the immediate offset in the above 
example to be the last addend in the address computing (as mentioned in 
http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the 
following code-gen:

         add     r1, r0, r1, asl #2
         mov     r3, #1
         str     r3, [r1, #40]

With your patch applied, the effort will be reverted to

         add     r0, r0, #40
         mov     r3, #1
         str     r3, [r0, r1, asl #2]


I briefly looked into PR57540.  I noticed that you are trying to tackle 
a loop invariant in a hot loop:

.L5:
	add	lr, sp, #2064            ////loop invariant
	add	r2, r2, #1
	add	r3, lr, r3, asl #2
	ldr	r3, [r3, #-2064]
	cmp	r3, #0
	bge	.L5
	uxtb	r2, r2

Looking into the example code:

void
foo ()
{
   int parent [ 258 * 2 ];
   for (i = 1; i <= alphaSize; i++)
     {
       while (parent[k] >= 0)
	{
	  k = parent[k];
	  ...
	}
       ...

The loop invariant is part of address computing for a stack variable. 
The immediate 2064 is the offset of the variable on the stack frame 
rather than what we normally expect, e.g. part of indexing; it is a 
back-end specific issue and there is nothing the mid-end can do (the 
mem_ref lowering cannot help either).  One possible solution may be to 
force the base address of an array on stack to a REG, before the RTL 
expand does anything 'smart'.  Is it something you think worth giving a try?

Just my two cents.

Thanks,
Yufeng

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 13:31             ` Yufeng Zhang
@ 2013-11-29 13:46               ` Richard Biener
  2013-11-29 17:26                 ` Yufeng Zhang
  2013-11-29 19:55               ` Richard Earnshaw
  1 sibling, 1 reply; 18+ messages in thread
From: Richard Biener @ 2013-11-29 13:46 UTC (permalink / raw)
  To: Yufeng Zhang; +Cc: Bin.Cheng, Richard Earnshaw, Bin Cheng, gcc-patches

On Fri, Nov 29, 2013 at 12:46 PM, Yufeng Zhang <Yufeng.Zhang@arm.com> wrote:
> On 11/29/13 07:52, Bin.Cheng wrote:
>>
>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com>  wrote:
>>>
>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com>
>>> wrote:
>>>>
>>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>>> To: Richard Earnshaw
>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Richard Earnshaw
>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>>> To: Bin Cheng
>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>
>>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>>>
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base +
>>>>>>>> offset;  reg
>>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>>> + function
>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>>> Hoping we can improve it in the future.
>>>>>>>>
>>>>>>>> With this patch:
>>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>>
>>>>>>>
>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical
>>>>>>> form.
>>>>>>>   So this shouldn't be necessary.  Can you identify where this
>>>>>>
>>>>>> non-canoncial form is being generated?
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was
>>>>>> going
>>>>>> to construct "base + index*scale" instead.  Since "base + index *
>>>>>> scale"
>>>>>
>>>>> is not
>>>>>>
>>>>>> canonical form, I will construct the canonical form and drop this part
>>>>>> of
>>>>>
>>>>> the
>>>>>>
>>>>>> patch.
>>>>>>
>>>>>> Is rest of this patch OK?
>>>>>>
>>>>> Hi Richard, I removed the part over which you concerned and created
>>>>> this
>>>>> updated patch.
>>>>>
>>>>> Is it OK?
>>>>>
>>>>> Thanks.
>>>>> bin
>>>>>
>>>>> 2013-09-18  Bin Cheng<bin.cheng@arm.com>
>>>>>
>>>>>        * config/arm/arm.c (try_multiplier_address): New function.
>>>>>        (thumb2_legitimize_address): New function.
>>>>>        (arm_legitimize_address): Call try_multiplier_address and
>>>>>        thumb2_legitimize_address.
>>>>>
>>>>>
>>>>> 6-arm-scaled_address-20130918.txt
>>>>>
>>>>>
>>>>> Index: gcc/config/arm/arm.c
>>>>> ===================================================================
>>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>>       }
>>>>>   }
>>>>>
>>>>> +/* Try to find address expression like base + index * scale + offset
>>>>> +   in X.  If we find one, force base + offset into register and
>>>>> +   construct new expression reg + index * scale; return the new
>>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>>> +static rtx
>>>>> +try_multiplier_address (rtx x, enum machine_mode mode
>>>>> ATTRIBUTE_UNUSED)
>>>>> +{
>>>>> +  rtx tmp, base_reg, new_rtx;
>>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset =
>>>>> NULL_RTX;
>>>>> +
>>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>>> +
>>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>>> +    {
>>>>> +      tmp = XEXP (x, 0);
>>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>> +     return x;
>>>>> +
>>>>> +      base = XEXP (tmp, 0);
>>>>> +      offset = XEXP (tmp, 1);
>>>>> +    }
>>>>> +  else
>>>>> +    {
>>>>> +      tmp = XEXP (x, 0);
>>>>> +      offset = XEXP (x, 1);
>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>> +     return x;
>>>>> +
>>>>> +      base = XEXP (tmp, 0);
>>>>> +      scale = XEXP (tmp, 1);
>>>>> +      if (GET_CODE (base) == MULT)
>>>>> +     {
>>>>> +       tmp = base;
>>>>> +       base = scale;
>>>>> +       scale = tmp;
>>>>> +     }
>>>>> +      if (GET_CODE (scale) != MULT)
>>>>> +     return x;
>>>>> +
>>>>> +      index = XEXP (scale, 0);
>>>>> +      scale = XEXP (scale, 1);
>>>>> +    }
>>>>> +
>>>>> +  if (CONST_INT_P (base))
>>>>> +    {
>>>>> +      tmp = base;
>>>>> +      base = offset;
>>>>> +      offset = tmp;
>>>>> +    }
>>>>> +
>>>>> +  if (CONST_INT_P (index))
>>>>> +    {
>>>>> +      tmp = index;
>>>>> +      index = scale;
>>>>> +      scale = tmp;
>>>>> +    }
>>>>> +
>>>>> +  /* ARM only supports constant scale in address.  */
>>>>> +  if (!CONST_INT_P (scale))
>>>>> +    return x;
>>>>> +
>>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>>> +    return x;
>>>>> +
>>>>> +  /* Only register/constant are allowed in each part.  */
>>>>> +  if (!symbol_mentioned_p (base)
>>>>> +&&  !symbol_mentioned_p (offset)
>>>>> +&&  !symbol_mentioned_p (index)
>>>>> +&&  !symbol_mentioned_p (scale))
>>>>> +    {
>>>>
>>>>
>>>> It would be easier to do this at the top of the function --
>>>>    if (symbol_mentioned_p (x))
>>>>      return x;
>>>>
>>>>
>>>>> +      /* Force "base+offset" into register and construct
>>>>> +      "register+index*scale".  Return the new expression
>>>>> +      only if it's valid.  */
>>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>>> +      base_reg = force_reg (SImode, tmp);
>>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>>> +      return new_rtx;
>>>>
>>>>
>>>> I can't help thinking that this is backwards.  That is, you want to
>>>> split out the mult expression and use offset addressing in the addresses
>>>> itself.  That's likely to lead to either better CSE, or more induction
>>>
>>> Thanks to your review.
>>>
>>> Actually base+offset is more likely loop invariant than scaled
>>> expression, just as reported in pr57540.  The bug is observed in
>>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>>> variable doesn't matter that much comparing to invariant because we
>>> are in RTL now.
>>>
>>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>>> expensive than simple offset addressing modes on at least some cores.
>>>
>>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>>> As for addressing mode is concerned, Though we may guide the
>>> transformation once we have reliable address cost mode, we don't have
>>> the information if base+offset is invariant, so it's difficult to
>>> handle in backend, right?
>>>
>>> What do you think about this?
>>>
>>
>> Additionally, there is no induction variable issue here.  The memory
>> reference we are facing during expand are not lowered by gimple IVOPT,
>> which means either it's outside loop, or doesn't have induction
>> variable address.
>>
>> Generally, there are three passes which lowers memory reference:
>> gimple strength reduction picks opportunity outside loop; gimple IVOPT
>> handles references with induction variable addresses; references not
>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
>> choice of address re-association.  I think Yufeng also noticed the
>> problem and are trying with patch like:
>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html
>
>
> Yes, when it comes to addressing expression, the re-association in RTL
> expand is quite sensitive to the available address modes on the target and
> its address legitimization routines.  Both patches I proposed try to make
> the RTL expansion more canonicalized on addressing expressions, especially
> on ARM.  It is done by essentially enabling simplify_gen_binary () to work
> on a bigger RTX node.
>
>
>> After thinking twice, I some kind of think we should not re-associate
>> addresses during expanding, because of lacking of context information.
>>   Take base + scaled_index + offset as an example in PR57540, we just
>> don't know if "base+offset" is loop invariant from either backend or
>> RTL expander.
>
>
> I'm getting less convinced by re-associating base with offset
> unconditionally.  One counter example is
>
> typedef int arr_1[20];
> void foo (arr_1 a1, int i)
> {
>   a1[i+10] = 1;
> }
>
> I'm experimenting a patch to get the immediate offset in the above example
> to be the last addend in the address computing (as mentioned in
> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the
> following code-gen:
>
>         add     r1, r0, r1, asl #2
>         mov     r3, #1
>         str     r3, [r1, #40]
>
> With your patch applied, the effort will be reverted to
>
>         add     r0, r0, #40
>         mov     r3, #1
>         str     r3, [r0, r1, asl #2]
>
>
> I briefly looked into PR57540.  I noticed that you are trying to tackle a
> loop invariant in a hot loop:
>
> .L5:
>         add     lr, sp, #2064            ////loop invariant
>         add     r2, r2, #1
>         add     r3, lr, r3, asl #2
>         ldr     r3, [r3, #-2064]
>         cmp     r3, #0
>         bge     .L5
>         uxtb    r2, r2

Why does RTL invariant motion not move it?

Richard.

> Looking into the example code:
>
> void
> foo ()
> {
>   int parent [ 258 * 2 ];
>   for (i = 1; i <= alphaSize; i++)
>     {
>       while (parent[k] >= 0)
>         {
>           k = parent[k];
>           ...
>         }
>       ...
>
> The loop invariant is part of address computing for a stack variable. The
> immediate 2064 is the offset of the variable on the stack frame rather than
> what we normally expect, e.g. part of indexing; it is a back-end specific
> issue and there is nothing the mid-end can do (the mem_ref lowering cannot
> help either).  One possible solution may be to force the base address of an
> array on stack to a REG, before the RTL expand does anything 'smart'.  Is it
> something you think worth giving a try?
>
> Just my two cents.
>
> Thanks,
> Yufeng
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 13:46               ` Richard Biener
@ 2013-11-29 17:26                 ` Yufeng Zhang
  0 siblings, 0 replies; 18+ messages in thread
From: Yufeng Zhang @ 2013-11-29 17:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: Bin.Cheng, Richard Earnshaw, Bin Cheng, gcc-patches

On 11/29/13 12:02, Richard Biener wrote:
> On Fri, Nov 29, 2013 at 12:46 PM, Yufeng Zhang<Yufeng.Zhang@arm.com>  wrote:
>> On 11/29/13 07:52, Bin.Cheng wrote:
>>>
>>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com>   wrote:
>>>>
>>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com>
>>>> wrote:
>>>>>
>>>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>>>> To: Richard Earnshaw
>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Richard Earnshaw
>>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>>>> To: Bin Cheng
>>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>>
>>>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>>>>
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base +
>>>>>>>>> offset;  reg
>>>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>>>> + function
>>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>>>> Hoping we can improve it in the future.
>>>>>>>>>
>>>>>>>>> With this patch:
>>>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>>>
>>>>>>>>
>>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical
>>>>>>>> form.
>>>>>>>>    So this shouldn't be necessary.  Can you identify where this
>>>>>>>
>>>>>>> non-canoncial form is being generated?
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was
>>>>>>> going
>>>>>>> to construct "base + index*scale" instead.  Since "base + index *
>>>>>>> scale"
>>>>>>
>>>>>> is not
>>>>>>>
>>>>>>> canonical form, I will construct the canonical form and drop this part
>>>>>>> of
>>>>>>
>>>>>> the
>>>>>>>
>>>>>>> patch.
>>>>>>>
>>>>>>> Is rest of this patch OK?
>>>>>>>
>>>>>> Hi Richard, I removed the part over which you concerned and created
>>>>>> this
>>>>>> updated patch.
>>>>>>
>>>>>> Is it OK?
>>>>>>
>>>>>> Thanks.
>>>>>> bin
>>>>>>
>>>>>> 2013-09-18  Bin Cheng<bin.cheng@arm.com>
>>>>>>
>>>>>>         * config/arm/arm.c (try_multiplier_address): New function.
>>>>>>         (thumb2_legitimize_address): New function.
>>>>>>         (arm_legitimize_address): Call try_multiplier_address and
>>>>>>         thumb2_legitimize_address.
>>>>>>
>>>>>>
>>>>>> 6-arm-scaled_address-20130918.txt
>>>>>>
>>>>>>
>>>>>> Index: gcc/config/arm/arm.c
>>>>>> ===================================================================
>>>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>>>        }
>>>>>>    }
>>>>>>
>>>>>> +/* Try to find address expression like base + index * scale + offset
>>>>>> +   in X.  If we find one, force base + offset into register and
>>>>>> +   construct new expression reg + index * scale; return the new
>>>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>>>> +static rtx
>>>>>> +try_multiplier_address (rtx x, enum machine_mode mode
>>>>>> ATTRIBUTE_UNUSED)
>>>>>> +{
>>>>>> +  rtx tmp, base_reg, new_rtx;
>>>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset =
>>>>>> NULL_RTX;
>>>>>> +
>>>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>>>> +
>>>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>>>> +    {
>>>>>> +      tmp = XEXP (x, 0);
>>>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      base = XEXP (tmp, 0);
>>>>>> +      offset = XEXP (tmp, 1);
>>>>>> +    }
>>>>>> +  else
>>>>>> +    {
>>>>>> +      tmp = XEXP (x, 0);
>>>>>> +      offset = XEXP (x, 1);
>>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      base = XEXP (tmp, 0);
>>>>>> +      scale = XEXP (tmp, 1);
>>>>>> +      if (GET_CODE (base) == MULT)
>>>>>> +     {
>>>>>> +       tmp = base;
>>>>>> +       base = scale;
>>>>>> +       scale = tmp;
>>>>>> +     }
>>>>>> +      if (GET_CODE (scale) != MULT)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      index = XEXP (scale, 0);
>>>>>> +      scale = XEXP (scale, 1);
>>>>>> +    }
>>>>>> +
>>>>>> +  if (CONST_INT_P (base))
>>>>>> +    {
>>>>>> +      tmp = base;
>>>>>> +      base = offset;
>>>>>> +      offset = tmp;
>>>>>> +    }
>>>>>> +
>>>>>> +  if (CONST_INT_P (index))
>>>>>> +    {
>>>>>> +      tmp = index;
>>>>>> +      index = scale;
>>>>>> +      scale = tmp;
>>>>>> +    }
>>>>>> +
>>>>>> +  /* ARM only supports constant scale in address.  */
>>>>>> +  if (!CONST_INT_P (scale))
>>>>>> +    return x;
>>>>>> +
>>>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>>>> +    return x;
>>>>>> +
>>>>>> +  /* Only register/constant are allowed in each part.  */
>>>>>> +  if (!symbol_mentioned_p (base)
>>>>>> +&&   !symbol_mentioned_p (offset)
>>>>>> +&&   !symbol_mentioned_p (index)
>>>>>> +&&   !symbol_mentioned_p (scale))
>>>>>> +    {
>>>>>
>>>>>
>>>>> It would be easier to do this at the top of the function --
>>>>>     if (symbol_mentioned_p (x))
>>>>>       return x;
>>>>>
>>>>>
>>>>>> +      /* Force "base+offset" into register and construct
>>>>>> +      "register+index*scale".  Return the new expression
>>>>>> +      only if it's valid.  */
>>>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>>>> +      base_reg = force_reg (SImode, tmp);
>>>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>>>> +      return new_rtx;
>>>>>
>>>>>
>>>>> I can't help thinking that this is backwards.  That is, you want to
>>>>> split out the mult expression and use offset addressing in the addresses
>>>>> itself.  That's likely to lead to either better CSE, or more induction
>>>>
>>>> Thanks to your review.
>>>>
>>>> Actually base+offset is more likely loop invariant than scaled
>>>> expression, just as reported in pr57540.  The bug is observed in
>>>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>>>> variable doesn't matter that much comparing to invariant because we
>>>> are in RTL now.
>>>>
>>>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>>>> expensive than simple offset addressing modes on at least some cores.
>>>>
>>>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>>>> As for addressing mode is concerned, Though we may guide the
>>>> transformation once we have reliable address cost mode, we don't have
>>>> the information if base+offset is invariant, so it's difficult to
>>>> handle in backend, right?
>>>>
>>>> What do you think about this?
>>>>
>>>
>>> Additionally, there is no induction variable issue here.  The memory
>>> reference we are facing during expand are not lowered by gimple IVOPT,
>>> which means either it's outside loop, or doesn't have induction
>>> variable address.
>>>
>>> Generally, there are three passes which lowers memory reference:
>>> gimple strength reduction picks opportunity outside loop; gimple IVOPT
>>> handles references with induction variable addresses; references not
>>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
>>> choice of address re-association.  I think Yufeng also noticed the
>>> problem and are trying with patch like:
>>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
>>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html
>>
>>
>> Yes, when it comes to addressing expression, the re-association in RTL
>> expand is quite sensitive to the available address modes on the target and
>> its address legitimization routines.  Both patches I proposed try to make
>> the RTL expansion more canonicalized on addressing expressions, especially
>> on ARM.  It is done by essentially enabling simplify_gen_binary () to work
>> on a bigger RTX node.
>>
>>
>>> After thinking twice, I some kind of think we should not re-associate
>>> addresses during expanding, because of lacking of context information.
>>>    Take base + scaled_index + offset as an example in PR57540, we just
>>> don't know if "base+offset" is loop invariant from either backend or
>>> RTL expander.
>>
>>
>> I'm getting less convinced by re-associating base with offset
>> unconditionally.  One counter example is
>>
>> typedef int arr_1[20];
>> void foo (arr_1 a1, int i)
>> {
>>    a1[i+10] = 1;
>> }
>>
>> I'm experimenting a patch to get the immediate offset in the above example
>> to be the last addend in the address computing (as mentioned in
>> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the
>> following code-gen:
>>
>>          add     r1, r0, r1, asl #2
>>          mov     r3, #1
>>          str     r3, [r1, #40]
>>
>> With your patch applied, the effort will be reverted to
>>
>>          add     r0, r0, #40
>>          mov     r3, #1
>>          str     r3, [r0, r1, asl #2]
>>
>>
>> I briefly looked into PR57540.  I noticed that you are trying to tackle a
>> loop invariant in a hot loop:
>>
>> .L5:
>>          add     lr, sp, #2064            ////loop invariant
>>          add     r2, r2, #1
>>          add     r3, lr, r3, asl #2
>>          ldr     r3, [r3, #-2064]
>>          cmp     r3, #0
>>          bge     .L5
>>          uxtb    r2, r2
>
> Why does RTL invariant motion not move it?

I haven't looked at the issue in detail.  I think Bin will be able to 
answer it.

The IM won't solve the entire problem, as ideally we'd like to see the 
'- 2064' happen earlier, so that we'll get

.L5:
           add     r3, lr, r3, asl #2
           ldr     r3, [sp, r3, asl #2]
           cmp     r3, #0
           bge     .L5
           uxtb    r2, r2

Thanks,
Yufeng

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 12:19             ` Richard Biener
  2013-11-29 12:31               ` Bin.Cheng
@ 2013-11-29 19:35               ` Yufeng Zhang
  2013-12-02 11:26                 ` Richard Biener
  1 sibling, 1 reply; 18+ messages in thread
From: Yufeng Zhang @ 2013-11-29 19:35 UTC (permalink / raw)
  To: Richard Biener; +Cc: Bin.Cheng, Richard Earnshaw, Bin Cheng, gcc-patches

On 11/29/13 10:44, Richard Biener wrote:
> On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng<amker.cheng@gmail.com>  wrote:
>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com>  wrote:
>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com>  wrote:
>>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>>> To: Richard Earnshaw
>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Richard Earnshaw
>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>>> To: Bin Cheng
>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>
>>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also tries
>>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base +
>>>>>>>> offset;  reg
>>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>>> + function
>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the
>>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>>> Hoping we can improve it in the future.
>>>>>>>>
>>>>>>>> With this patch:
>>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>>
>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form.
>>>>>>>   So this shouldn't be necessary.  Can you identify where this
>>>>>> non-canoncial form is being generated?
>>>>>>>
>>>>>>
>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going
>>>>>> to construct "base + index*scale" instead.  Since "base + index * scale"
>>>>> is not
>>>>>> canonical form, I will construct the canonical form and drop this part of
>>>>> the
>>>>>> patch.
>>>>>>
>>>>>> Is rest of this patch OK?
>>>>>>
>>>>> Hi Richard, I removed the part over which you concerned and created this
>>>>> updated patch.
>>>>>
>>>>> Is it OK?
>>>>>
>>>>> Thanks.
>>>>> bin
>>>>>
>>>>> 2013-09-18  Bin Cheng<bin.cheng@arm.com>
>>>>>
>>>>>        * config/arm/arm.c (try_multiplier_address): New function.
>>>>>        (thumb2_legitimize_address): New function.
>>>>>        (arm_legitimize_address): Call try_multiplier_address and
>>>>>        thumb2_legitimize_address.
>>>>>
>>>>>
>>>>> 6-arm-scaled_address-20130918.txt
>>>>>
>>>>>
>>>>> Index: gcc/config/arm/arm.c
>>>>> ===================================================================
>>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>>       }
>>>>>   }
>>>>>
>>>>> +/* Try to find address expression like base + index * scale + offset
>>>>> +   in X.  If we find one, force base + offset into register and
>>>>> +   construct new expression reg + index * scale; return the new
>>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>>> +static rtx
>>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
>>>>> +{
>>>>> +  rtx tmp, base_reg, new_rtx;
>>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX;
>>>>> +
>>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>>> +
>>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>>> +    {
>>>>> +      tmp = XEXP (x, 0);
>>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>> +     return x;
>>>>> +
>>>>> +      base = XEXP (tmp, 0);
>>>>> +      offset = XEXP (tmp, 1);
>>>>> +    }
>>>>> +  else
>>>>> +    {
>>>>> +      tmp = XEXP (x, 0);
>>>>> +      offset = XEXP (x, 1);
>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>> +     return x;
>>>>> +
>>>>> +      base = XEXP (tmp, 0);
>>>>> +      scale = XEXP (tmp, 1);
>>>>> +      if (GET_CODE (base) == MULT)
>>>>> +     {
>>>>> +       tmp = base;
>>>>> +       base = scale;
>>>>> +       scale = tmp;
>>>>> +     }
>>>>> +      if (GET_CODE (scale) != MULT)
>>>>> +     return x;
>>>>> +
>>>>> +      index = XEXP (scale, 0);
>>>>> +      scale = XEXP (scale, 1);
>>>>> +    }
>>>>> +
>>>>> +  if (CONST_INT_P (base))
>>>>> +    {
>>>>> +      tmp = base;
>>>>> +      base = offset;
>>>>> +      offset = tmp;
>>>>> +    }
>>>>> +
>>>>> +  if (CONST_INT_P (index))
>>>>> +    {
>>>>> +      tmp = index;
>>>>> +      index = scale;
>>>>> +      scale = tmp;
>>>>> +    }
>>>>> +
>>>>> +  /* ARM only supports constant scale in address.  */
>>>>> +  if (!CONST_INT_P (scale))
>>>>> +    return x;
>>>>> +
>>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>>> +    return x;
>>>>> +
>>>>> +  /* Only register/constant are allowed in each part.  */
>>>>> +  if (!symbol_mentioned_p (base)
>>>>> +&&  !symbol_mentioned_p (offset)
>>>>> +&&  !symbol_mentioned_p (index)
>>>>> +&&  !symbol_mentioned_p (scale))
>>>>> +    {
>>>>
>>>> It would be easier to do this at the top of the function --
>>>>    if (symbol_mentioned_p (x))
>>>>      return x;
>>>>
>>>>
>>>>> +      /* Force "base+offset" into register and construct
>>>>> +      "register+index*scale".  Return the new expression
>>>>> +      only if it's valid.  */
>>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>>> +      base_reg = force_reg (SImode, tmp);
>>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>>> +      return new_rtx;
>>>>
>>>> I can't help thinking that this is backwards.  That is, you want to
>>>> split out the mult expression and use offset addressing in the addresses
>>>> itself.  That's likely to lead to either better CSE, or more induction
>>> Thanks to your review.
>>>
>>> Actually base+offset is more likely loop invariant than scaled
>>> expression, just as reported in pr57540.  The bug is observed in
>>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>>> variable doesn't matter that much comparing to invariant because we
>>> are in RTL now.
>>>
>>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>>> expensive than simple offset addressing modes on at least some cores.
>>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>>> As for addressing mode is concerned, Though we may guide the
>>> transformation once we have reliable address cost mode, we don't have
>>> the information if base+offset is invariant, so it's difficult to
>>> handle in backend, right?
>>>
>>> What do you think about this?
>>>
>>
>> Additionally, there is no induction variable issue here.  The memory
>> reference we are facing during expand are not lowered by gimple IVOPT,
>> which means either it's outside loop, or doesn't have induction
>> variable address.
>>
>> Generally, there are three passes which lowers memory reference:
>> gimple strength reduction picks opportunity outside loop; gimple IVOPT
>> handles references with induction variable addresses; references not
>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
>> choice of address re-association.  I think Yufeng also noticed the
>> problem and are trying with patch like:
>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html
>>
>> After thinking twice, I some kind of think we should not re-associate
>> addresses during expanding, because of lacking of context information.
>>   Take base + scaled_index + offset as an example in PR57540, we just
>> don't know if "base+offset" is loop invariant from either backend or
>> RTL expander.
>>
>> Any comments?
>
> The issue is that re-association and address lowering is not integrated
> with optimizers such as CSE and invariant motion.  This means it's
> hard to arrive at optimal results and all passes are just "guessing"
> and applying heuristics that the programmer thought are appropriate
> for his machine.
>
> I have no easy way out but think that we lower addresses too late
> (at least fully).  Loop optimizers would like to see the complex
> memory reference forms but starting with IVOPTs lowering all
> addresses would likely be a win in the end (there are CSE, LIM
> and re-association passes around after/at this point as well).
>
> Back in the 4.3 days when I for the first time attempted to introduce
> MEM_REF I forcefully lowered all memory accesses and addresses
> very early (during gimplification basically).  That didn't work out well.
>
> So my suggestion to anyone who wants to try something is
> to apply an additional lowering to the whole GIMPLE, lowering
> things like handled-component references and addresses
> (and also things like bitfield accesses).

This idea of additional lowering is interesting.  I found this wiki page 
describing the memory reference flattening in GIMPLE:

   http://gcc.gnu.org/wiki/MemRef

I wonder whether there is more information or notes you can share, 
things like common pitfalls, implication in alias analysis, etc.  I'd 
like to do some experiment with it when I have time.

Regards,
Yufeng

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 13:31             ` Yufeng Zhang
  2013-11-29 13:46               ` Richard Biener
@ 2013-11-29 19:55               ` Richard Earnshaw
  2013-11-30  8:08                 ` Bin.Cheng
  2013-12-02  3:22                 ` Bin.Cheng
  1 sibling, 2 replies; 18+ messages in thread
From: Richard Earnshaw @ 2013-11-29 19:55 UTC (permalink / raw)
  To: Yufeng Zhang, Bin Cheng; +Cc: Bin.Cheng, gcc-patches, Richard Biener

On 29/11/13 11:46, Yufeng Zhang wrote:
> On 11/29/13 07:52, Bin.Cheng wrote:
>> After thinking twice, I some kind of think we should not re-associate
>> addresses during expanding, because of lacking of context information.
>>   Take base + scaled_index + offset as an example in PR57540, we just
>> don't know if "base+offset" is loop invariant from either backend or
>> RTL expander.
> 
> I'm getting less convinced by re-associating base with offset 
> unconditionally.  One counter example is
> 
> typedef int arr_1[20];
> void foo (arr_1 a1, int i)
> {
>    a1[i+10] = 1;
> }
> 
> I'm experimenting a patch to get the immediate offset in the above 
> example to be the last addend in the address computing (as mentioned in 
> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the 
> following code-gen:
> 
>          add     r1, r0, r1, asl #2
>          mov     r3, #1
>          str     r3, [r1, #40]
> 
> With your patch applied, the effort will be reverted to
> 
>          add     r0, r0, #40
>          mov     r3, #1
>          str     r3, [r0, r1, asl #2]
> 

And another one is:



typedef int arr_1[20];
void foo (arr_1 a1, int i)
{
   a1[i+10] = 1;
   a1[i+11] = 1;
}

This should compile to:

	add	r1, r0, r1, asl #2
	mov	r3, #1
	str	r3, [r1, #40]
	str	r3, [r1, #44]

And which on Thumb2 should then collapse to:

	add	r1, r0, r1, asl #2
	mov	r3, #1
	strd	r3, r3, [r1, #40]

With your patch I don't see any chance of being able to get to this
situation.

(BTW, we currently generate:

	mov	r3, #1
	add	r1, r1, #10
	add	r2, r0, r1, asl #2
	str	r3, [r0, r1, asl #2]
	str	r3, [r2, #4]

which is insane).

I think I see where you're coming from on the original testcase, but I
think you're trying to solve the wrong problem.   In your test case the
base is an eliminable register, which is likely to be replaced with an
offset expression during register allocation.  The problem then, I
think, is that the cost of these virtual registers is treated the same
as any other pseudo register, when it may really have the cost of a PLUS
expression.

Perhaps the cost of using an eliminable register should be raised in
rtx_costs() (treating them as equivalent to (PLUS (reg) (CONST_INT
(TBD))), so that loop optimizations will try to hoist suitable
sub-expressions out the loop and replace them with real pseudos.

R.








^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 19:55               ` Richard Earnshaw
@ 2013-11-30  8:08                 ` Bin.Cheng
  2013-12-02  3:22                 ` Bin.Cheng
  1 sibling, 0 replies; 18+ messages in thread
From: Bin.Cheng @ 2013-11-30  8:08 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: Yufeng Zhang, Bin Cheng, gcc-patches, Richard Biener

On Sat, Nov 30, 2013 at 12:34 AM, Richard Earnshaw <rearnsha@arm.com> wrote:
> On 29/11/13 11:46, Yufeng Zhang wrote:
>> On 11/29/13 07:52, Bin.Cheng wrote:
>>> After thinking twice, I some kind of think we should not re-associate
>>> addresses during expanding, because of lacking of context information.
>>>   Take base + scaled_index + offset as an example in PR57540, we just
>>> don't know if "base+offset" is loop invariant from either backend or
>>> RTL expander.
>>
>> I'm getting less convinced by re-associating base with offset
>> unconditionally.  One counter example is
>>
>> typedef int arr_1[20];
>> void foo (arr_1 a1, int i)
>> {
>>    a1[i+10] = 1;
>> }
>>
>> I'm experimenting a patch to get the immediate offset in the above
>> example to be the last addend in the address computing (as mentioned in
>> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the
>> following code-gen:
>>
>>          add     r1, r0, r1, asl #2
>>          mov     r3, #1
>>          str     r3, [r1, #40]
>>
>> With your patch applied, the effort will be reverted to
>>
>>          add     r0, r0, #40
>>          mov     r3, #1
>>          str     r3, [r0, r1, asl #2]
>>
>
> And another one is:
>
>
>
> typedef int arr_1[20];
> void foo (arr_1 a1, int i)
> {
>    a1[i+10] = 1;
>    a1[i+11] = 1;
> }
>
> This should compile to:
>
>         add     r1, r0, r1, asl #2
>         mov     r3, #1
>         str     r3, [r1, #40]
>         str     r3, [r1, #44]
>
> And which on Thumb2 should then collapse to:
>
>         add     r1, r0, r1, asl #2
>         mov     r3, #1
>         strd    r3, r3, [r1, #40]
>
> With your patch I don't see any chance of being able to get to this
> situation.
>
> (BTW, we currently generate:
>
>         mov     r3, #1
>         add     r1, r1, #10
>         add     r2, r0, r1, asl #2
>         str     r3, [r0, r1, asl #2]
>         str     r3, [r2, #4]
>
> which is insane).
The two memory references share common sub expressions, SLSR is
designed to handle this case, and it should be improved to handle.
The original patch are only used to pick up cases not handled by SLSR
and IVOPT.  Anyway, as you saw from previous message, to do the
refactoring during expand is not a good practice, without enough
CSE/INVARIANT information, there will be always catched and missed
opportunities, that's why I think another lowering besides SLSR/IVOPT
on gimple might be a win.

Thanks,
bin

>
> I think I see where you're coming from on the original testcase, but I
> think you're trying to solve the wrong problem.   In your test case the
> base is an eliminable register, which is likely to be replaced with an
> offset expression during register allocation.  The problem then, I
> think, is that the cost of these virtual registers is treated the same
> as any other pseudo register, when it may really have the cost of a PLUS
> expression.
>
> Perhaps the cost of using an eliminable register should be raised in
> rtx_costs() (treating them as equivalent to (PLUS (reg) (CONST_INT
> (TBD))), so that loop optimizations will try to hoist suitable
> sub-expressions out the loop and replace them with real pseudos.
>
> R.
>
>
>
>
>
>
>
>



-- 
Best Regards.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 19:55               ` Richard Earnshaw
  2013-11-30  8:08                 ` Bin.Cheng
@ 2013-12-02  3:22                 ` Bin.Cheng
  1 sibling, 0 replies; 18+ messages in thread
From: Bin.Cheng @ 2013-12-02  3:22 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: Yufeng Zhang, Bin Cheng, gcc-patches, Richard Biener

On Sat, Nov 30, 2013 at 12:34 AM, Richard Earnshaw <rearnsha@arm.com> wrote:
> On 29/11/13 11:46, Yufeng Zhang wrote:
>> On 11/29/13 07:52, Bin.Cheng wrote:
>>> After thinking twice, I some kind of think we should not re-associate
>>> addresses during expanding, because of lacking of context information.
>>>   Take base + scaled_index + offset as an example in PR57540, we just
>>> don't know if "base+offset" is loop invariant from either backend or
>>> RTL expander.
>>
>> I'm getting less convinced by re-associating base with offset
>> unconditionally.  One counter example is
>>
>> typedef int arr_1[20];
>> void foo (arr_1 a1, int i)
>> {
>>    a1[i+10] = 1;
>> }
>>
>> I'm experimenting a patch to get the immediate offset in the above
>> example to be the last addend in the address computing (as mentioned in
>> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the
>> following code-gen:
>>
>>          add     r1, r0, r1, asl #2
>>          mov     r3, #1
>>          str     r3, [r1, #40]
>>
>> With your patch applied, the effort will be reverted to
>>
>>          add     r0, r0, #40
>>          mov     r3, #1
>>          str     r3, [r0, r1, asl #2]
>>
>
> And another one is:
>
>
>
> typedef int arr_1[20];
> void foo (arr_1 a1, int i)
> {
>    a1[i+10] = 1;
>    a1[i+11] = 1;
> }
>
> This should compile to:
>
>         add     r1, r0, r1, asl #2
>         mov     r3, #1
>         str     r3, [r1, #40]
>         str     r3, [r1, #44]
>
> And which on Thumb2 should then collapse to:
>
>         add     r1, r0, r1, asl #2
>         mov     r3, #1
>         strd    r3, r3, [r1, #40]
>
> With your patch I don't see any chance of being able to get to this
> situation.
>
> (BTW, we currently generate:
>
>         mov     r3, #1
>         add     r1, r1, #10
>         add     r2, r0, r1, asl #2
>         str     r3, [r0, r1, asl #2]
>         str     r3, [r2, #4]
>
> which is insane).
>
> I think I see where you're coming from on the original testcase, but I
> think you're trying to solve the wrong problem.   In your test case the
> base is an eliminable register, which is likely to be replaced with an
> offset expression during register allocation.  The problem then, I
> think, is that the cost of these virtual registers is treated the same
> as any other pseudo register, when it may really have the cost of a PLUS
> expression.
>
> Perhaps the cost of using an eliminable register should be raised in
> rtx_costs() (treating them as equivalent to (PLUS (reg) (CONST_INT
> (TBD))), so that loop optimizations will try to hoist suitable
> sub-expressions out the loop and replace them with real pseudos.
>
I now have access to the code.
The gimple before expanding is like:
  <bb 6>:
  # j_26 = PHI <j_9(8), 0(5)>
  # k_29 = PHI <k_8(8), k_24(5)>
  j_9 = j_26 + 1;
  k_8 = parent[k_29];
  if (k_8 >= 0)
    goto <bb 8>;
  else
    goto <bb 7>;

The rtl generated after expanding is like:
   88: NOTE_INSN_BASIC_BLOCK 7
   89: r174:SI=r174:SI+0x1
   90: r191:SI=r173:SI<<0x2
   91: r190:SI=r105:SI+r191:SI
   92: r173:SI=[r190:SI-0x810]

with r105 == virtual_stack_vars_rtx, and it will be instantiated into
frame_pointer_rtx in vregs pass:
   88: NOTE_INSN_BASIC_BLOCK 7
   89: r174:SI=r174:SI+0x1
   90: r191:SI=r173:SI<<0x2
   91: r190:SI=sfp:SI+r191:SI
   92: r173:SI=[r190:SI-0x810]

As you pointed out, sfp is not hoisted as a high cost invariant.  I am
not sure if loop-invariant will hoist a single pseudo reg even it's
assigned with a higher cost.
But before the invariant problem, the choice made by RTL expand is bad
because it hides the CSE opportunity, because (sfp + r173<<2 - 0x810)
== (sp + r173<<2), (sfp-0x810) can be folded into (sp), then we can
embed the shift instruction in scaled addressing mode [sp + r173 <<
2].

Thanks,
bin

-- 
Best Regards.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH ARM]Refine scaled address expression on ARM
  2013-11-29 19:35               ` Yufeng Zhang
@ 2013-12-02 11:26                 ` Richard Biener
  0 siblings, 0 replies; 18+ messages in thread
From: Richard Biener @ 2013-12-02 11:26 UTC (permalink / raw)
  To: Yufeng Zhang; +Cc: Bin.Cheng, Richard Earnshaw, Bin Cheng, gcc-patches

On Fri, Nov 29, 2013 at 5:10 PM, Yufeng Zhang <Yufeng.Zhang@arm.com> wrote:
> On 11/29/13 10:44, Richard Biener wrote:
>>
>> On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng<amker.cheng@gmail.com>  wrote:
>>>
>>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com>  wrote:
>>>>
>>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com>
>>>> wrote:
>>>>>
>>>>> On 18/09/13 10:15, bin.cheng wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng
>>>>>>> Sent: Monday, September 02, 2013 3:09 PM
>>>>>>> To: Richard Earnshaw
>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Richard Earnshaw
>>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM
>>>>>>>> To: Bin Cheng
>>>>>>>> Cc: gcc-patches@gcc.gnu.org
>>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM
>>>>>>>>
>>>>>>>> On 28/08/13 08:00, bin.cheng wrote:
>>>>>>>>>
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> This patch refines scaled address expression on ARM.  It supports
>>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p.  It also
>>>>>>>>> tries
>>>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base +
>>>>>>>>> offset;  reg
>>>>>>>>> + index * scale" by introducing thumb2_legitimize_address.  For now
>>>>>>>>> + function
>>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does
>>>>>>>>> the
>>>>>>>>> mentioned transformation by calling to try_multiplier_address.
>>>>>>>>> Hoping we can improve it in the future.
>>>>>>>>>
>>>>>>>>> With this patch:
>>>>>>>>> 1) "base+index*scale" is recognized.
>>>>>>>>
>>>>>>>>
>>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical
>>>>>>>> form.
>>>>>>>>   So this shouldn't be necessary.  Can you identify where this
>>>>>>>
>>>>>>> non-canoncial form is being generated?
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend
>>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was
>>>>>>> going
>>>>>>> to construct "base + index*scale" instead.  Since "base + index *
>>>>>>> scale"
>>>>>>
>>>>>> is not
>>>>>>>
>>>>>>> canonical form, I will construct the canonical form and drop this
>>>>>>> part of
>>>>>>
>>>>>> the
>>>>>>>
>>>>>>> patch.
>>>>>>>
>>>>>>> Is rest of this patch OK?
>>>>>>>
>>>>>> Hi Richard, I removed the part over which you concerned and created
>>>>>> this
>>>>>> updated patch.
>>>>>>
>>>>>> Is it OK?
>>>>>>
>>>>>> Thanks.
>>>>>> bin
>>>>>>
>>>>>> 2013-09-18  Bin Cheng<bin.cheng@arm.com>
>>>>>>
>>>>>>        * config/arm/arm.c (try_multiplier_address): New function.
>>>>>>        (thumb2_legitimize_address): New function.
>>>>>>        (arm_legitimize_address): Call try_multiplier_address and
>>>>>>        thumb2_legitimize_address.
>>>>>>
>>>>>>
>>>>>> 6-arm-scaled_address-20130918.txt
>>>>>>
>>>>>>
>>>>>> Index: gcc/config/arm/arm.c
>>>>>> ===================================================================
>>>>>> --- gcc/config/arm/arm.c      (revision 200774)
>>>>>> +++ gcc/config/arm/arm.c      (working copy)
>>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg)
>>>>>>       }
>>>>>>   }
>>>>>>
>>>>>> +/* Try to find address expression like base + index * scale + offset
>>>>>> +   in X.  If we find one, force base + offset into register and
>>>>>> +   construct new expression reg + index * scale; return the new
>>>>>> +   address expression if it's valid.  Otherwise return X.  */
>>>>>> +static rtx
>>>>>> +try_multiplier_address (rtx x, enum machine_mode mode
>>>>>> ATTRIBUTE_UNUSED)
>>>>>> +{
>>>>>> +  rtx tmp, base_reg, new_rtx;
>>>>>> +  rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset =
>>>>>> NULL_RTX;
>>>>>> +
>>>>>> +  gcc_assert (GET_CODE (x) == PLUS);
>>>>>> +
>>>>>> +  /* Try to find and record base/index/scale/offset in X. */
>>>>>> +  if (GET_CODE (XEXP (x, 1)) == MULT)
>>>>>> +    {
>>>>>> +      tmp = XEXP (x, 0);
>>>>>> +      index = XEXP (XEXP (x, 1), 0);
>>>>>> +      scale = XEXP (XEXP (x, 1), 1);
>>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      base = XEXP (tmp, 0);
>>>>>> +      offset = XEXP (tmp, 1);
>>>>>> +    }
>>>>>> +  else
>>>>>> +    {
>>>>>> +      tmp = XEXP (x, 0);
>>>>>> +      offset = XEXP (x, 1);
>>>>>> +      if (GET_CODE (tmp) != PLUS)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      base = XEXP (tmp, 0);
>>>>>> +      scale = XEXP (tmp, 1);
>>>>>> +      if (GET_CODE (base) == MULT)
>>>>>> +     {
>>>>>> +       tmp = base;
>>>>>> +       base = scale;
>>>>>> +       scale = tmp;
>>>>>> +     }
>>>>>> +      if (GET_CODE (scale) != MULT)
>>>>>> +     return x;
>>>>>> +
>>>>>> +      index = XEXP (scale, 0);
>>>>>> +      scale = XEXP (scale, 1);
>>>>>> +    }
>>>>>> +
>>>>>> +  if (CONST_INT_P (base))
>>>>>> +    {
>>>>>> +      tmp = base;
>>>>>> +      base = offset;
>>>>>> +      offset = tmp;
>>>>>> +    }
>>>>>> +
>>>>>> +  if (CONST_INT_P (index))
>>>>>> +    {
>>>>>> +      tmp = index;
>>>>>> +      index = scale;
>>>>>> +      scale = tmp;
>>>>>> +    }
>>>>>> +
>>>>>> +  /* ARM only supports constant scale in address.  */
>>>>>> +  if (!CONST_INT_P (scale))
>>>>>> +    return x;
>>>>>> +
>>>>>> +  if (GET_MODE (base) != SImode || GET_MODE (index) != SImode)
>>>>>> +    return x;
>>>>>> +
>>>>>> +  /* Only register/constant are allowed in each part.  */
>>>>>> +  if (!symbol_mentioned_p (base)
>>>>>> +&&  !symbol_mentioned_p (offset)
>>>>>> +&&  !symbol_mentioned_p (index)
>>>>>> +&&  !symbol_mentioned_p (scale))
>>>>>> +    {
>>>>>
>>>>>
>>>>> It would be easier to do this at the top of the function --
>>>>>    if (symbol_mentioned_p (x))
>>>>>      return x;
>>>>>
>>>>>
>>>>>> +      /* Force "base+offset" into register and construct
>>>>>> +      "register+index*scale".  Return the new expression
>>>>>> +      only if it's valid.  */
>>>>>> +      tmp = gen_rtx_PLUS (SImode, base, offset);
>>>>>> +      base_reg = force_reg (SImode, tmp);
>>>>>> +      tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale);
>>>>>> +      new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp);
>>>>>> +      return new_rtx;
>>>>>
>>>>>
>>>>> I can't help thinking that this is backwards.  That is, you want to
>>>>> split out the mult expression and use offset addressing in the
>>>>> addresses
>>>>> itself.  That's likely to lead to either better CSE, or more induction
>>>>
>>>> Thanks to your review.
>>>>
>>>> Actually base+offset is more likely loop invariant than scaled
>>>> expression, just as reported in pr57540.  The bug is observed in
>>>> spec2k bzip/gzip, and hurts arm in hot loops.  The loop induction
>>>> variable doesn't matter that much comparing to invariant because we
>>>> are in RTL now.
>>>>
>>>>> vars.  Furthermore, scaled addressing modes are likely to be more
>>>>> expensive than simple offset addressing modes on at least some cores.
>>>>
>>>> The purpose is to CSE (as you pointed out) or hoist loop invariant.
>>>> As for addressing mode is concerned, Though we may guide the
>>>> transformation once we have reliable address cost mode, we don't have
>>>> the information if base+offset is invariant, so it's difficult to
>>>> handle in backend, right?
>>>>
>>>> What do you think about this?
>>>>
>>>
>>> Additionally, there is no induction variable issue here.  The memory
>>> reference we are facing during expand are not lowered by gimple IVOPT,
>>> which means either it's outside loop, or doesn't have induction
>>> variable address.
>>>
>>> Generally, there are three passes which lowers memory reference:
>>> gimple strength reduction picks opportunity outside loop; gimple IVOPT
>>> handles references with induction variable addresses; references not
>>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad
>>> choice of address re-association.  I think Yufeng also noticed the
>>> problem and are trying with patch like:
>>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html
>>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html
>>>
>>> After thinking twice, I some kind of think we should not re-associate
>>> addresses during expanding, because of lacking of context information.
>>>   Take base + scaled_index + offset as an example in PR57540, we just
>>> don't know if "base+offset" is loop invariant from either backend or
>>> RTL expander.
>>>
>>> Any comments?
>>
>>
>> The issue is that re-association and address lowering is not integrated
>> with optimizers such as CSE and invariant motion.  This means it's
>> hard to arrive at optimal results and all passes are just "guessing"
>> and applying heuristics that the programmer thought are appropriate
>> for his machine.
>>
>> I have no easy way out but think that we lower addresses too late
>> (at least fully).  Loop optimizers would like to see the complex
>> memory reference forms but starting with IVOPTs lowering all
>> addresses would likely be a win in the end (there are CSE, LIM
>> and re-association passes around after/at this point as well).
>>
>> Back in the 4.3 days when I for the first time attempted to introduce
>> MEM_REF I forcefully lowered all memory accesses and addresses
>> very early (during gimplification basically).  That didn't work out well.
>>
>> So my suggestion to anyone who wants to try something is
>> to apply an additional lowering to the whole GIMPLE, lowering
>> things like handled-component references and addresses
>> (and also things like bitfield accesses).
>
>
> This idea of additional lowering is interesting.  I found this wiki page
> describing the memory reference flattening in GIMPLE:
>
>   http://gcc.gnu.org/wiki/MemRef
>
> I wonder whether there is more information or notes you can share, things
> like common pitfalls, implication in alias analysis, etc.  I'd like to do
> some experiment with it when I have time.

The most notable pitfall is that you lose disambiguations that work
on access-paths (disambigations look at a single 'tree' only).  The
later you do the lowering the less is the effect - but it will definitely
hit the instruction scheduler for example.

Currently we save the full "tree" memory access in MEM_EXPR
during expansion which is where we lower the accesses to RTL.
If you lower earlier you cannot save that fancy MEM_EXPR.

The other pitfall is that more variables become address-taken
if they are variable-indexed as the memory access becomes
indirect in the IL.  This then requires proper points-to information
to disambiguate against other indirect references (but this points
to info should be available, so this isn't a big concern IMHO).

That said, "late" lowering in a target specific way (that is, with
TARGET_MEM_REFs) is still worthwhile to try.

Richard.

> Regards,
> Yufeng
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2013-12-02 11:26 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-28  7:12 [PATCH ARM]Refine scaled address expression on ARM bin.cheng
2013-08-29 13:15 ` Richard Earnshaw
2013-09-02  7:08   ` bin.cheng
2013-09-18  9:38     ` bin.cheng
2013-11-28 13:15       ` Richard Earnshaw
2013-11-28 14:28         ` Bin.Cheng
2013-11-29 11:23           ` Bin.Cheng
2013-11-29 12:19             ` Richard Biener
2013-11-29 12:31               ` Bin.Cheng
2013-11-29 12:59                 ` Richard Biener
2013-11-29 19:35               ` Yufeng Zhang
2013-12-02 11:26                 ` Richard Biener
2013-11-29 13:31             ` Yufeng Zhang
2013-11-29 13:46               ` Richard Biener
2013-11-29 17:26                 ` Yufeng Zhang
2013-11-29 19:55               ` Richard Earnshaw
2013-11-30  8:08                 ` Bin.Cheng
2013-12-02  3:22                 ` Bin.Cheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).