* [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift
@ 2016-12-31 8:03 Andrew Pinski
2017-05-07 21:39 ` Andrew Pinski
2017-06-07 17:16 ` James Greenhalgh
0 siblings, 2 replies; 6+ messages in thread
From: Andrew Pinski @ 2016-12-31 8:03 UTC (permalink / raw)
To: GCC Patches
[-- Attachment #1: Type: text/plain, Size: 1909 bytes --]
Hi,
Currently for the following function:
int f(int a, int b)
{
return a + (b <<7);
}
GCC produces:
add w0, w0, w1, lsl 7
But for ThunderX 1, it is better if the instruction was split allowing
better scheduling to happen in most cases, the latency is the same. I
get a small improvement in coremarks, ~1%.
Currently the code does not take into account Arith_shift even though
the comment:
/* Strip any extend, leave shifts behind as we will
cost them through mult_cost. */
Say it does not strip out the shift, aarch64_strip_extend does and has
always has since the back-end was added to GCC.
Once I fixed the code around aarch64_strip_extend, I got a regression
for ThunderX 1 as some shifts/extends (left shifts <=4 and/or zero
extends) are considered free so I needed to add a new tuning flag.
Note I will get an even more improvement for ThunderX 2 CN99XX, but I
have not measured it yet as I have not made the change to
aarch64-cost-tables.h yet as I am waiting for approval of the renaming
patch first before submitting any of the cost table changes. Also I
noticed this problem with this tuning first and then looked back at
what I needed to do for ThunderX 1.
OK? Bootstrapped and tested on aarch64-linux-gnu without any
regressions (both with and without --with-cpu=thunderx).
Thanks,
Andrew
ChangeLog:
* config/aarch64/aarch64-cost-tables.h (thunderx_extra_costs):
Increment Arith_shift and Arith_shift_reg by 1.
* config/aarch64/aarch64-tuning-flags.def (easy_shift_extend): New tuning flag.
* config/aarch64/aarch64.c (thunderx_tunings): Enable
AARCH64_EXTRA_TUNE_EASY_SHIFT_EXTEND.
(aarch64_strip_extend): Add new argument and test for it.
(aarch64_easy_mult_shift_p): New function.
(aarch64_rtx_mult_cost): Call aarch64_easy_mult_shift_p and don't add
a cost if it is true.
Update calls to aarch64_strip_extend.
(aarch64_rtx_costs): Update calls to aarch64_strip_extend.
[-- Attachment #2: improve-thunderx-cost.diff.txt --]
[-- Type: text/plain, Size: 5589 bytes --]
Index: config/aarch64/aarch64-cost-tables.h
===================================================================
--- config/aarch64/aarch64-cost-tables.h (revision 243974)
+++ config/aarch64/aarch64-cost-tables.h (working copy)
@@ -32,8 +32,8 @@ const struct cpu_cost_table thunderx_ext
0, /* Logical. */
0, /* Shift. */
0, /* Shift_reg. */
- COSTS_N_INSNS (1), /* Arith_shift. */
- COSTS_N_INSNS (1), /* Arith_shift_reg. */
+ COSTS_N_INSNS (1)+1, /* Arith_shift. */
+ COSTS_N_INSNS (1)+1, /* Arith_shift_reg. */
COSTS_N_INSNS (1), /* UNUSED: Log_shift. */
COSTS_N_INSNS (1), /* UNUSED: Log_shift_reg. */
0, /* Extend. */
Index: config/aarch64/aarch64-tuning-flags.def
===================================================================
--- config/aarch64/aarch64-tuning-flags.def (revision 243974)
+++ config/aarch64/aarch64-tuning-flags.def (working copy)
@@ -35,4 +35,8 @@ two load/stores are not at least 8 byte
pairs. */
AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
+/* Logical shift left <=4 with/without zero extend are considered easy
+ extended, also zero extends without the shift. */
+AARCH64_EXTRA_TUNING_OPTION ("easy_shift_extend", EASY_SHIFT_EXTEND)
+
#undef AARCH64_EXTRA_TUNING_OPTION
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c (revision 243974)
+++ config/aarch64/aarch64.c (working copy)
@@ -714,7 +714,8 @@ static const struct tune_params thunderx
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
+ | AARCH64_EXTRA_TUNE_EASY_SHIFT_EXTEND) /* tune_flags. */
};
static const struct tune_params xgene1_tunings =
@@ -5918,9 +5919,10 @@ aarch64_strip_shift (rtx x)
/* Helper function for rtx cost calculation. Strip an extend
expression from X. Returns the inner operand if successful, or the
original expression on failure. We deal with a number of possible
- canonicalization variations here. */
+ canonicalization variations here. If STRIP_SHIFT is true, then
+ we can strip off a shift also. */
static rtx
-aarch64_strip_extend (rtx x)
+aarch64_strip_extend (rtx x, bool strip_shift)
{
rtx op = x;
@@ -5944,7 +5946,8 @@ aarch64_strip_extend (rtx x)
/* Now handle extended register, as this may also have an optional
left shift by 1..4. */
- if (GET_CODE (op) == ASHIFT
+ if (strip_shift
+ && GET_CODE (op) == ASHIFT
&& CONST_INT_P (XEXP (op, 1))
&& ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
op = XEXP (op, 0);
@@ -5968,6 +5971,39 @@ aarch64_shift_p (enum rtx_code code)
return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
}
+
+/* Return true iff X is an easy shift without a sign extend. */
+
+static bool
+aarch64_easy_mult_shift_p (rtx x)
+{
+ rtx op0, op1;
+
+ op0 = XEXP (x, 0);
+ op1 = XEXP (x, 1);
+
+ if (!(aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_EASY_SHIFT_EXTEND))
+ return false;
+
+ if (GET_CODE (op0) == SIGN_EXTEND)
+ return false;
+
+ if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
+ && UINTVAL (op1) <= 4)
+ return true;
+
+ if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
+ return false;
+
+ HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
+
+ if (l2 > 0 && l2 <= 4)
+ return true;
+
+ return false;
+}
+
/* Helper function for rtx cost calculation. Calculate the cost of
a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
Return the calculated cost of the expression, recursing manually in to
@@ -6005,7 +6041,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_c
{
if (compound_p)
{
- if (REG_P (op1))
+ /* If the shift is considered easy,
+ then don't add any cost. */
+ if (aarch64_easy_mult_shift_p (x))
+ ;
+ else if (REG_P (op1))
/* ARITH + shift-by-register. */
cost += extra_cost->alu.arith_shift_reg;
else if (is_extend)
@@ -6023,7 +6063,7 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_c
}
/* Strip extends as we will have costed them in the case above. */
if (is_extend)
- op0 = aarch64_strip_extend (op0);
+ op0 = aarch64_strip_extend (op0, true);
cost += rtx_cost (op0, VOIDmode, code, 0, speed);
@@ -6867,13 +6907,13 @@ cost_minus:
if (speed)
*cost += extra_cost->alu.extend_arith;
- op1 = aarch64_strip_extend (op1);
+ op1 = aarch64_strip_extend (op1, true);
*cost += rtx_cost (op1, VOIDmode,
(enum rtx_code) GET_CODE (op1), 0, speed);
return true;
}
- rtx new_op1 = aarch64_strip_extend (op1);
+ rtx new_op1 = aarch64_strip_extend (op1, false);
/* Cost this as an FMA-alike operation. */
if ((GET_CODE (new_op1) == MULT
@@ -6946,7 +6986,7 @@ cost_plus:
if (speed)
*cost += extra_cost->alu.extend_arith;
- op0 = aarch64_strip_extend (op0);
+ op0 = aarch64_strip_extend (op0, true);
*cost += rtx_cost (op0, VOIDmode,
(enum rtx_code) GET_CODE (op0), 0, speed);
return true;
@@ -6954,7 +6994,7 @@ cost_plus:
/* Strip any extend, leave shifts behind as we will
cost them through mult_cost. */
- new_op0 = aarch64_strip_extend (op0);
+ new_op0 = aarch64_strip_extend (op0, false);
if (GET_CODE (new_op0) == MULT
|| aarch64_shift_p (GET_CODE (new_op0)))
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift
2016-12-31 8:03 [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift Andrew Pinski
@ 2017-05-07 21:39 ` Andrew Pinski
2017-06-07 17:16 ` James Greenhalgh
1 sibling, 0 replies; 6+ messages in thread
From: Andrew Pinski @ 2017-05-07 21:39 UTC (permalink / raw)
To: GCC Patches
On Fri, Dec 30, 2016 at 10:05 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> Hi,
> Currently for the following function:
> int f(int a, int b)
> {
> return a + (b <<7);
> }
>
> GCC produces:
> add w0, w0, w1, lsl 7
> But for ThunderX 1, it is better if the instruction was split allowing
> better scheduling to happen in most cases, the latency is the same. I
> get a small improvement in coremarks, ~1%.
>
> Currently the code does not take into account Arith_shift even though
> the comment:
> /* Strip any extend, leave shifts behind as we will
> cost them through mult_cost. */
> Say it does not strip out the shift, aarch64_strip_extend does and has
> always has since the back-end was added to GCC.
>
> Once I fixed the code around aarch64_strip_extend, I got a regression
> for ThunderX 1 as some shifts/extends (left shifts <=4 and/or zero
> extends) are considered free so I needed to add a new tuning flag.
>
> Note I will get an even more improvement for ThunderX 2 CN99XX, but I
> have not measured it yet as I have not made the change to
> aarch64-cost-tables.h yet as I am waiting for approval of the renaming
> patch first before submitting any of the cost table changes. Also I
> noticed this problem with this tuning first and then looked back at
> what I needed to do for ThunderX 1.
>
> OK? Bootstrapped and tested on aarch64-linux-gnu without any
> regressions (both with and without --with-cpu=thunderx).
Ping? This has been not reviewed for over 5 months now :(.
Thanks,
Andrew
>
> Thanks,
> Andrew
>
> ChangeLog:
> * config/aarch64/aarch64-cost-tables.h (thunderx_extra_costs):
> Increment Arith_shift and Arith_shift_reg by 1.
> * config/aarch64/aarch64-tuning-flags.def (easy_shift_extend): New tuning flag.
> * config/aarch64/aarch64.c (thunderx_tunings): Enable
> AARCH64_EXTRA_TUNE_EASY_SHIFT_EXTEND.
> (aarch64_strip_extend): Add new argument and test for it.
> (aarch64_easy_mult_shift_p): New function.
> (aarch64_rtx_mult_cost): Call aarch64_easy_mult_shift_p and don't add
> a cost if it is true.
> Update calls to aarch64_strip_extend.
> (aarch64_rtx_costs): Update calls to aarch64_strip_extend.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift
2016-12-31 8:03 [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift Andrew Pinski
2017-05-07 21:39 ` Andrew Pinski
@ 2017-06-07 17:16 ` James Greenhalgh
2017-06-19 21:00 ` Andrew Pinski
1 sibling, 1 reply; 6+ messages in thread
From: James Greenhalgh @ 2017-06-07 17:16 UTC (permalink / raw)
To: Andrew Pinski; +Cc: GCC Patches, nd
On Fri, Dec 30, 2016 at 10:05:26PM -0800, Andrew Pinski wrote:
> Hi,
> Currently for the following function:
> int f(int a, int b)
> {
> return a + (b <<7);
> }
>
> GCC produces:
> add w0, w0, w1, lsl 7
> But for ThunderX 1, it is better if the instruction was split allowing
> better scheduling to happen in most cases, the latency is the same. I
> get a small improvement in coremarks, ~1%.
>
> Currently the code does not take into account Arith_shift even though
> the comment:
> /* Strip any extend, leave shifts behind as we will
> cost them through mult_cost. */
> Say it does not strip out the shift, aarch64_strip_extend does and has
> always has since the back-end was added to GCC.
>
> Once I fixed the code around aarch64_strip_extend, I got a regression
> for ThunderX 1 as some shifts/extends (left shifts <=4 and/or zero
> extends) are considered free so I needed to add a new tuning flag.
>
> Note I will get an even more improvement for ThunderX 2 CN99XX, but I
> have not measured it yet as I have not made the change to
> aarch64-cost-tables.h yet as I am waiting for approval of the renaming
> patch first before submitting any of the cost table changes. Also I
> noticed this problem with this tuning first and then looked back at
> what I needed to do for ThunderX 1.
>
> OK? Bootstrapped and tested on aarch64-linux-gnu without any
> regressions (both with and without --with-cpu=thunderx).
This is mostly OK, but I don't like the name "easy"_shift_extend. Cheap
or free seems better. I have some other minor points below.
> Index: config/aarch64/aarch64-tuning-flags.def
> ===================================================================
> --- config/aarch64/aarch64-tuning-flags.def (revision 243974)
> +++ config/aarch64/aarch64-tuning-flags.def (working copy)
> @@ -35,4 +35,8 @@ two load/stores are not at least 8 byte
> pairs. */
> AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
>
> +/* Logical shift left <=4 with/without zero extend are considered easy
> + extended, also zero extends without the shift. */
I'm struggling to parse this comment. "also zero extends without the shift"
is what is getting me. I'm also not certain I follow when I should set this
flag. If all shifts are cheap/free on my platform, should I set this flag?
> +AARCH64_EXTRA_TUNING_OPTION ("easy_shift_extend", EASY_SHIFT_EXTEND)
> +
> #undef AARCH64_EXTRA_TUNING_OPTION
> +
> +/* Return true iff X is an easy shift without a sign extend. */
> +
Again I don't like calling <= 4 "easy", it feels imprecise.
Thanks,
James
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift
2017-06-07 17:16 ` James Greenhalgh
@ 2017-06-19 21:00 ` Andrew Pinski
2017-06-20 21:07 ` Andrew Pinski
0 siblings, 1 reply; 6+ messages in thread
From: Andrew Pinski @ 2017-06-19 21:00 UTC (permalink / raw)
To: James Greenhalgh; +Cc: GCC Patches, nd
On Wed, Jun 7, 2017 at 10:16 AM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
> On Fri, Dec 30, 2016 at 10:05:26PM -0800, Andrew Pinski wrote:
>> Hi,
>> Currently for the following function:
>> int f(int a, int b)
>> {
>> return a + (b <<7);
>> }
>>
>> GCC produces:
>> add w0, w0, w1, lsl 7
>> But for ThunderX 1, it is better if the instruction was split allowing
>> better scheduling to happen in most cases, the latency is the same. I
>> get a small improvement in coremarks, ~1%.
>>
>> Currently the code does not take into account Arith_shift even though
>> the comment:
>> /* Strip any extend, leave shifts behind as we will
>> cost them through mult_cost. */
>> Say it does not strip out the shift, aarch64_strip_extend does and has
>> always has since the back-end was added to GCC.
>>
>> Once I fixed the code around aarch64_strip_extend, I got a regression
>> for ThunderX 1 as some shifts/extends (left shifts <=4 and/or zero
>> extends) are considered free so I needed to add a new tuning flag.
>>
>> Note I will get an even more improvement for ThunderX 2 CN99XX, but I
>> have not measured it yet as I have not made the change to
>> aarch64-cost-tables.h yet as I am waiting for approval of the renaming
>> patch first before submitting any of the cost table changes. Also I
>> noticed this problem with this tuning first and then looked back at
>> what I needed to do for ThunderX 1.
>>
>> OK? Bootstrapped and tested on aarch64-linux-gnu without any
>> regressions (both with and without --with-cpu=thunderx).
>
> This is mostly OK, but I don't like the name "easy"_shift_extend. Cheap
> or free seems better. I have some other minor points below.
Ok, that seems like a good idea. I used easy since that was the
wording our hardware folks had came up with. I am changing the
comments to make clearer when this flag should be used.
I should a new patch out by the end of today.
Thanks,
Andrew
>
>> Index: config/aarch64/aarch64-tuning-flags.def
>> ===================================================================
>> --- config/aarch64/aarch64-tuning-flags.def (revision 243974)
>> +++ config/aarch64/aarch64-tuning-flags.def (working copy)
>> @@ -35,4 +35,8 @@ two load/stores are not at least 8 byte
>> pairs. */
>> AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
>>
>> +/* Logical shift left <=4 with/without zero extend are considered easy
>> + extended, also zero extends without the shift. */
>
>
> I'm struggling to parse this comment. "also zero extends without the shift"
> is what is getting me. I'm also not certain I follow when I should set this
> flag. If all shifts are cheap/free on my platform, should I set this flag?
>
>> +AARCH64_EXTRA_TUNING_OPTION ("easy_shift_extend", EASY_SHIFT_EXTEND)
>> +
>> #undef AARCH64_EXTRA_TUNING_OPTION
>
>
>> +
>> +/* Return true iff X is an easy shift without a sign extend. */
>> +
>
> Again I don't like calling <= 4 "easy", it feels imprecise.
>
> Thanks,
> James
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift
2017-06-19 21:00 ` Andrew Pinski
@ 2017-06-20 21:07 ` Andrew Pinski
2017-06-21 11:13 ` James Greenhalgh
0 siblings, 1 reply; 6+ messages in thread
From: Andrew Pinski @ 2017-06-20 21:07 UTC (permalink / raw)
To: James Greenhalgh; +Cc: GCC Patches, nd
[-- Attachment #1: Type: text/plain, Size: 4011 bytes --]
On Mon, Jun 19, 2017 at 2:00 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Wed, Jun 7, 2017 at 10:16 AM, James Greenhalgh
> <james.greenhalgh@arm.com> wrote:
>> On Fri, Dec 30, 2016 at 10:05:26PM -0800, Andrew Pinski wrote:
>>> Hi,
>>> Currently for the following function:
>>> int f(int a, int b)
>>> {
>>> return a + (b <<7);
>>> }
>>>
>>> GCC produces:
>>> add w0, w0, w1, lsl 7
>>> But for ThunderX 1, it is better if the instruction was split allowing
>>> better scheduling to happen in most cases, the latency is the same. I
>>> get a small improvement in coremarks, ~1%.
>>>
>>> Currently the code does not take into account Arith_shift even though
>>> the comment:
>>> /* Strip any extend, leave shifts behind as we will
>>> cost them through mult_cost. */
>>> Say it does not strip out the shift, aarch64_strip_extend does and has
>>> always has since the back-end was added to GCC.
>>>
>>> Once I fixed the code around aarch64_strip_extend, I got a regression
>>> for ThunderX 1 as some shifts/extends (left shifts <=4 and/or zero
>>> extends) are considered free so I needed to add a new tuning flag.
>>>
>>> Note I will get an even more improvement for ThunderX 2 CN99XX, but I
>>> have not measured it yet as I have not made the change to
>>> aarch64-cost-tables.h yet as I am waiting for approval of the renaming
>>> patch first before submitting any of the cost table changes. Also I
>>> noticed this problem with this tuning first and then looked back at
>>> what I needed to do for ThunderX 1.
>>>
>>> OK? Bootstrapped and tested on aarch64-linux-gnu without any
>>> regressions (both with and without --with-cpu=thunderx).
>>
>> This is mostly OK, but I don't like the name "easy"_shift_extend. Cheap
>> or free seems better. I have some other minor points below.
>
>
> Ok, that seems like a good idea. I used easy since that was the
> wording our hardware folks had came up with. I am changing the
> comments to make clearer when this flag should be used.
> I should a new patch out by the end of today.
Due to the LSE ICE which I reported in the other thread, it took me
longer to send out a new patch.
Anyways here is the updated patch with the changes requested.
OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.
Thanks,
Andrew Pinski
* config/aarch64/aarch64-cost-tables.h (thunderx_extra_costs):
Increment Arith_shift and Arith_shift_reg by 1.
* config/aarch64/aarch64-tuning-flags.def (cheap_shift_extend): New tuning flag.
* config/aarch64/aarch64.c (thunderx_tunings): Enable
AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND.
(aarch64_strip_extend): Add new argument and test for it.
(aarch64_cheap_mult_shift_p): New function.
(aarch64_rtx_mult_cost): Call aarch64_cheap_mult_shift_p and don't add
a cost if it is true.
Update calls to aarch64_strip_extend.
(aarch64_rtx_costs): Update calls to aarch64_strip_extend.
>
> Thanks,
> Andrew
>
>
>>
>>> Index: config/aarch64/aarch64-tuning-flags.def
>>> ===================================================================
>>> --- config/aarch64/aarch64-tuning-flags.def (revision 243974)
>>> +++ config/aarch64/aarch64-tuning-flags.def (working copy)
>>> @@ -35,4 +35,8 @@ two load/stores are not at least 8 byte
>>> pairs. */
>>> AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
>>>
>>> +/* Logical shift left <=4 with/without zero extend are considered easy
>>> + extended, also zero extends without the shift. */
>>
>>
>> I'm struggling to parse this comment. "also zero extends without the shift"
>> is what is getting me. I'm also not certain I follow when I should set this
>> flag. If all shifts are cheap/free on my platform, should I set this flag?
>>
>>> +AARCH64_EXTRA_TUNING_OPTION ("easy_shift_extend", EASY_SHIFT_EXTEND)
>>> +
>>> #undef AARCH64_EXTRA_TUNING_OPTION
>>
>>
>>> +
>>> +/* Return true iff X is an easy shift without a sign extend. */
>>> +
>>
>> Again I don't like calling <= 4 "easy", it feels imprecise.
>>
>> Thanks,
>> James
>>
[-- Attachment #2: improvecost.diff.txt --]
[-- Type: text/plain, Size: 5733 bytes --]
Index: gcc/config/aarch64/aarch64-cost-tables.h
===================================================================
--- gcc/config/aarch64/aarch64-cost-tables.h (revision 249424)
+++ gcc/config/aarch64/aarch64-cost-tables.h (working copy)
@@ -136,8 +136,8 @@ const struct cpu_cost_table thunderx_ext
0, /* Logical. */
0, /* Shift. */
0, /* Shift_reg. */
- COSTS_N_INSNS (1), /* Arith_shift. */
- COSTS_N_INSNS (1), /* Arith_shift_reg. */
+ COSTS_N_INSNS (1)+1, /* Arith_shift. */
+ COSTS_N_INSNS (1)+1, /* Arith_shift_reg. */
COSTS_N_INSNS (1), /* UNUSED: Log_shift. */
COSTS_N_INSNS (1), /* UNUSED: Log_shift_reg. */
0, /* Extend. */
Index: gcc/config/aarch64/aarch64-tuning-flags.def
===================================================================
--- gcc/config/aarch64/aarch64-tuning-flags.def (revision 249424)
+++ gcc/config/aarch64/aarch64-tuning-flags.def (working copy)
@@ -35,4 +35,10 @@ two load/stores are not at least 8 byte
pairs. */
AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
+/* Some of the optional shift to some arthematic instructions are
+ considered cheap. Logical shift left <=4 with or without a
+ zero extend are considered cheap. Sign extend; non logical shift left
+ are not considered cheap. */
+AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
+
#undef AARCH64_EXTRA_TUNING_OPTION
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c (revision 249424)
+++ gcc/config/aarch64/aarch64.c (working copy)
@@ -766,7 +766,8 @@ static const struct tune_params thunderx
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
+ | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
&generic_prefetch_tune
};
@@ -6077,9 +6078,10 @@ aarch64_strip_shift (rtx x)
/* Helper function for rtx cost calculation. Strip an extend
expression from X. Returns the inner operand if successful, or the
original expression on failure. We deal with a number of possible
- canonicalization variations here. */
+ canonicalization variations here. If STRIP_SHIFT is true, then
+ we can strip off a shift also. */
static rtx
-aarch64_strip_extend (rtx x)
+aarch64_strip_extend (rtx x, bool strip_shift)
{
rtx op = x;
@@ -6103,7 +6105,8 @@ aarch64_strip_extend (rtx x)
/* Now handle extended register, as this may also have an optional
left shift by 1..4. */
- if (GET_CODE (op) == ASHIFT
+ if (strip_shift
+ && GET_CODE (op) == ASHIFT
&& CONST_INT_P (XEXP (op, 1))
&& ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
op = XEXP (op, 0);
@@ -6127,6 +6130,39 @@ aarch64_shift_p (enum rtx_code code)
return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
}
+
+/* Return true iff X is an cheap shift without a sign extend. */
+
+static bool
+aarch64_cheap_mult_shift_p (rtx x)
+{
+ rtx op0, op1;
+
+ op0 = XEXP (x, 0);
+ op1 = XEXP (x, 1);
+
+ if (!(aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
+ return false;
+
+ if (GET_CODE (op0) == SIGN_EXTEND)
+ return false;
+
+ if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
+ && UINTVAL (op1) <= 4)
+ return true;
+
+ if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
+ return false;
+
+ HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
+
+ if (l2 > 0 && l2 <= 4)
+ return true;
+
+ return false;
+}
+
/* Helper function for rtx cost calculation. Calculate the cost of
a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
Return the calculated cost of the expression, recursing manually in to
@@ -6164,7 +6200,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_c
{
if (compound_p)
{
- if (REG_P (op1))
+ /* If the shift is considered cheap,
+ then don't add any cost. */
+ if (aarch64_cheap_mult_shift_p (x))
+ ;
+ else if (REG_P (op1))
/* ARITH + shift-by-register. */
cost += extra_cost->alu.arith_shift_reg;
else if (is_extend)
@@ -6182,7 +6222,7 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_c
}
/* Strip extends as we will have costed them in the case above. */
if (is_extend)
- op0 = aarch64_strip_extend (op0);
+ op0 = aarch64_strip_extend (op0, true);
cost += rtx_cost (op0, VOIDmode, code, 0, speed);
@@ -7026,13 +7066,13 @@ cost_minus:
if (speed)
*cost += extra_cost->alu.extend_arith;
- op1 = aarch64_strip_extend (op1);
+ op1 = aarch64_strip_extend (op1, true);
*cost += rtx_cost (op1, VOIDmode,
(enum rtx_code) GET_CODE (op1), 0, speed);
return true;
}
- rtx new_op1 = aarch64_strip_extend (op1);
+ rtx new_op1 = aarch64_strip_extend (op1, false);
/* Cost this as an FMA-alike operation. */
if ((GET_CODE (new_op1) == MULT
@@ -7105,7 +7145,7 @@ cost_plus:
if (speed)
*cost += extra_cost->alu.extend_arith;
- op0 = aarch64_strip_extend (op0);
+ op0 = aarch64_strip_extend (op0, true);
*cost += rtx_cost (op0, VOIDmode,
(enum rtx_code) GET_CODE (op0), 0, speed);
return true;
@@ -7113,7 +7153,7 @@ cost_plus:
/* Strip any extend, leave shifts behind as we will
cost them through mult_cost. */
- new_op0 = aarch64_strip_extend (op0);
+ new_op0 = aarch64_strip_extend (op0, false);
if (GET_CODE (new_op0) == MULT
|| aarch64_shift_p (GET_CODE (new_op0)))
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift
2017-06-20 21:07 ` Andrew Pinski
@ 2017-06-21 11:13 ` James Greenhalgh
0 siblings, 0 replies; 6+ messages in thread
From: James Greenhalgh @ 2017-06-21 11:13 UTC (permalink / raw)
To: Andrew Pinski; +Cc: GCC Patches, nd
On Tue, Jun 20, 2017 at 02:07:22PM -0700, Andrew Pinski wrote:
> On Mon, Jun 19, 2017 at 2:00 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> > On Wed, Jun 7, 2017 at 10:16 AM, James Greenhalgh
> > <james.greenhalgh@arm.com> wrote:
> >> On Fri, Dec 30, 2016 at 10:05:26PM -0800, Andrew Pinski wrote:
> >>> Hi,
> >>> Currently for the following function:
> >>> int f(int a, int b)
> >>> {
> >>> return a + (b <<7);
> >>> }
> >>>
> >>> GCC produces:
> >>> add w0, w0, w1, lsl 7
> >>> But for ThunderX 1, it is better if the instruction was split allowing
> >>> better scheduling to happen in most cases, the latency is the same. I
> >>> get a small improvement in coremarks, ~1%.
> >>>
> >>> Currently the code does not take into account Arith_shift even though
> >>> the comment:
> >>> /* Strip any extend, leave shifts behind as we will
> >>> cost them through mult_cost. */
> >>> Say it does not strip out the shift, aarch64_strip_extend does and has
> >>> always has since the back-end was added to GCC.
> >>>
> >>> Once I fixed the code around aarch64_strip_extend, I got a regression
> >>> for ThunderX 1 as some shifts/extends (left shifts <=4 and/or zero
> >>> extends) are considered free so I needed to add a new tuning flag.
> >>>
> >>> Note I will get an even more improvement for ThunderX 2 CN99XX, but I
> >>> have not measured it yet as I have not made the change to
> >>> aarch64-cost-tables.h yet as I am waiting for approval of the renaming
> >>> patch first before submitting any of the cost table changes. Also I
> >>> noticed this problem with this tuning first and then looked back at
> >>> what I needed to do for ThunderX 1.
> >>>
> >>> OK? Bootstrapped and tested on aarch64-linux-gnu without any
> >>> regressions (both with and without --with-cpu=thunderx).
> >>
> >> This is mostly OK, but I don't like the name "easy"_shift_extend. Cheap
> >> or free seems better. I have some other minor points below.
> >
> >
> > Ok, that seems like a good idea. I used easy since that was the
> > wording our hardware folks had came up with. I am changing the
> > comments to make clearer when this flag should be used.
> > I should a new patch out by the end of today.
>
> Due to the LSE ICE which I reported in the other thread, it took me
> longer to send out a new patch.
> Anyways here is the updated patch with the changes requested.
>
>
> OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.
One grammar fix inline below, otherwise this is OK.
Thanks,
James
> * config/aarch64/aarch64-cost-tables.h (thunderx_extra_costs):
> Increment Arith_shift and Arith_shift_reg by 1.
> * config/aarch64/aarch64-tuning-flags.def (cheap_shift_extend): New tuning flag.
> * config/aarch64/aarch64.c (thunderx_tunings): Enable
> AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND.
> (aarch64_strip_extend): Add new argument and test for it.
> (aarch64_cheap_mult_shift_p): New function.
> (aarch64_rtx_mult_cost): Call aarch64_cheap_mult_shift_p and don't add
> a cost if it is true.
> Update calls to aarch64_strip_extend.
> (aarch64_rtx_costs): Update calls to aarch64_strip_extend.
>
> +
> +/* Return true iff X is an cheap shift without a sign extend. */
s/an cheap/a cheap/
> +
> +static bool
> +aarch64_cheap_mult_shift_p (rtx x)
> +{
> + rtx op0, op1;
> +
> + op0 = XEXP (x, 0);
> + op1 = XEXP (x, 1);
> +
> + if (!(aarch64_tune_params.extra_tuning_flags
> + & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
> + return false;
> +
> + if (GET_CODE (op0) == SIGN_EXTEND)
> + return false;
> +
> + if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
> + && UINTVAL (op1) <= 4)
> + return true;
> +
> + if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
> + return false;
> +
> + HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
> +
> + if (l2 > 0 && l2 <= 4)
> + return true;
> +
> + return false;
> +}
> +
> /* Helper function for rtx cost calculation. Calculate the cost of
> a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
> Return the calculated cost of the expression, recursing manually in to
> @@ -6164,7 +6200,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_c
> {
> if (compound_p)
> {
> - if (REG_P (op1))
> + /* If the shift is considered cheap,
> + then don't add any cost. */
> + if (aarch64_cheap_mult_shift_p (x))
> + ;
> + else if (REG_P (op1))
> /* ARITH + shift-by-register. */
> cost += extra_cost->alu.arith_shift_reg;
> else if (is_extend)
> @@ -6182,7 +6222,7 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_c
> }
> /* Strip extends as we will have costed them in the case above. */
> if (is_extend)
> - op0 = aarch64_strip_extend (op0);
> + op0 = aarch64_strip_extend (op0, true);
>
> cost += rtx_cost (op0, VOIDmode, code, 0, speed);
>
> @@ -7026,13 +7066,13 @@ cost_minus:
> if (speed)
> *cost += extra_cost->alu.extend_arith;
>
> - op1 = aarch64_strip_extend (op1);
> + op1 = aarch64_strip_extend (op1, true);
> *cost += rtx_cost (op1, VOIDmode,
> (enum rtx_code) GET_CODE (op1), 0, speed);
> return true;
> }
>
> - rtx new_op1 = aarch64_strip_extend (op1);
> + rtx new_op1 = aarch64_strip_extend (op1, false);
>
> /* Cost this as an FMA-alike operation. */
> if ((GET_CODE (new_op1) == MULT
> @@ -7105,7 +7145,7 @@ cost_plus:
> if (speed)
> *cost += extra_cost->alu.extend_arith;
>
> - op0 = aarch64_strip_extend (op0);
> + op0 = aarch64_strip_extend (op0, true);
> *cost += rtx_cost (op0, VOIDmode,
> (enum rtx_code) GET_CODE (op0), 0, speed);
> return true;
> @@ -7113,7 +7153,7 @@ cost_plus:
>
> /* Strip any extend, leave shifts behind as we will
> cost them through mult_cost. */
> - new_op0 = aarch64_strip_extend (op0);
> + new_op0 = aarch64_strip_extend (op0, false);
>
> if (GET_CODE (new_op0) == MULT
> || aarch64_shift_p (GET_CODE (new_op0)))
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2017-06-21 11:13 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-31 8:03 [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift Andrew Pinski
2017-05-07 21:39 ` Andrew Pinski
2017-06-07 17:16 ` James Greenhalgh
2017-06-19 21:00 ` Andrew Pinski
2017-06-20 21:07 ` Andrew Pinski
2017-06-21 11:13 ` James Greenhalgh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).