* [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR. @ 2021-07-28 8:35 liuhongt 2021-07-28 12:36 ` Richard Biener 0 siblings, 1 reply; 3+ messages in thread From: liuhongt @ 2021-07-28 8:35 UTC (permalink / raw) To: gcc-patches Hi: As described in PR 39821, WIDEN_MULT_EXPR should use a different cost model from MULT_EXPR, this patch add ix86_widen_mult_cost for that. Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn. Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. gcc/ChangeLog: * config/i386/i386.c (ix86_widen_mult_cost): New function. (ix86_add_stmt_cost): Use ix86_widen_mult_cost for WIDEN_MULT_EXPR. gcc/testsuite/ChangeLog: * gcc.target/i386/sse2-pr39821.c: New test. * gcc.target/i386/sse4-pr39821.c: New test. --- gcc/config/i386/i386.c | 48 +++++++++++++++++++- gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 876a19f4c1f..281b5fe2706 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost) return cost; } +/* Return cost of vec_widen_<s>mult_hi/lo_<mode>, + vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */ +static int +ix86_widen_mult_cost (const struct processor_costs *cost, + enum machine_mode mode, bool uns_p) +{ + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); + int extra_cost = 0; + int basic_cost = 0; + switch (mode) + { + case V8HImode: + case V16HImode: + if (!uns_p || mode == V16HImode) + extra_cost = cost->sse_op * 2; + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + case V4SImode: + case V8SImode: + /* pmulhw/pmullw can be used. */ + basic_cost = cost->mulss * 2 + cost->sse_op * 2; + break; + case V2DImode: + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, + require extra 4 mul, 4 add, 4 cmp and 2 shift. */ + if (!TARGET_SSE4_1 && !uns_p) + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 + + cost->sse_op * 2; + /* Fallthru. */ + case V4DImode: + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + default: + gcc_unreachable(); + } + return ix86_vec_cost (mode, basic_cost + extra_cost); +} + /* Return cost of multiplication in MODE. */ static int @@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, break; case MULT_EXPR: - case WIDEN_MULT_EXPR: + /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw, + take it as MULT_EXPR. */ case MULT_HIGHPART_EXPR: stmt_cost = ix86_multiplication_cost (ix86_cost, mode); break; + /* There's no direct instruction for WIDEN_MULT_EXPR, + take emulation into account. */ + case WIDEN_MULT_EXPR: + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode, + TYPE_UNSIGNED (vectype)); + break; + case NEGATE_EXPR: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) stmt_cost = ix86_cost->sse_op; diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c new file mode 100644 index 00000000000..bcd4b772c98 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */ +#include<stdint.h> +void +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order) +{ + while (order--) + *v3++ = (int16_t) *v1++ * *v2++; +} + +void +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order) +{ + while (order--) + *v3++ = (uint16_t) *v1++ * *v2++; +} + +void +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order) +{ + while (order--) + *v3++ = (int32_t) *v1++ * *v2++; +} + +void +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order) +{ + while (order--) + *v3++ = (uint32_t) *v1++ * *v2++; +} + +void +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order) +{ + while (order--) + *v3++ = (int64_t) *v1++ * *v2++; +} + +void +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order) +{ + while (order--) + *v3++ = (uint64_t) *v1++ * *v2++; +} diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c new file mode 100644 index 00000000000..4456c31e43e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c @@ -0,0 +1,4 @@ +/* { dg-do compile } */ +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */ +#include "sse2-pr39821.c" -- 2.27.0 ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR. 2021-07-28 8:35 [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR liuhongt @ 2021-07-28 12:36 ` Richard Biener 2021-07-29 1:11 ` Hongtao Liu 0 siblings, 1 reply; 3+ messages in thread From: Richard Biener @ 2021-07-28 12:36 UTC (permalink / raw) To: liuhongt; +Cc: GCC Patches, Hongtao Liu On Wed, Jul 28, 2021 at 10:35 AM liuhongt <hongtao.liu@intel.com> wrote: > > Hi: > As described in PR 39821, WIDEN_MULT_EXPR should use a different cost > model from MULT_EXPR, this patch add ix86_widen_mult_cost for that. > Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn. > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > gcc/ChangeLog: can you reference PR target/39821 please? > * config/i386/i386.c (ix86_widen_mult_cost): New function. > (ix86_add_stmt_cost): Use ix86_widen_mult_cost for > WIDEN_MULT_EXPR. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/sse2-pr39821.c: New test. > * gcc.target/i386/sse4-pr39821.c: New test. > --- > gcc/config/i386/i386.c | 48 +++++++++++++++++++- > gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++ > gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++ > 3 files changed, 96 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c > create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 876a19f4c1f..281b5fe2706 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost) > return cost; > } > > +/* Return cost of vec_widen_<s>mult_hi/lo_<mode>, > + vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */ > +static int > +ix86_widen_mult_cost (const struct processor_costs *cost, > + enum machine_mode mode, bool uns_p) > +{ > + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); > + int extra_cost = 0; > + int basic_cost = 0; > + switch (mode) > + { > + case V8HImode: > + case V16HImode: > + if (!uns_p || mode == V16HImode) > + extra_cost = cost->sse_op * 2; > + basic_cost = cost->mulss * 2 + cost->sse_op * 4; > + break; > + case V4SImode: > + case V8SImode: > + /* pmulhw/pmullw can be used. */ > + basic_cost = cost->mulss * 2 + cost->sse_op * 2; > + break; > + case V2DImode: > + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, > + require extra 4 mul, 4 add, 4 cmp and 2 shift. */ > + if (!TARGET_SSE4_1 && !uns_p) > + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 > + + cost->sse_op * 2; > + /* Fallthru. */ > + case V4DImode: > + basic_cost = cost->mulss * 2 + cost->sse_op * 4; > + break; > + default: > + gcc_unreachable(); > + } > + return ix86_vec_cost (mode, basic_cost + extra_cost); > +} > + > /* Return cost of multiplication in MODE. */ > > static int > @@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, > break; > > case MULT_EXPR: > - case WIDEN_MULT_EXPR: > + /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw, Space after /* otherwise OK. > + take it as MULT_EXPR. */ > case MULT_HIGHPART_EXPR: > stmt_cost = ix86_multiplication_cost (ix86_cost, mode); > break; > + /* There's no direct instruction for WIDEN_MULT_EXPR, > + take emulation into account. */ > + case WIDEN_MULT_EXPR: > + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode, > + TYPE_UNSIGNED (vectype)); > + break; > + > case NEGATE_EXPR: > if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) > stmt_cost = ix86_cost->sse_op; > diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c > new file mode 100644 > index 00000000000..bcd4b772c98 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c > @@ -0,0 +1,45 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */ > +#include<stdint.h> > +void > +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order) > +{ > + while (order--) > + *v3++ = (int16_t) *v1++ * *v2++; > +} > + > +void > +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order) > +{ > + while (order--) > + *v3++ = (uint16_t) *v1++ * *v2++; > +} > + > +void > +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order) > +{ > + while (order--) > + *v3++ = (int32_t) *v1++ * *v2++; > +} > + > +void > +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order) > +{ > + while (order--) > + *v3++ = (uint32_t) *v1++ * *v2++; > +} > + > +void > +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order) > +{ > + while (order--) > + *v3++ = (int64_t) *v1++ * *v2++; > +} > + > +void > +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order) > +{ > + while (order--) > + *v3++ = (uint64_t) *v1++ * *v2++; > +} > diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c > new file mode 100644 > index 00000000000..4456c31e43e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c > @@ -0,0 +1,4 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */ > +#include "sse2-pr39821.c" > -- > 2.27.0 > ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR. 2021-07-28 12:36 ` Richard Biener @ 2021-07-29 1:11 ` Hongtao Liu 0 siblings, 0 replies; 3+ messages in thread From: Hongtao Liu @ 2021-07-29 1:11 UTC (permalink / raw) To: Richard Biener; +Cc: liuhongt, GCC Patches [-- Attachment #1: Type: text/plain, Size: 6065 bytes --] On Wed, Jul 28, 2021 at 8:36 PM Richard Biener <richard.guenther@gmail.com> wrote: > > On Wed, Jul 28, 2021 at 10:35 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > Hi: > > As described in PR 39821, WIDEN_MULT_EXPR should use a different cost > > model from MULT_EXPR, this patch add ix86_widen_mult_cost for that. > > Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn. > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > > > gcc/ChangeLog: > > can you reference PR target/39821 please? > Added. > > * config/i386/i386.c (ix86_widen_mult_cost): New function. > > (ix86_add_stmt_cost): Use ix86_widen_mult_cost for > > WIDEN_MULT_EXPR. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/sse2-pr39821.c: New test. > > * gcc.target/i386/sse4-pr39821.c: New test. > > --- > > gcc/config/i386/i386.c | 48 +++++++++++++++++++- > > gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++ > > 3 files changed, 96 insertions(+), 1 deletion(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c > > create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > index 876a19f4c1f..281b5fe2706 100644 > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost) > > return cost; > > } > > > > +/* Return cost of vec_widen_<s>mult_hi/lo_<mode>, > > + vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */ > > +static int > > +ix86_widen_mult_cost (const struct processor_costs *cost, > > + enum machine_mode mode, bool uns_p) > > +{ > > + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); > > + int extra_cost = 0; > > + int basic_cost = 0; > > + switch (mode) > > + { > > + case V8HImode: > > + case V16HImode: > > + if (!uns_p || mode == V16HImode) > > + extra_cost = cost->sse_op * 2; > > + basic_cost = cost->mulss * 2 + cost->sse_op * 4; > > + break; > > + case V4SImode: > > + case V8SImode: > > + /* pmulhw/pmullw can be used. */ > > + basic_cost = cost->mulss * 2 + cost->sse_op * 2; > > + break; > > + case V2DImode: > > + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, > > + require extra 4 mul, 4 add, 4 cmp and 2 shift. */ > > + if (!TARGET_SSE4_1 && !uns_p) > > + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 > > + + cost->sse_op * 2; > > + /* Fallthru. */ > > + case V4DImode: > > + basic_cost = cost->mulss * 2 + cost->sse_op * 4; > > + break; > > + default: > > + gcc_unreachable(); > > + } > > + return ix86_vec_cost (mode, basic_cost + extra_cost); > > +} > > + > > /* Return cost of multiplication in MODE. */ > > > > static int > > @@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, > > break; > > > > case MULT_EXPR: > > - case WIDEN_MULT_EXPR: > > + /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw, > > Space after /* > Changed. > otherwise OK. Thanks for the review, attach the patch i'm going to check in. > > > + take it as MULT_EXPR. */ > > case MULT_HIGHPART_EXPR: > > stmt_cost = ix86_multiplication_cost (ix86_cost, mode); > > break; > > + /* There's no direct instruction for WIDEN_MULT_EXPR, > > + take emulation into account. */ > > + case WIDEN_MULT_EXPR: > > + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode, > > + TYPE_UNSIGNED (vectype)); > > + break; > > + > > case NEGATE_EXPR: > > if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) > > stmt_cost = ix86_cost->sse_op; > > diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c > > new file mode 100644 > > index 00000000000..bcd4b772c98 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c > > @@ -0,0 +1,45 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */ > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */ > > +#include<stdint.h> > > +void > > +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order) > > +{ > > + while (order--) > > + *v3++ = (int16_t) *v1++ * *v2++; > > +} > > + > > +void > > +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order) > > +{ > > + while (order--) > > + *v3++ = (uint16_t) *v1++ * *v2++; > > +} > > + > > +void > > +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order) > > +{ > > + while (order--) > > + *v3++ = (int32_t) *v1++ * *v2++; > > +} > > + > > +void > > +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order) > > +{ > > + while (order--) > > + *v3++ = (uint32_t) *v1++ * *v2++; > > +} > > + > > +void > > +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order) > > +{ > > + while (order--) > > + *v3++ = (int64_t) *v1++ * *v2++; > > +} > > + > > +void > > +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order) > > +{ > > + while (order--) > > + *v3++ = (uint64_t) *v1++ * *v2++; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c > > new file mode 100644 > > index 00000000000..4456c31e43e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c > > @@ -0,0 +1,4 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */ > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */ > > +#include "sse2-pr39821.c" > > -- > > 2.27.0 > > -- BR, Hongtao [-- Attachment #2: 0001-i386-Add-a-separate-function-to-calculate-cost-for-W.patch --] [-- Type: text/x-patch, Size: 4889 bytes --] From 0bae4cdd18a1645b4f70248e8986f2c83a814e8f Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao.liu@intel.com> Date: Wed, 28 Jul 2021 16:24:52 +0800 Subject: [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR. gcc/ChangeLog: PR target/39821 * config/i386/i386.c (ix86_widen_mult_cost): New function. (ix86_add_stmt_cost): Use ix86_widen_mult_cost for WIDEN_MULT_EXPR. gcc/testsuite/ChangeLog: PR target/39821 * gcc.target/i386/sse2-pr39821.c: New test. * gcc.target/i386/sse4-pr39821.c: New test. --- gcc/config/i386/i386.c | 48 +++++++++++++++++++- gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 876a19f4c1f..5d49e0b45db 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost) return cost; } +/* Return cost of vec_widen_<s>mult_hi/lo_<mode>, + vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */ +static int +ix86_widen_mult_cost (const struct processor_costs *cost, + enum machine_mode mode, bool uns_p) +{ + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); + int extra_cost = 0; + int basic_cost = 0; + switch (mode) + { + case V8HImode: + case V16HImode: + if (!uns_p || mode == V16HImode) + extra_cost = cost->sse_op * 2; + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + case V4SImode: + case V8SImode: + /* pmulhw/pmullw can be used. */ + basic_cost = cost->mulss * 2 + cost->sse_op * 2; + break; + case V2DImode: + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, + require extra 4 mul, 4 add, 4 cmp and 2 shift. */ + if (!TARGET_SSE4_1 && !uns_p) + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 + + cost->sse_op * 2; + /* Fallthru. */ + case V4DImode: + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + default: + gcc_unreachable(); + } + return ix86_vec_cost (mode, basic_cost + extra_cost); +} + /* Return cost of multiplication in MODE. */ static int @@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, break; case MULT_EXPR: - case WIDEN_MULT_EXPR: + /* For MULT_HIGHPART_EXPR, x86 only supports pmulhw, + take it as MULT_EXPR. */ case MULT_HIGHPART_EXPR: stmt_cost = ix86_multiplication_cost (ix86_cost, mode); break; + /* There's no direct instruction for WIDEN_MULT_EXPR, + take emulation into account. */ + case WIDEN_MULT_EXPR: + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode, + TYPE_UNSIGNED (vectype)); + break; + case NEGATE_EXPR: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) stmt_cost = ix86_cost->sse_op; diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c new file mode 100644 index 00000000000..bcd4b772c98 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */ +#include<stdint.h> +void +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order) +{ + while (order--) + *v3++ = (int16_t) *v1++ * *v2++; +} + +void +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order) +{ + while (order--) + *v3++ = (uint16_t) *v1++ * *v2++; +} + +void +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order) +{ + while (order--) + *v3++ = (int32_t) *v1++ * *v2++; +} + +void +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order) +{ + while (order--) + *v3++ = (uint32_t) *v1++ * *v2++; +} + +void +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order) +{ + while (order--) + *v3++ = (int64_t) *v1++ * *v2++; +} + +void +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order) +{ + while (order--) + *v3++ = (uint64_t) *v1++ * *v2++; +} diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c new file mode 100644 index 00000000000..4456c31e43e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c @@ -0,0 +1,4 @@ +/* { dg-do compile } */ +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */ +#include "sse2-pr39821.c" -- 2.27.0 ^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2021-07-29 1:06 UTC | newest] Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2021-07-28 8:35 [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR liuhongt 2021-07-28 12:36 ` Richard Biener 2021-07-29 1:11 ` Hongtao Liu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).