* [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR.
@ 2021-07-28 8:35 liuhongt
2021-07-28 12:36 ` Richard Biener
0 siblings, 1 reply; 3+ messages in thread
From: liuhongt @ 2021-07-28 8:35 UTC (permalink / raw)
To: gcc-patches
Hi:
As described in PR 39821, WIDEN_MULT_EXPR should use a different cost
model from MULT_EXPR, this patch add ix86_widen_mult_cost for that.
Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn.
Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
gcc/ChangeLog:
* config/i386/i386.c (ix86_widen_mult_cost): New function.
(ix86_add_stmt_cost): Use ix86_widen_mult_cost for
WIDEN_MULT_EXPR.
gcc/testsuite/ChangeLog:
* gcc.target/i386/sse2-pr39821.c: New test.
* gcc.target/i386/sse4-pr39821.c: New test.
---
gcc/config/i386/i386.c | 48 +++++++++++++++++++-
gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++
gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++
3 files changed, 96 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c
create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 876a19f4c1f..281b5fe2706 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost)
return cost;
}
+/* Return cost of vec_widen_<s>mult_hi/lo_<mode>,
+ vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */
+static int
+ix86_widen_mult_cost (const struct processor_costs *cost,
+ enum machine_mode mode, bool uns_p)
+{
+ gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+ int extra_cost = 0;
+ int basic_cost = 0;
+ switch (mode)
+ {
+ case V8HImode:
+ case V16HImode:
+ if (!uns_p || mode == V16HImode)
+ extra_cost = cost->sse_op * 2;
+ basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+ break;
+ case V4SImode:
+ case V8SImode:
+ /* pmulhw/pmullw can be used. */
+ basic_cost = cost->mulss * 2 + cost->sse_op * 2;
+ break;
+ case V2DImode:
+ /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
+ require extra 4 mul, 4 add, 4 cmp and 2 shift. */
+ if (!TARGET_SSE4_1 && !uns_p)
+ extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+ + cost->sse_op * 2;
+ /* Fallthru. */
+ case V4DImode:
+ basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+ break;
+ default:
+ gcc_unreachable();
+ }
+ return ix86_vec_cost (mode, basic_cost + extra_cost);
+}
+
/* Return cost of multiplication in MODE. */
static int
@@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
break;
case MULT_EXPR:
- case WIDEN_MULT_EXPR:
+ /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw,
+ take it as MULT_EXPR. */
case MULT_HIGHPART_EXPR:
stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
break;
+ /* There's no direct instruction for WIDEN_MULT_EXPR,
+ take emulation into account. */
+ case WIDEN_MULT_EXPR:
+ stmt_cost = ix86_widen_mult_cost (ix86_cost, mode,
+ TYPE_UNSIGNED (vectype));
+ break;
+
case NEGATE_EXPR:
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
stmt_cost = ix86_cost->sse_op;
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
new file mode 100644
index 00000000000..bcd4b772c98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */
+#include<stdint.h>
+void
+vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (int16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (uint16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (int32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (uint32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (int64_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (uint64_t) *v1++ * *v2++;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
new file mode 100644
index 00000000000..4456c31e43e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */
+#include "sse2-pr39821.c"
--
2.27.0
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR.
2021-07-28 8:35 [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR liuhongt
@ 2021-07-28 12:36 ` Richard Biener
2021-07-29 1:11 ` Hongtao Liu
0 siblings, 1 reply; 3+ messages in thread
From: Richard Biener @ 2021-07-28 12:36 UTC (permalink / raw)
To: liuhongt; +Cc: GCC Patches, Hongtao Liu
On Wed, Jul 28, 2021 at 10:35 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Hi:
> As described in PR 39821, WIDEN_MULT_EXPR should use a different cost
> model from MULT_EXPR, this patch add ix86_widen_mult_cost for that.
> Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn.
>
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>
> gcc/ChangeLog:
can you reference PR target/39821 please?
> * config/i386/i386.c (ix86_widen_mult_cost): New function.
> (ix86_add_stmt_cost): Use ix86_widen_mult_cost for
> WIDEN_MULT_EXPR.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/sse2-pr39821.c: New test.
> * gcc.target/i386/sse4-pr39821.c: New test.
> ---
> gcc/config/i386/i386.c | 48 +++++++++++++++++++-
> gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++
> gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++
> 3 files changed, 96 insertions(+), 1 deletion(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 876a19f4c1f..281b5fe2706 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost)
> return cost;
> }
>
> +/* Return cost of vec_widen_<s>mult_hi/lo_<mode>,
> + vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */
> +static int
> +ix86_widen_mult_cost (const struct processor_costs *cost,
> + enum machine_mode mode, bool uns_p)
> +{
> + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
> + int extra_cost = 0;
> + int basic_cost = 0;
> + switch (mode)
> + {
> + case V8HImode:
> + case V16HImode:
> + if (!uns_p || mode == V16HImode)
> + extra_cost = cost->sse_op * 2;
> + basic_cost = cost->mulss * 2 + cost->sse_op * 4;
> + break;
> + case V4SImode:
> + case V8SImode:
> + /* pmulhw/pmullw can be used. */
> + basic_cost = cost->mulss * 2 + cost->sse_op * 2;
> + break;
> + case V2DImode:
> + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
> + require extra 4 mul, 4 add, 4 cmp and 2 shift. */
> + if (!TARGET_SSE4_1 && !uns_p)
> + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
> + + cost->sse_op * 2;
> + /* Fallthru. */
> + case V4DImode:
> + basic_cost = cost->mulss * 2 + cost->sse_op * 4;
> + break;
> + default:
> + gcc_unreachable();
> + }
> + return ix86_vec_cost (mode, basic_cost + extra_cost);
> +}
> +
> /* Return cost of multiplication in MODE. */
>
> static int
> @@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
> break;
>
> case MULT_EXPR:
> - case WIDEN_MULT_EXPR:
> + /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw,
Space after /*
otherwise OK.
> + take it as MULT_EXPR. */
> case MULT_HIGHPART_EXPR:
> stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
> break;
> + /* There's no direct instruction for WIDEN_MULT_EXPR,
> + take emulation into account. */
> + case WIDEN_MULT_EXPR:
> + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode,
> + TYPE_UNSIGNED (vectype));
> + break;
> +
> case NEGATE_EXPR:
> if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
> stmt_cost = ix86_cost->sse_op;
> diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> new file mode 100644
> index 00000000000..bcd4b772c98
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */
> +#include<stdint.h>
> +void
> +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
> +{
> + while (order--)
> + *v3++ = (int16_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order)
> +{
> + while (order--)
> + *v3++ = (uint16_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order)
> +{
> + while (order--)
> + *v3++ = (int32_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order)
> +{
> + while (order--)
> + *v3++ = (uint32_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order)
> +{
> + while (order--)
> + *v3++ = (int64_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order)
> +{
> + while (order--)
> + *v3++ = (uint64_t) *v1++ * *v2++;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
> new file mode 100644
> index 00000000000..4456c31e43e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
> @@ -0,0 +1,4 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */
> +#include "sse2-pr39821.c"
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR.
2021-07-28 12:36 ` Richard Biener
@ 2021-07-29 1:11 ` Hongtao Liu
0 siblings, 0 replies; 3+ messages in thread
From: Hongtao Liu @ 2021-07-29 1:11 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, GCC Patches
[-- Attachment #1: Type: text/plain, Size: 6065 bytes --]
On Wed, Jul 28, 2021 at 8:36 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Jul 28, 2021 at 10:35 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Hi:
> > As described in PR 39821, WIDEN_MULT_EXPR should use a different cost
> > model from MULT_EXPR, this patch add ix86_widen_mult_cost for that.
> > Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn.
> >
> > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> >
> > gcc/ChangeLog:
>
> can you reference PR target/39821 please?
>
Added.
> > * config/i386/i386.c (ix86_widen_mult_cost): New function.
> > (ix86_add_stmt_cost): Use ix86_widen_mult_cost for
> > WIDEN_MULT_EXPR.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/sse2-pr39821.c: New test.
> > * gcc.target/i386/sse4-pr39821.c: New test.
> > ---
> > gcc/config/i386/i386.c | 48 +++++++++++++++++++-
> > gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++
> > gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++
> > 3 files changed, 96 insertions(+), 1 deletion(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index 876a19f4c1f..281b5fe2706 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost)
> > return cost;
> > }
> >
> > +/* Return cost of vec_widen_<s>mult_hi/lo_<mode>,
> > + vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */
> > +static int
> > +ix86_widen_mult_cost (const struct processor_costs *cost,
> > + enum machine_mode mode, bool uns_p)
> > +{
> > + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
> > + int extra_cost = 0;
> > + int basic_cost = 0;
> > + switch (mode)
> > + {
> > + case V8HImode:
> > + case V16HImode:
> > + if (!uns_p || mode == V16HImode)
> > + extra_cost = cost->sse_op * 2;
> > + basic_cost = cost->mulss * 2 + cost->sse_op * 4;
> > + break;
> > + case V4SImode:
> > + case V8SImode:
> > + /* pmulhw/pmullw can be used. */
> > + basic_cost = cost->mulss * 2 + cost->sse_op * 2;
> > + break;
> > + case V2DImode:
> > + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
> > + require extra 4 mul, 4 add, 4 cmp and 2 shift. */
> > + if (!TARGET_SSE4_1 && !uns_p)
> > + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
> > + + cost->sse_op * 2;
> > + /* Fallthru. */
> > + case V4DImode:
> > + basic_cost = cost->mulss * 2 + cost->sse_op * 4;
> > + break;
> > + default:
> > + gcc_unreachable();
> > + }
> > + return ix86_vec_cost (mode, basic_cost + extra_cost);
> > +}
> > +
> > /* Return cost of multiplication in MODE. */
> >
> > static int
> > @@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
> > break;
> >
> > case MULT_EXPR:
> > - case WIDEN_MULT_EXPR:
> > + /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw,
>
> Space after /*
>
Changed.
> otherwise OK.
Thanks for the review, attach the patch i'm going to check in.
>
> > + take it as MULT_EXPR. */
> > case MULT_HIGHPART_EXPR:
> > stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
> > break;
> > + /* There's no direct instruction for WIDEN_MULT_EXPR,
> > + take emulation into account. */
> > + case WIDEN_MULT_EXPR:
> > + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode,
> > + TYPE_UNSIGNED (vectype));
> > + break;
> > +
> > case NEGATE_EXPR:
> > if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
> > stmt_cost = ix86_cost->sse_op;
> > diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> > new file mode 100644
> > index 00000000000..bcd4b772c98
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> > @@ -0,0 +1,45 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */
> > +#include<stdint.h>
> > +void
> > +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
> > +{
> > + while (order--)
> > + *v3++ = (int16_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order)
> > +{
> > + while (order--)
> > + *v3++ = (uint16_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order)
> > +{
> > + while (order--)
> > + *v3++ = (int32_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order)
> > +{
> > + while (order--)
> > + *v3++ = (uint32_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order)
> > +{
> > + while (order--)
> > + *v3++ = (int64_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order)
> > +{
> > + while (order--)
> > + *v3++ = (uint64_t) *v1++ * *v2++;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
> > new file mode 100644
> > index 00000000000..4456c31e43e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
> > @@ -0,0 +1,4 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */
> > +#include "sse2-pr39821.c"
> > --
> > 2.27.0
> >
--
BR,
Hongtao
[-- Attachment #2: 0001-i386-Add-a-separate-function-to-calculate-cost-for-W.patch --]
[-- Type: text/x-patch, Size: 4889 bytes --]
From 0bae4cdd18a1645b4f70248e8986f2c83a814e8f Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 28 Jul 2021 16:24:52 +0800
Subject: [PATCH] [i386] Add a separate function to calculate cost for
WIDEN_MULT_EXPR.
gcc/ChangeLog:
PR target/39821
* config/i386/i386.c (ix86_widen_mult_cost): New function.
(ix86_add_stmt_cost): Use ix86_widen_mult_cost for
WIDEN_MULT_EXPR.
gcc/testsuite/ChangeLog:
PR target/39821
* gcc.target/i386/sse2-pr39821.c: New test.
* gcc.target/i386/sse4-pr39821.c: New test.
---
gcc/config/i386/i386.c | 48 +++++++++++++++++++-
gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++++++++++++++++++
gcc/testsuite/gcc.target/i386/sse4-pr39821.c | 4 ++
3 files changed, 96 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c
create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 876a19f4c1f..5d49e0b45db 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost)
return cost;
}
+/* Return cost of vec_widen_<s>mult_hi/lo_<mode>,
+ vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2. */
+static int
+ix86_widen_mult_cost (const struct processor_costs *cost,
+ enum machine_mode mode, bool uns_p)
+{
+ gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+ int extra_cost = 0;
+ int basic_cost = 0;
+ switch (mode)
+ {
+ case V8HImode:
+ case V16HImode:
+ if (!uns_p || mode == V16HImode)
+ extra_cost = cost->sse_op * 2;
+ basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+ break;
+ case V4SImode:
+ case V8SImode:
+ /* pmulhw/pmullw can be used. */
+ basic_cost = cost->mulss * 2 + cost->sse_op * 2;
+ break;
+ case V2DImode:
+ /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
+ require extra 4 mul, 4 add, 4 cmp and 2 shift. */
+ if (!TARGET_SSE4_1 && !uns_p)
+ extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+ + cost->sse_op * 2;
+ /* Fallthru. */
+ case V4DImode:
+ basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+ break;
+ default:
+ gcc_unreachable();
+ }
+ return ix86_vec_cost (mode, basic_cost + extra_cost);
+}
+
/* Return cost of multiplication in MODE. */
static int
@@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
break;
case MULT_EXPR:
- case WIDEN_MULT_EXPR:
+ /* For MULT_HIGHPART_EXPR, x86 only supports pmulhw,
+ take it as MULT_EXPR. */
case MULT_HIGHPART_EXPR:
stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
break;
+ /* There's no direct instruction for WIDEN_MULT_EXPR,
+ take emulation into account. */
+ case WIDEN_MULT_EXPR:
+ stmt_cost = ix86_widen_mult_cost (ix86_cost, mode,
+ TYPE_UNSIGNED (vectype));
+ break;
+
case NEGATE_EXPR:
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
stmt_cost = ix86_cost->sse_op;
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
new file mode 100644
index 00000000000..bcd4b772c98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */
+#include<stdint.h>
+void
+vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (int16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (uint16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (int32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (uint32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (int64_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order)
+{
+ while (order--)
+ *v3++ = (uint64_t) *v1++ * *v2++;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
new file mode 100644
index 00000000000..4456c31e43e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */
+#include "sse2-pr39821.c"
--
2.27.0
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2021-07-29 1:06 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-28 8:35 [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR liuhongt
2021-07-28 12:36 ` Richard Biener
2021-07-29 1:11 ` Hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).