[PATCH] i386: Sync move_max/store_max with prefer-vector-width [PR112824]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] i386: Sync move_max/store_max with prefer-vector-width [PR112824]
@ 2023-12-14  7:54 Hongyu Wang
  2023-12-15  2:35 ` Hongtao Liu
  0 siblings, 1 reply; 2+ messages in thread
From: Hongyu Wang @ 2023-12-14  7:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: hjl.tools, hongtao.liu

Hi,

Currently move_max follows the tuning feature first, but ideally it
should sync with prefer-vector-width when it is explicitly set to keep
vector move and operation with same vector size.

Bootstrapped/regtested on x86-64-pc-linux-gnu{-m32,}

OK for trunk?

gcc/ChangeLog:

	PR target/112824
	* config/i386/i386-options.cc (ix86_option_override_internal):
	Sync ix86_move_max/ix86_store_max with prefer_vector_width when
	it is explicitly set.

gcc/testsuite/ChangeLog:

	PR target/112824
	* gcc.target/i386/pieces-memset-45.c: Remove
	-mprefer-vector-width=256.
	* g++.target/i386/pr112824-1.C: New test.
---
 gcc/config/i386/i386-options.cc               |   8 +-
 gcc/testsuite/g++.target/i386/pr112824-1.C    | 113 ++++++++++++++++++
 .../gcc.target/i386/pieces-memset-45.c        |   2 +-
 3 files changed, 120 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr112824-1.C

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 588a0878c0d..440ef59ffff 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -3012,7 +3012,9 @@ ix86_option_override_internal (bool main_args_p,
     {
       /* Set the maximum number of bits can be moved from memory to
 	 memory efficiently.  */
-      if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
+      if (opts_set->x_prefer_vector_width_type != PVW_NONE)
+	opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
+      else if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
 	opts->x_ix86_move_max = PVW_AVX512;
       else if (ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES])
 	opts->x_ix86_move_max = PVW_AVX256;
@@ -3034,7 +3036,9 @@ ix86_option_override_internal (bool main_args_p,
     {
       /* Set the maximum number of bits can be stored to memory
 	 efficiently.  */
-      if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
+      if (opts_set->x_prefer_vector_width_type != PVW_NONE)
+	opts->x_ix86_store_max = opts->x_prefer_vector_width_type;
+      else if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
 	opts->x_ix86_store_max = PVW_AVX512;
       else if (ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES])
 	opts->x_ix86_store_max = PVW_AVX256;
diff --git a/gcc/testsuite/g++.target/i386/pr112824-1.C b/gcc/testsuite/g++.target/i386/pr112824-1.C
new file mode 100644
index 00000000000..fccaf23c530
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr112824-1.C
@@ -0,0 +1,113 @@
+/* PR target/112824 */
+/* { dg-do compile } */
+/* { dg-options "-std=c++23 -O3 -march=skylake-avx512 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-not "vmov(?:dqu|apd)\[ \\t\]+\[^\n\]*%ymm" } } */
+
+
+#include <bit>
+#include <concepts>
+#include <cstddef>
+#include <cstdint>
+
+template <ptrdiff_t W, typename T>
+using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
+
+// Omitted: 16 without AVX, 32 without AVX512F,
+// or for forward compatibility some AVX10 may also mean 32-only
+static constexpr ptrdiff_t VectorBytes = 64;
+template<typename T>
+static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T);
+
+template <typename T, ptrdiff_t N> struct Vector{
+    static constexpr ptrdiff_t L = N;
+    T data[L];
+    static constexpr auto size()->ptrdiff_t{return N;}
+};
+template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{
+    static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> : ptrdiff_t(std::bit_ceil(size_t(N))); 
+    static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
+    using V = Vec<W,T>;
+    V data[L];
+    static constexpr auto size()->ptrdiff_t{return N;}
+};
+/// should be trivially copyable
+/// codegen is worse when passing by value, even though it seems like it should make
+/// aliasing simpler to analyze?
+template<typename T, ptrdiff_t N>
+[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> {
+    Vector<T,N> z;
+    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] + y.data[n];
+    return z;
+}
+template<typename T, ptrdiff_t N>
+[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> {
+    Vector<T,N> z;
+    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] * y.data[n];
+    return z;
+}
+template<typename T, ptrdiff_t N>
+[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) -> Vector<T,N> {
+    Vector<T,N> z;
+    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n];
+    return z;
+}
+template<typename T, ptrdiff_t N>
+[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) -> Vector<T,N> {
+    Vector<T,N> z;
+    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n];
+    return z;
+}
+
+
+
+template <typename T, ptrdiff_t N> struct Dual {
+  T value;
+  Vector<T, N> partials;
+};
+// Here we have a specialization for non-power-of-2 `N`
+template <typename T, ptrdiff_t N> 
+requires(std::floating_point<T> && (std::popcount(size_t(N))>1))
+struct Dual<T,N> {
+  Vector<T, N+1> data;
+};
+
+template<ptrdiff_t W, typename T>
+consteval auto firstoff(){
+    static_assert(std::same_as<T,double>, "type not implemented");
+    if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0;
+    else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0;
+    else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0;
+    else static_assert(false, "vector width not implemented");
+}
+
+template <typename T, ptrdiff_t N>
+[[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a,
+                                                Dual<T, N> b)
+  -> Dual<T, N> {
+  if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
+    Dual<T,N> c;
+    for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l)
+      c.data.data[l] = a.data.data[l] + b.data.data[l]; 
+    return c;
+  } else return {a.value + b.value, a.partials + b.partials};
+}
+
+template <typename T, ptrdiff_t N>
+[[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a,
+                                                Dual<T, N> b)
+  -> Dual<T, N> {
+  if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
+    using V = typename Vector<T,N>::V;
+    V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0];
+    V x = va * b.data.data[0];
+    Dual<T,N> c;
+    c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x;
+    for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l)
+      c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l]; 
+    return c;
+  } else return {a.value * b.value, a.value * b.partials + b.value * a.partials};
+}
+
+void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const Dual<Dual<double,7>,2>&b){
+    c = a*b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c
index 70c80e5064b..e8ce7c23256 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=x86-64 -mprefer-vector-width=256 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */
+/* { dg-options "-O2 -march=x86-64 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */
 
 extern char *dst;
 
-- 
2.31.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] i386: Sync move_max/store_max with prefer-vector-width [PR112824]
  2023-12-14  7:54 [PATCH] i386: Sync move_max/store_max with prefer-vector-width [PR112824] Hongyu Wang
@ 2023-12-15  2:35 ` Hongtao Liu
  0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2023-12-15  2:35 UTC (permalink / raw)
  To: Hongyu Wang; +Cc: gcc-patches, hjl.tools, hongtao.liu

On Thu, Dec 14, 2023 at 3:54 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> Hi,
>
> Currently move_max follows the tuning feature first, but ideally it
> should sync with prefer-vector-width when it is explicitly set to keep
> vector move and operation with same vector size.
>
> Bootstrapped/regtested on x86-64-pc-linux-gnu{-m32,}
>
> OK for trunk?
>
> gcc/ChangeLog:
>
>         PR target/112824
>         * config/i386/i386-options.cc (ix86_option_override_internal):
>         Sync ix86_move_max/ix86_store_max with prefer_vector_width when
>         it is explicitly set.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/112824
>         * gcc.target/i386/pieces-memset-45.c: Remove
>         -mprefer-vector-width=256.
>         * g++.target/i386/pr112824-1.C: New test.
> ---
>  gcc/config/i386/i386-options.cc               |   8 +-
>  gcc/testsuite/g++.target/i386/pr112824-1.C    | 113 ++++++++++++++++++
>  .../gcc.target/i386/pieces-memset-45.c        |   2 +-
>  3 files changed, 120 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr112824-1.C
>
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index 588a0878c0d..440ef59ffff 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -3012,7 +3012,9 @@ ix86_option_override_internal (bool main_args_p,
>      {
>        /* Set the maximum number of bits can be moved from memory to
>          memory efficiently.  */
> -      if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
> +      if (opts_set->x_prefer_vector_width_type != PVW_NONE)
> +       opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
> +      else if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
>         opts->x_ix86_move_max = PVW_AVX512;
>        else if (ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES])
>         opts->x_ix86_move_max = PVW_AVX256;
> @@ -3034,7 +3036,9 @@ ix86_option_override_internal (bool main_args_p,
>      {
>        /* Set the maximum number of bits can be stored to memory
>          efficiently.  */
> -      if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
> +      if (opts_set->x_prefer_vector_width_type != PVW_NONE)
> +       opts->x_ix86_store_max = opts->x_prefer_vector_width_type;
> +      else if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
>         opts->x_ix86_store_max = PVW_AVX512;
>        else if (ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES])
>         opts->x_ix86_store_max = PVW_AVX256;
> diff --git a/gcc/testsuite/g++.target/i386/pr112824-1.C b/gcc/testsuite/g++.target/i386/pr112824-1.C
> new file mode 100644
> index 00000000000..fccaf23c530
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr112824-1.C
> @@ -0,0 +1,113 @@
> +/* PR target/112824 */
> +/* { dg-do compile } */
> +/* { dg-options "-std=c++23 -O3 -march=skylake-avx512 -mprefer-vector-width=512" } */
> +/* { dg-final { scan-assembler-not "vmov(?:dqu|apd)\[ \\t\]+\[^\n\]*%ymm" } } */
> +
> +
remove empty line.
> +#include <bit>
> +#include <concepts>
> +#include <cstddef>
> +#include <cstdint>
> +
> +template <ptrdiff_t W, typename T>
> +using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
> +
> +// Omitted: 16 without AVX, 32 without AVX512F,
> +// or for forward compatibility some AVX10 may also mean 32-only
> +static constexpr ptrdiff_t VectorBytes = 64;
> +template<typename T>
> +static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T);
> +
> +template <typename T, ptrdiff_t N> struct Vector{
> +    static constexpr ptrdiff_t L = N;
> +    T data[L];
> +    static constexpr auto size()->ptrdiff_t{return N;}
> +};
> +template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{
> +    static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> : ptrdiff_t(std::bit_ceil(size_t(N)));
> +    static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
> +    using V = Vec<W,T>;
> +    V data[L];
> +    static constexpr auto size()->ptrdiff_t{return N;}
> +};
> +/// should be trivially copyable
> +/// codegen is worse when passing by value, even though it seems like it should make
> +/// aliasing simpler to analyze?
> +template<typename T, ptrdiff_t N>
> +[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> {
> +    Vector<T,N> z;
> +    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] + y.data[n];
> +    return z;
> +}
> +template<typename T, ptrdiff_t N>
> +[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> {
> +    Vector<T,N> z;
> +    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] * y.data[n];
> +    return z;
> +}
> +template<typename T, ptrdiff_t N>
> +[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) -> Vector<T,N> {
> +    Vector<T,N> z;
> +    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n];
> +    return z;
> +}
> +template<typename T, ptrdiff_t N>
> +[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) -> Vector<T,N> {
> +    Vector<T,N> z;
> +    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n];
> +    return z;
> +}
> +
> +
> +
Ditto.
> +template <typename T, ptrdiff_t N> struct Dual {
> +  T value;
> +  Vector<T, N> partials;
> +};
> +// Here we have a specialization for non-power-of-2 `N`
> +template <typename T, ptrdiff_t N>
> +requires(std::floating_point<T> && (std::popcount(size_t(N))>1))
> +struct Dual<T,N> {
> +  Vector<T, N+1> data;
> +};
> +
> +template<ptrdiff_t W, typename T>
> +consteval auto firstoff(){
> +    static_assert(std::same_as<T,double>, "type not implemented");
> +    if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0;
> +    else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0;
> +    else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0;
> +    else static_assert(false, "vector width not implemented");
> +}
> +
> +template <typename T, ptrdiff_t N>
> +[[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a,
> +                                                Dual<T, N> b)
> +  -> Dual<T, N> {
> +  if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
> +    Dual<T,N> c;
> +    for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l)
> +      c.data.data[l] = a.data.data[l] + b.data.data[l];
> +    return c;
> +  } else return {a.value + b.value, a.partials + b.partials};
> +}
> +
> +template <typename T, ptrdiff_t N>
> +[[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a,
> +                                                Dual<T, N> b)
> +  -> Dual<T, N> {
> +  if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
> +    using V = typename Vector<T,N>::V;
> +    V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0];
> +    V x = va * b.data.data[0];
> +    Dual<T,N> c;
> +    c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x;
> +    for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l)
> +      c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l];
> +    return c;
> +  } else return {a.value * b.value, a.value * b.partials + b.value * a.partials};
> +}
> +
> +void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const Dual<Dual<double,7>,2>&b){
> +    c = a*b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c
> index 70c80e5064b..e8ce7c23256 100644
> --- a/gcc/testsuite/gcc.target/i386/pieces-memset-45.c
> +++ b/gcc/testsuite/gcc.target/i386/pieces-memset-45.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -march=x86-64 -mprefer-vector-width=256 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */
> +/* { dg-options "-O2 -march=x86-64 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */
>
>  extern char *dst;
>
> --
> 2.31.1
>

Others LGTM.

-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-12-15  2:26 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-14  7:54 [PATCH] i386: Sync move_max/store_max with prefer-vector-width [PR112824] Hongyu Wang
2023-12-15  2:35 ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).