[PATCH] i386: Fix up _mm256_vzeroupper() handling [PR99563]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] i386: Fix up _mm256_vzeroupper() handling [PR99563]
@ 2021-03-16  9:50 Jakub Jelinek
  2021-03-16 10:04 ` Uros Bizjak
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2021-03-16  9:50 UTC (permalink / raw)
  To: Uros Bizjak, H.J. Lu, Kirill Yukhin; +Cc: gcc-patches

Hi!

My r10-6451-gb7b3378f91c0641f2ef4d88db22af62a571c9359 fix for
vzeroupper vs. ms ABI apparently broke the explicit vzeroupper handling
when the implicit vzeroupper handling is disabled.
The epilogue_completed splitter for vzeroupper now adds clobbers for all
registers which don't have explicit sets in the pattern and the sets are
added during vzeroupper pass.  Before my changes, for explicit user
vzeroupper, we just weren't modelling its effects at all, it was just
unspec that didn't tell that it clobbers the upper parts of all XMM < %xmm16
registers.  But now the splitter will even for those add clobbers and as
it has no sets, it will add clobbers for all registers, which means
we optimize away anything that lived across that vzeroupper.

The vzeroupper pass has two parts, one is the mode switching that computes
where to put the implicit vzeroupper calls and puts them there, and then
another that uses df to figure out what sets to add to all the vzeroupper.
The former part should be done only under the conditions we have in the
gate, but the latter as this PR shows needs to happen either if we perform
the implicit vzeroupper additions, or if there are (or could be) any
explicit vzeroupper instructions.  As that function does df_analyze and
walks the whole IL, I think it would be too expensive to run it always
whenever TARGET_AVX, so this patch remembers if we've expanded at least
one __builtin_ia32_vzeroupper in the function and runs that part of the
vzeroupper pass both when the old condition is true or when this new
flag is set.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-03-16  Jakub Jelinek  <jakub@redhat.com>

	PR target/99563
	* config/i386/i386.h (struct machine_function): Add
	has_explicit_vzeroupper bitfield.
	* config/i386/i386-expand.c (ix86_expand_builtin): Set
	cfun->machine->has_explicit_vzeroupper when expanding
	IX86_BUILTIN_VZEROUPPER.
	* config/i386/i386-features.c (rest_of_handle_insert_vzeroupper):
	Do the mode switching only when TARGET_VZEROUPPER, expensive
	optimizations turned on and not optimizing for size.
	(pass_insert_vzeroupper::gate): Enable even when
	cfun->machine->has_explicit_vzeroupper is set.

	* gcc.target/i386/avx-pr99563.c: New test.

--- gcc/config/i386/i386.h.jj	2021-02-22 17:54:05.617799002 +0100
+++ gcc/config/i386/i386.h	2021-03-15 12:30:00.814841624 +0100
@@ -2941,6 +2941,10 @@ struct GTY(()) machine_function {
   /* True if the function needs a stack frame.  */
   BOOL_BITFIELD stack_frame_required : 1;
 
+  /* True if __builtin_ia32_vzeroupper () has been expanded in current
+     function.  */
+  BOOL_BITFIELD has_explicit_vzeroupper : 1;
+
   /* The largest alignment, in bytes, of stack slot actually used.  */
   unsigned int max_used_stack_alignment;
 
--- gcc/config/i386/i386-expand.c.jj	2021-02-09 12:28:14.069323264 +0100
+++ gcc/config/i386/i386-expand.c	2021-03-15 12:34:26.549901726 +0100
@@ -13210,6 +13210,10 @@ rdseed_step:
 
       return 0;
 
+    case IX86_BUILTIN_VZEROUPPER:
+      cfun->machine->has_explicit_vzeroupper = true;
+      break;
+
     default:
       break;
     }
--- gcc/config/i386/i386-features.c.jj	2021-02-01 09:55:45.953519272 +0100
+++ gcc/config/i386/i386-features.c	2021-03-15 12:37:07.886116827 +0100
@@ -1837,19 +1837,22 @@ ix86_add_reg_usage_to_vzerouppers (void)
 static unsigned int
 rest_of_handle_insert_vzeroupper (void)
 {
-  int i;
-
-  /* vzeroupper instructions are inserted immediately after reload to
-     account for possible spills from 256bit or 512bit registers.  The pass
-     reuses mode switching infrastructure by re-running mode insertion
-     pass, so disable entities that have already been processed.  */
-  for (i = 0; i < MAX_386_ENTITIES; i++)
-    ix86_optimize_mode_switching[i] = 0;
+  if (TARGET_VZEROUPPER
+      && flag_expensive_optimizations
+      && !optimize_size)
+    {
+      /* vzeroupper instructions are inserted immediately after reload to
+	 account for possible spills from 256bit or 512bit registers.  The pass
+	 reuses mode switching infrastructure by re-running mode insertion
+	 pass, so disable entities that have already been processed.  */
+      for (int i = 0; i < MAX_386_ENTITIES; i++)
+	ix86_optimize_mode_switching[i] = 0;
 
-  ix86_optimize_mode_switching[AVX_U128] = 1;
+      ix86_optimize_mode_switching[AVX_U128] = 1;
 
-  /* Call optimize_mode_switching.  */
-  g->get_passes ()->execute_pass_mode_switching ();
+      /* Call optimize_mode_switching.  */
+      g->get_passes ()->execute_pass_mode_switching ();
+    }
   ix86_add_reg_usage_to_vzerouppers ();
   return 0;
 }
@@ -1880,8 +1883,10 @@ public:
   virtual bool gate (function *)
     {
       return TARGET_AVX
-	     && TARGET_VZEROUPPER && flag_expensive_optimizations
-	     && !optimize_size;
+	     && ((TARGET_VZEROUPPER
+		  && flag_expensive_optimizations
+		  && !optimize_size)
+		 || cfun->machine->has_explicit_vzeroupper);
     }
 
   virtual unsigned int execute (function *)
--- gcc/testsuite/gcc.target/i386/avx-pr99563.c.jj	2021-03-15 13:18:08.896950279 +0100
+++ gcc/testsuite/gcc.target/i386/avx-pr99563.c	2021-03-15 13:17:28.881392012 +0100
@@ -0,0 +1,38 @@
+/* PR target/99563 */
+/* { dg-do run { target avx } } */
+/* { dg-options "-O2 -mavx -mno-vzeroupper" } */
+
+#include "avx-check.h"
+#include <immintrin.h>
+
+
+__attribute__((noipa)) float
+compute_generic (void)
+{
+  return 0.0f;
+}
+
+static inline __attribute__((always_inline))
+float compute_avx (unsigned long block_count)
+{
+  __m128d mm_res = _mm_set1_pd (256.0);
+  float res = (float) (_mm_cvtsd_f64 (mm_res) / (double) block_count);
+  _mm256_zeroupper ();
+  return res;
+}
+
+__attribute__((noipa)) float
+compute (unsigned long block_count)
+{
+  if (block_count >= 64)
+    return compute_avx (block_count);
+  else
+    return compute_generic ();
+}
+
+static void
+avx_test (void)
+{
+  if (compute (128) != 2.0f || compute (32) != 0.0f)
+    abort ();
+}

	Jakub


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] i386: Fix up _mm256_vzeroupper() handling [PR99563]
  2021-03-16  9:50 [PATCH] i386: Fix up _mm256_vzeroupper() handling [PR99563] Jakub Jelinek
@ 2021-03-16 10:04 ` Uros Bizjak
  0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2021-03-16 10:04 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: H.J. Lu, Kirill Yukhin, gcc-patches

On Tue, Mar 16, 2021 at 10:51 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> My r10-6451-gb7b3378f91c0641f2ef4d88db22af62a571c9359 fix for
> vzeroupper vs. ms ABI apparently broke the explicit vzeroupper handling
> when the implicit vzeroupper handling is disabled.
> The epilogue_completed splitter for vzeroupper now adds clobbers for all
> registers which don't have explicit sets in the pattern and the sets are
> added during vzeroupper pass.  Before my changes, for explicit user
> vzeroupper, we just weren't modelling its effects at all, it was just
> unspec that didn't tell that it clobbers the upper parts of all XMM < %xmm16
> registers.  But now the splitter will even for those add clobbers and as
> it has no sets, it will add clobbers for all registers, which means
> we optimize away anything that lived across that vzeroupper.
>
> The vzeroupper pass has two parts, one is the mode switching that computes
> where to put the implicit vzeroupper calls and puts them there, and then
> another that uses df to figure out what sets to add to all the vzeroupper.
> The former part should be done only under the conditions we have in the
> gate, but the latter as this PR shows needs to happen either if we perform
> the implicit vzeroupper additions, or if there are (or could be) any
> explicit vzeroupper instructions.  As that function does df_analyze and
> walks the whole IL, I think it would be too expensive to run it always
> whenever TARGET_AVX, so this patch remembers if we've expanded at least
> one __builtin_ia32_vzeroupper in the function and runs that part of the
> vzeroupper pass both when the old condition is true or when this new
> flag is set.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2021-03-16  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/99563
>         * config/i386/i386.h (struct machine_function): Add
>         has_explicit_vzeroupper bitfield.
>         * config/i386/i386-expand.c (ix86_expand_builtin): Set
>         cfun->machine->has_explicit_vzeroupper when expanding
>         IX86_BUILTIN_VZEROUPPER.
>         * config/i386/i386-features.c (rest_of_handle_insert_vzeroupper):
>         Do the mode switching only when TARGET_VZEROUPPER, expensive
>         optimizations turned on and not optimizing for size.
>         (pass_insert_vzeroupper::gate): Enable even when
>         cfun->machine->has_explicit_vzeroupper is set.
>
>         * gcc.target/i386/avx-pr99563.c: New test.

OK.

Thanks,
Uros.

>
> --- gcc/config/i386/i386.h.jj   2021-02-22 17:54:05.617799002 +0100
> +++ gcc/config/i386/i386.h      2021-03-15 12:30:00.814841624 +0100
> @@ -2941,6 +2941,10 @@ struct GTY(()) machine_function {
>    /* True if the function needs a stack frame.  */
>    BOOL_BITFIELD stack_frame_required : 1;
>
> +  /* True if __builtin_ia32_vzeroupper () has been expanded in current
> +     function.  */
> +  BOOL_BITFIELD has_explicit_vzeroupper : 1;
> +
>    /* The largest alignment, in bytes, of stack slot actually used.  */
>    unsigned int max_used_stack_alignment;
>
> --- gcc/config/i386/i386-expand.c.jj    2021-02-09 12:28:14.069323264 +0100
> +++ gcc/config/i386/i386-expand.c       2021-03-15 12:34:26.549901726 +0100
> @@ -13210,6 +13210,10 @@ rdseed_step:
>
>        return 0;
>
> +    case IX86_BUILTIN_VZEROUPPER:
> +      cfun->machine->has_explicit_vzeroupper = true;
> +      break;
> +
>      default:
>        break;
>      }
> --- gcc/config/i386/i386-features.c.jj  2021-02-01 09:55:45.953519272 +0100
> +++ gcc/config/i386/i386-features.c     2021-03-15 12:37:07.886116827 +0100
> @@ -1837,19 +1837,22 @@ ix86_add_reg_usage_to_vzerouppers (void)
>  static unsigned int
>  rest_of_handle_insert_vzeroupper (void)
>  {
> -  int i;
> -
> -  /* vzeroupper instructions are inserted immediately after reload to
> -     account for possible spills from 256bit or 512bit registers.  The pass
> -     reuses mode switching infrastructure by re-running mode insertion
> -     pass, so disable entities that have already been processed.  */
> -  for (i = 0; i < MAX_386_ENTITIES; i++)
> -    ix86_optimize_mode_switching[i] = 0;
> +  if (TARGET_VZEROUPPER
> +      && flag_expensive_optimizations
> +      && !optimize_size)
> +    {
> +      /* vzeroupper instructions are inserted immediately after reload to
> +        account for possible spills from 256bit or 512bit registers.  The pass
> +        reuses mode switching infrastructure by re-running mode insertion
> +        pass, so disable entities that have already been processed.  */
> +      for (int i = 0; i < MAX_386_ENTITIES; i++)
> +       ix86_optimize_mode_switching[i] = 0;
>
> -  ix86_optimize_mode_switching[AVX_U128] = 1;
> +      ix86_optimize_mode_switching[AVX_U128] = 1;
>
> -  /* Call optimize_mode_switching.  */
> -  g->get_passes ()->execute_pass_mode_switching ();
> +      /* Call optimize_mode_switching.  */
> +      g->get_passes ()->execute_pass_mode_switching ();
> +    }
>    ix86_add_reg_usage_to_vzerouppers ();
>    return 0;
>  }
> @@ -1880,8 +1883,10 @@ public:
>    virtual bool gate (function *)
>      {
>        return TARGET_AVX
> -            && TARGET_VZEROUPPER && flag_expensive_optimizations
> -            && !optimize_size;
> +            && ((TARGET_VZEROUPPER
> +                 && flag_expensive_optimizations
> +                 && !optimize_size)
> +                || cfun->machine->has_explicit_vzeroupper);
>      }
>
>    virtual unsigned int execute (function *)
> --- gcc/testsuite/gcc.target/i386/avx-pr99563.c.jj      2021-03-15 13:18:08.896950279 +0100
> +++ gcc/testsuite/gcc.target/i386/avx-pr99563.c 2021-03-15 13:17:28.881392012 +0100
> @@ -0,0 +1,38 @@
> +/* PR target/99563 */
> +/* { dg-do run { target avx } } */
> +/* { dg-options "-O2 -mavx -mno-vzeroupper" } */
> +
> +#include "avx-check.h"
> +#include <immintrin.h>
> +
> +
> +__attribute__((noipa)) float
> +compute_generic (void)
> +{
> +  return 0.0f;
> +}
> +
> +static inline __attribute__((always_inline))
> +float compute_avx (unsigned long block_count)
> +{
> +  __m128d mm_res = _mm_set1_pd (256.0);
> +  float res = (float) (_mm_cvtsd_f64 (mm_res) / (double) block_count);
> +  _mm256_zeroupper ();
> +  return res;
> +}
> +
> +__attribute__((noipa)) float
> +compute (unsigned long block_count)
> +{
> +  if (block_count >= 64)
> +    return compute_avx (block_count);
> +  else
> +    return compute_generic ();
> +}
> +
> +static void
> +avx_test (void)
> +{
> +  if (compute (128) != 2.0f || compute (32) != 0.0f)
> +    abort ();
> +}
>
>         Jakub
>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-03-16 10:05 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-16  9:50 [PATCH] i386: Fix up _mm256_vzeroupper() handling [PR99563] Jakub Jelinek
2021-03-16 10:04 ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).