* [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
@ 2019-01-07 17:40 H.J. Lu
2019-01-08 7:12 ` Uros Bizjak
0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-01-07 17:40 UTC (permalink / raw)
To: gcc-patches; +Cc: Uros Bizjak
There is no need to generate vzeroupper if caller uses upper bits of
AVX/AVX512 registers, We track caller's avx_u128_state and avoid
vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
Tested on i686 and x86-64 with and without --with-arch=native.
OK for trunk?
Thanks.
H.J.
---
gcc/
PR target/88717
* config/i386/i386.c (ix86_avx_u128_mode_entry): Set
caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY.
(ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if
caller is AVX_U128_DIRTY.
* config/i386/i386.h (machine_function): Add
caller_avx_u128_dirty.
gcc/testsuite/
PR target/88717
* gcc.target/i386/pr88717.c: New test.
---
gcc/config/i386/i386.c | 10 +++++++++-
gcc/config/i386/i386.h | 3 +++
gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++
3 files changed, 36 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d01278d866f..9b49a2c1d9c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void)
rtx incoming = DECL_INCOMING_RTL (arg);
if (incoming && ix86_check_avx_upper_register (incoming))
- return AVX_U128_DIRTY;
+ {
+ /* Caller is AVX_U128_DIRTY. */
+ cfun->machine->caller_avx_u128_dirty = true;
+ return AVX_U128_DIRTY;
+ }
}
return AVX_U128_CLEAN;
@@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity)
static int
ix86_avx_u128_mode_exit (void)
{
+ /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY. */
+ if (cfun->machine->caller_avx_u128_dirty)
+ return AVX_U128_DIRTY;
+
rtx reg = crtl->return_rtx;
/* Exit mode is set to AVX_U128_DIRTY if there are 256bit
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 83b025e0cf5..c053b657a55 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2747,6 +2747,9 @@ struct GTY(()) machine_function {
/* If true, ENDBR is queued at function entrance. */
BOOL_BITFIELD endbr_queued_at_entrance : 1;
+ /* If true, caller is AVX_U128_DIRTY. */
+ BOOL_BITFIELD caller_avx_u128_dirty : 1;
+
/* The largest alignment, in bytes, of stack slot actually used. */
unsigned int max_used_stack_alignment;
diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c
new file mode 100644
index 00000000000..01680998f1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88717.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mvzeroupper" } */
+
+#include <immintrin.h>
+
+__m128
+foo1 (__m256 x)
+{
+ return _mm256_castps256_ps128 (x);
+}
+
+void
+foo2 (float *p, __m256 x)
+{
+ *p = ((__v8sf)x)[0];
+}
+
+void
+foo3 (float *p, __m512 x)
+{
+ *p = ((__v16sf)x)[0];
+}
+
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
--
2.20.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
2019-01-07 17:40 [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY H.J. Lu
@ 2019-01-08 7:12 ` Uros Bizjak
2019-01-08 14:39 ` H.J. Lu
0 siblings, 1 reply; 7+ messages in thread
From: Uros Bizjak @ 2019-01-08 7:12 UTC (permalink / raw)
To: H.J. Lu; +Cc: gcc-patches
On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
>
> There is no need to generate vzeroupper if caller uses upper bits of
> AVX/AVX512 registers, We track caller's avx_u128_state and avoid
> vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
>
> Tested on i686 and x86-64 with and without --with-arch=native.
>
> OK for trunk?
In principle OK, but I think we don't have to cache the result of
ix86_avx_u128_mode_entry. Simply call the function from
ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
afford to re-call it one more time per function.
Uros.
> Thanks.
>
> H.J.
> ---
> gcc/
>
> PR target/88717
> * config/i386/i386.c (ix86_avx_u128_mode_entry): Set
> caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY.
> (ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if
> caller is AVX_U128_DIRTY.
> * config/i386/i386.h (machine_function): Add
> caller_avx_u128_dirty.
>
> gcc/testsuite/
>
> PR target/88717
> * gcc.target/i386/pr88717.c: New test.
> ---
> gcc/config/i386/i386.c | 10 +++++++++-
> gcc/config/i386/i386.h | 3 +++
> gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++
> 3 files changed, 36 insertions(+), 1 deletion(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index d01278d866f..9b49a2c1d9c 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void)
> rtx incoming = DECL_INCOMING_RTL (arg);
>
> if (incoming && ix86_check_avx_upper_register (incoming))
> - return AVX_U128_DIRTY;
> + {
> + /* Caller is AVX_U128_DIRTY. */
> + cfun->machine->caller_avx_u128_dirty = true;
> + return AVX_U128_DIRTY;
> + }
> }
>
> return AVX_U128_CLEAN;
> @@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity)
> static int
> ix86_avx_u128_mode_exit (void)
> {
> + /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY. */
> + if (cfun->machine->caller_avx_u128_dirty)
> + return AVX_U128_DIRTY;
> +
> rtx reg = crtl->return_rtx;
>
> /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 83b025e0cf5..c053b657a55 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -2747,6 +2747,9 @@ struct GTY(()) machine_function {
> /* If true, ENDBR is queued at function entrance. */
> BOOL_BITFIELD endbr_queued_at_entrance : 1;
>
> + /* If true, caller is AVX_U128_DIRTY. */
> + BOOL_BITFIELD caller_avx_u128_dirty : 1;
> +
> /* The largest alignment, in bytes, of stack slot actually used. */
> unsigned int max_used_stack_alignment;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c
> new file mode 100644
> index 00000000000..01680998f1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88717.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f -mvzeroupper" } */
> +
> +#include <immintrin.h>
> +
> +__m128
> +foo1 (__m256 x)
> +{
> + return _mm256_castps256_ps128 (x);
> +}
> +
> +void
> +foo2 (float *p, __m256 x)
> +{
> + *p = ((__v8sf)x)[0];
> +}
> +
> +void
> +foo3 (float *p, __m512 x)
> +{
> + *p = ((__v16sf)x)[0];
> +}
> +
> +/* { dg-final { scan-assembler-not "vzeroupper" } } */
> --
> 2.20.1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
2019-01-08 7:12 ` Uros Bizjak
@ 2019-01-08 14:39 ` H.J. Lu
2019-01-08 14:54 ` Uros Bizjak
0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-01-08 14:39 UTC (permalink / raw)
To: Uros Bizjak; +Cc: gcc-patches
On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
> >
> > There is no need to generate vzeroupper if caller uses upper bits of
> > AVX/AVX512 registers, We track caller's avx_u128_state and avoid
> > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
> >
> > Tested on i686 and x86-64 with and without --with-arch=native.
> >
> > OK for trunk?
>
> In principle OK, but I think we don't have to cache the result of
> ix86_avx_u128_mode_entry. Simply call the function from
> ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
> afford to re-call it one more time per function.
Do we really need ix86_avx_u128_mode_entry? We can just
set entry state to AVX_U128_CLEAN and set exit state to
AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes
AVX/AVX512 registers to callee.
Does this patch look OK?
Thanks.
H.J.
--
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d01278d866f..1ac89fd2eb5 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19087,25 +19087,6 @@ ix86_dirflag_mode_entry (void)
return X86_DIRFLAG_RESET;
}
-static int
-ix86_avx_u128_mode_entry (void)
-{
- tree arg;
-
- /* Entry mode is set to AVX_U128_DIRTY if there are
- 256bit or 512bit modes used in function arguments. */
- for (arg = DECL_ARGUMENTS (current_function_decl); arg;
- arg = TREE_CHAIN (arg))
- {
- rtx incoming = DECL_INCOMING_RTL (arg);
-
- if (incoming && ix86_check_avx_upper_register (incoming))
- return AVX_U128_DIRTY;
- }
-
- return AVX_U128_CLEAN;
-}
-
/* Return a mode that ENTITY is assumed to be
switched to at function entry. */
@@ -19117,7 +19098,7 @@ ix86_mode_entry (int entity)
case X86_DIRFLAG:
return ix86_dirflag_mode_entry ();
case AVX_U128:
- return ix86_avx_u128_mode_entry ();
+ return AVX_U128_CLEAN;
case I387_TRUNC:
case I387_FLOOR:
case I387_CEIL:
@@ -19130,13 +19111,24 @@ ix86_mode_entry (int entity)
static int
ix86_avx_u128_mode_exit (void)
{
+ /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
+ modes used in function arguments or function return.. */
rtx reg = crtl->return_rtx;
- /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
- or 512 bit modes used in the function return register. */
if (reg && ix86_check_avx_upper_register (reg))
return AVX_U128_DIRTY;
+ tree arg;
+
+ for (arg = DECL_ARGUMENTS (current_function_decl); arg;
+ arg = TREE_CHAIN (arg))
+ {
+ rtx incoming = DECL_INCOMING_RTL (arg);
+
+ if (incoming && ix86_check_avx_upper_register (incoming))
+ return AVX_U128_DIRTY;
+ }
+
return AVX_U128_CLEAN;
}
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
2019-01-08 14:39 ` H.J. Lu
@ 2019-01-08 14:54 ` Uros Bizjak
2019-01-08 16:17 ` V2 " H.J. Lu
0 siblings, 1 reply; 7+ messages in thread
From: Uros Bizjak @ 2019-01-08 14:54 UTC (permalink / raw)
To: H.J. Lu; +Cc: gcc-patches
On Tue, Jan 8, 2019 at 3:39 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
> > >
> > > There is no need to generate vzeroupper if caller uses upper bits of
> > > AVX/AVX512 registers, We track caller's avx_u128_state and avoid
> > > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
> > >
> > > Tested on i686 and x86-64 with and without --with-arch=native.
> > >
> > > OK for trunk?
> >
> > In principle OK, but I think we don't have to cache the result of
> > ix86_avx_u128_mode_entry. Simply call the function from
> > ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
> > afford to re-call it one more time per function.
>
> Do we really need ix86_avx_u128_mode_entry? We can just
> set entry state to AVX_U128_CLEAN and set exit state to
> AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes
> AVX/AVX512 registers to callee.
>
> Does this patch look OK?
No, the compiler is then free to move optimal insertion point at the
beginning of the function.
Uros.
> Thanks.
>
> H.J.
> --
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index d01278d866f..1ac89fd2eb5 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19087,25 +19087,6 @@ ix86_dirflag_mode_entry (void)
> return X86_DIRFLAG_RESET;
> }
>
> -static int
> -ix86_avx_u128_mode_entry (void)
> -{
> - tree arg;
> -
> - /* Entry mode is set to AVX_U128_DIRTY if there are
> - 256bit or 512bit modes used in function arguments. */
> - for (arg = DECL_ARGUMENTS (current_function_decl); arg;
> - arg = TREE_CHAIN (arg))
> - {
> - rtx incoming = DECL_INCOMING_RTL (arg);
> -
> - if (incoming && ix86_check_avx_upper_register (incoming))
> - return AVX_U128_DIRTY;
> - }
> -
> - return AVX_U128_CLEAN;
> -}
> -
> /* Return a mode that ENTITY is assumed to be
> switched to at function entry. */
>
> @@ -19117,7 +19098,7 @@ ix86_mode_entry (int entity)
> case X86_DIRFLAG:
> return ix86_dirflag_mode_entry ();
> case AVX_U128:
> - return ix86_avx_u128_mode_entry ();
> + return AVX_U128_CLEAN;
> case I387_TRUNC:
> case I387_FLOOR:
> case I387_CEIL:
> @@ -19130,13 +19111,24 @@ ix86_mode_entry (int entity)
> static int
> ix86_avx_u128_mode_exit (void)
> {
> + /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
> + modes used in function arguments or function return.. */
> rtx reg = crtl->return_rtx;
>
> - /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
> - or 512 bit modes used in the function return register. */
> if (reg && ix86_check_avx_upper_register (reg))
> return AVX_U128_DIRTY;
>
> + tree arg;
> +
> + for (arg = DECL_ARGUMENTS (current_function_decl); arg;
> + arg = TREE_CHAIN (arg))
> + {
> + rtx incoming = DECL_INCOMING_RTL (arg);
> +
> + if (incoming && ix86_check_avx_upper_register (incoming))
> + return AVX_U128_DIRTY;
> + }
> +
> return AVX_U128_CLEAN;
> }
^ permalink raw reply [flat|nested] 7+ messages in thread
* V2 [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
2019-01-08 14:54 ` Uros Bizjak
@ 2019-01-08 16:17 ` H.J. Lu
2019-01-08 17:30 ` Uros Bizjak
0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-01-08 16:17 UTC (permalink / raw)
To: Uros Bizjak, Hongtao Liu; +Cc: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 1333 bytes --]
On Tue, Jan 8, 2019 at 6:54 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Jan 8, 2019 at 3:39 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
> > > >
> > > > There is no need to generate vzeroupper if caller uses upper bits of
> > > > AVX/AVX512 registers, We track caller's avx_u128_state and avoid
> > > > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
> > > >
> > > > Tested on i686 and x86-64 with and without --with-arch=native.
> > > >
> > > > OK for trunk?
> > >
> > > In principle OK, but I think we don't have to cache the result of
> > > ix86_avx_u128_mode_entry. Simply call the function from
> > > ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
> > > afford to re-call it one more time per function.
> >
> > Do we really need ix86_avx_u128_mode_entry? We can just
> > set entry state to AVX_U128_CLEAN and set exit state to
> > AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes
> > AVX/AVX512 registers to callee.
> >
> > Does this patch look OK?
>
> No, the compiler is then free to move optimal insertion point at the
> beginning of the function.
>
Here is the updated patch. OK for trunk?
Thanks.
--
H.J.
[-- Attachment #2: 0001-x86-Don-t-generate-vzeroupper-if-caller-passes-AVX-A.patch --]
[-- Type: text/x-patch, Size: 1930 bytes --]
From 702ece14923f9922be5a6ed835a8efbe24e890ba Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 7 Jan 2019 06:56:44 -0800
Subject: [PATCH] x86: Don't generate vzeroupper if caller passes AVX/AVX512
registers
There is no need to generate vzeroupper if caller passes arguments in
AVX/AVX512 registers.
Tested on i686 and x86-64 with and without --with-arch=native.
gcc/
PR target/88717
* config/i386/i386.c (ix86_avx_u128_mode_exit): Call
ix86_avx_u128_mode_entry.
gcc/testsuite/
PR target/88717
* gcc.target/i386/pr88717.c: New test.
---
gcc/config/i386/i386.c | 4 +++-
gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++
2 files changed, 27 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d01278d866f..7d82a241143 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19137,7 +19137,9 @@ ix86_avx_u128_mode_exit (void)
if (reg && ix86_check_avx_upper_register (reg))
return AVX_U128_DIRTY;
- return AVX_U128_CLEAN;
+ /* Entry mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
+ modes used in function arguments. */
+ return ix86_avx_u128_mode_entry ();
}
/* Return a mode that ENTITY is assumed to be
diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c
new file mode 100644
index 00000000000..01680998f1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88717.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mvzeroupper" } */
+
+#include <immintrin.h>
+
+__m128
+foo1 (__m256 x)
+{
+ return _mm256_castps256_ps128 (x);
+}
+
+void
+foo2 (float *p, __m256 x)
+{
+ *p = ((__v8sf)x)[0];
+}
+
+void
+foo3 (float *p, __m512 x)
+{
+ *p = ((__v16sf)x)[0];
+}
+
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
--
2.20.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: V2 [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
2019-01-08 16:17 ` V2 " H.J. Lu
@ 2019-01-08 17:30 ` Uros Bizjak
2019-01-08 17:37 ` H.J. Lu
0 siblings, 1 reply; 7+ messages in thread
From: Uros Bizjak @ 2019-01-08 17:30 UTC (permalink / raw)
To: H.J. Lu; +Cc: Hongtao Liu, gcc-patches
On Tue, Jan 8, 2019 at 5:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jan 8, 2019 at 6:54 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Tue, Jan 8, 2019 at 3:39 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
> > > > >
> > > > > There is no need to generate vzeroupper if caller uses upper bits of
> > > > > AVX/AVX512 registers, We track caller's avx_u128_state and avoid
> > > > > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
> > > > >
> > > > > Tested on i686 and x86-64 with and without --with-arch=native.
> > > > >
> > > > > OK for trunk?
> > > >
> > > > In principle OK, but I think we don't have to cache the result of
> > > > ix86_avx_u128_mode_entry. Simply call the function from
> > > > ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
> > > > afford to re-call it one more time per function.
> > >
> > > Do we really need ix86_avx_u128_mode_entry? We can just
> > > set entry state to AVX_U128_CLEAN and set exit state to
> > > AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes
> > > AVX/AVX512 registers to callee.
> > >
> > > Does this patch look OK?
> >
> > No, the compiler is then free to move optimal insertion point at the
> > beginning of the function.
> >
>
> Here is the updated patch. OK for trunk?
OK with the comment fix.
Thanks,
Uros.
- return AVX_U128_CLEAN;
+ /* Entry mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
s/Entry/Exit/
+ modes used in function arguments. */
... , otherwise return AVX_U128_CLEAN.
+ return ix86_avx_u128_mode_entry ();
}
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: V2 [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
2019-01-08 17:30 ` Uros Bizjak
@ 2019-01-08 17:37 ` H.J. Lu
0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2019-01-08 17:37 UTC (permalink / raw)
To: Uros Bizjak; +Cc: Hongtao Liu, gcc-patches
[-- Attachment #1: Type: text/plain, Size: 1952 bytes --]
On Tue, Jan 8, 2019 at 9:29 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Jan 8, 2019 at 5:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Jan 8, 2019 at 6:54 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Tue, Jan 8, 2019 at 3:39 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
> > > > > >
> > > > > > There is no need to generate vzeroupper if caller uses upper bits of
> > > > > > AVX/AVX512 registers, We track caller's avx_u128_state and avoid
> > > > > > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
> > > > > >
> > > > > > Tested on i686 and x86-64 with and without --with-arch=native.
> > > > > >
> > > > > > OK for trunk?
> > > > >
> > > > > In principle OK, but I think we don't have to cache the result of
> > > > > ix86_avx_u128_mode_entry. Simply call the function from
> > > > > ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
> > > > > afford to re-call it one more time per function.
> > > >
> > > > Do we really need ix86_avx_u128_mode_entry? We can just
> > > > set entry state to AVX_U128_CLEAN and set exit state to
> > > > AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes
> > > > AVX/AVX512 registers to callee.
> > > >
> > > > Does this patch look OK?
> > >
> > > No, the compiler is then free to move optimal insertion point at the
> > > beginning of the function.
> > >
> >
> > Here is the updated patch. OK for trunk?
>
> OK with the comment fix.
>
> Thanks,
> Uros.
>
> - return AVX_U128_CLEAN;
> + /* Entry mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
>
> s/Entry/Exit/
>
> + modes used in function arguments. */
>
> ... , otherwise return AVX_U128_CLEAN.
>
> + return ix86_avx_u128_mode_entry ();
> }
This is what I am checking in.
Thanks.
--
H.J.
[-- Attachment #2: 0001-x86-Don-t-generate-vzeroupper-if-caller-passes-AVX-A.patch --]
[-- Type: text/x-patch, Size: 1967 bytes --]
From 315e6eadf7021748de375c59da9cf451351c9597 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 7 Jan 2019 06:56:44 -0800
Subject: [PATCH] x86: Don't generate vzeroupper if caller passes AVX/AVX512
registers
There is no need to generate vzeroupper if caller passes arguments in
AVX/AVX512 registers.
Tested on i686 and x86-64 with and without --with-arch=native.
gcc/
PR target/88717
* config/i386/i386.c (ix86_avx_u128_mode_exit): Call
ix86_avx_u128_mode_entry.
gcc/testsuite/
PR target/88717
* gcc.target/i386/pr88717.c: New test.
---
gcc/config/i386/i386.c | 5 ++++-
gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++
2 files changed, 28 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d01278d866f..bd48e080f46 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19137,7 +19137,10 @@ ix86_avx_u128_mode_exit (void)
if (reg && ix86_check_avx_upper_register (reg))
return AVX_U128_DIRTY;
- return AVX_U128_CLEAN;
+ /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
+ modes used in function arguments, otherwise return AVX_U128_CLEAN.
+ */
+ return ix86_avx_u128_mode_entry ();
}
/* Return a mode that ENTITY is assumed to be
diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c
new file mode 100644
index 00000000000..01680998f1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88717.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mvzeroupper" } */
+
+#include <immintrin.h>
+
+__m128
+foo1 (__m256 x)
+{
+ return _mm256_castps256_ps128 (x);
+}
+
+void
+foo2 (float *p, __m256 x)
+{
+ *p = ((__v8sf)x)[0];
+}
+
+void
+foo3 (float *p, __m512 x)
+{
+ *p = ((__v16sf)x)[0];
+}
+
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
--
2.20.1
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2019-01-08 17:37 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-07 17:40 [PATCH] x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY H.J. Lu
2019-01-08 7:12 ` Uros Bizjak
2019-01-08 14:39 ` H.J. Lu
2019-01-08 14:54 ` Uros Bizjak
2019-01-08 16:17 ` V2 " H.J. Lu
2019-01-08 17:30 ` Uros Bizjak
2019-01-08 17:37 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).