public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] [PR91400] build only one __cpu_model variable
@ 2021-05-03  8:39 Ivan Sorokin
  2021-05-03  9:08 ` Ivan Sorokin
  2021-05-05  7:36 ` Richard Biener
  0 siblings, 2 replies; 5+ messages in thread
From: Ivan Sorokin @ 2021-05-03  8:39 UTC (permalink / raw)
  To: gcc-patches

Prior to this commit GCC -O2 generated quite bad code for this
function:

bool f()
{
    return __builtin_cpu_supports("popcnt")
        && __builtin_cpu_supports("ssse3");
}

f:
        movl    __cpu_model+12(%rip), %eax
        xorl    %r8d, %r8d
        testb   $4, %al
        je      .L1
        shrl    $6, %eax
        movl    %eax, %r8d
        andl    $1, %r8d
.L1:
        movl    %r8d, %eax
        ret

The problem was caused by the fact that internally every invocation
of __builtin_cpu_supports built a new variable __cpu_model and a new
type __processor_model. Because of this GIMPLE level optimizers
weren't able to CSE the loads of __cpu_model and optimize
bit-operations properly.

This commit fixes the problem by caching created __cpu_model
variable and __processor_model type. Now the GCC -O2 generates:

f:
        movl    __cpu_model+12(%rip), %eax
        andl    $68, %eax
        cmpl    $68, %eax
        sete    %al
        ret

gcc/ChangeLog:

	PR target/91400
	* config/i386/i386-builtins.c (fold_builtin_cpu): Extract
	building of __cpu_model and __processor_model into new
	function.
	* config/i386/i386-builtins.c (init_cpu_model_var): New.
	Cache creation of __cpu_model and __processor_model.

gcc/testsuite/Changelog:

	PR target/91400
	* gcc.target/i386/pr91400.c: New.
---
 gcc/config/i386/i386-builtins.c         | 27 ++++++++++++++++++-------
 gcc/testsuite/gcc.target/i386/pr91400.c | 11 ++++++++++
 2 files changed, 31 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91400.c

diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
index b66911082ab..b7d9dd18b03 100644
--- a/gcc/config/i386/i386-builtins.c
+++ b/gcc/config/i386/i386-builtins.c
@@ -2103,6 +2103,25 @@ make_var_decl (tree type, const char *name)
   return new_decl;
 }
 
+static GTY(()) tree __cpu_model_var;
+static GTY(()) tree __processor_model_type;
+
+static void
+init_cpu_model_var()
+{
+  if (__cpu_model_var != NULL_TREE)
+    {
+      gcc_assert (__processor_model_type != NULL_TREE);
+      return;
+    }
+
+  __processor_model_type = build_processor_model_struct ();
+  __cpu_model_var = make_var_decl (__processor_model_type,
+				   "__cpu_model");
+
+  varpool_node::add (__cpu_model_var);
+}
+
 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
    into an integer defined in libgcc/config/i386/cpuinfo.c */
 
@@ -2114,13 +2133,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
     = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
   tree param_string_cst = NULL;
 
-  tree __processor_model_type = build_processor_model_struct ();
-  tree __cpu_model_var = make_var_decl (__processor_model_type,
-					"__cpu_model");
-
-
-  varpool_node::add (__cpu_model_var);
-
+  init_cpu_model_var ();
   gcc_assert ((args != NULL) && (*args != NULL));
 
   param_string_cst = *args;
diff --git a/gcc/testsuite/gcc.target/i386/pr91400.c b/gcc/testsuite/gcc.target/i386/pr91400.c
new file mode 100644
index 00000000000..e8b7d9285f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91400.c
@@ -0,0 +1,11 @@
+/* PR target/91400 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "andl" 1 } } */
+/* { dg-final { scan-assembler-times "68" 2 } } */
+/* { dg-final { scan-assembler-not "je" } } */
+
+_Bool f()
+{
+    return __builtin_cpu_supports("popcnt") && __builtin_cpu_supports("ssse3");
+}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] [PR91400] build only one __cpu_model variable
  2021-05-03  8:39 [PATCH] [PR91400] build only one __cpu_model variable Ivan Sorokin
@ 2021-05-03  9:08 ` Ivan Sorokin
  2021-05-05  7:36 ` Richard Biener
  1 sibling, 0 replies; 5+ messages in thread
From: Ivan Sorokin @ 2021-05-03  9:08 UTC (permalink / raw)
  To: gcc-patches

As GCC 11 is released and the development of GCC 12 begins I would like
to send this patch again.

It is basically unchanged since the previous submission in February. The 
only change I made is fixing formatting by adding a space after gcc_assert.

Please note, that the original author of the code no longer actively 
contribute to GCC, so he is unlikely to review this patch. It would be 
great if someone else takes a look at it.

On 03.05.2021 11:39, Ivan Sorokin wrote:
> Prior to this commit GCC -O2 generated quite bad code for this
> function:
> 
> bool f()
> {
>      return __builtin_cpu_supports("popcnt")
>          && __builtin_cpu_supports("ssse3");
> }
> 
> f:
>          movl    __cpu_model+12(%rip), %eax
>          xorl    %r8d, %r8d
>          testb   $4, %al
>          je      .L1
>          shrl    $6, %eax
>          movl    %eax, %r8d
>          andl    $1, %r8d
> .L1:
>          movl    %r8d, %eax
>          ret
> 
> The problem was caused by the fact that internally every invocation
> of __builtin_cpu_supports built a new variable __cpu_model and a new
> type __processor_model. Because of this GIMPLE level optimizers
> weren't able to CSE the loads of __cpu_model and optimize
> bit-operations properly.
> 
> This commit fixes the problem by caching created __cpu_model
> variable and __processor_model type. Now the GCC -O2 generates:
> 
> f:
>          movl    __cpu_model+12(%rip), %eax
>          andl    $68, %eax
>          cmpl    $68, %eax
>          sete    %al
>          ret
> 
> gcc/ChangeLog:
> 
> 	PR target/91400
> 	* config/i386/i386-builtins.c (fold_builtin_cpu): Extract
> 	building of __cpu_model and __processor_model into new
> 	function.
> 	* config/i386/i386-builtins.c (init_cpu_model_var): New.
> 	Cache creation of __cpu_model and __processor_model.
> 
> gcc/testsuite/Changelog:
> 
> 	PR target/91400
> 	* gcc.target/i386/pr91400.c: New.
> ---
>   gcc/config/i386/i386-builtins.c         | 27 ++++++++++++++++++-------
>   gcc/testsuite/gcc.target/i386/pr91400.c | 11 ++++++++++
>   2 files changed, 31 insertions(+), 7 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/i386/pr91400.c
> 
> diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
> index b66911082ab..b7d9dd18b03 100644
> --- a/gcc/config/i386/i386-builtins.c
> +++ b/gcc/config/i386/i386-builtins.c
> @@ -2103,6 +2103,25 @@ make_var_decl (tree type, const char *name)
>     return new_decl;
>   }
>   
> +static GTY(()) tree __cpu_model_var;
> +static GTY(()) tree __processor_model_type;
> +
> +static void
> +init_cpu_model_var()
> +{
> +  if (__cpu_model_var != NULL_TREE)
> +    {
> +      gcc_assert (__processor_model_type != NULL_TREE);
> +      return;
> +    }
> +
> +  __processor_model_type = build_processor_model_struct ();
> +  __cpu_model_var = make_var_decl (__processor_model_type,
> +				   "__cpu_model");
> +
> +  varpool_node::add (__cpu_model_var);
> +}
> +
>   /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
>      into an integer defined in libgcc/config/i386/cpuinfo.c */
>   
> @@ -2114,13 +2133,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
>       = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
>     tree param_string_cst = NULL;
>   
> -  tree __processor_model_type = build_processor_model_struct ();
> -  tree __cpu_model_var = make_var_decl (__processor_model_type,
> -					"__cpu_model");
> -
> -
> -  varpool_node::add (__cpu_model_var);
> -
> +  init_cpu_model_var ();
>     gcc_assert ((args != NULL) && (*args != NULL));
>   
>     param_string_cst = *args;
> diff --git a/gcc/testsuite/gcc.target/i386/pr91400.c b/gcc/testsuite/gcc.target/i386/pr91400.c
> new file mode 100644
> index 00000000000..e8b7d9285f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91400.c
> @@ -0,0 +1,11 @@
> +/* PR target/91400 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "andl" 1 } } */
> +/* { dg-final { scan-assembler-times "68" 2 } } */
> +/* { dg-final { scan-assembler-not "je" } } */
> +
> +_Bool f()
> +{
> +    return __builtin_cpu_supports("popcnt") && __builtin_cpu_supports("ssse3");
> +}
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] [PR91400] build only one __cpu_model variable
  2021-05-03  8:39 [PATCH] [PR91400] build only one __cpu_model variable Ivan Sorokin
  2021-05-03  9:08 ` Ivan Sorokin
@ 2021-05-05  7:36 ` Richard Biener
  2021-05-05 14:50   ` [PATCH v2] x86: Build only one __cpu_model/__cpu_features2 variables H.J. Lu
  1 sibling, 1 reply; 5+ messages in thread
From: Richard Biener @ 2021-05-05  7:36 UTC (permalink / raw)
  To: Ivan Sorokin, Uros Bizjak, H. J. Lu; +Cc: GCC Patches

On Mon, May 3, 2021 at 11:31 AM Ivan Sorokin via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Prior to this commit GCC -O2 generated quite bad code for this
> function:
>
> bool f()
> {
>     return __builtin_cpu_supports("popcnt")
>         && __builtin_cpu_supports("ssse3");
> }
>
> f:
>         movl    __cpu_model+12(%rip), %eax
>         xorl    %r8d, %r8d
>         testb   $4, %al
>         je      .L1
>         shrl    $6, %eax
>         movl    %eax, %r8d
>         andl    $1, %r8d
> .L1:
>         movl    %r8d, %eax
>         ret
>
> The problem was caused by the fact that internally every invocation
> of __builtin_cpu_supports built a new variable __cpu_model and a new
> type __processor_model. Because of this GIMPLE level optimizers
> weren't able to CSE the loads of __cpu_model and optimize
> bit-operations properly.
>
> This commit fixes the problem by caching created __cpu_model
> variable and __processor_model type. Now the GCC -O2 generates:
>
> f:
>         movl    __cpu_model+12(%rip), %eax
>         andl    $68, %eax
>         cmpl    $68, %eax
>         sete    %al
>         ret

The patch looks good, the function could need a comment
and the global variables better names, not starting with __

Up to the x86 maintainers - HJ, can you pick up this work?

Thanks,
Richard.

> gcc/ChangeLog:
>
>         PR target/91400
>         * config/i386/i386-builtins.c (fold_builtin_cpu): Extract
>         building of __cpu_model and __processor_model into new
>         function.
>         * config/i386/i386-builtins.c (init_cpu_model_var): New.
>         Cache creation of __cpu_model and __processor_model.
>
> gcc/testsuite/Changelog:
>
>         PR target/91400
>         * gcc.target/i386/pr91400.c: New.
> ---
>  gcc/config/i386/i386-builtins.c         | 27 ++++++++++++++++++-------
>  gcc/testsuite/gcc.target/i386/pr91400.c | 11 ++++++++++
>  2 files changed, 31 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91400.c
>
> diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
> index b66911082ab..b7d9dd18b03 100644
> --- a/gcc/config/i386/i386-builtins.c
> +++ b/gcc/config/i386/i386-builtins.c
> @@ -2103,6 +2103,25 @@ make_var_decl (tree type, const char *name)
>    return new_decl;
>  }
>
> +static GTY(()) tree __cpu_model_var;
> +static GTY(()) tree __processor_model_type;
> +
> +static void
> +init_cpu_model_var()
> +{
> +  if (__cpu_model_var != NULL_TREE)
> +    {
> +      gcc_assert (__processor_model_type != NULL_TREE);
> +      return;
> +    }
> +
> +  __processor_model_type = build_processor_model_struct ();
> +  __cpu_model_var = make_var_decl (__processor_model_type,
> +                                  "__cpu_model");
> +
> +  varpool_node::add (__cpu_model_var);
> +}
> +
>  /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
>     into an integer defined in libgcc/config/i386/cpuinfo.c */
>
> @@ -2114,13 +2133,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
>      = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
>    tree param_string_cst = NULL;
>
> -  tree __processor_model_type = build_processor_model_struct ();
> -  tree __cpu_model_var = make_var_decl (__processor_model_type,
> -                                       "__cpu_model");
> -
> -
> -  varpool_node::add (__cpu_model_var);
> -
> +  init_cpu_model_var ();
>    gcc_assert ((args != NULL) && (*args != NULL));
>
>    param_string_cst = *args;
> diff --git a/gcc/testsuite/gcc.target/i386/pr91400.c b/gcc/testsuite/gcc.target/i386/pr91400.c
> new file mode 100644
> index 00000000000..e8b7d9285f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91400.c
> @@ -0,0 +1,11 @@
> +/* PR target/91400 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "andl" 1 } } */
> +/* { dg-final { scan-assembler-times "68" 2 } } */
> +/* { dg-final { scan-assembler-not "je" } } */
> +
> +_Bool f()
> +{
> +    return __builtin_cpu_supports("popcnt") && __builtin_cpu_supports("ssse3");
> +}
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2] x86: Build only one __cpu_model/__cpu_features2 variables
  2021-05-05  7:36 ` Richard Biener
@ 2021-05-05 14:50   ` H.J. Lu
  2021-05-05 17:59     ` Uros Bizjak
  0 siblings, 1 reply; 5+ messages in thread
From: H.J. Lu @ 2021-05-05 14:50 UTC (permalink / raw)
  To: Richard Biener; +Cc: Ivan Sorokin, Uros Bizjak, GCC Patches

On Wed, May 05, 2021 at 09:36:16AM +0200, Richard Biener wrote:
> On Mon, May 3, 2021 at 11:31 AM Ivan Sorokin via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Prior to this commit GCC -O2 generated quite bad code for this
> > function:
> >
> > bool f()
> > {
> >     return __builtin_cpu_supports("popcnt")
> >         && __builtin_cpu_supports("ssse3");
> > }
> >
> > f:
> >         movl    __cpu_model+12(%rip), %eax
> >         xorl    %r8d, %r8d
> >         testb   $4, %al
> >         je      .L1
> >         shrl    $6, %eax
> >         movl    %eax, %r8d
> >         andl    $1, %r8d
> > .L1:
> >         movl    %r8d, %eax
> >         ret
> >
> > The problem was caused by the fact that internally every invocation
> > of __builtin_cpu_supports built a new variable __cpu_model and a new
> > type __processor_model. Because of this GIMPLE level optimizers
> > weren't able to CSE the loads of __cpu_model and optimize
> > bit-operations properly.
> >
> > This commit fixes the problem by caching created __cpu_model
> > variable and __processor_model type. Now the GCC -O2 generates:
> >
> > f:
> >         movl    __cpu_model+12(%rip), %eax
> >         andl    $68, %eax
> >         cmpl    $68, %eax
> >         sete    %al
> >         ret
> 
> The patch looks good, the function could need a comment
> and the global variables better names, not starting with __
> 
> Up to the x86 maintainers - HJ, can you pick up this work?
> 

Here is the updated patch to also handle and__cpu_features2.
OK for master?

Thanks.

H.J.
---
GCC -O2 generated quite bad code for this function:

bool
f (void)
{
  return __builtin_cpu_supports("popcnt")
	 && __builtin_cpu_supports("ssse3");
}

f:
	movl	__cpu_model+12(%rip), %edx
	movl	%edx, %eax
	shrl	$6, %eax
	andl	$1, %eax
	andl	$4, %edx
	movl	$0, %edx
	cmove	%edx, %eax
	ret

The problem was caused by the fact that internally every invocation of
__builtin_cpu_supports built a new variable __cpu_model and a new type
__processor_model.  Because of this, GIMPLE level optimizers weren't able
to CSE the loads of __cpu_model and optimize bit-operations properly.

Improve GCC -O2 code generation by caching __cpu_model and__cpu_features2
variables as well as their types:

f:
        movl    __cpu_model+12(%rip), %eax
        andl    $68, %eax
        cmpl    $68, %eax
        sete    %al
        ret

gcc/ChangeLog:

2021-05-05  Ivan Sorokin <vanyacpp@gmail.com>
	    H.J. Lu <hjl.tools@gmail.com>

	PR target/91400
	* config/i386/i386-builtins.c (ix86_cpu_model_type_node): New.
	(ix86_cpu_model_var): Likewise.
	(ix86_cpu_features2_type_node): Likewise.
	(ix86_cpu_features2_var): Likewise.
	(fold_builtin_cpu): Cache __cpu_model and __cpu_features2 with
	their types.

gcc/testsuite/Changelog:

2021-05-05  Ivan Sorokin <vanyacpp@gmail.com>
	    H.J. Lu <hjl.tools@gmail.com>

	PR target/91400
	* gcc.target/i386/pr91400-1.c: New test.
	* gcc.target/i386/pr91400-2.c: Likewise.
---
 gcc/config/i386/i386-builtins.c           | 52 +++++++++++++++--------
 gcc/testsuite/gcc.target/i386/pr91400-1.c | 14 ++++++
 gcc/testsuite/gcc.target/i386/pr91400-2.c | 14 ++++++
 3 files changed, 63 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91400-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr91400-2.c

diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
index b66911082ab..8036aedebac 100644
--- a/gcc/config/i386/i386-builtins.c
+++ b/gcc/config/i386/i386-builtins.c
@@ -2103,6 +2103,11 @@ make_var_decl (tree type, const char *name)
   return new_decl;
 }
 
+static GTY(()) tree ix86_cpu_model_type_node;
+static GTY(()) tree ix86_cpu_model_var;
+static GTY(()) tree ix86_cpu_features2_type_node;
+static GTY(()) tree ix86_cpu_features2_var;
+
 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
    into an integer defined in libgcc/config/i386/cpuinfo.c */
 
@@ -2114,12 +2119,16 @@ fold_builtin_cpu (tree fndecl, tree *args)
     = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
   tree param_string_cst = NULL;
 
-  tree __processor_model_type = build_processor_model_struct ();
-  tree __cpu_model_var = make_var_decl (__processor_model_type,
-					"__cpu_model");
-
-
-  varpool_node::add (__cpu_model_var);
+  if (ix86_cpu_model_var == nullptr)
+    {
+      /* Build a single __cpu_model variable for all references to
+	 __cpu_model so that GIMPLE level optimizers can CSE the loads
+	 of __cpu_model and optimize bit-operations properly.  */
+      ix86_cpu_model_type_node = build_processor_model_struct ();
+      ix86_cpu_model_var = make_var_decl (ix86_cpu_model_type_node,
+					  "__cpu_model");
+      varpool_node::add (ix86_cpu_model_var);
+    }
 
   gcc_assert ((args != NULL) && (*args != NULL));
 
@@ -2160,7 +2169,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
 	  return integer_zero_node;
 	}
 
-      field = TYPE_FIELDS (__processor_model_type);
+      field = TYPE_FIELDS (ix86_cpu_model_type_node);
       field_val = processor_alias_table[i].model;
 
       /* CPU types are stored in the next field.  */
@@ -2179,7 +2188,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
 	}
 
       /* Get the appropriate field in __cpu_model.  */
-      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+      ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
 		    field, NULL_TREE);
 
       /* Check the value.  */
@@ -2212,13 +2221,22 @@ fold_builtin_cpu (tree fndecl, tree *args)
 
       if (isa_names_table[i].feature >= 32)
 	{
-	  tree index_type
-	    = build_index_type (size_int (SIZE_OF_CPU_FEATURES));
-	  tree type = build_array_type (unsigned_type_node, index_type);
-	  tree __cpu_features2_var = make_var_decl (type,
-						    "__cpu_features2");
+	  if (ix86_cpu_features2_var == nullptr)
+	    {
+	      /* Build a single __cpu_features2 variable for all
+		 references to __cpu_features2 so that GIMPLE level
+		 optimizers can CSE the loads of __cpu_features2 and
+		 optimize bit-operations properly.  */
+	      tree index_type
+		= build_index_type (size_int (SIZE_OF_CPU_FEATURES));
+	      ix86_cpu_features2_type_node
+		= build_array_type (unsigned_type_node, index_type);
+	      ix86_cpu_features2_var
+		= make_var_decl (ix86_cpu_features2_type_node,
+				 "__cpu_features2");
+	      varpool_node::add (ix86_cpu_features2_var);
+	    }
 
-	  varpool_node::add (__cpu_features2_var);
 	  for (unsigned int j = 0; j < SIZE_OF_CPU_FEATURES; j++)
 	    if (isa_names_table[i].feature < (32 + 32 + j * 32))
 	      {
@@ -2226,7 +2244,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
 				    - (32 + j * 32)));
 		tree index = size_int (j);
 		array_elt = build4 (ARRAY_REF, unsigned_type_node,
-				    __cpu_features2_var,
+				    ix86_cpu_features2_var,
 				    index, NULL_TREE, NULL_TREE);
 		/* Return __cpu_features2[index] & field_val  */
 		final = build2 (BIT_AND_EXPR, unsigned_type_node,
@@ -2237,13 +2255,13 @@ fold_builtin_cpu (tree fndecl, tree *args)
 	      }
 	}
 
-      field = TYPE_FIELDS (__processor_model_type);
+      field = TYPE_FIELDS (ix86_cpu_model_type_node);
       /* Get the last field, which is __cpu_features.  */
       while (DECL_CHAIN (field))
         field = DECL_CHAIN (field);
 
       /* Get the appropriate field: __cpu_model.__cpu_features  */
-      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+      ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
 		    field, NULL_TREE);
 
       /* Access the 0th element of __cpu_features array.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr91400-1.c b/gcc/testsuite/gcc.target/i386/pr91400-1.c
new file mode 100644
index 00000000000..6124058dc1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91400-1.c
@@ -0,0 +1,14 @@
+/* PR target/91400 */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "andl" 1 } } */
+/* { dg-final { scan-assembler-times "cmpl" 1 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-not "cmove" } } */
+
+_Bool
+f (void)
+{
+  return __builtin_cpu_supports("popcnt")
+	 && __builtin_cpu_supports("ssse3");
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr91400-2.c b/gcc/testsuite/gcc.target/i386/pr91400-2.c
new file mode 100644
index 00000000000..1af5a2f41cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91400-2.c
@@ -0,0 +1,14 @@
+/* PR target/91400 */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "andl" 1 } } */
+/* { dg-final { scan-assembler-times "cmpl" 1 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-not "cmove" } } */
+
+_Bool
+f (void)
+{
+  return __builtin_cpu_supports("avx512vnni")
+	 && __builtin_cpu_supports("3dnow");
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] x86: Build only one __cpu_model/__cpu_features2 variables
  2021-05-05 14:50   ` [PATCH v2] x86: Build only one __cpu_model/__cpu_features2 variables H.J. Lu
@ 2021-05-05 17:59     ` Uros Bizjak
  0 siblings, 0 replies; 5+ messages in thread
From: Uros Bizjak @ 2021-05-05 17:59 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Richard Biener, Ivan Sorokin, GCC Patches

On Wed, May 5, 2021 at 4:50 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, May 05, 2021 at 09:36:16AM +0200, Richard Biener wrote:
> > On Mon, May 3, 2021 at 11:31 AM Ivan Sorokin via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Prior to this commit GCC -O2 generated quite bad code for this
> > > function:
> > >
> > > bool f()
> > > {
> > >     return __builtin_cpu_supports("popcnt")
> > >         && __builtin_cpu_supports("ssse3");
> > > }
> > >
> > > f:
> > >         movl    __cpu_model+12(%rip), %eax
> > >         xorl    %r8d, %r8d
> > >         testb   $4, %al
> > >         je      .L1
> > >         shrl    $6, %eax
> > >         movl    %eax, %r8d
> > >         andl    $1, %r8d
> > > .L1:
> > >         movl    %r8d, %eax
> > >         ret
> > >
> > > The problem was caused by the fact that internally every invocation
> > > of __builtin_cpu_supports built a new variable __cpu_model and a new
> > > type __processor_model. Because of this GIMPLE level optimizers
> > > weren't able to CSE the loads of __cpu_model and optimize
> > > bit-operations properly.
> > >
> > > This commit fixes the problem by caching created __cpu_model
> > > variable and __processor_model type. Now the GCC -O2 generates:
> > >
> > > f:
> > >         movl    __cpu_model+12(%rip), %eax
> > >         andl    $68, %eax
> > >         cmpl    $68, %eax
> > >         sete    %al
> > >         ret
> >
> > The patch looks good, the function could need a comment
> > and the global variables better names, not starting with __
> >
> > Up to the x86 maintainers - HJ, can you pick up this work?
> >
>
> Here is the updated patch to also handle and__cpu_features2.
> OK for master?

LGTM (Richi effectively approved tree parts).

Thanks,
Uros.

>
> Thanks.
>
> H.J.
> ---
> GCC -O2 generated quite bad code for this function:
>
> bool
> f (void)
> {
>   return __builtin_cpu_supports("popcnt")
>          && __builtin_cpu_supports("ssse3");
> }
>
> f:
>         movl    __cpu_model+12(%rip), %edx
>         movl    %edx, %eax
>         shrl    $6, %eax
>         andl    $1, %eax
>         andl    $4, %edx
>         movl    $0, %edx
>         cmove   %edx, %eax
>         ret
>
> The problem was caused by the fact that internally every invocation of
> __builtin_cpu_supports built a new variable __cpu_model and a new type
> __processor_model.  Because of this, GIMPLE level optimizers weren't able
> to CSE the loads of __cpu_model and optimize bit-operations properly.
>
> Improve GCC -O2 code generation by caching __cpu_model and__cpu_features2
> variables as well as their types:
>
> f:
>         movl    __cpu_model+12(%rip), %eax
>         andl    $68, %eax
>         cmpl    $68, %eax
>         sete    %al
>         ret
>
> gcc/ChangeLog:
>
> 2021-05-05  Ivan Sorokin <vanyacpp@gmail.com>
>             H.J. Lu <hjl.tools@gmail.com>
>
>         PR target/91400
>         * config/i386/i386-builtins.c (ix86_cpu_model_type_node): New.
>         (ix86_cpu_model_var): Likewise.
>         (ix86_cpu_features2_type_node): Likewise.
>         (ix86_cpu_features2_var): Likewise.
>         (fold_builtin_cpu): Cache __cpu_model and __cpu_features2 with
>         their types.
>
> gcc/testsuite/Changelog:
>
> 2021-05-05  Ivan Sorokin <vanyacpp@gmail.com>
>             H.J. Lu <hjl.tools@gmail.com>
>
>         PR target/91400
>         * gcc.target/i386/pr91400-1.c: New test.
>         * gcc.target/i386/pr91400-2.c: Likewise.
> ---
>  gcc/config/i386/i386-builtins.c           | 52 +++++++++++++++--------
>  gcc/testsuite/gcc.target/i386/pr91400-1.c | 14 ++++++
>  gcc/testsuite/gcc.target/i386/pr91400-2.c | 14 ++++++
>  3 files changed, 63 insertions(+), 17 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91400-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr91400-2.c
>
> diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
> index b66911082ab..8036aedebac 100644
> --- a/gcc/config/i386/i386-builtins.c
> +++ b/gcc/config/i386/i386-builtins.c
> @@ -2103,6 +2103,11 @@ make_var_decl (tree type, const char *name)
>    return new_decl;
>  }
>
> +static GTY(()) tree ix86_cpu_model_type_node;
> +static GTY(()) tree ix86_cpu_model_var;
> +static GTY(()) tree ix86_cpu_features2_type_node;
> +static GTY(()) tree ix86_cpu_features2_var;
> +
>  /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
>     into an integer defined in libgcc/config/i386/cpuinfo.c */
>
> @@ -2114,12 +2119,16 @@ fold_builtin_cpu (tree fndecl, tree *args)
>      = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
>    tree param_string_cst = NULL;
>
> -  tree __processor_model_type = build_processor_model_struct ();
> -  tree __cpu_model_var = make_var_decl (__processor_model_type,
> -                                       "__cpu_model");
> -
> -
> -  varpool_node::add (__cpu_model_var);
> +  if (ix86_cpu_model_var == nullptr)
> +    {
> +      /* Build a single __cpu_model variable for all references to
> +        __cpu_model so that GIMPLE level optimizers can CSE the loads
> +        of __cpu_model and optimize bit-operations properly.  */
> +      ix86_cpu_model_type_node = build_processor_model_struct ();
> +      ix86_cpu_model_var = make_var_decl (ix86_cpu_model_type_node,
> +                                         "__cpu_model");
> +      varpool_node::add (ix86_cpu_model_var);
> +    }
>
>    gcc_assert ((args != NULL) && (*args != NULL));
>
> @@ -2160,7 +2169,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
>           return integer_zero_node;
>         }
>
> -      field = TYPE_FIELDS (__processor_model_type);
> +      field = TYPE_FIELDS (ix86_cpu_model_type_node);
>        field_val = processor_alias_table[i].model;
>
>        /* CPU types are stored in the next field.  */
> @@ -2179,7 +2188,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
>         }
>
>        /* Get the appropriate field in __cpu_model.  */
> -      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
> +      ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
>                     field, NULL_TREE);
>
>        /* Check the value.  */
> @@ -2212,13 +2221,22 @@ fold_builtin_cpu (tree fndecl, tree *args)
>
>        if (isa_names_table[i].feature >= 32)
>         {
> -         tree index_type
> -           = build_index_type (size_int (SIZE_OF_CPU_FEATURES));
> -         tree type = build_array_type (unsigned_type_node, index_type);
> -         tree __cpu_features2_var = make_var_decl (type,
> -                                                   "__cpu_features2");
> +         if (ix86_cpu_features2_var == nullptr)
> +           {
> +             /* Build a single __cpu_features2 variable for all
> +                references to __cpu_features2 so that GIMPLE level
> +                optimizers can CSE the loads of __cpu_features2 and
> +                optimize bit-operations properly.  */
> +             tree index_type
> +               = build_index_type (size_int (SIZE_OF_CPU_FEATURES));
> +             ix86_cpu_features2_type_node
> +               = build_array_type (unsigned_type_node, index_type);
> +             ix86_cpu_features2_var
> +               = make_var_decl (ix86_cpu_features2_type_node,
> +                                "__cpu_features2");
> +             varpool_node::add (ix86_cpu_features2_var);
> +           }
>
> -         varpool_node::add (__cpu_features2_var);
>           for (unsigned int j = 0; j < SIZE_OF_CPU_FEATURES; j++)
>             if (isa_names_table[i].feature < (32 + 32 + j * 32))
>               {
> @@ -2226,7 +2244,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
>                                     - (32 + j * 32)));
>                 tree index = size_int (j);
>                 array_elt = build4 (ARRAY_REF, unsigned_type_node,
> -                                   __cpu_features2_var,
> +                                   ix86_cpu_features2_var,
>                                     index, NULL_TREE, NULL_TREE);
>                 /* Return __cpu_features2[index] & field_val  */
>                 final = build2 (BIT_AND_EXPR, unsigned_type_node,
> @@ -2237,13 +2255,13 @@ fold_builtin_cpu (tree fndecl, tree *args)
>               }
>         }
>
> -      field = TYPE_FIELDS (__processor_model_type);
> +      field = TYPE_FIELDS (ix86_cpu_model_type_node);
>        /* Get the last field, which is __cpu_features.  */
>        while (DECL_CHAIN (field))
>          field = DECL_CHAIN (field);
>
>        /* Get the appropriate field: __cpu_model.__cpu_features  */
> -      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
> +      ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
>                     field, NULL_TREE);
>
>        /* Access the 0th element of __cpu_features array.  */
> diff --git a/gcc/testsuite/gcc.target/i386/pr91400-1.c b/gcc/testsuite/gcc.target/i386/pr91400-1.c
> new file mode 100644
> index 00000000000..6124058dc1e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91400-1.c
> @@ -0,0 +1,14 @@
> +/* PR target/91400 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "andl" 1 } } */
> +/* { dg-final { scan-assembler-times "cmpl" 1 } } */
> +/* { dg-final { scan-assembler-times "sete" 1 } } */
> +/* { dg-final { scan-assembler-not "cmove" } } */
> +
> +_Bool
> +f (void)
> +{
> +  return __builtin_cpu_supports("popcnt")
> +        && __builtin_cpu_supports("ssse3");
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr91400-2.c b/gcc/testsuite/gcc.target/i386/pr91400-2.c
> new file mode 100644
> index 00000000000..1af5a2f41cd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91400-2.c
> @@ -0,0 +1,14 @@
> +/* PR target/91400 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "andl" 1 } } */
> +/* { dg-final { scan-assembler-times "cmpl" 1 } } */
> +/* { dg-final { scan-assembler-times "sete" 1 } } */
> +/* { dg-final { scan-assembler-not "cmove" } } */
> +
> +_Bool
> +f (void)
> +{
> +  return __builtin_cpu_supports("avx512vnni")
> +        && __builtin_cpu_supports("3dnow");
> +}
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-05-05 17:59 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-03  8:39 [PATCH] [PR91400] build only one __cpu_model variable Ivan Sorokin
2021-05-03  9:08 ` Ivan Sorokin
2021-05-05  7:36 ` Richard Biener
2021-05-05 14:50   ` [PATCH v2] x86: Build only one __cpu_model/__cpu_features2 variables H.J. Lu
2021-05-05 17:59     ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).