public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc/devel/omp/gcc-11] openmp: Optimize for OpenMP atomics 2x__builtin_clear_padding+__builtin_memcmp if possible
@ 2021-10-06 11:01 Tobias Burnus
  0 siblings, 0 replies; only message in thread
From: Tobias Burnus @ 2021-10-06 11:01 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:015380702773cf8f19708e4eda2709e345c0c35d

commit 015380702773cf8f19708e4eda2709e345c0c35d
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Wed Oct 6 12:08:20 2021 +0200

    openmp: Optimize for OpenMP atomics 2x__builtin_clear_padding+__builtin_memcmp if possible
    
    For the few long double types that do have padding bits, e.g. on x86
    the clear_type_padding_in_mask computed mask is
    ff ff ff ff ff ff ff ff ff ff 00 00 for 32-bit and
    ff ff ff ff ff ff ff ff ff ff 00 00 00 00 00 00 for 64-bit.
    Instead of doing __builtin_clear_padding on both operands that will clear the
    last 2 or 6 bytes and then memcmp on the whole 12/16 bytes, we can just
    memcmp 10 bytes.  The code also handles if the padding would be at the start
    or both at the start and end, but everything on byte boundaries only and
    non-padding bits being contiguous.
    This works around a tree-ssa-dse.c bug (but we need to fix it anyway,
    as libstdc++ won't do this and as it can deal with arbitrary types, it even
    can't do that generally).
    
    2021-10-06  Jakub Jelinek  <jakub@redhat.com>
    
            PR tree-optimization/102571
            * c-omp.c (c_finish_omp_atomic): Optimize the case where type has
            padding, but the non-padding bits are contiguous set of bytes
            by adjusting the memcmp call arguments instead of emitting
            __builtin_clear_padding and then comparing all the type's bytes.
    
    (cherry picked from commit ba837323dbda2bca5a1c8a4c78092a88241dcfa3)

Diff:
---
 gcc/c-family/c-omp.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/gcc/c-family/c-omp.c b/gcc/c-family/c-omp.c
index fa07738b745..72552a6062d 100644
--- a/gcc/c-family/c-omp.c
+++ b/gcc/c-family/c-omp.c
@@ -380,6 +380,8 @@ c_finish_omp_atomic (location_t loc, enum tree_code code,
       if (SCALAR_FLOAT_TYPE_P (cmptype) && !test)
 	{
 	  bool clear_padding = false;
+	  HOST_WIDE_INT non_padding_start = 0;
+	  HOST_WIDE_INT non_padding_end = 0;
 	  if (BITS_PER_UNIT == 8 && CHAR_BIT == 8)
 	    {
 	      HOST_WIDE_INT sz = int_size_in_bytes (cmptype), i;
@@ -393,6 +395,40 @@ c_finish_omp_atomic (location_t loc, enum tree_code code,
 		    clear_padding = true;
 		    break;
 		  }
+	      if (clear_padding && buf[i] == 0)
+		{
+		  /* Try to optimize.  In the common case where
+		     non-padding bits are all continuous and start
+		     and end at a byte boundary, we can just adjust
+		     the memcmp call arguments and don't need to
+		     emit __builtin_clear_padding calls.  */
+		  if (i == 0)
+		    {
+		      for (i = 0; i < sz; i++)
+			if (buf[i] != 0)
+			  break;
+		      if (i < sz && buf[i] == (unsigned char) ~0)
+			{
+			  non_padding_start = i;
+			  for (; i < sz; i++)
+			    if (buf[i] != (unsigned char) ~0)
+			      break;
+			}
+		      else
+			i = 0;
+		    }
+		  if (i != 0)
+		    {
+		      non_padding_end = i;
+		      for (; i < sz; i++)
+			if (buf[i] != 0)
+			  {
+			    non_padding_start = 0;
+			    non_padding_end = 0;
+			    break;
+			  }
+		    }
+		}
 	    }
 	  tree inttype = NULL_TREE;
 	  if (!clear_padding && tree_fits_uhwi_p (TYPE_SIZE (cmptype)))
@@ -429,12 +465,22 @@ c_finish_omp_atomic (location_t loc, enum tree_code code,
 	      tmp2 = build4 (TARGET_EXPR, cmptype, tmp2,
 			     TREE_OPERAND (rhs1, 1), NULL, NULL);
 	      tmp2 = build1 (ADDR_EXPR, pcmptype, tmp2);
+	      if (non_padding_start)
+		{
+		  tmp1 = build2 (POINTER_PLUS_EXPR, pcmptype, tmp1,
+				 size_int (non_padding_start));
+		  tmp2 = build2 (POINTER_PLUS_EXPR, pcmptype, tmp2,
+				 size_int (non_padding_start));
+		}
 	      tree fndecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
 	      rhs1 = build_call_expr_loc (loc, fndecl, 3, tmp1, tmp2,
-					  TYPE_SIZE_UNIT (cmptype));
+					  non_padding_end
+					  ? size_int (non_padding_end
+						      - non_padding_start)
+					  : TYPE_SIZE_UNIT (cmptype));
 	      rhs1 = build2 (EQ_EXPR, boolean_type_node, rhs1,
 			     integer_zero_node);
-	      if (clear_padding)
+	      if (clear_padding && non_padding_end == 0)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_CLEAR_PADDING);
 		  tree cp1 = build_call_expr_loc (loc, fndecl, 1, tmp1);


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-10-06 11:01 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-06 11:01 [gcc/devel/omp/gcc-11] openmp: Optimize for OpenMP atomics 2x__builtin_clear_padding+__builtin_memcmp if possible Tobias Burnus

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).