public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* Thoughts on memcmp expansion (PR43052)
@ 2016-01-15 16:58 Bernd Schmidt
  2016-01-18  9:22 ` Richard Biener
                   ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Bernd Schmidt @ 2016-01-15 16:58 UTC (permalink / raw)
  To: GCC Patches, Nick Clifton

[-- Attachment #1: Type: text/plain, Size: 1963 bytes --]

PR43052 is a PR complaining about how the rep cmpsb expansion that gcc 
uses for memcmp is slower than the library function. As is so often the 
case, if you investigate a bit, you can find a lot of issues with the 
current situation in the compiler.

This PR was accidentally fixed by a patch by Nick which disabled the use 
of cmpstrnsi for memcmp expansion, on the grounds that cmpstrnsi could 
stop looking after seeing a null byte, which would be invalid for 
memcmp, so only cmpmemsi should be used. This fix was for an out-of-tree 
target.

I believe the rep cmpsb sequence used by i386 would actually be valid, 
so we could duplicate the cmpstrn pattern to also match cmpmem and be 
done - but that would then again cause the performance problem described 
in the PR, so it's probably not a good idea.

One question Richard posed in the comments: why aren't we optimizing 
small constant size memcmps other than size 1 to *s == *q? The reason is 
the return value of memcmp, which implies byte-sized operation 
(incidentally, the use of SImode in the cmpmem/cmpstr patterns is really 
odd). It's possible to work around this, but expansion becomes a little 
more tricky (subtract after bswap, maybe). Still, the current code 
generation is lame.

So, for gcc-6, I think we shouldn't do anything. The PR is fixed, and 
there's no easy bug-fix that can be done to improve matters. Not sure 
whether to keep the PR open or create a new one for the remaining 
issues. For the next stage1, I'm attaching a proof-of-concept patch that 
does the following:
  * notice if memcmp results are only used for equality comparison
    against zero
  * if so, replace with a different builtin __memcmp_eq
  * Expand __memcmp_eq for small constant sizes with loads and
    comparison, fall back to a memcmp call.

The whole thing could be extended to work for sizes larger than an int, 
along the lines of memcpy expansion controlled by move ratio etc. Thoughts?


Bernd

[-- Attachment #2: memcmp-eq.diff --]
[-- Type: text/x-patch, Size: 7989 bytes --]

Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 232359)
+++ gcc/builtins.c	(working copy)
@@ -3699,25 +3699,22 @@ expand_cmpstrn_or_cmpmem (insn_code icod
 
 /* Expand expression EXP, which is a call to the memcmp built-in function.
    Return NULL_RTX if we failed and the caller should emit a normal call,
-   otherwise try to get the result in TARGET, if convenient.  */
+   otherwise try to get the result in TARGET, if convenient.
+   RESULT_EQ is true if we can relax the returned value to be either zero
+   or nonzero, without caring about the sign.  */
 
 static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
 {
   if (!validate_arglist (exp,
  			 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
     return NULL_RTX;
 
-  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
-     implementing memcmp because it will stop if it encounters two
-     zero bytes.  */
-  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
-  if (icode == CODE_FOR_nothing)
-    return NULL_RTX;
-
   tree arg1 = CALL_EXPR_ARG (exp, 0);
   tree arg2 = CALL_EXPR_ARG (exp, 1);
   tree len = CALL_EXPR_ARG (exp, 2);
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+  location_t loc = EXPR_LOCATION (exp);
 
   unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
   unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3725,12 +3722,27 @@ expand_builtin_memcmp (tree exp, rtx tar
   /* If we don't have POINTER_TYPE, call the function.  */
   if (arg1_align == 0 || arg2_align == 0)
     return NULL_RTX;
+  unsigned int min_align = MIN (arg1_align, arg2_align);
+
+  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
+     implementing memcmp because it will stop if it encounters two
+     zero bytes.  */
+  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
+
+  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+  machine_mode direct_mode = VOIDmode;
+  if (CONST_INT_P (arg3_rtx))
+    direct_mode = mode_for_size (INTVAL (arg3_rtx) * BITS_PER_UNIT,
+				 MODE_INT, 1);
+  if (icode == CODE_FOR_nothing
+      && (!result_eq
+	  || direct_mode == VOIDmode
+	  || direct_mode == BLKmode
+	  || GET_MODE_ALIGNMENT (direct_mode) / BITS_PER_UNIT > min_align))
+    return NULL_RTX;
 
-  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-  location_t loc = EXPR_LOCATION (exp);
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
-  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
 
   /* Set MEM_SIZE as appropriate.  */
   if (CONST_INT_P (arg3_rtx))
@@ -3739,6 +3751,27 @@ expand_builtin_memcmp (tree exp, rtx tar
       set_mem_size (arg2_rtx, INTVAL (arg3_rtx));
     }
 
+  if (icode == CODE_FOR_nothing)
+    {
+      arg1_rtx = change_address (arg1_rtx, direct_mode, XEXP (arg1_rtx, 0));
+      arg2_rtx = change_address (arg2_rtx, direct_mode, XEXP (arg2_rtx, 0));
+      start_sequence ();
+      rtx op0 = force_reg (direct_mode, arg1_rtx);
+      rtx op1 = force_reg (direct_mode, arg2_rtx);
+      rtx tem = emit_store_flag (target, NE, op0, op1,
+				 direct_mode, true, false);
+      rtx seq = get_insns ();
+      end_sequence ();
+      if (tem != 0)
+	{
+	  emit_insn (seq);
+	  return target;
+	}
+    }
+
+  if (icode == CODE_FOR_nothing)
+    return NULL_RTX;
+
   rtx result = expand_cmpstrn_or_cmpmem (icode, target, arg1_rtx, arg2_rtx,
 					 TREE_TYPE (len), arg3_rtx,
 					 MIN (arg1_align, arg2_align));
@@ -5997,9 +6030,16 @@ expand_builtin (tree exp, rtx target, rt
 
     case BUILT_IN_BCMP:
     case BUILT_IN_MEMCMP:
-      target = expand_builtin_memcmp (exp, target);
+    case BUILT_IN_MEMCMP_EQ:
+      target = expand_builtin_memcmp (exp, target, fcode == BUILT_IN_MEMCMP_EQ);
       if (target)
 	return target;
+      if (fcode == BUILT_IN_MEMCMP_EQ)
+	{
+	  exp = copy_node (exp);
+	  tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+	  TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
+	}
       break;
 
     case BUILT_IN_SETJMP:
Index: gcc/builtins.def
===================================================================
--- gcc/builtins.def	(revision 232359)
+++ gcc/builtins.def	(working copy)
@@ -617,6 +617,7 @@ DEF_EXT_LIB_BUILTIN    (BUILT_IN_BZERO,
 DEF_EXT_LIB_BUILTIN    (BUILT_IN_INDEX, "index", BT_FN_STRING_CONST_STRING_INT, ATTR_PURE_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN        (BUILT_IN_MEMCHR, "memchr", BT_FN_PTR_CONST_PTR_INT_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN        (BUILT_IN_MEMCMP, "memcmp", BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
+DEF_GCC_BUILTIN        (BUILT_IN_MEMCMP_EQ, "__memcmp_eq", BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN_CHKP   (BUILT_IN_MEMCPY, "memcpy", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN_CHKP   (BUILT_IN_MEMMOVE, "memmove", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
 DEF_EXT_LIB_BUILTIN_CHKP (BUILT_IN_MEMPCPY, "mempcpy", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
Index: gcc/gimple-fold.c
===================================================================
--- gcc/gimple-fold.c	(revision 232359)
+++ gcc/gimple-fold.c	(working copy)
@@ -2539,13 +2539,52 @@ gimple_fold_builtin_fprintf (gimple_stmt
   return false;
 }
 
+/* Fold a call to the memcmp builtin.  ARG0, ARG1 and ARG2 are the
+   arguments to the call.
+
+   Return false if no simplification was possible, otherwise return
+   true and replace the call with a simplified form.  */
+
+static bool
+gimple_fold_builtin_memcmp (gimple_stmt_iterator *gsi, tree arg0, tree arg1, tree arg2)
+{
+  gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
+
+  /* If the return value is used, don't do the transformation.  */
+  tree res = gimple_call_lhs (stmt);
+  if (res == NULL_TREE || TREE_CODE (res) != SSA_NAME)
+    return false;
+
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, res)
+    {
+      gimple *ustmt = USE_STMT (use_p);
+
+      if (gimple_code (ustmt) != GIMPLE_ASSIGN)
+	return false;
+      gassign *asgn = as_a <gassign *> (ustmt);
+      tree_code code = gimple_assign_rhs_code (asgn);
+      if ((code != EQ_EXPR && code != NE_EXPR)
+	  || !integer_zerop (gimple_assign_rhs2 (asgn)))
+	return false;
+    }
+
+  tree newfn = builtin_decl_explicit (BUILT_IN_MEMCMP_EQ);
+  gcall *repl = gimple_build_call (newfn, 3, arg0, arg1, arg2);
+  gimple_call_set_lhs (repl, res);
+  gimple_set_vuse (repl, gimple_vuse (stmt));
+  replace_call_with_call_and_fold (gsi, repl);
+  return true;
+}
+
 /* Fold a call to the {,v}printf{,_unlocked} and __{,v}printf_chk builtins.
    FMT and ARG are the arguments to the call; we don't fold cases with
    more than 2 arguments, and ARG may be null if this is a 1-argument case.
+   FCODE is the BUILT_IN_* code of the function to be simplified.
 
-   Return NULL_TREE if no simplification was possible, otherwise return the
-   simplified form of the call as a tree.  FCODE is the BUILT_IN_*
-   code of the function to be simplified.  */
+   Return false if no simplification was possible, otherwise return
+   true and replace the call with a simplified form.    */
 
 static bool
 gimple_fold_builtin_printf (gimple_stmt_iterator *gsi, tree fmt,
@@ -2796,6 +2835,10 @@ gimple_fold_builtin (gimple_stmt_iterato
     case BUILT_IN_MEMMOVE:
       return gimple_fold_builtin_memory_op (gsi, gimple_call_arg (stmt, 0),
 					    gimple_call_arg (stmt, 1), 3);
+    case BUILT_IN_MEMCMP:
+      return gimple_fold_builtin_memcmp (gsi, gimple_call_arg (stmt, 0),
+					 gimple_call_arg (stmt, 1),
+					 gimple_call_arg (stmt, 2));
     case BUILT_IN_SPRINTF_CHK:
     case BUILT_IN_VSPRINTF_CHK:
       return gimple_fold_builtin_sprintf_chk (gsi, fcode);

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2016-05-31 21:04 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-15 16:58 Thoughts on memcmp expansion (PR43052) Bernd Schmidt
2016-01-18  9:22 ` Richard Biener
2016-04-28 18:36   ` Bernd Schmidt
2016-05-02 12:52     ` Richard Biener
2016-05-02 12:57       ` Bernd Schmidt
2016-05-02 13:14         ` Richard Biener
2016-05-12 17:14           ` Bernd Schmidt
2016-05-13 10:20             ` Richard Biener
2016-05-13 13:05               ` Bernd Schmidt
2016-05-13 13:07                 ` Richard Biener
2016-05-13 13:13                   ` Bernd Schmidt
2016-05-13 13:53                     ` Joseph Myers
2016-05-13 14:00                       ` Bernd Schmidt
2016-05-13 20:41                         ` Joseph Myers
2016-05-31 23:50                           ` Bernd Schmidt
2016-01-18 12:22 ` Nick Clifton
2016-01-19 21:36 ` Jeff Law
2016-05-30 11:29 ` Florian Weimer
2016-05-31 15:05   ` Bernd Schmidt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).