public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* [PR43721] Failure to optimise (a/b) and (a%b) into single  call
@ 2013-06-16 15:33 Kugan
  2013-06-17  9:37 ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Kugan @ 2013-06-16 15:33 UTC (permalink / raw)
  To: gcc; +Cc: Richard Biener, Richard Earnshaw, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 1193 bytes --]

Hi,

I am attempting to fix Bug 43721 - Failure to optimise (a/b) and (a%b) 
into single __aeabi_idivmod call 
(http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721)

execute_cse_sincos tree level pass does similar cse so I attempted to 
use similar approach here. Div/mod cse is not really using built-in 
functions though at this level.

For the case of div and mod operations, after CSE is performed, there 
isnt a way to represent the resulting stament in gimple. We will endup 
with divmod  taking two arguments and returning double the size of one 
arguments in the three address format (divmod will return reminder and 
quotient so the return type is double the size of argument type).

Since GIMPLE_ASSIGN will result in type checking failure in this case, I 
atempted use built-in functions (GIMPLE_CALL to represent the runtime 
library call). Name for the function here  is target specific and can be 
obtained from sdivmod_optab so the builtin function name defined in tree 
level is not used. I am not entirelt sure this is the right approach so 
I am attaching the first cut of the patch to get your feedback and 
understand the right approach to this problem.



Thank you,
Kugan


[-- Attachment #2: divmod.diff --]
[-- Type: text/x-patch, Size: 12289 bytes --]

diff --git a/gcc/builtin-types.def b/gcc/builtin-types.def
index 2634ecc..21c483a 100644
--- a/gcc/builtin-types.def
+++ b/gcc/builtin-types.def
@@ -250,6 +250,10 @@ DEF_FUNCTION_TYPE_2 (BT_FN_INT_CONST_STRING_FILEPTR,
 		     BT_INT, BT_CONST_STRING, BT_FILEPTR)
 DEF_FUNCTION_TYPE_2 (BT_FN_INT_INT_FILEPTR,
 		     BT_INT, BT_INT, BT_FILEPTR)
+DEF_FUNCTION_TYPE_2 (BT_FN_LONGLONG_INT_INT,
+		     BT_LONGLONG, BT_INT, BT_INT)
+DEF_FUNCTION_TYPE_2 (BT_FN_ULONGLONG_UINT_UINT,
+		     BT_ULONGLONG, BT_UINT, BT_UINT)
 DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRMODE_PTR,
 		     BT_VOID, BT_PTRMODE, BT_PTR)
 DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTR_PTRMODE,
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 402bb1f..1cae2bb 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -1876,7 +1876,9 @@ mathfn_built_in_1 (tree type, enum built_in_function fn, bool implicit_p)
       CASE_MATHFN (BUILT_IN_Y0)
       CASE_MATHFN (BUILT_IN_Y1)
       CASE_MATHFN (BUILT_IN_YN)
-
+      case BUILT_IN_DIVMOD:
+      case BUILT_IN_UDIVMOD:
+        return builtin_decl_explicit (fn);
       default:
 	return NULL_TREE;
       }
@@ -2449,6 +2451,57 @@ expand_builtin_interclass_mathfn (tree exp, rtx target)
   return NULL_RTX;
 }
 
+/* Expand a call to the builtin divmod function to
+   library call.  */
+static rtx
+expand_builtin_divmod (tree exp, rtx target)
+{
+  rtx op0, op1;
+  enum machine_mode mode;
+  tree arg0, arg1;
+  rtx libval;
+  rtx libfunc;
+  rtx insns;
+  bool is_unsigned;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+
+  mode = TYPE_MODE (TREE_TYPE (arg0));
+  is_unsigned = TYPE_UNSIGNED (TREE_TYPE (arg0));
+
+  /* Get the libcall.  */
+  libfunc = optab_libfunc (is_unsigned ? udivmod_optab : sdivmod_optab, mode);
+  gcc_assert (libfunc);
+
+  op0 = expand_normal (arg0);
+  op1 = expand_normal (arg1);
+
+  if (MEM_P (op0))
+    op0 = force_reg (mode, op0);
+  if (MEM_P (op1))
+    op1 = force_reg (mode, op1);
+
+  /* The value returned by the library function will have twice as
+     many bits as the nominal MODE.  */
+  machine_mode libval_mode
+    = smallest_mode_for_size (2 * GET_MODE_BITSIZE (mode),
+                              MODE_INT);
+  start_sequence ();
+  libval = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
+                                    libval_mode, 2,
+                                    op0, mode,
+                                    op1, mode);
+  insns = get_insns ();
+  end_sequence ();
+  /* Move into the desired location.  */
+  if (target != const0_rtx)
+    emit_libcall_block (insns, target, libval,
+                        gen_rtx_fmt_ee (is_unsigned ?  UMOD : MOD, mode, op0, op1));
+
+  return target;
+}
+
 /* Expand a call to the builtin sincos math function.
    Return NULL_RTX if a normal call should be emitted rather than expanding the
    function in-line.  EXP is the expression that is a call to the builtin
@@ -5977,6 +6030,13 @@ expand_builtin (tree exp, rtx target, rtx subtarget, enum machine_mode mode,
 	return target;
       break;
 
+    case BUILT_IN_DIVMOD:
+    case BUILT_IN_UDIVMOD:
+      target = expand_builtin_divmod (exp, target);
+      if (target)
+        return target;
+      break;
+
     CASE_FLT_FN (BUILT_IN_SINCOS):
       if (! flag_unsafe_math_optimizations)
 	break;
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 91879a6..7664700 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -599,6 +599,8 @@ DEF_C99_BUILTIN        (BUILT_IN_VSCANF, "vscanf", BT_FN_INT_CONST_STRING_VALIST
 DEF_C99_BUILTIN        (BUILT_IN_VSNPRINTF, "vsnprintf", BT_FN_INT_STRING_SIZE_CONST_STRING_VALIST_ARG, ATTR_FORMAT_PRINTF_NOTHROW_3_0)
 DEF_LIB_BUILTIN        (BUILT_IN_VSPRINTF, "vsprintf", BT_FN_INT_STRING_CONST_STRING_VALIST_ARG, ATTR_FORMAT_PRINTF_NOTHROW_2_0)
 DEF_C99_BUILTIN        (BUILT_IN_VSSCANF, "vsscanf", BT_FN_INT_CONST_STRING_CONST_STRING_VALIST_ARG, ATTR_FORMAT_SCANF_NOTHROW_2_0)
+DEF_EXT_LIB_BUILTIN    (BUILT_IN_DIVMOD, "__$_idivmod", BT_FN_LONGLONG_INT_INT, ATTR_NULL)
+DEF_EXT_LIB_BUILTIN    (BUILT_IN_UDIVMOD, "__$_uidivmod", BT_FN_ULONGLONG_UINT_UINT, ATTR_NULL)
 
 /* Category: ctype builtins.  */
 DEF_LIB_BUILTIN        (BUILT_IN_ISALNUM, "isalnum", BT_FN_INT_INT, ATTR_PURE_NOTHROW_LEAF_LIST)
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index b4de411..3af01ac 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -664,7 +664,7 @@ struct gimple_opt_pass pass_cse_reciprocals =
    statements in the vector.  */
 
 static bool
-maybe_record_sincos (vec<gimple> *stmts,
+maybe_record_stmt (vec<gimple> *stmts,
 		     basic_block *top_bb, gimple use_stmt)
 {
   basic_block use_bb = gimple_bb (use_stmt);
@@ -684,6 +684,176 @@ maybe_record_sincos (vec<gimple> *stmts,
   return true;
 }
 
+/* Look for div and mod statements with the same operands and
+   create a library call if needed.  We first walk over all
+   immediate uses of one of the operand looking for matched statement
+   that we can combine.  In a second pass replace the statement with
+   a library calls and statements with the results from library call.  */
+
+static bool
+execute_cse_div_mod_1 (gimple s)
+{
+  gimple_stmt_iterator gsi;
+  imm_use_iterator use_iter;
+  gimple use_stmt;
+  bool found = false;
+  vec<gimple> stmts = vNULL;
+  basic_block top_bb = NULL;
+  bool cfg_changed = false;
+  int i;
+  tree type, op1, op2;
+
+  op1 = gimple_assign_rhs1 (s);
+  op2 = gimple_assign_rhs2 (s);
+  type = TREE_TYPE (op1);
+
+  /* if op2 is constant, It will be optimized without lib calls.  */
+  if (TREE_CODE (op2) != SSA_NAME)
+    return false;
+ 
+  if (HAVE_divsi3
+       || (GET_MODE_BITSIZE (TYPE_MODE (type)) != 32)
+       || !optab_libfunc (TYPE_UNSIGNED (type)? udivmod_optab : sdivmod_optab,
+                            TYPE_MODE (type)))
+    return false;
+
+  stmts.safe_push (s);
+  top_bb = gimple_bb (s);
+
+  /* look fpr a TRUNC_MOD_EXPR coresponding to stmt
+     TRUNC_DIV_EXPR s to combine.  */
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op2)
+    {
+      if (is_gimple_assign (use_stmt)
+          && (gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR))
+        {
+          tree rhs1 = gimple_assign_rhs1 (use_stmt);
+          tree rhs2 = gimple_assign_rhs2 (use_stmt);
+
+          if ((rhs1 == op1 && rhs2 == op2)
+              ||(rhs1 == op1 && rhs2 == op2))
+            found |= maybe_record_stmt (&stmts, &top_bb, use_stmt);
+        }
+    }
+
+  /* If we have matched instructions to combine, try creatiing Library
+     call and use the results.  */
+  if (found)
+    {
+      gimple def_stmt1 = NULL, def_stmt2;
+      bool insert_after_op1_def, insert_after_op2_def;
+      tree fndecl, res, rhs, temp;
+      gimple stmt, shift_stmt, call_stmt;
+
+      /* See if the library call is needed and available.  */
+      fndecl = mathfn_built_in (type, BUILT_IN_DIVMOD);
+      if (!fndecl)
+      {
+        stmts.release ();
+        return false;
+      }
+
+      if (TREE_CODE (op1) == SSA_NAME)
+        def_stmt1 = SSA_NAME_DEF_STMT (op1);
+      def_stmt2 = SSA_NAME_DEF_STMT (op2);
+
+      /* Is the call to be insterted adter op1 define stmt.  */
+      insert_after_op1_def = TREE_CODE (op1) == SSA_NAME
+                            && !SSA_NAME_IS_DEFAULT_DEF (op1)
+                            && gimple_code (def_stmt1) != GIMPLE_PHI
+                            && gimple_bb (def_stmt1) == top_bb;
+
+      /* Is the call to be insterted adter op2 define stmt.  */
+      insert_after_op2_def = !SSA_NAME_IS_DEFAULT_DEF (op2)
+                            && gimple_code (def_stmt2) != GIMPLE_PHI
+                            && gimple_bb (def_stmt2) == top_bb;
+
+      /* Create a library call and instert it at the place idenified.  */
+      call_stmt = gimple_build_call (fndecl, 2, op1, op2);
+      res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)),
+                                call_stmt, "divmod");
+      gimple_call_set_lhs (call_stmt, res);
+
+      /* Insert call at the beginning of top_bb but not earlier
+         than the name def statement.  */
+      if (insert_after_op1_def || insert_after_op2_def)
+        {
+          if (insert_after_op1_def && insert_after_op2_def)
+            {
+              for (gsi = gsi_start_bb (top_bb);
+                   !gsi_end_p (gsi); gsi_next (&gsi))
+                {
+                  gimple g = gsi_stmt (gsi);
+                  if (g == def_stmt1)
+                    {
+                      gsi = gsi_for_stmt (def_stmt2);
+                      break;
+                    }
+                  else if (g == def_stmt2)
+                    {
+                      gsi = gsi_for_stmt (def_stmt1);
+                      break;
+                    }
+                }
+            }
+
+          /* Only one of the definition is in the basic block, place after
+             that.  */
+          else if (insert_after_op1_def)
+            gsi = gsi_for_stmt (def_stmt1);
+          else
+            gsi = gsi_for_stmt (def_stmt2);
+
+          gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
+        }
+      else
+        {
+          /* op1 and op2 are defined before the top_bb.  Insert as first
+             non label instruction.  */
+          gsi = gsi_after_labels (top_bb);
+          gsi_insert_before (&gsi, call_stmt, GSI_SAME_STMT);
+        }
+
+      /* Adjust the recorded old statements to use the results.  */
+      for (i = 0; stmts.iterate (i, &use_stmt); ++i)
+        {
+          switch (gimple_assign_rhs_code (use_stmt))
+            {
+            case TRUNC_MOD_EXPR:
+              /* Get the top 32 bit from result and assign it.  */
+              gsi = gsi_for_stmt (use_stmt);
+              shift_stmt = gimple_build_assign_with_ops (RSHIFT_EXPR,
+                          NULL_TREE, res, build_int_cst (TREE_TYPE (res), 32));
+              temp = make_temp_ssa_name (long_long_integer_type_node,
+                                         shift_stmt, "divmod");
+              gimple_assign_set_lhs (shift_stmt, temp);
+              gsi_insert_before (&gsi, shift_stmt, GSI_SAME_STMT);
+              rhs = fold_build1 (CONVERT_EXPR, type, temp);
+              break;
+
+            case TRUNC_DIV_EXPR:
+              /* Get the bottom 32 bit from result and assign it.  */
+              rhs = fold_build1 (CONVERT_EXPR, type, res);
+              gsi = gsi_for_stmt (use_stmt);
+              break;
+
+            default:
+              gcc_unreachable ();
+            }
+
+          /* Replace the statement with a copy.  */
+          stmt = gimple_build_assign (gimple_assign_lhs (use_stmt), rhs);
+          gsi_replace (&gsi, stmt, true);
+
+          if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
+            cfg_changed = true;
+        }
+    }
+
+  stmts.release ();
+  return cfg_changed;
+}
+
 /* Look for sin, cos and cexpi calls with the same argument NAME and
    create a single call to cexpi CSEing the result in this case.
    We first walk over all immediate uses of the argument collecting
@@ -717,15 +887,15 @@ execute_cse_sincos_1 (tree name)
       switch (DECL_FUNCTION_CODE (fndecl))
 	{
 	CASE_FLT_FN (BUILT_IN_COS):
-	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+	  seen_cos |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	CASE_FLT_FN (BUILT_IN_SIN):
-	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+	  seen_sin |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	CASE_FLT_FN (BUILT_IN_CEXPI):
-	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+	  seen_cexpi |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	default:;
@@ -1398,6 +1568,12 @@ execute_cse_sincos (void)
 	     gimple_purge_dead_eh_edges if we change something in the middle
 	     of a basic block.  */
 	  cleanup_eh = false;
+          if (stmt && is_gimple_assign (stmt)
+              && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_binary)
+              && (gimple_assign_rhs_code (stmt) ==  TRUNC_DIV_EXPR))
+            {
+	      cfg_changed |= execute_cse_div_mod_1 (stmt);
+            }
 
 	  if (is_gimple_call (stmt)
 	      && gimple_call_lhs (stmt)


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PR43721] Failure to optimise (a/b) and (a%b) into single  call
  2013-06-16 15:33 [PR43721] Failure to optimise (a/b) and (a%b) into single call Kugan
@ 2013-06-17  9:37 ` Richard Biener
  2013-06-21  2:13   ` Kugan
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2013-06-17  9:37 UTC (permalink / raw)
  To: Kugan; +Cc: gcc, Richard Earnshaw, Ramana Radhakrishnan

On Mon, 17 Jun 2013, Kugan wrote:

> Hi,
> 
> I am attempting to fix Bug 43721 - Failure to optimise (a/b) and (a%b) into
> single __aeabi_idivmod call
> (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721)
> 
> execute_cse_sincos tree level pass does similar cse so I attempted to use
> similar approach here. Div/mod cse is not really using built-in functions
> though at this level.

The issue with performing the transform at the same time as we
transform SINCOS is that the vectorizer will now no longer be able
to vectorize these loops.  It would need to be taught how to
handle the builtin calls (basically undo the transformation, I don't
know of any ISA that can do vectorized combined div/mod).  Which
means it should rather be done at the point we CSE reciprocals
(which also replaces computes with builtin target function calls).

> For the case of div and mod operations, after CSE is performed, there isnt a
> way to represent the resulting stament in gimple. We will endup with divmod
> taking two arguments and returning double the size of one arguments in the
> three address format (divmod will return reminder and quotient so the return
> type is double the size of argument type).
> 
> Since GIMPLE_ASSIGN will result in type checking failure in this case, I
> atempted use built-in functions (GIMPLE_CALL to represent the runtime library
> call). Name for the function here  is target specific and can be obtained from
> sdivmod_optab so the builtin function name defined in tree level is not used.
> I am not entirelt sure this is the right approach so I am attaching the first
> cut of the patch to get your feedback and understand the right approach to
> this problem.

If we don't want to expose new builtins to the user (I'm not sure we
want that), then using "internal functions" is an easier way to
avoid these issues (see gimple.h and internal-fn.(def|h)).

Generally the transform looks useful to me as it moves forward with
the general idea of moving pattern recognition done during RTL expansion
to an earlier place.

For the use of a larger integer type and shifts to represent
the modulo and division result I don't think that's the very best
idea.  Instead resorting to a complex integer type as return
value looks more appealing (similar to sincos using cexpi here).
That way you also avoid the ugly hard-coding of bit-sizes.

+  if (HAVE_divsi3
+       || (GET_MODE_BITSIZE (TYPE_MODE (type)) != 32)

watch out for types whose TYPE_PRECISION is not the bitsize
of their mode.  Also it should be GET_MODE_PRECISION here.

+       || !optab_libfunc (TYPE_UNSIGNED (type)? udivmod_optab : 
sdivmod_optab,
+                            TYPE_MODE (type)))

targets that use a libfunc should also get this optimization, as
it always removes computations.  I think the proper test is
for whether the target can do division and/or modulus without
using a libfunc, not whether there is a divmod optab/libfunc.

Others knowing this piece of the compiler better may want to comment
here, of course.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PR43721] Failure to optimise (a/b) and (a%b) into single call
  2013-06-17  9:37 ` Richard Biener
@ 2013-06-21  2:13   ` Kugan
  2013-09-03 10:57     ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Kugan @ 2013-06-21  2:13 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc, Richard Earnshaw, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 3625 bytes --]

On 17/06/13 19:07, Richard Biener wrote:
> On Mon, 17 Jun 2013, Kugan wrote:
>
>> Hi,
>>
>> I am attempting to fix Bug 43721 - Failure to optimise (a/b) and (a%b) into
>> single __aeabi_idivmod call
>> (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721)
>>
>> execute_cse_sincos tree level pass does similar cse so I attempted to use
>> similar approach here. Div/mod cse is not really using built-in functions
>> though at this level.
>
> The issue with performing the transform at the same time as we
> transform SINCOS is that the vectorizer will now no longer be able
> to vectorize these loops.  It would need to be taught how to
> handle the builtin calls (basically undo the transformation, I don't
> know of any ISA that can do vectorized combined div/mod).  Which
> means it should rather be done at the point we CSE reciprocals
> (which also replaces computes with builtin target function calls).
>
Thanks Richard. Since execute_cse_reciprocals is handling reciprocals 
only, I added another pass to do divmod. Is that OK?

>> For the case of div and mod operations, after CSE is performed, there isnt a
>> way to represent the resulting stament in gimple. We will endup with divmod
>> taking two arguments and returning double the size of one arguments in the
>> three address format (divmod will return reminder and quotient so the return
>> type is double the size of argument type).
>>
>> Since GIMPLE_ASSIGN will result in type checking failure in this case, I
>> atempted use built-in functions (GIMPLE_CALL to represent the runtime library
>> call). Name for the function here  is target specific and can be obtained from
>> sdivmod_optab so the builtin function name defined in tree level is not used.
>> I am not entirelt sure this is the right approach so I am attaching the first
>> cut of the patch to get your feedback and understand the right approach to
>> this problem.
>
> If we don't want to expose new builtins to the user (I'm not sure we
> want that), then using "internal functions" is an easier way to
> avoid these issues (see gimple.h and internal-fn.(def|h)).
>
I have now changed to use internal functions. Thanks for that.

> Generally the transform looks useful to me as it moves forward with
> the general idea of moving pattern recognition done during RTL expansion
> to an earlier place.
>
> For the use of a larger integer type and shifts to represent
> the modulo and division result I don't think that's the very best
> idea.  Instead resorting to a complex integer type as return
> value looks more appealing (similar to sincos using cexpi here).
> That way you also avoid the ugly hard-coding of bit-sizes.
>

I have changed it to use complex integers now.

> +  if (HAVE_divsi3
> +       || (GET_MODE_BITSIZE (TYPE_MODE (type)) != 32)
>
> watch out for types whose TYPE_PRECISION is not the bitsize
> of their mode.  Also it should be GET_MODE_PRECISION here.
>
> +       || !optab_libfunc (TYPE_UNSIGNED (type)? udivmod_optab :
> sdivmod_optab,
> +                            TYPE_MODE (type)))
>
> targets that use a libfunc should also get this optimization, as
> it always removes computations.  I think the proper test is
> for whether the target can do division and/or modulus without
> using a libfunc, not whether there is a divmod optab/libfunc.
>
I guess best way to do is by defining a target hook and let the target 
define the required behaviour. Is that what you had in mind?

I have attached a modified patch with these changes.

> Others knowing this piece of the compiler better may want to comment
> here, of course.
>
> Thanks,
> Richard.
>



Thanks,
Kugan

[-- Attachment #2: divmod_r1.diff --]
[-- Type: text/x-patch, Size: 15779 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index f030b56..3fae80e 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11375,3 +11375,8 @@ It returns true if the target supports GNU indirect functions.
 The support includes the assembler, linker and dynamic linker.
 The default value of this hook is based on target's libc.
 @end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_COMBINE_DIVMOD (enum machine_mode @var{mode})
+This target hook returns @code{true} if the target provides divmod libcall operation for the machine mode @var{mode} and must be used to combine integer division and modulus operations. Return @code{false} otherwise.
+@end deftypefn
+
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index cc25fec..12974b1 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -11198,3 +11198,6 @@ memory model bits are allowed.
 @hook TARGET_ATOMIC_TEST_AND_SET_TRUEVAL
 
 @hook TARGET_HAS_IFUNC_P
+
+@hook TARGET_COMBINE_DIVMOD
+
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index b841abd..0db06f1 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -61,6 +61,62 @@ get_multi_vector_move (tree array_type, convert_optab optab)
   return icode;
 }
 
+/* Expand DIVMOD call STMT.  */
+static void
+expand_DIVMOD (gimple stmt)
+{
+  tree type, lhs, arg0, arg1;
+  rtx op0, op1, res0, res1, target;
+  enum machine_mode mode, compute_mode;
+  rtx libval;
+  rtx libfunc = NULL_RTX;
+  bool is_unsigned;
+
+  lhs = gimple_call_lhs (stmt);
+  arg0 = gimple_call_arg (stmt, 0);
+  arg1 = gimple_call_arg (stmt, 1);
+  mode = TYPE_MODE (TREE_TYPE (arg0));
+  is_unsigned = TYPE_UNSIGNED (TREE_TYPE (arg0));
+
+  op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  op1 = expand_expr (arg1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+
+  /* Now convert to the best mode to use.  */
+  for (compute_mode = mode; compute_mode != VOIDmode;
+       compute_mode = GET_MODE_WIDER_MODE (compute_mode))
+    if (optab_libfunc (is_unsigned ? udivmod_optab : sdivmod_optab, compute_mode))
+      break;
+
+  if (compute_mode != mode)
+    {
+      op0 = convert_modes (compute_mode, mode, op0, is_unsigned);
+      op1 = convert_modes (compute_mode, mode, op1, is_unsigned);
+    }
+
+  /* Get the libcall.  */
+  libfunc = optab_libfunc (is_unsigned ? udivmod_optab : sdivmod_optab, compute_mode);
+  gcc_assert (libfunc);
+  machine_mode libval_mode
+    = smallest_mode_for_size (2 * GET_MODE_BITSIZE (compute_mode),
+                              MODE_INT);
+  libval = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
+                                    libval_mode, 2,
+                                    op0, compute_mode,
+                                    op1, compute_mode);
+
+  /* Get quotient and remainder into two registers.  */
+  res0 = simplify_gen_subreg (mode, libval, libval_mode, 0);
+  res1 = simplify_gen_subreg (mode, libval, libval_mode, GET_MODE_SIZE (compute_mode));
+
+  /* Now build the complex integer target.  */
+  expand_expr (build2 (COMPLEX_EXPR, TREE_TYPE (lhs),
+                       make_tree (TREE_TYPE (arg0), res0),
+                       make_tree (TREE_TYPE (arg1), res1)),
+               target, VOIDmode, EXPAND_NORMAL);
+}
+
+
 /* Expand LOAD_LANES call STMT.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 8900d90..4c5d4e0 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -40,3 +40,4 @@ along with GCC; see the file COPYING3.  If not see
 
 DEF_INTERNAL_FN (LOAD_LANES, ECF_CONST | ECF_LEAF)
 DEF_INTERNAL_FN (STORE_LANES, ECF_CONST | ECF_LEAF)
+DEF_INTERNAL_FN (DIVMOD, ECF_CONST | ECF_LEAF)
diff --git a/gcc/passes.c b/gcc/passes.c
index c8b03ee..03cea4c 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -1492,6 +1492,7 @@ init_optimization_passes (void)
 	}
       NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
+      NEXT_PASS (pass_cse_divmod);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_strength_reduction);
       NEXT_PASS (pass_dominator);
diff --git a/gcc/target.def b/gcc/target.def
index 3ba3e0a..8036a97 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2500,6 +2500,15 @@ DEFHOOK
  bool, (const_tree field, enum machine_mode mode),
  default_member_type_forces_blk)
 
+/* True if div and mod operations for MODE should be combined.  */
+DEFHOOK
+(combine_divmod,
+ "This target hook returns @code{true} if the target provides divmod libcall\
+ operation for the machine mode @var{mode} and must be used to combine\
+ integer division and modulus operations. Return @code{false} otherwise.",
+ bool, (enum machine_mode mode),
+ default_combine_divmod)
+
 /* Return the class for a secondary reload, and fill in extra information.  */
 DEFHOOK
 (secondary_reload,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index d3a3f5f..afa8076 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1560,6 +1560,14 @@ default_member_type_forces_blk (const_tree, enum machine_mode)
   return false;
 }
 
+/* Default version of combine_divmod.  */
+
+bool
+default_combine_divmod (enum machine_mode)
+{
+  return false;
+}
+
 /* Default version of canonicalize_comparison.  */
 
 void
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 2da6fb8..97dd036 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -198,3 +198,5 @@ extern void default_asm_output_ident_directive (const char*);
 
 extern enum machine_mode default_cstore_mode (enum insn_code);
 extern bool default_member_type_forces_blk (const_tree, enum machine_mode);
+
+extern bool default_combine_divmod (enum machine_mode);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index b8c59a7..4fe7d24 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -329,6 +329,7 @@ extern struct gimple_opt_pass pass_stdarg;
 extern struct gimple_opt_pass pass_early_warn_uninitialized;
 extern struct gimple_opt_pass pass_late_warn_uninitialized;
 extern struct gimple_opt_pass pass_cse_reciprocals;
+extern struct gimple_opt_pass pass_cse_divmod;
 extern struct gimple_opt_pass pass_cse_sincos;
 extern struct gimple_opt_pass pass_optimize_bswap;
 extern struct gimple_opt_pass pass_optimize_widening_mul;
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index e9c32b3..8e09fe9 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -147,6 +147,15 @@ static struct
 
 static struct
 {
+  /* Number of combined dovmod calls inserted.  */
+  int divmod_calls_inserted;
+
+  /* Number of instructions uses divmod result.  */
+  int divmod_result_used;
+} divmod_stats;
+
+static struct
+{
   /* Number of cexpi calls inserted.  */
   int inserted;
 } sincos_stats;
@@ -664,7 +673,7 @@ struct gimple_opt_pass pass_cse_reciprocals =
    statements in the vector.  */
 
 static bool
-maybe_record_sincos (vec<gimple> *stmts,
+maybe_record_stmt (vec<gimple> *stmts,
 		     basic_block *top_bb, gimple use_stmt)
 {
   basic_block use_bb = gimple_bb (use_stmt);
@@ -684,6 +693,229 @@ maybe_record_sincos (vec<gimple> *stmts,
   return true;
 }
 
+/* Look for div and mod statements with the same operands and
+   create a library call if needed.  We first walk over all
+   immediate uses of one of the operand looking for matched statement
+   that we can combine.  In a second pass replace the statement with
+   a library calls and statements with the results from library call.  */
+
+static bool
+execute_cse_div_mod_1 (gimple stmt)
+{
+  gimple_stmt_iterator gsi;
+  imm_use_iterator use_iter;
+  gimple use_stmt;
+  bool found = false;
+  vec<gimple> stmts = vNULL;
+  basic_block top_bb = NULL;
+  bool cfg_changed = false;
+  int i;
+  tree type, op1, op2;
+
+  op1 = gimple_assign_rhs1 (stmt);
+  op2 = gimple_assign_rhs2 (stmt);
+  type = TREE_TYPE (op1);
+
+  /* Skip if both the operands and constant. */
+  if (TREE_CODE (op1) == INTEGER_CST && TREE_CODE (op2) == INTEGER_CST)
+    return false;
+
+  /* Skip if the target doesnt support or require it.  */
+  if (!targetm.combine_divmod (TYPE_MODE (type)))
+    return false;
+
+  stmts.safe_push (stmt);
+  top_bb = gimple_bb (stmt);
+
+  /* look fpr a TRUNC_MOD_EXPR coresponding to stmt
+     TRUNC_DIV_EXPR s to combine.  */
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, (TREE_CODE (op2) == SSA_NAME) ? op2 : op1)
+    {
+      if (is_gimple_assign (use_stmt)
+          && (gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR))
+        {
+          tree rhs1 = gimple_assign_rhs1 (use_stmt);
+          tree rhs2 = gimple_assign_rhs2 (use_stmt);
+
+          if ((rhs1 == op1 && rhs2 == op2)
+              ||(rhs1 == op1 && rhs2 == op2))
+            found |= maybe_record_stmt (&stmts, &top_bb, use_stmt);
+        }
+    }
+
+  /* If we have matched instructions to combine, try creatiing Library
+     call and use the results.  */
+  if (found)
+    {
+      gimple def_stmt1 = NULL, def_stmt2 = NULL;
+      bool insert_after_op1_def, insert_after_op2_def;
+      tree res, rhs;
+      gimple assign_stmt, call_stmt;
+      tree return_type = build_complex_type (type);
+
+      if (TREE_CODE (op1) == SSA_NAME)
+        def_stmt1 = SSA_NAME_DEF_STMT (op1);
+      if (TREE_CODE (op2) == SSA_NAME)
+        def_stmt2 = SSA_NAME_DEF_STMT (op2);
+
+      /* Is the call to be insterted adter op1 define stmt.  */
+      insert_after_op1_def = TREE_CODE (op1) == SSA_NAME
+                            && !SSA_NAME_IS_DEFAULT_DEF (op1)
+                            && gimple_code (def_stmt1) != GIMPLE_PHI
+                            && gimple_bb (def_stmt1) == top_bb;
+
+      /* Is the call to be insterted adter op2 define stmt.  */
+      insert_after_op2_def = TREE_CODE (op2) == SSA_NAME
+                            && !SSA_NAME_IS_DEFAULT_DEF (op2)
+                            && gimple_code (def_stmt2) != GIMPLE_PHI
+                            && gimple_bb (def_stmt2) == top_bb;
+
+      /* Create a library call and instert it at the place idenified.  */
+      call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
+      res = make_temp_ssa_name (return_type,
+                                call_stmt, "divmod");
+      gimple_call_set_lhs (call_stmt, res);
+      divmod_stats.divmod_calls_inserted++;
+
+      /* Insert call at the beginning of top_bb but not earlier
+         than the name def statement.  */
+      if (insert_after_op1_def || insert_after_op2_def)
+        {
+          if (insert_after_op1_def && insert_after_op2_def)
+            {
+              for (gsi = gsi_start_bb (top_bb);
+                   !gsi_end_p (gsi); gsi_next (&gsi))
+                {
+                  gimple g = gsi_stmt (gsi);
+                  if (g == def_stmt1)
+                    {
+                      gsi = gsi_for_stmt (def_stmt2);
+                      break;
+                    }
+                  else if (g == def_stmt2)
+                    {
+                      gsi = gsi_for_stmt (def_stmt1);
+                      break;
+                    }
+                }
+            }
+
+          /* Only one of the definition is in the basic block, place after
+             that.  */
+          else if (insert_after_op1_def)
+            gsi = gsi_for_stmt (def_stmt1);
+          else
+            gsi = gsi_for_stmt (def_stmt2);
+
+          gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
+        }
+      else
+        {
+          /* op1 and op2 are defined before the top_bb.  Insert as first
+             non label instruction.  */
+          gsi = gsi_after_labels (top_bb);
+          gsi_insert_before (&gsi, call_stmt, GSI_SAME_STMT);
+        }
+
+      /* Adjust the recorded old statements to use the results.  */
+      for (i = 0; stmts.iterate (i, &use_stmt); ++i)
+        {
+          gsi = gsi_for_stmt (use_stmt);
+          divmod_stats.divmod_result_used++;
+
+          switch (gimple_assign_rhs_code (use_stmt))
+            {
+            case TRUNC_MOD_EXPR:
+              /* Get the top 32 bit from result and assign it.  */
+              rhs = fold_build1 (IMAGPART_EXPR, type, res);
+              break;
+
+            case TRUNC_DIV_EXPR:
+              /* Get the bottom 32 bit from result and assign it.  */
+              rhs = fold_build1 (REALPART_EXPR, type, res);
+              break;
+
+            default:
+              gcc_unreachable ();
+            }
+
+          /* Replace the statement with a copy.  */
+          assign_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt), rhs);
+          gsi_replace (&gsi, assign_stmt, true);
+
+          if (gimple_purge_dead_eh_edges (gimple_bb (assign_stmt)))
+            cfg_changed = true;
+        }
+    }
+
+  stmts.release ();
+  return cfg_changed;
+}
+
+/* Go through all the floating-point SSA_NAMEs, and call
+   execute_cse_reciprocals_1 on each of them.  */
+static unsigned int
+execute_cse_divmod (void)
+{
+  basic_block bb;
+
+  memset (&divmod_stats, 0, sizeof (divmod_stats));
+  calculate_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_POST_DOMINATORS);
+  FOR_EACH_BB (bb)
+    {
+      gimple_stmt_iterator gsi;
+      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+        {
+          gimple stmt = gsi_stmt (gsi);
+
+          if (is_gimple_assign (stmt)
+              && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_binary)
+              && (gimple_assign_rhs_code (stmt) ==  TRUNC_DIV_EXPR))
+            {
+              execute_cse_div_mod_1 (stmt);
+            }
+        }
+    }
+
+  statistics_counter_event (cfun, "number of combined divmod calls inserted",
+                            divmod_stats.divmod_calls_inserted);
+  statistics_counter_event (cfun, "number of instructions uses divmod result",
+                            divmod_stats.divmod_result_used);
+
+  free_dominance_info (CDI_DOMINATORS);
+  free_dominance_info (CDI_POST_DOMINATORS);
+  return 0;
+}
+
+
+static bool
+gate_cse_divmod (void)
+{
+  return optimize;
+}
+
+struct gimple_opt_pass pass_cse_divmod =
+{
+ {
+  GIMPLE_PASS,
+  "divmod",                             /* name */
+  OPTGROUP_NONE,                        /* optinfo_flags */
+  gate_cse_divmod,                      /* gate */
+  execute_cse_divmod,                   /* execute */
+  NULL,                                 /* sub */
+  NULL,                                 /* next */
+  0,                                    /* static_pass_number */
+  TV_NONE,                              /* tv_id */
+  PROP_ssa,                             /* properties_required */
+  0,                                    /* properties_provided */
+  0,                                    /* properties_destroyed */
+  0,                                    /* todo_flags_start */
+  TODO_update_ssa | TODO_verify_ssa
+    | TODO_verify_stmts                /* todo_flags_finish */
+ }
+};
+
 /* Look for sin, cos and cexpi calls with the same argument NAME and
    create a single call to cexpi CSEing the result in this case.
    We first walk over all immediate uses of the argument collecting
@@ -717,15 +949,15 @@ execute_cse_sincos_1 (tree name)
       switch (DECL_FUNCTION_CODE (fndecl))
 	{
 	CASE_FLT_FN (BUILT_IN_COS):
-	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+          seen_cos |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	CASE_FLT_FN (BUILT_IN_SIN):
-	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+          seen_sin |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	CASE_FLT_FN (BUILT_IN_CEXPI):
-	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+          seen_cexpi |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	default:;

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PR43721] Failure to optimise (a/b) and (a%b) into single call
  2013-06-21  2:13   ` Kugan
@ 2013-09-03 10:57     ` Richard Biener
  2013-09-03 15:53       ` David Malcolm
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2013-09-03 10:57 UTC (permalink / raw)
  To: Kugan
  Cc: Richard Biener, GCC Development, Richard Earnshaw, Ramana Radhakrishnan

On Fri, Jun 21, 2013 at 4:12 AM, Kugan
<kugan.vivekanandarajah@linaro.org> wrote:
> On 17/06/13 19:07, Richard Biener wrote:
>>
>> On Mon, 17 Jun 2013, Kugan wrote:
>>
>>> Hi,
>>>
>>> I am attempting to fix Bug 43721 - Failure to optimise (a/b) and (a%b)
>>> into
>>> single __aeabi_idivmod call
>>> (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721)
>>>
>>> execute_cse_sincos tree level pass does similar cse so I attempted to use
>>> similar approach here. Div/mod cse is not really using built-in functions
>>> though at this level.
>>
>>
>> The issue with performing the transform at the same time as we
>> transform SINCOS is that the vectorizer will now no longer be able
>> to vectorize these loops.  It would need to be taught how to
>> handle the builtin calls (basically undo the transformation, I don't
>> know of any ISA that can do vectorized combined div/mod).  Which
>> means it should rather be done at the point we CSE reciprocals
>> (which also replaces computes with builtin target function calls).
>>
> Thanks Richard. Since execute_cse_reciprocals is handling reciprocals only,
> I added another pass to do divmod. Is that OK?
>
>
>>> For the case of div and mod operations, after CSE is performed, there
>>> isnt a
>>> way to represent the resulting stament in gimple. We will endup with
>>> divmod
>>> taking two arguments and returning double the size of one arguments in
>>> the
>>> three address format (divmod will return reminder and quotient so the
>>> return
>>> type is double the size of argument type).
>>>
>>> Since GIMPLE_ASSIGN will result in type checking failure in this case, I
>>> atempted use built-in functions (GIMPLE_CALL to represent the runtime
>>> library
>>> call). Name for the function here  is target specific and can be obtained
>>> from
>>> sdivmod_optab so the builtin function name defined in tree level is not
>>> used.
>>> I am not entirelt sure this is the right approach so I am attaching the
>>> first
>>> cut of the patch to get your feedback and understand the right approach
>>> to
>>> this problem.
>>
>>
>> If we don't want to expose new builtins to the user (I'm not sure we
>> want that), then using "internal functions" is an easier way to
>> avoid these issues (see gimple.h and internal-fn.(def|h)).
>>
> I have now changed to use internal functions. Thanks for that.
>
>
>> Generally the transform looks useful to me as it moves forward with
>> the general idea of moving pattern recognition done during RTL expansion
>> to an earlier place.
>>
>> For the use of a larger integer type and shifts to represent
>> the modulo and division result I don't think that's the very best
>> idea.  Instead resorting to a complex integer type as return
>> value looks more appealing (similar to sincos using cexpi here).
>> That way you also avoid the ugly hard-coding of bit-sizes.
>>
>
> I have changed it to use complex integers now.
>
>
>> +  if (HAVE_divsi3
>> +       || (GET_MODE_BITSIZE (TYPE_MODE (type)) != 32)
>>
>> watch out for types whose TYPE_PRECISION is not the bitsize
>> of their mode.  Also it should be GET_MODE_PRECISION here.
>>
>> +       || !optab_libfunc (TYPE_UNSIGNED (type)? udivmod_optab :
>> sdivmod_optab,
>> +                            TYPE_MODE (type)))
>>
>> targets that use a libfunc should also get this optimization, as
>> it always removes computations.  I think the proper test is
>> for whether the target can do division and/or modulus without
>> using a libfunc, not whether there is a divmod optab/libfunc.
>>
> I guess best way to do is by defining a target hook and let the target
> define the required behaviour. Is that what you had in mind?
>
> I have attached a modified patch with these changes.

+@deftypefn {Target Hook} bool TARGET_COMBINE_DIVMOD (enum
machine_mode @var{mode})

rather than talking about libcalls I'd say the target should indicate whether
it has a way to optimize division and modulo with the same operands for given
mode.  As we have sdivmod_optab and udivmod_optab already you should
check whether for the given mode it is != CODE_for_nothing.  Or even better,
follow some of the vectorizer expansion stuff and separate out a
predicate from expand_divmod () that tells you whether it would do something
interesting.

       NEXT_PASS (pass_cse_reciprocals);
+      NEXT_PASS (pass_cse_divmod);

of course I've meant that you perform divmod detection at the same
time reciprocals are optimized - thus in the very same pass.

Sorry for taking so long to review this - the patch now needs updating
for the pass-manager C++-ification, too.

Thanks,
Richard.

>
>> Others knowing this piece of the compiler better may want to comment
>> here, of course.
>>
>> Thanks,
>> Richard.
>>
>
>
>
> Thanks,
> Kugan

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PR43721] Failure to optimise (a/b) and (a%b) into single call
  2013-09-03 10:57     ` Richard Biener
@ 2013-09-03 15:53       ` David Malcolm
  0 siblings, 0 replies; 5+ messages in thread
From: David Malcolm @ 2013-09-03 15:53 UTC (permalink / raw)
  To: Kugan
  Cc: Richard Biener, Richard Biener, GCC Development,
	Richard Earnshaw, Ramana Radhakrishnan

On Tue, 2013-09-03 at 12:57 +0200, Richard Biener wrote:
> On Fri, Jun 21, 2013 at 4:12 AM, Kugan
> <kugan.vivekanandarajah@linaro.org> wrote:
> > On 17/06/13 19:07, Richard Biener wrote:
> >>
> >> On Mon, 17 Jun 2013, Kugan wrote:
> >>
> >>> Hi,
> >>>
> >>> I am attempting to fix Bug 43721 - Failure to optimise (a/b) and (a%b)
> >>> into
> >>> single __aeabi_idivmod call
> >>> (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721)
> >>>
> >>> execute_cse_sincos tree level pass does similar cse so I attempted to use
> >>> similar approach here. Div/mod cse is not really using built-in functions
> >>> though at this level.
> >>
> >>
> >> The issue with performing the transform at the same time as we
> >> transform SINCOS is that the vectorizer will now no longer be able
> >> to vectorize these loops.  It would need to be taught how to
> >> handle the builtin calls (basically undo the transformation, I don't
> >> know of any ISA that can do vectorized combined div/mod).  Which
> >> means it should rather be done at the point we CSE reciprocals
> >> (which also replaces computes with builtin target function calls).
> >>
> > Thanks Richard. Since execute_cse_reciprocals is handling reciprocals only,
> > I added another pass to do divmod. Is that OK?
> >
> >
> >>> For the case of div and mod operations, after CSE is performed, there
> >>> isnt a
> >>> way to represent the resulting stament in gimple. We will endup with
> >>> divmod
> >>> taking two arguments and returning double the size of one arguments in
> >>> the
> >>> three address format (divmod will return reminder and quotient so the
> >>> return
> >>> type is double the size of argument type).
> >>>
> >>> Since GIMPLE_ASSIGN will result in type checking failure in this case, I
> >>> atempted use built-in functions (GIMPLE_CALL to represent the runtime
> >>> library
> >>> call). Name for the function here  is target specific and can be obtained
> >>> from
> >>> sdivmod_optab so the builtin function name defined in tree level is not
> >>> used.
> >>> I am not entirelt sure this is the right approach so I am attaching the
> >>> first
> >>> cut of the patch to get your feedback and understand the right approach
> >>> to
> >>> this problem.
> >>
> >>
> >> If we don't want to expose new builtins to the user (I'm not sure we
> >> want that), then using "internal functions" is an easier way to
> >> avoid these issues (see gimple.h and internal-fn.(def|h)).
> >>
> > I have now changed to use internal functions. Thanks for that.
> >
> >
> >> Generally the transform looks useful to me as it moves forward with
> >> the general idea of moving pattern recognition done during RTL expansion
> >> to an earlier place.
> >>
> >> For the use of a larger integer type and shifts to represent
> >> the modulo and division result I don't think that's the very best
> >> idea.  Instead resorting to a complex integer type as return
> >> value looks more appealing (similar to sincos using cexpi here).
> >> That way you also avoid the ugly hard-coding of bit-sizes.
> >>
> >
> > I have changed it to use complex integers now.
> >
> >
> >> +  if (HAVE_divsi3
> >> +       || (GET_MODE_BITSIZE (TYPE_MODE (type)) != 32)
> >>
> >> watch out for types whose TYPE_PRECISION is not the bitsize
> >> of their mode.  Also it should be GET_MODE_PRECISION here.
> >>
> >> +       || !optab_libfunc (TYPE_UNSIGNED (type)? udivmod_optab :
> >> sdivmod_optab,
> >> +                            TYPE_MODE (type)))
> >>
> >> targets that use a libfunc should also get this optimization, as
> >> it always removes computations.  I think the proper test is
> >> for whether the target can do division and/or modulus without
> >> using a libfunc, not whether there is a divmod optab/libfunc.
> >>
> > I guess best way to do is by defining a target hook and let the target
> > define the required behaviour. Is that what you had in mind?
> >
> > I have attached a modified patch with these changes.
> 
> +@deftypefn {Target Hook} bool TARGET_COMBINE_DIVMOD (enum
> machine_mode @var{mode})
> 
> rather than talking about libcalls I'd say the target should indicate whether
> it has a way to optimize division and modulo with the same operands for given
> mode.  As we have sdivmod_optab and udivmod_optab already you should
> check whether for the given mode it is != CODE_for_nothing.  Or even better,
> follow some of the vectorizer expansion stuff and separate out a
> predicate from expand_divmod () that tells you whether it would do something
> interesting.
> 
>        NEXT_PASS (pass_cse_reciprocals);
> +      NEXT_PASS (pass_cse_divmod);
> 
> of course I've meant that you perform divmod detection at the same
> time reciprocals are optimized - thus in the very same pass.
> 
> Sorry for taking so long to review this - the patch now needs updating
> for the pass-manager C++-ification, too.

The NEXT_PASS addition now needs to happen to passes.def, rather than
passes.c

The opt_pass structs have become an inheritance hierarchy, with their
const data moving to a pass_data struct.  Hopefully the pattern is
obvious from looking at other passes.

FWIW, I wrote a script to make these latter changes, which may work for
your new pass, by running refactor_passes.py from
https://github.com/davidmalcolm/gcc-refactoring-scripts
It expects a source tree in "../src" relative to the checkout of the
refactoring scripts, but you can pass in a path to a specific files to
have it run on them.   Tested with Python 2.7; may run with 2.6.  It
edits files in-place, including the relevant ChangeLog file.

I wouldn't run the script *until* you've merged/rebased the other
changes from trunk back into your tree (assuming you're using git; from
your patch it looks like you are).

Hope this is helpful
Dave

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2013-09-03 15:53 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-16 15:33 [PR43721] Failure to optimise (a/b) and (a%b) into single call Kugan
2013-06-17  9:37 ` Richard Biener
2013-06-21  2:13   ` Kugan
2013-09-03 10:57     ` Richard Biener
2013-09-03 15:53       ` David Malcolm

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).