public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] widening_mul: Do cost check when propagating mult into plus/minus expressions
@ 2011-07-13 13:26 Andreas Krebbel
  2011-07-13 14:49 ` Richard Guenther
  2011-07-13 17:08 ` Richard Henderson
  0 siblings, 2 replies; 11+ messages in thread
From: Andreas Krebbel @ 2011-07-13 13:26 UTC (permalink / raw)
  To: gcc-patches

Hi,

the widening_mul pass might increase the number of multiplications in
the code by transforming

a = b * c
d = a + 2
e = a + 3

into:

d = b * c + 2
e = b * c + 3

under the assumption that an FMA instruction is not more expensive
than a simple add.  This certainly isn't always true.  While e.g. on
s390 an fma is indeed not slower than an add execution-wise it has
disadvantages regarding instruction grouping.  It doesn't group with
any other instruction what has a major impact on the instruction
dispatch bandwidth.

The following patch tries to figure out the costs for adds, mults and
fmas by building an RTX and asking the backends cost function in order
to estimate whether it is whorthwhile doing the transformation.

With that patch the 436.cactus hotloop contains 28 less
multiplications than before increasing performance slightly (~2%).

Bootstrapped and regtested on x86_64 and s390x.

Bye,

-Andreas-

2011-07-13  Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>

	* tree-ssa-math-opts.c (compute_costs): New function.
	(convert_mult_to_fma): Take costs into account when propagating
	multiplications into several additions.
	* config/s390/s390.c (z196_costs): Adjust costs for madbr and
	maebr.


Index: gcc/tree-ssa-math-opts.c
===================================================================
*** gcc/tree-ssa-math-opts.c.orig
--- gcc/tree-ssa-math-opts.c
*************** convert_plusminus_to_widen (gimple_stmt_
*** 2185,2190 ****
--- 2185,2252 ----
    return true;
  }
  
+ /* Computing the costs for calculating RTX with CODE in MODE.  */
+ 
+ static unsigned
+ compute_costs (enum machine_mode mode, enum rtx_code code, bool speed)
+ {
+   rtx seq;
+   rtx set;
+   unsigned cost = 0;
+ 
+   start_sequence ();
+ 
+   switch (GET_RTX_LENGTH (code))
+     {
+     case 2:
+       force_operand (gen_rtx_fmt_ee (code, mode,
+ 		       gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1),
+ 		       gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2)),
+ 		     NULL_RTX);
+       break;
+     case 3:
+       /* FMA expressions are not handled by force_operand.  */
+       expand_ternary_op (mode, fma_optab,
+ 			 gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1),
+ 			 gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2),
+ 			 gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 3),
+ 			 NULL_RTX, false);
+       break;
+     default:
+       gcc_unreachable ();
+     }
+ 
+   seq = get_insns ();
+   end_sequence ();
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "Calculating costs of %s in %s mode.  Sequence is:\n",
+ 	       GET_RTX_NAME (code), GET_MODE_NAME (mode));
+       print_rtl (dump_file, seq);
+     }
+ 
+   for (; seq; seq = NEXT_INSN (seq))
+     {
+       set = single_set (seq);
+       if (set)
+ 	cost += rtx_cost (set, SET, speed);
+       else
+ 	cost++;
+     }
+ 
+   /* If the backend returns a cost of zero it is most certainly lying.
+      Set this to one in order to notice that we already calculated it
+      once.  */
+   cost = cost ? cost : 1;
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "%s in %s costs %d\n\n",
+ 	       GET_RTX_NAME (code), GET_MODE_NAME (mode), cost);
+ 
+   return cost;
+ }
+ 
  /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
     with uses in additions and subtractions to form fused multiply-add
     operations.  Returns true if successful and MUL_STMT should be removed.  */
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2197,2202 ****
--- 2259,2270 ----
    gimple use_stmt, neguse_stmt, fma_stmt;
    use_operand_p use_p;
    imm_use_iterator imm_iter;
+   enum machine_mode mode;
+   int uses = 0;
+   bool speed = optimize_bb_for_speed_p (gimple_bb (mul_stmt));
+   static unsigned mul_cost[NUM_MACHINE_MODES];
+   static unsigned add_cost[NUM_MACHINE_MODES];
+   static unsigned fma_cost[NUM_MACHINE_MODES];
  
    if (FLOAT_TYPE_P (type)
        && flag_fp_contract_mode == FP_CONTRACT_OFF)
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2213,2222 ****
    if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
      return false;
  
    /* Make sure that the multiplication statement becomes dead after
!      the transformation, thus that all uses are transformed to FMAs.
!      This means we assume that an FMA operation has the same cost
!      as an addition.  */
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
      {
        enum tree_code use_code;
--- 2281,2297 ----
    if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
      return false;
  
+   mode = TYPE_MODE (type);
+ 
+   if (!fma_cost[mode])
+     {
+       fma_cost[mode] = compute_costs (mode, FMA, speed);
+       add_cost[mode] = compute_costs (mode, PLUS, speed);
+       mul_cost[mode] = compute_costs (mode, MULT, speed);
+     }
+ 
    /* Make sure that the multiplication statement becomes dead after
!      the transformation, thus that all uses are transformed to FMAs.  */
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
      {
        enum tree_code use_code;
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2292,2297 ****
--- 2367,2373 ----
        if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
  	return false;
  
+       uses++;
        /* While it is possible to validate whether or not the exact form
  	 that we've recognized is available in the backend, the assumption
  	 is that the transformation is never a loss.  For instance, suppose
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2302,2307 ****
--- 2378,2389 ----
  	 independant and could be run in parallel.  */
      }
  
+   /* Calculate the costs of moving the multiplication into all the
+      minus/plus expressions.  */
+ 
+   if (uses * fma_cost[mode] > uses * add_cost[mode] + mul_cost[mode])
+     return false;
+ 
    FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
      {
        gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
Index: gcc/config/s390/s390.c
===================================================================
*** gcc/config/s390/s390.c.orig
--- gcc/config/s390/s390.c
*************** struct processor_costs z196_cost =
*** 242,249 ****
    COSTS_N_INSNS (100),   /* SQXBR B+100 */
    COSTS_N_INSNS (42),    /* SQDBR B+42 */
    COSTS_N_INSNS (28),    /* SQEBR B+28 */
!   COSTS_N_INSNS (1),     /* MADBR B */
!   COSTS_N_INSNS (1),     /* MAEBR B */
    COSTS_N_INSNS (101),   /* DXBR B+101 */
    COSTS_N_INSNS (29),    /* DDBR */
    COSTS_N_INSNS (22),    /* DEBR */
--- 242,250 ----
    COSTS_N_INSNS (100),   /* SQXBR B+100 */
    COSTS_N_INSNS (42),    /* SQDBR B+42 */
    COSTS_N_INSNS (28),    /* SQEBR B+28 */
!   /* Cheaper than a mul+add but more expensive then a single mul/add.  */
!   COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MADBR B */
!   COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MAEBR B */
    COSTS_N_INSNS (101),   /* DXBR B+101 */
    COSTS_N_INSNS (29),    /* DDBR */
    COSTS_N_INSNS (22),    /* DEBR */

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2011-07-15  7:59 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-13 13:26 [PATCH] widening_mul: Do cost check when propagating mult into plus/minus expressions Andreas Krebbel
2011-07-13 14:49 ` Richard Guenther
2011-07-13 15:27   ` Georg-Johann Lay
2011-07-13 23:29   ` Steven Bosscher
2011-07-14  9:43     ` Richard Guenther
2011-07-14 16:16       ` Steven Bosscher
2011-07-15  9:06       ` Andreas Krebbel
2011-07-14  7:45   ` Andreas Krebbel
2011-07-13 17:08 ` Richard Henderson
2011-07-14  9:28   ` Andreas Krebbel
2011-07-14 13:32   ` Andreas Krebbel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).