public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* Re: [PATCH][modulo-sched] Remove profitability check
       [not found] <OF2D1B96D8.9BD526E2-ONC225732C.005EA0B6-C2257330.004378F1@LocalDomain>
@ 2007-08-07 19:55 ` Ayal Zaks
  2007-08-08  6:29   ` Revital1 Eres
  0 siblings, 1 reply; 3+ messages in thread
From: Ayal Zaks @ 2007-08-07 19:55 UTC (permalink / raw)
  To: Revital1 Eres; +Cc: abel, gcc-patches, Kenneth.Zadeck, volodyan

Revital1 Eres/Haifa/IBM wrote on 07/08/2007 15:16:58:

> Hello,
>
> This patch is the third one in the series of patches originated from
> patch 1 of 2 (http://gcc.gnu.org/ml/gcc-patches/2007-01/msg01468.html).
>
> A reminder - here is the list of issues patch 1 of 2 addresses:
>
> 1.1 Avoid SMS when the loop contains inc instruction. (commited)
> 1.2 Fix removal of anti-deps. (commited)
> 1.3 Add -fsms-allow-reg-moves flag. (commited)
> 1.4 Fix order of instructions within one cycle.
> 1.5 Remove profitability checks.
>
> The attached patch addresses items 1.4 and 1.5 above.
> It also adds dump information.
>
> A new reduced Fortran testcase for forall_10.f90 test (sms-2.f90) was
> added; this testcase failed with -fmodulo-sched flag when the current
> patch is not applied. It caused by applying wrong order of instructions
> within one cycle which is fixed in the current patch.
>
> :ADDPATCH middle-end (modulo-sched):
>
> This patch was bootstrapped and tested on PPC and x86_64 (also with
> --enable-checking=assert), with and without -fmodulo-sched-allow-regmoves
> flag; with no new regressions.
>
> OK for mainline?
>

OK.
Please include your name in ChangeLog, as this is not entirely Vladimir's
original patch, and see minor correction below.

Ayal.

> Thanks,
> Revital
>
> 2007-08-07  Vladimir Yanovsky  <yanov@il.ibm.com>
>
>         * ddg.c (print_ddg): Add dump information.
>         * modulo-sched.c (print_node_sched_params): Add parameter and
>         verbosity.
>         (calculate_maxii): Remove function.
>         (undo_generate_reg_moves): Likewise.
>         (undo_permute_partial_schedule): Likewise.
>         (kernel_number_of_cycles): Likewise.
>         (MAXII_FACTOR): New definition to calculate the upper bound of
II.
>         (sms_schedule): Use it.  Remove profitability checks.
>         (sms_schedule_by_order): Fix order of nodes within the cycle.
>
>         * gfortran.dg/sms-1.f90: Add comment.
>         * gfortran.dg/sms-2.f90: New.

>

----- Forwarded by Ayal Zaks/Haifa/IBM on 07/08/2007 17:42 -----

Ayal Zaks/Haifa/IBM wrote on 07/08/2007 17:42:04:

> Index: ddg.c
> ===================================================================
> --- ddg.c (revision 127223)
> +++ ddg.c (working copy)
> @@ -568,6 +568,7 @@
>      {
>        ddg_edge_ptr e;
>
> +      fprintf (file, "Node num: %d\n", g->nodes[i].cuid);
>        print_rtl_single (file, g->nodes[i].insn);
>        fprintf (file, "OUT ARCS: ");
>        for (e = g->nodes[i].out; e; e = e->next_out)
> Index: testsuite/gfortran.dg/sms-1.f90
> ===================================================================
> --- testsuite/gfortran.dg/sms-1.f90 (revision 127223)
> +++ testsuite/gfortran.dg/sms-1.f90 (working copy)
> @@ -1,5 +1,7 @@
>  ! { dg-do run }
> -! { dg-options "-O2 -fmodulo-sched" }
> +! { dg-options "-O2 -fmodulo-sched" }
> +! This testcase related to INC instruction which is
> +! currently not supported in SMS.
>  program main
>    integer (kind = 8) :: i, l8, u8, step8
>    integer (kind = 4) :: l4, step4
> Index: testsuite/gfortran.dg/sms-2.f90
> ===================================================================
> --- testsuite/gfortran.dg/sms-2.f90 (revision 0)
> +++ testsuite/gfortran.dg/sms-2.f90 (revision 0)
> @@ -0,0 +1,19 @@
> +! { dg-do run }
> +! { dg-options "-O2 -fmodulo-sched" }
> +! This testcase related to wrong order within a cycle fix.
> +!
> +program foo
> +  real, dimension (5, 5, 5, 5) :: a
> +
> +  a (:, :, :,  :)  = 4
> +  a (:, 2, :, 4) = 10
> +  a (:, 2, :, 1) = 0
> +
> +  forall (i = 1:5, i == 3)
> +     a(i, i, i, i) = -5
> +   end forall
> +
> +  if (sum (a) .ne. 2541.0) call abort ()
> +end
> +
> +
> Index: modulo-sched.c
> ===================================================================
> --- modulo-sched.c (revision 127223)
> +++ modulo-sched.c (working copy)
> @@ -159,7 +159,6 @@
>  static void free_partial_schedule (partial_schedule_ptr);
>  static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
>  void print_partial_schedule (partial_schedule_ptr, FILE *);
> -static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
>  static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
>        ddg_node_ptr node, int cycle,
>        sbitmap must_precede,
> @@ -365,7 +364,7 @@
>  }
>
>  static void
> -print_node_sched_params (FILE * file, int num_nodes)
> +print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
>  {
>    int i;
>
> @@ -377,7 +376,8 @@
>        rtx reg_move = nsp->first_reg_move;
>        int j;
>
> -      fprintf (file, "Node %d:\n", i);
> +      fprintf (dump_file, "Node = %d; INSN = %d\n", i,
                  ^^^^^^^^^
should be 'file'


> +        (INSN_UID (g->nodes[i].insn)));
>        fprintf (file, " asap = %d:\n", nsp->asap);
>        fprintf (file, " time = %d:\n", nsp->time);
>        fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
> @@ -390,29 +390,6 @@
>      }
>  }
>
> -/* Calculate an upper bound for II.  SMS should not schedule the loop if
it
> -   requires more cycles than this bound.  Currently set to the sum of
the
> -   longest latency edge for each node.  Reset based on experiments.  */
> -static int
> -calculate_maxii (ddg_ptr g)
> -{
> -  int i;
> -  int maxii = 0;
> -
> -  for (i = 0; i < g->num_nodes; i++)
> -    {
> -      ddg_node_ptr u = &g->nodes[i];
> -      ddg_edge_ptr e;
> -      int max_edge_latency = 0;
> -
> -      for (e = u->out; e; e = e->next_out)
> - max_edge_latency = MAX (max_edge_latency, e->latency);
> -
> -      maxii += max_edge_latency;
> -    }
> -  return maxii;
> -}
> -
>  /*
>     Breaking intra-loop register anti-dependences:
>     Each intra-loop register anti-dependence implies a cross-iteration
true
> @@ -533,40 +510,6 @@
>    return reg_move_replaces;
>  }
>
> -/* We call this when we want to undo the SMS schedule for a given loop.
> -   One of the things that we do is to delete the register moves
generated
> -   for the sake of SMS; this function deletes the register move
instructions
> -   recorded in the undo buffer.  */
> -static void
> -undo_generate_reg_moves (partial_schedule_ptr ps,
> -    struct undo_replace_buff_elem *reg_move_replaces)
> -{
> -  int i,j;
> -
> -  for (i = 0; i < ps->g->num_nodes; i++)
> -    {
> -      ddg_node_ptr u = &ps->g->nodes[i];
> -      rtx prev;
> -      rtx crr = SCHED_FIRST_REG_MOVE (u);
> -
> -      for (j = 0; j < SCHED_NREG_MOVES (u); j++)
> - {
> -   prev = PREV_INSN (crr);
> -   delete_insn (crr);
> -   crr = prev;
> - }
> -      SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
> -    }
> -
> -  while (reg_move_replaces)
> -    {
> -      struct undo_replace_buff_elem *rep = reg_move_replaces;
> -
> -      reg_move_replaces = reg_move_replaces->next;
> -      replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
> -    }
> -}
> -
>  /* Free memory allocated for the undo buffer.  */
>  static void
>  free_undo_replace_buff (struct undo_replace_buff_elem
*reg_move_replaces)
> @@ -638,29 +581,7 @@
>         PREV_INSN (last));
>  }
>
> -/* As part of undoing SMS we return to the original ordering of the
> -   instructions inside the loop kernel.  Given the partial schedule PS,
this
> -   function returns the ordering of the instruction according to their
CUID
> -   in the DDG (PS->G), which is the original order of the instruction
before
> -   performing SMS.  */
>  static void
> -undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
> -{
> -  int i;
> -
> -  for (i = 0 ; i < ps->g->num_nodes; i++)
> -    if (last == ps->g->nodes[i].insn
> - || last == ps->g->nodes[i].first_note)
> -      break;
> -    else if (PREV_INSN (last) != ps->g->nodes[i].insn)
> -      reorder_insns_nobb (ps->g->nodes[i].first_note,
ps->g->nodes[i].insn,
> -     PREV_INSN (last));
> -}
> -
> -/* Used to generate the prologue & epilogue.  Duplicate the subset of
> -   nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
> -   of both), together with a prefix/suffix of their reg_moves.  */
> -static void
>  duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
>        int to_stage, int for_prolog)
>  {
> @@ -869,6 +790,9 @@
>     version may be entered.  Just a guess.  */
>  #define PROB_SMS_ENOUGH_ITERATIONS 80
>
> +/* Used to calculate the upper bound of ii.  */
> +#define MAXII_FACTOR 2
> +
>  /* Main entry point, perform SMS scheduling on the loops of the function
>     that consist of single basic blocks.  */
>  static void
> @@ -1097,7 +1021,7 @@
>        mii = 1; /* Need to pass some estimate of mii.  */
>        rec_mii = sms_order_nodes (g, mii, node_order);
>        mii = MAX (res_MII (g), rec_mii);
> -      maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
> +      maxii = MAXII_FACTOR * mii;
>
>        if (dump_file)
>   fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
> @@ -1131,8 +1055,6 @@
>   }
>        else
>   {
> -   int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END
(g->bb));
> -   int new_cycles;
>     struct undo_replace_buff_elem *reg_move_replaces;
>
>     if (dump_file)
> @@ -1154,68 +1076,46 @@
>     normalize_sched_times (ps);
>     rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
>     set_columns_for_ps (ps);
> +
> +   canon_loop (loop);
>
> -   /* Generate the kernel just to be able to measure its cycles.  */
> -   permute_partial_schedule (ps, g->closing_branch->first_note);
> -   reg_move_replaces = generate_reg_moves (ps, false);
> +          /* case the BCT count is not known , Do loop-versioning */
> +   if (count_reg && ! count_init)
> +            {
> +       rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
> +            GEN_INT(stage_count));
> +       unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
> +          * REG_BR_PROB_BASE) / 100;
>
> -   /* Get the number of cycles the new kernel expect to execute in.  */
> -   new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END
(g->bb));
> +       loop_version (loop, comp_rtx, &condition_bb,
> +         prob, prob, REG_BR_PROB_BASE - prob,
> +       true);
> +      }
>
> -   /* Get back to the original loop so we can do loop versioning.  */
> -   undo_permute_partial_schedule (ps, g->closing_branch->first_note);
> -   if (reg_move_replaces)
> -     undo_generate_reg_moves (ps, reg_move_replaces);
> +   /* Set new iteration count of loop kernel.  */
> +          if (count_reg && count_init)
> +     SET_SRC (single_set (count_init)) = GEN_INT (loop_count
> +           - stage_count + 1);
>
> -   if ( new_cycles >= orig_cycles)
> -     {
> -       /* SMS is not profitable so undo the permutation and reg move
generation
> -          and return the kernel to its original state.  */
> -       if (dump_file)
> -  fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
> +   /* Now apply the scheduled kernel to the RTL of the loop.  */
> +   permute_partial_schedule (ps, g->closing_branch->first_note);
>
> -     }
> -   else
> -     {
> -       canon_loop (loop);
> +          /* Mark this loop as software pipelined so the later
> +      scheduling passes doesn't touch it.  */
> +   if (! flag_resched_modulo_sched)
> +     g->bb->flags |= BB_DISABLE_SCHEDULE;
> +   /* The life-info is not valid any more.  */
> +   df_set_bb_dirty (g->bb);
>
> -              /* case the BCT count is not known , Do loop-versioning */
> -       if (count_reg && ! count_init)
> -  {
> -    rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
> -       GEN_INT(stage_count));
> -    unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
> -       * REG_BR_PROB_BASE) / 100;
> -
> -    loop_version (loop, comp_rtx, &condition_bb,
> -    prob, prob, REG_BR_PROB_BASE - prob,
> -    true);
> -  }
> -
> -       /* Set new iteration count of loop kernel.  */
> -              if (count_reg && count_init)
> -  SET_SRC (single_set (count_init)) = GEN_INT (loop_count
> -            - stage_count + 1);
> -
> -       /* Now apply the scheduled kernel to the RTL of the loop.  */
> -       permute_partial_schedule (ps, g->closing_branch->first_note);
> -
> -              /* Mark this loop as software pipelined so the later
> -       scheduling passes doesn't touch it.  */
> -       if (! flag_resched_modulo_sched)
> -  g->bb->flags |= BB_DISABLE_SCHEDULE;
> -       /* The life-info is not valid any more.  */
> -       df_set_bb_dirty (g->bb);
> -
> -       reg_move_replaces = generate_reg_moves (ps, true);
> -       if (dump_file)
> -  print_node_sched_params (dump_file, g->num_nodes);
> -       /* Generate prolog and epilog.  */
> -       if (count_reg && !count_init)
> -  generate_prolog_epilog (ps, loop, count_reg);
> -       else
> -   generate_prolog_epilog (ps, loop, NULL_RTX);
> -     }
> +   reg_move_replaces = generate_reg_moves (ps, true);
> +   if (dump_file)
> +     print_node_sched_params (dump_file, g->num_nodes, g);
> +   /* Generate prolog and epilog.  */
> +   if (count_reg && !count_init)
> +     generate_prolog_epilog (ps, loop, count_reg);
> +   else
> +     generate_prolog_epilog (ps, loop, NULL_RTX);
> +
>     free_undo_replace_buff (reg_move_replaces);
>   }
>
> @@ -1529,17 +1429,21 @@
>        of nodes within the cycle.  */
>            sbitmap_zero (must_precede);
>            sbitmap_zero (must_follow);
> -         for (e = u_node->in; e != 0; e = e->next_in)
> +          /* TODO: We can add an insn to the must_precede or must_follow
> +             bitmaps only if it has tight dependence to U and they
> +             both scheduled in the same row.  The current check is less
> +             conservative and content with the fact that both U and the
> +             insn are scheduled in the same row.  */
> +          for (e = u_node->in; e != 0; e = e->next_in)
>              if (TEST_BIT (sched_nodes, e->src->cuid)
> -         && e->latency == (ii * e->distance)
> -  && start == SCHED_TIME (e->src))
> -             SET_BIT (must_precede, e->src->cuid);
> +                && (SMODULO (SCHED_TIME (e->src), ii) == SMODULO (start,
ii)))
> +              SET_BIT (must_precede, e->src->cuid);
>
> -   for (e = u_node->out; e != 0; e = e->next_out)
> +          for (e = u_node->out; e != 0; e = e->next_out)
>              if (TEST_BIT (sched_nodes, e->dest->cuid)
> -         && e->latency == (ii * e->distance)
> -  && end == SCHED_TIME (e->dest))
> -             SET_BIT (must_follow, e->dest->cuid);
> +                && (SMODULO (SCHED_TIME (e->dest), ii) ==
> +                    SMODULO ((end - step), ii)))
> +              SET_BIT (must_follow, e->dest->cuid);
>
>     success = 0;
>     if ((step > 0 && start < end) ||  (step < 0 && start > end))
> @@ -2259,58 +2163,8 @@
>          targetm.sched.dfa_post_cycle_insn ());
>  }
>
> -/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
> -   the number of cycles according to DFA that the kernel fits in,
> -   we use this to check if we done well with SMS after we add
> -   register moves.  In some cases register moves overhead makes
> -   it even worse than the original loop.  We want SMS to be performed
> -   when it gives less cycles after register moves are added.  */
> -static int
> -kernel_number_of_cycles (rtx first_insn, rtx last_insn)
> -{
> -  int cycles = 0;
> -  rtx insn;
> -  int can_issue_more = issue_rate;
>
> -  state_reset (curr_state);
>
> -  for (insn = first_insn;
> -       insn != NULL_RTX && insn != last_insn;
> -       insn = NEXT_INSN (insn))
> -    {
> -      if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
> - continue;
> -
> -      /* Check if there is room for the current insn.  */
> -      if (!can_issue_more || state_dead_lock_p (curr_state))
> - {
> -   cycles ++;
> -   advance_one_cycle ();
> -   can_issue_more = issue_rate;
> - }
> -
> - /* Update the DFA state and return with failure if the DFA found
> -    recource conflicts.  */
> -      if (state_transition (curr_state, insn) >= 0)
> - {
> -   cycles ++;
> -   advance_one_cycle ();
> -   can_issue_more = issue_rate;
> - }
> -
> -      if (targetm.sched.variable_issue)
> - can_issue_more =
> -   targetm.sched.variable_issue (sched_dump, sched_verbose,
> -     insn, can_issue_more);
> -      /* A naked CLOBBER or USE generates no instruction, so don't
> -  let them consume issue slots.  */
> -      else if (GET_CODE (PATTERN (insn)) != USE
> -        && GET_CODE (PATTERN (insn)) != CLOBBER)
> - can_issue_more--;
> -    }
> -  return cycles;
> -}
> -
>  /* Checks if PS has resource conflicts according to DFA, starting from
>     FROM cycle to TO cycle; returns true if there are conflicts and false
>     if there are no conflicts.  Assumes DFA is being used.  */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH][modulo-sched] Remove profitability check
  2007-08-07 19:55 ` [PATCH][modulo-sched] Remove profitability check Ayal Zaks
@ 2007-08-08  6:29   ` Revital1 Eres
  0 siblings, 0 replies; 3+ messages in thread
From: Revital1 Eres @ 2007-08-08  6:29 UTC (permalink / raw)
  To: Ayal Zaks; +Cc: abel, gcc-patches, Kenneth.Zadeck, volodyan



gcc-patches-owner@gcc.gnu.org wrote on 07/08/2007 22:55:16:

> Revital1 Eres/Haifa/IBM wrote on 07/08/2007 15:16:58:
>
> > Hello,
> >
> > This patch is the third one in the series of patches originated from
> > patch 1 of 2 (http://gcc.gnu.org/ml/gcc-patches/2007-01/msg01468.html).
> >
> > A reminder - here is the list of issues patch 1 of 2 addresses:
> >
> > 1.1 Avoid SMS when the loop contains inc instruction. (commited)
> > 1.2 Fix removal of anti-deps. (commited)
> > 1.3 Add -fsms-allow-reg-moves flag. (commited)
> > 1.4 Fix order of instructions within one cycle.
> > 1.5 Remove profitability checks.
> >
> > The attached patch addresses items 1.4 and 1.5 above.
> > It also adds dump information.
> >
> > A new reduced Fortran testcase for forall_10.f90 test (sms-2.f90) was
> > added; this testcase failed with -fmodulo-sched flag when the current
> > patch is not applied. It caused by applying wrong order of instructions
> > within one cycle which is fixed in the current patch.
> >
> > :ADDPATCH middle-end (modulo-sched):
> >
> > This patch was bootstrapped and tested on PPC and x86_64 (also with
> > --enable-checking=assert), with and without
-fmodulo-sched-allow-regmoves
> > flag; with no new regressions.
> >
> > OK for mainline?
> >
>
> OK.
> Please include your name in ChangeLog, as this is not entirely Vladimir's
> original patch, and see minor correction below.

Sure, I'll commit the patch with this change.

Thanks,
Revital

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH][modulo-sched] Remove profitability check
@ 2007-08-07 12:18 Revital1 Eres
  0 siblings, 0 replies; 3+ messages in thread
From: Revital1 Eres @ 2007-08-07 12:18 UTC (permalink / raw)
  To: Ayal Zaks; +Cc: Kenneth.Zadeck, volodyan, abel, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1759 bytes --]


Hello,

This patch is the third one in the series of patches originated from
patch 1 of 2 (http://gcc.gnu.org/ml/gcc-patches/2007-01/msg01468.html).

A reminder - here is the list of issues patch 1 of 2 addresses:

1.1 Avoid SMS when the loop contains inc instruction. (commited)
1.2 Fix removal of anti-deps. (commited)
1.3 Add -fsms-allow-reg-moves flag. (commited)
1.4 Fix order of instructions within one cycle.
1.5 Remove profitability checks.

The attached patch addresses items 1.4 and 1.5 above.
It also adds dump information.

A new reduced Fortran testcase for forall_10.f90 test (sms-2.f90) was
added; this testcase failed with -fmodulo-sched flag when the current
patch is not applied. It caused by applying wrong order of instructions
within one cycle which is fixed in the current patch.

:ADDPATCH middle-end (modulo-sched):

This patch was bootstrapped and tested on PPC and x86_64 (also with
--enable-checking=assert), with and without -fmodulo-sched-allow-regmoves
flag; with no new regressions.

OK for mainline?

Thanks,
Revital


2007-08-07  Vladimir Yanovsky  <yanov@il.ibm.com>

        * ddg.c (print_ddg): Add dump information.
        * modulo-sched.c (print_node_sched_params): Add parameter and
        verbosity.
        (calculate_maxii): Remove function.
        (undo_generate_reg_moves): Likewise.
        (undo_permute_partial_schedule): Likewise.
        (kernel_number_of_cycles): Likewise.
        (MAXII_FACTOR): New definition to calculate the upper bound of II.
        (sms_schedule): Use it.  Remove profitability checks.
        (sms_schedule_by_order): Fix order of nodes within the cycle.

        * gfortran.dg/sms-1.f90: Add comment.
        * gfortran.dg/sms-2.f90: New.


(See attached file: patch_1_of_2.txt)

[-- Attachment #2: patch_1_of_2.txt --]
[-- Type: text/plain, Size: 14118 bytes --]

Index: ddg.c
===================================================================
--- ddg.c	(revision 127223)
+++ ddg.c	(working copy)
@@ -568,6 +568,7 @@
     {
       ddg_edge_ptr e;
 
+      fprintf (file, "Node num: %d\n", g->nodes[i].cuid);
       print_rtl_single (file, g->nodes[i].insn);
       fprintf (file, "OUT ARCS: ");
       for (e = g->nodes[i].out; e; e = e->next_out)
Index: testsuite/gfortran.dg/sms-1.f90
===================================================================
--- testsuite/gfortran.dg/sms-1.f90	(revision 127223)
+++ testsuite/gfortran.dg/sms-1.f90	(working copy)
@@ -1,5 +1,7 @@
 ! { dg-do run }
-! { dg-options "-O2 -fmodulo-sched" } 
+! { dg-options "-O2 -fmodulo-sched" }
+! This testcase related to INC instruction which is
+! currently not supported in SMS. 
 program main
   integer (kind = 8) :: i, l8, u8, step8
   integer (kind = 4) :: l4, step4
Index: testsuite/gfortran.dg/sms-2.f90
===================================================================
--- testsuite/gfortran.dg/sms-2.f90	(revision 0)
+++ testsuite/gfortran.dg/sms-2.f90	(revision 0)
@@ -0,0 +1,19 @@
+! { dg-do run }
+! { dg-options "-O2 -fmodulo-sched" }
+! This testcase related to wrong order within a cycle fix.
+!
+program foo 
+  real, dimension (5, 5, 5, 5) :: a
+
+  a (:, :, :,  :)  = 4
+  a (:, 2, :, 4) = 10
+  a (:, 2, :, 1) = 0
+
+  forall (i = 1:5, i == 3) 
+     a(i, i, i, i) = -5
+   end forall
+
+  if (sum (a) .ne. 2541.0) call abort ()
+end
+
+
Index: modulo-sched.c
===================================================================
--- modulo-sched.c	(revision 127223)
+++ modulo-sched.c	(working copy)
@@ -159,7 +159,6 @@
 static void free_partial_schedule (partial_schedule_ptr);
 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 void print_partial_schedule (partial_schedule_ptr, FILE *);
-static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 						ddg_node_ptr node, int cycle,
 						sbitmap must_precede,
@@ -365,7 +364,7 @@
 }
 
 static void
-print_node_sched_params (FILE * file, int num_nodes)
+print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
 {
   int i;
 
@@ -377,7 +376,8 @@
       rtx reg_move = nsp->first_reg_move;
       int j;
 
-      fprintf (file, "Node %d:\n", i);
+      fprintf (dump_file, "Node = %d; INSN = %d\n", i,
+	       (INSN_UID (g->nodes[i].insn)));
       fprintf (file, " asap = %d:\n", nsp->asap);
       fprintf (file, " time = %d:\n", nsp->time);
       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
@@ -390,29 +390,6 @@
     }
 }
 
-/* Calculate an upper bound for II.  SMS should not schedule the loop if it
-   requires more cycles than this bound.  Currently set to the sum of the
-   longest latency edge for each node.  Reset based on experiments.  */
-static int
-calculate_maxii (ddg_ptr g)
-{
-  int i;
-  int maxii = 0;
-
-  for (i = 0; i < g->num_nodes; i++)
-    {
-      ddg_node_ptr u = &g->nodes[i];
-      ddg_edge_ptr e;
-      int max_edge_latency = 0;
-
-      for (e = u->out; e; e = e->next_out)
-	max_edge_latency = MAX (max_edge_latency, e->latency);
-
-      maxii += max_edge_latency;
-    }
-  return maxii;
-}
-
 /*
    Breaking intra-loop register anti-dependences:
    Each intra-loop register anti-dependence implies a cross-iteration true
@@ -533,40 +510,6 @@
   return reg_move_replaces;
 }
 
-/* We call this when we want to undo the SMS schedule for a given loop.
-   One of the things that we do is to delete the register moves generated
-   for the sake of SMS; this function deletes the register move instructions
-   recorded in the undo buffer.  */
-static void
-undo_generate_reg_moves (partial_schedule_ptr ps,
-			 struct undo_replace_buff_elem *reg_move_replaces)
-{
-  int i,j;
-
-  for (i = 0; i < ps->g->num_nodes; i++)
-    {
-      ddg_node_ptr u = &ps->g->nodes[i];
-      rtx prev;
-      rtx crr = SCHED_FIRST_REG_MOVE (u);
-
-      for (j = 0; j < SCHED_NREG_MOVES (u); j++)
-	{
-	  prev = PREV_INSN (crr);
-	  delete_insn (crr);
-	  crr = prev;
-	}
-      SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
-    }
-
-  while (reg_move_replaces)
-    {
-      struct undo_replace_buff_elem *rep = reg_move_replaces;
-
-      reg_move_replaces = reg_move_replaces->next;
-      replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
-    }
-}
-
 /* Free memory allocated for the undo buffer.  */
 static void
 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
@@ -638,29 +581,7 @@
 			    PREV_INSN (last));
 }
 
-/* As part of undoing SMS we return to the original ordering of the
-   instructions inside the loop kernel.  Given the partial schedule PS, this
-   function returns the ordering of the instruction according to their CUID
-   in the DDG (PS->G), which is the original order of the instruction before
-   performing SMS.  */
 static void
-undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
-{
-  int i;
-
-  for (i = 0 ; i < ps->g->num_nodes; i++)
-    if (last == ps->g->nodes[i].insn
-	|| last == ps->g->nodes[i].first_note)
-      break;
-    else if (PREV_INSN (last) != ps->g->nodes[i].insn)
-      reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
-			  PREV_INSN (last));
-}
-
-/* Used to generate the prologue & epilogue.  Duplicate the subset of
-   nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
-   of both), together with a prefix/suffix of their reg_moves.  */
-static void
 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 			   int to_stage, int for_prolog)
 {
@@ -869,6 +790,9 @@
    version may be entered.  Just a guess.  */
 #define PROB_SMS_ENOUGH_ITERATIONS 80
 
+/* Used to calculate the upper bound of ii.  */
+#define MAXII_FACTOR 2
+
 /* Main entry point, perform SMS scheduling on the loops of the function
    that consist of single basic blocks.  */
 static void
@@ -1097,7 +1021,7 @@
       mii = 1; /* Need to pass some estimate of mii.  */
       rec_mii = sms_order_nodes (g, mii, node_order);
       mii = MAX (res_MII (g), rec_mii);
-      maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
+      maxii = MAXII_FACTOR * mii;
 
       if (dump_file)
 	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
@@ -1131,8 +1055,6 @@
 	}
       else
 	{
-	  int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
-	  int new_cycles;
 	  struct undo_replace_buff_elem *reg_move_replaces;
 
 	  if (dump_file)
@@ -1154,68 +1076,46 @@
 	  normalize_sched_times (ps);
 	  rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
 	  set_columns_for_ps (ps);
+	  
+	  canon_loop (loop);
 
-	  /* Generate the kernel just to be able to measure its cycles.  */
-	  permute_partial_schedule (ps, g->closing_branch->first_note);
-	  reg_move_replaces = generate_reg_moves (ps, false);
+          /* case the BCT count is not known , Do loop-versioning */
+	  if (count_reg && ! count_init)
+            {
+	      rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
+	  				     GEN_INT(stage_count));
+	      unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
+			       * REG_BR_PROB_BASE) / 100;
 
-	  /* Get the number of cycles the new kernel expect to execute in.  */
-	  new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
+	      loop_version (loop, comp_rtx, &condition_bb,
+	  		    prob, prob, REG_BR_PROB_BASE - prob,
+			    true);
+	     }
 
-	  /* Get back to the original loop so we can do loop versioning.  */
-	  undo_permute_partial_schedule (ps, g->closing_branch->first_note);
-	  if (reg_move_replaces)
-	    undo_generate_reg_moves (ps, reg_move_replaces);
+	  /* Set new iteration count of loop kernel.  */
+          if (count_reg && count_init)
+	    SET_SRC (single_set (count_init)) = GEN_INT (loop_count
+						     - stage_count + 1);
 
-	  if ( new_cycles >= orig_cycles)
-	    {
-	      /* SMS is not profitable so undo the permutation and reg move generation
-	         and return the kernel to its original state.  */
-	      if (dump_file)
-		fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
+	  /* Now apply the scheduled kernel to the RTL of the loop.  */
+	  permute_partial_schedule (ps, g->closing_branch->first_note);
 
-	    }
-	  else
-	    {
-	      canon_loop (loop);
+          /* Mark this loop as software pipelined so the later
+	     scheduling passes doesn't touch it.  */
+	  if (! flag_resched_modulo_sched)
+	    g->bb->flags |= BB_DISABLE_SCHEDULE;
+	  /* The life-info is not valid any more.  */
+	  df_set_bb_dirty (g->bb);
 
-              /* case the BCT count is not known , Do loop-versioning */
-	      if (count_reg && ! count_init)
-		{
-		  rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
-						 GEN_INT(stage_count));
-		  unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
-				   * REG_BR_PROB_BASE) / 100;
-
-		  loop_version (loop, comp_rtx, &condition_bb,
-				prob, prob, REG_BR_PROB_BASE - prob,
-				true);
-		}
-
-	      /* Set new iteration count of loop kernel.  */
-              if (count_reg && count_init)
-		SET_SRC (single_set (count_init)) = GEN_INT (loop_count
-							     - stage_count + 1);
-
-	      /* Now apply the scheduled kernel to the RTL of the loop.  */
-	      permute_partial_schedule (ps, g->closing_branch->first_note);
-
-              /* Mark this loop as software pipelined so the later
-	      scheduling passes doesn't touch it.  */
-	      if (! flag_resched_modulo_sched)
-		g->bb->flags |= BB_DISABLE_SCHEDULE;
-	      /* The life-info is not valid any more.  */
-	      df_set_bb_dirty (g->bb);
-
-	      reg_move_replaces = generate_reg_moves (ps, true);
-	      if (dump_file)
-		print_node_sched_params (dump_file, g->num_nodes);
-	      /* Generate prolog and epilog.  */
-	      if (count_reg && !count_init)
-		generate_prolog_epilog (ps, loop, count_reg);
-	      else
-	 	generate_prolog_epilog (ps, loop, NULL_RTX);
-	    }
+	  reg_move_replaces = generate_reg_moves (ps, true);
+	  if (dump_file)
+	    print_node_sched_params (dump_file, g->num_nodes, g);
+	  /* Generate prolog and epilog.  */
+	  if (count_reg && !count_init)
+	    generate_prolog_epilog (ps, loop, count_reg);
+	  else
+	    generate_prolog_epilog (ps, loop, NULL_RTX);
+	    
 	  free_undo_replace_buff (reg_move_replaces);
 	}
 
@@ -1529,17 +1429,21 @@
 	     of nodes within the cycle.  */
           sbitmap_zero (must_precede);
           sbitmap_zero (must_follow);
-      	  for (e = u_node->in; e != 0; e = e->next_in)
+          /* TODO: We can add an insn to the must_precede or must_follow
+             bitmaps only if it has tight dependence to U and they
+             both scheduled in the same row.  The current check is less
+             conservative and content with the fact that both U and the
+             insn are scheduled in the same row.  */
+          for (e = u_node->in; e != 0; e = e->next_in)
             if (TEST_BIT (sched_nodes, e->src->cuid)
-	        && e->latency == (ii * e->distance)
-		&& start == SCHED_TIME (e->src))
-             SET_BIT (must_precede, e->src->cuid);
+                && (SMODULO (SCHED_TIME (e->src), ii) == SMODULO (start, ii)))
+              SET_BIT (must_precede, e->src->cuid);
 
-	  for (e = u_node->out; e != 0; e = e->next_out)
+          for (e = u_node->out; e != 0; e = e->next_out)
             if (TEST_BIT (sched_nodes, e->dest->cuid)
-	        && e->latency == (ii * e->distance)
-		&& end == SCHED_TIME (e->dest))
-             SET_BIT (must_follow, e->dest->cuid);
+                && (SMODULO (SCHED_TIME (e->dest), ii) ==
+                    SMODULO ((end - step), ii)))
+              SET_BIT (must_follow, e->dest->cuid);
 
 	  success = 0;
 	  if ((step > 0 && start < end) ||  (step < 0 && start > end))
@@ -2259,58 +2163,8 @@
 		      targetm.sched.dfa_post_cycle_insn ());
 }
 
-/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
-   the number of cycles according to DFA that the kernel fits in,
-   we use this to check if we done well with SMS after we add
-   register moves.  In some cases register moves overhead makes
-   it even worse than the original loop.  We want SMS to be performed
-   when it gives less cycles after register moves are added.  */
-static int
-kernel_number_of_cycles (rtx first_insn, rtx last_insn)
-{
-  int cycles = 0;
-  rtx insn;
-  int can_issue_more = issue_rate;
 
-  state_reset (curr_state);
 
-  for (insn = first_insn;
-       insn != NULL_RTX && insn != last_insn;
-       insn = NEXT_INSN (insn))
-    {
-      if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
-	continue;
-
-      /* Check if there is room for the current insn.  */
-      if (!can_issue_more || state_dead_lock_p (curr_state))
-	{
-	  cycles ++;
-	  advance_one_cycle ();
-	  can_issue_more = issue_rate;
-	}
-
-	/* Update the DFA state and return with failure if the DFA found
-	   recource conflicts.  */
-      if (state_transition (curr_state, insn) >= 0)
-	{
-	  cycles ++;
-	  advance_one_cycle ();
-	  can_issue_more = issue_rate;
-	}
-
-      if (targetm.sched.variable_issue)
-	can_issue_more =
-	  targetm.sched.variable_issue (sched_dump, sched_verbose,
-					insn, can_issue_more);
-      /* A naked CLOBBER or USE generates no instruction, so don't
-	 let them consume issue slots.  */
-      else if (GET_CODE (PATTERN (insn)) != USE
-	       && GET_CODE (PATTERN (insn)) != CLOBBER)
-	can_issue_more--;
-    }
-  return cycles;
-}
-
 /* Checks if PS has resource conflicts according to DFA, starting from
    FROM cycle to TO cycle; returns true if there are conflicts and false
    if there are no conflicts.  Assumes DFA is being used.  */

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2007-08-08  6:29 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <OF2D1B96D8.9BD526E2-ONC225732C.005EA0B6-C2257330.004378F1@LocalDomain>
2007-08-07 19:55 ` [PATCH][modulo-sched] Remove profitability check Ayal Zaks
2007-08-08  6:29   ` Revital1 Eres
2007-08-07 12:18 Revital1 Eres

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).