public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [RS6000] reload_vsx_from_gprsf splitter
@ 2016-02-11 14:04 Alan Modra
  2016-02-11 17:34 ` David Edelsohn
  0 siblings, 1 reply; 14+ messages in thread
From: Alan Modra @ 2016-02-11 14:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: David Edelsohn, Michael Meissner

This is PR68973 part 2, the failure of a boost test, caused by a
splitter emitting an invalid move in reload_vsx_from_gprsf:
  emit_move_insn (op0_di, op2);

op0 can be any vsx reg, but the mtvsrd destination constraint in
movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
we have that constraint so left movdi_internal64 alone and used a
special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
restricted to fprs there too so fixed that as well.  (We can't use %L
in asm operands that must accept vsx.)

Bootstrapped and regression tested powerpc64le-linux.  powerpc64-linux
biarch -mcpu=power7 bootstrap still in progress.  OK to apply assuming
no regressions found?

	PR target/68973
	* config/rs6000/vsx.md (vsx_xscvspdpn_directmove): Delete.
	* config/rs6000/rs6000.md (reload_vsx_from_gprsf): Rewrite splitter.
	(p8_mtvsrd): New.
	(p8_mtvsrd_1, p8_mtvsrd_2): Delete.
	(reload_vsx_from_gpr<mode>): Adjust to use p8_mtvsrd.

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index cdbf873..745293b 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7543,29 +7543,22 @@
    (set_attr "type" "three")])
 
 ;; Move 128 bit values from GPRs to VSX registers in 64-bit mode
-(define_insn "p8_mtvsrd_1"
-  [(set (match_operand:TF 0 "register_operand" "=ws")
-	(unspec:TF [(match_operand:DI 1 "register_operand" "r")]
+(define_insn "p8_mtvsrd"
+  [(set (match_operand:DF 0 "register_operand" "=ws")
+	(unspec:DF [(match_operand:DI 1 "register_operand" "r")]
 		   UNSPEC_P8V_MTVSRD))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrd %0,%1"
-  [(set_attr "type" "mftgpr")])
-
-(define_insn "p8_mtvsrd_2"
-  [(set (match_operand:TF 0 "register_operand" "+ws")
-	(unspec:TF [(match_dup 0)
-		    (match_operand:DI 1 "register_operand" "r")]
-		   UNSPEC_P8V_MTVSRD))]
-  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrd %L0,%1"
+  "mtvsrd %x0,%1"
   [(set_attr "type" "mftgpr")])
 
 (define_insn "p8_xxpermdi_<mode>"
   [(set (match_operand:FMOVE128_GPR 0 "register_operand" "=wa")
-	(unspec:FMOVE128_GPR [(match_operand:TF 1 "register_operand" "ws")]
-			     UNSPEC_P8V_XXPERMDI))]
+	(unspec:FMOVE128_GPR [
+		(match_operand:DF 1 "register_operand" "ws")
+		(match_operand:DF 2 "register_operand" "ws")]
+		UNSPEC_P8V_XXPERMDI))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "xxpermdi %x0,%1,%L1,0"
+  "xxpermdi %x0,%x1,%x2,0"
   [(set_attr "type" "vecperm")])
 
 (define_insn_and_split "reload_vsx_from_gpr<mode>"
@@ -7581,13 +7574,18 @@
 {
   rtx dest = operands[0];
   rtx src = operands[1];
-  rtx tmp = operands[2];
+  /* You might think that we could use op0 as one temp and a DF clobber
+     as the other, but you'd be wrong.  These secondary_reload patterns
+     don't check that the clobber is different to the destination, which
+     is probably a reload bug.  */
+  rtx tmp_hi = gen_rtx_REG (DFmode, REGNO (operands[2]));
+  rtx tmp_lo = gen_rtx_REG (DFmode, REGNO (operands[2]) + 1);
   rtx gpr_hi_reg = gen_highpart (DImode, src);
   rtx gpr_lo_reg = gen_lowpart (DImode, src);
 
-  emit_insn (gen_p8_mtvsrd_1 (tmp, gpr_hi_reg));
-  emit_insn (gen_p8_mtvsrd_2 (tmp, gpr_lo_reg));
-  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp));
+  emit_insn (gen_p8_mtvsrd (tmp_hi, gpr_hi_reg));
+  emit_insn (gen_p8_mtvsrd (tmp_lo, gpr_lo_reg));
+  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp_hi, tmp_lo));
   DONE;
 }
   [(set_attr "length" "12")
@@ -7622,16 +7620,18 @@
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
+
   /* Also use the destination register to hold the unconverted DImode value.
      This is conceptually a separate value from OP0, so we use gen_rtx_REG
      rather than simplify_gen_subreg.  */
-  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
+  rtx op0_df = gen_rtx_REG (DFmode, REGNO (op0));
+  rtx op0_v4sf = gen_rtx_REG (V4SFmode, REGNO (op0));
   rtx op1_di = simplify_gen_subreg (DImode, op1, SFmode, 0);
 
   /* Move SF value to upper 32-bits for xscvspdpn.  */
   emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
-  emit_move_insn (op0_di, op2);
-  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
+  emit_insn (gen_p8_mtvsrd (op0_df, op2));
+  emit_insn (gen_vsx_xscvspdpn (op0_df, op0_v4sf));
   DONE;
 }
   [(set_attr "length" "8")
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 997ff31..2d2f137 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1518,15 +1518,6 @@
   "xscvdpspn %x0,%x1"
   [(set_attr "type" "fp")])
 
-;; Used by direct move to move a SFmode value from GPR to VSX register
-(define_insn "vsx_xscvspdpn_directmove"
-  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
-	(unspec:SF [(match_operand:DI 1 "vsx_register_operand" "wa")]
-		   UNSPEC_VSX_CVSPDPN))]
-  "TARGET_XSCVSPDPN"
-  "xscvspdpn %x0,%x1"
-  [(set_attr "type" "fp")])
-
 ;; Convert and scale (used by vec_ctf, vec_cts, vec_ctu for double/long long)
 
 (define_expand "vsx_xvcvsxddp_scale"

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 14:04 [RS6000] reload_vsx_from_gprsf splitter Alan Modra
@ 2016-02-11 17:34 ` David Edelsohn
  2016-02-11 18:38   ` Ulrich Weigand
  2016-02-11 21:53   ` Michael Meissner
  0 siblings, 2 replies; 14+ messages in thread
From: David Edelsohn @ 2016-02-11 17:34 UTC (permalink / raw)
  To: Alan Modra, Ulrich Weigand; +Cc: GCC Patches, Michael R Meissner

On Thu, Feb 11, 2016 at 6:04 AM, Alan Modra <amodra@gmail.com> wrote:
> This is PR68973 part 2, the failure of a boost test, caused by a
> splitter emitting an invalid move in reload_vsx_from_gprsf:
>   emit_move_insn (op0_di, op2);
>
> op0 can be any vsx reg, but the mtvsrd destination constraint in
> movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
> we have that constraint so left movdi_internal64 alone and used a
> special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
> by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
> restricted to fprs there too so fixed that as well.  (We can't use %L
> in asm operands that must accept vsx.)

Michael, please review the "wj" constraint in these patterns.

Alan, the explanation says that it uses a special p8_mtvsrd similar to
p8_mtvsrd_[12], but does not explain why the two similar patterns are
removed.  The incorrect use of %L implies those patterns, but the
change is to p8_xxpermdi_<mode> that is not mentioned in the
ChangeLog.

I also would appreciate Uli's comments on this direction because of
his reload expertise.

Thanks, David

>
> Bootstrapped and regression tested powerpc64le-linux.  powerpc64-linux
> biarch -mcpu=power7 bootstrap still in progress.  OK to apply assuming
> no regressions found?
>
>         PR target/68973
>         * config/rs6000/vsx.md (vsx_xscvspdpn_directmove): Delete.
>         * config/rs6000/rs6000.md (reload_vsx_from_gprsf): Rewrite splitter.
>         (p8_mtvsrd): New.
>         (p8_mtvsrd_1, p8_mtvsrd_2): Delete.
>         (reload_vsx_from_gpr<mode>): Adjust to use p8_mtvsrd.
>
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index cdbf873..745293b 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -7543,29 +7543,22 @@
>     (set_attr "type" "three")])
>
>  ;; Move 128 bit values from GPRs to VSX registers in 64-bit mode
> -(define_insn "p8_mtvsrd_1"
> -  [(set (match_operand:TF 0 "register_operand" "=ws")
> -       (unspec:TF [(match_operand:DI 1 "register_operand" "r")]
> +(define_insn "p8_mtvsrd"
> +  [(set (match_operand:DF 0 "register_operand" "=ws")
> +       (unspec:DF [(match_operand:DI 1 "register_operand" "r")]
>                    UNSPEC_P8V_MTVSRD))]
>    "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> -  "mtvsrd %0,%1"
> -  [(set_attr "type" "mftgpr")])
> -
> -(define_insn "p8_mtvsrd_2"
> -  [(set (match_operand:TF 0 "register_operand" "+ws")
> -       (unspec:TF [(match_dup 0)
> -                   (match_operand:DI 1 "register_operand" "r")]
> -                  UNSPEC_P8V_MTVSRD))]
> -  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> -  "mtvsrd %L0,%1"
> +  "mtvsrd %x0,%1"
>    [(set_attr "type" "mftgpr")])
>
>  (define_insn "p8_xxpermdi_<mode>"
>    [(set (match_operand:FMOVE128_GPR 0 "register_operand" "=wa")
> -       (unspec:FMOVE128_GPR [(match_operand:TF 1 "register_operand" "ws")]
> -                            UNSPEC_P8V_XXPERMDI))]
> +       (unspec:FMOVE128_GPR [
> +               (match_operand:DF 1 "register_operand" "ws")
> +               (match_operand:DF 2 "register_operand" "ws")]
> +               UNSPEC_P8V_XXPERMDI))]
>    "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> -  "xxpermdi %x0,%1,%L1,0"
> +  "xxpermdi %x0,%x1,%x2,0"
>    [(set_attr "type" "vecperm")])
>
>  (define_insn_and_split "reload_vsx_from_gpr<mode>"
> @@ -7581,13 +7574,18 @@
>  {
>    rtx dest = operands[0];
>    rtx src = operands[1];
> -  rtx tmp = operands[2];
> +  /* You might think that we could use op0 as one temp and a DF clobber
> +     as the other, but you'd be wrong.  These secondary_reload patterns
> +     don't check that the clobber is different to the destination, which
> +     is probably a reload bug.  */
> +  rtx tmp_hi = gen_rtx_REG (DFmode, REGNO (operands[2]));
> +  rtx tmp_lo = gen_rtx_REG (DFmode, REGNO (operands[2]) + 1);
>    rtx gpr_hi_reg = gen_highpart (DImode, src);
>    rtx gpr_lo_reg = gen_lowpart (DImode, src);
>
> -  emit_insn (gen_p8_mtvsrd_1 (tmp, gpr_hi_reg));
> -  emit_insn (gen_p8_mtvsrd_2 (tmp, gpr_lo_reg));
> -  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp));
> +  emit_insn (gen_p8_mtvsrd (tmp_hi, gpr_hi_reg));
> +  emit_insn (gen_p8_mtvsrd (tmp_lo, gpr_lo_reg));
> +  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp_hi, tmp_lo));
>    DONE;
>  }
>    [(set_attr "length" "12")
> @@ -7622,16 +7620,18 @@
>    rtx op0 = operands[0];
>    rtx op1 = operands[1];
>    rtx op2 = operands[2];
> +
>    /* Also use the destination register to hold the unconverted DImode value.
>       This is conceptually a separate value from OP0, so we use gen_rtx_REG
>       rather than simplify_gen_subreg.  */
> -  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
> +  rtx op0_df = gen_rtx_REG (DFmode, REGNO (op0));
> +  rtx op0_v4sf = gen_rtx_REG (V4SFmode, REGNO (op0));
>    rtx op1_di = simplify_gen_subreg (DImode, op1, SFmode, 0);
>
>    /* Move SF value to upper 32-bits for xscvspdpn.  */
>    emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
> -  emit_move_insn (op0_di, op2);
> -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
> +  emit_insn (gen_p8_mtvsrd (op0_df, op2));
> +  emit_insn (gen_vsx_xscvspdpn (op0_df, op0_v4sf));
>    DONE;
>  }
>    [(set_attr "length" "8")
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 997ff31..2d2f137 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -1518,15 +1518,6 @@
>    "xscvdpspn %x0,%x1"
>    [(set_attr "type" "fp")])
>
> -;; Used by direct move to move a SFmode value from GPR to VSX register
> -(define_insn "vsx_xscvspdpn_directmove"
> -  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
> -       (unspec:SF [(match_operand:DI 1 "vsx_register_operand" "wa")]
> -                  UNSPEC_VSX_CVSPDPN))]
> -  "TARGET_XSCVSPDPN"
> -  "xscvspdpn %x0,%x1"
> -  [(set_attr "type" "fp")])
> -
>  ;; Convert and scale (used by vec_ctf, vec_cts, vec_ctu for double/long long)
>
>  (define_expand "vsx_xvcvsxddp_scale"
>
> --
> Alan Modra
> Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 17:34 ` David Edelsohn
@ 2016-02-11 18:38   ` Ulrich Weigand
  2016-02-11 21:56     ` Michael Meissner
  2016-02-12  0:21     ` David Edelsohn
  2016-02-11 21:53   ` Michael Meissner
  1 sibling, 2 replies; 14+ messages in thread
From: Ulrich Weigand @ 2016-02-11 18:38 UTC (permalink / raw)
  To: David Edelsohn; +Cc: Alan Modra, GCC Patches, Michael R Meissner

David Edelsohn wrote:
> On Thu, Feb 11, 2016 at 6:04 AM, Alan Modra <amodra@gmail.com> wrote:
> > This is PR68973 part 2, the failure of a boost test, caused by a
> > splitter emitting an invalid move in reload_vsx_from_gprsf:
> >   emit_move_insn (op0_di, op2);
> >
> > op0 can be any vsx reg, but the mtvsrd destination constraint in
> > movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
> > we have that constraint so left movdi_internal64 alone and used a
> > special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
> > by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
> > restricted to fprs there too so fixed that as well.  (We can't use %L
> > in asm operands that must accept vsx.)
> 
> Michael, please review the "wj" constraint in these patterns.
> 
> Alan, the explanation says that it uses a special p8_mtvsrd similar to
> p8_mtvsrd_[12], but does not explain why the two similar patterns are
> removed.  The incorrect use of %L implies those patterns, but the
> change is to p8_xxpermdi_<mode> that is not mentioned in the
> ChangeLog.
> 
> I also would appreciate Uli's comments on this direction because of
> his reload expertise.

For the most part, this patch doesn't really change anything in the
interaction with reload as far as I can see.  The changes introduced
by the patch do make sense to me.  In particular, replacing the two
patterns p8_mtvsrd_1 and p8_mtvsrd_2 used to fill high and low parts
of a TFmode register pair with a single pattern p8_mtvsrd that just
works on any DFmode register (leaving the split into high/low to the
caller if necessary) seems to simplify things.

> > +  /* You might think that we could use op0 as one temp and a DF clobber
> > +     as the other, but you'd be wrong.  These secondary_reload patterns
> > +     don't check that the clobber is different to the destination, which
> > +     is probably a reload bug.  */

It's not a bug, it's deliberate behavior.  The reload registers allocated
for secondary reload clobbers may overlap the destination, since in many
cases you simply move the input to the reload register, and then the
reload register to the destination (where the latter move can be a no-op
if it is possible to allocate the reload register and the destination
into the same physical register).  If you need two separate intermediate
values, you need to allocate a secondary reload register of a larger
mode (as is already done in the pattern).

> >    /* Also use the destination register to hold the unconverted DImode value.
> >       This is conceptually a separate value from OP0, so we use gen_rtx_REG
> >       rather than simplify_gen_subreg.  */
> > -  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
> > +  rtx op0_df = gen_rtx_REG (DFmode, REGNO (op0));
> > +  rtx op0_v4sf = gen_rtx_REG (V4SFmode, REGNO (op0));

While this was not introduced by this patch, I'm a little bit concerned
about the hard-coded use of REGNO here, which will crash if op0 at this
point happens to be a SUBREG instead of a REG.  This is extremely unlikely
at this point in reload, but not 100% impossible, e.g. if op0 for some
reason is one of the "magic" registers like the stack pointer.

That's why it is in general better to use simplify_gen_subreg or one of
gen_highpart/gen_lowpart, which will handle SUBREG correctly as well.
I'm not sure why it matters whether this is "conceptually a separate
value" as the comment argues ...

> >    /* Move SF value to upper 32-bits for xscvspdpn.  */
> >    emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
> > -  emit_move_insn (op0_di, op2);
> > -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
> > +  emit_insn (gen_p8_mtvsrd (op0_df, op2));
> > +  emit_insn (gen_vsx_xscvspdpn (op0_df, op0_v4sf));

The sequence of modes used for op0 here is a bit weird.  First,
op0 is loaded as DFmode by mtvsrd, then it is silently re-
interpreted as V4SFmode when used as input to xscvspdpn, which
gets a DFmode output that is again silently re-interpreted as
SFmode.

This isn't really wrong as such, just maybe a bit confusing.
Maybe instead have p8_mtvsrd use DImode as output (instead of
DFmode), which shouldn't be any harder to use in the
reload_vsx_from_gpr<mode> splitter, and keep using the
vsx_xscvspdpn_directmove pattern?

[ This of course reinforces the question why we have p8_mtvsrd
in the first place, instead of just allowing this use directly
in movdi_internal64 itself.  ]

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 17:34 ` David Edelsohn
  2016-02-11 18:38   ` Ulrich Weigand
@ 2016-02-11 21:53   ` Michael Meissner
  2016-02-12 13:57     ` Ulrich Weigand
  1 sibling, 1 reply; 14+ messages in thread
From: Michael Meissner @ 2016-02-11 21:53 UTC (permalink / raw)
  To: David Edelsohn
  Cc: Alan Modra, Ulrich Weigand, GCC Patches, Michael R Meissner

On Thu, Feb 11, 2016 at 09:34:29AM -0800, David Edelsohn wrote:
> On Thu, Feb 11, 2016 at 6:04 AM, Alan Modra <amodra@gmail.com> wrote:
> > This is PR68973 part 2, the failure of a boost test, caused by a
> > splitter emitting an invalid move in reload_vsx_from_gprsf:
> >   emit_move_insn (op0_di, op2);
> >
> > op0 can be any vsx reg, but the mtvsrd destination constraint in
> > movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
> > we have that constraint so left movdi_internal64 alone and used a
> > special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
> > by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
> > restricted to fprs there too so fixed that as well.  (We can't use %L
> > in asm operands that must accept vsx.)
> 
> Michael, please review the "wj" constraint in these patterns.
> 
> Alan, the explanation says that it uses a special p8_mtvsrd similar to
> p8_mtvsrd_[12], but does not explain why the two similar patterns are
> removed.  The incorrect use of %L implies those patterns, but the
> change is to p8_xxpermdi_<mode> that is not mentioned in the
> ChangeLog.
> 
> I also would appreciate Uli's comments on this direction because of
> his reload expertise.

Since the mail was sent to my Lotus Notes account (mrmeissn@us.ibm.com), I
originally tried to reply through that system, but I got bounce errors.

At the present time, we do not allow DImode to go into Altivec
registers. Mostly it was due to reload complications in the power8 time frame,
where we didn't have d-form addressing for the Altivec registers and
interference with the other DImode reload patterns, but also I felt I would
need to go through the main rs6000.md to look for the various DImode patterns
to see if they needed to be updated.  At some I hoped to extend this, so I
added the "wi" and "wj" constraints to be used whenever the mode is DImode.
"wi" is the constraint for the VSX register class DImode can go in
(i.e. currently FLOAT=5FREGS), and "wj" is the VSX register class DImode can go
in if we have 64-bit direct move.

The TFmode thing was a hack.  It was the only way I could see getting two
registers without a lot of hair.  Since TFmode is also restricted to FPR_REGS,
you could use %L on it.  When I was writing it, I really wished we had a reload
pattern to get more than one temporary, but we didn't, so I went for TFmode to
get 2 FPR registers.  Given the replacement pattern no longer uses a TFmode
temporary, it can go in any appropriate register.

If you are using DFmode, the right constraint is "wk" (appropriate register to
use with the double type when direct moves are available) or "ws" (appropriate
register to use for DFtype with VSX instructions).

We don't have a constraint for appropriate register class to use with SFmode
when direct moves are available, so I would suggest using "wy" for a register
class you can use ISA 2.07 (power8) ops (the float scalar ops in the upper
registers).  The "ww" constraint would be the appropriate register to use ISA
2.06 (power7) operations on SFmode.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 18:38   ` Ulrich Weigand
@ 2016-02-11 21:56     ` Michael Meissner
  2016-02-11 22:24       ` Alan Modra
  2016-02-12  0:21     ` David Edelsohn
  1 sibling, 1 reply; 14+ messages in thread
From: Michael Meissner @ 2016-02-11 21:56 UTC (permalink / raw)
  To: Ulrich Weigand
  Cc: David Edelsohn, Alan Modra, GCC Patches, Michael R Meissner

On Thu, Feb 11, 2016 at 07:38:15PM +0100, Ulrich Weigand wrote:
> For the most part, this patch doesn't really change anything in the
> interaction with reload as far as I can see.  The changes introduced
> by the patch do make sense to me.  In particular, replacing the two
> patterns p8_mtvsrd_1 and p8_mtvsrd_2 used to fill high and low parts
> of a TFmode register pair with a single pattern p8_mtvsrd that just
> works on any DFmode register (leaving the split into high/low to the
> caller if necessary) seems to simplify things.
> 
> > > +  /* You might think that we could use op0 as one temp and a DF clobber
> > > +     as the other, but you'd be wrong.  These secondary_reload patterns
> > > +     don't check that the clobber is different to the destination, which
> > > +     is probably a reload bug.  */
> 
> It's not a bug, it's deliberate behavior.  The reload registers allocated
> for secondary reload clobbers may overlap the destination, since in many
> cases you simply move the input to the reload register, and then the
> reload register to the destination (where the latter move can be a no-op
> if it is possible to allocate the reload register and the destination
> into the same physical register).  If you need two separate intermediate
> values, you need to allocate a secondary reload register of a larger
> mode (as is already done in the pattern).

This is one of the cases I wished the reload support had the ability to
allocate 2 scratch temporaries instead of 1.  As I said in my other message,
TFmode was a hack to get two registers to use.

> While this was not introduced by this patch, I'm a little bit concerned
> about the hard-coded use of REGNO here, which will crash if op0 at this
> point happens to be a SUBREG instead of a REG.  This is extremely unlikely
> at this point in reload, but not 100% impossible, e.g. if op0 for some
> reason is one of the "magic" registers like the stack pointer.
> 
> That's why it is in general better to use simplify_gen_subreg or one of
> gen_highpart/gen_lowpart, which will handle SUBREG correctly as well.
> I'm not sure why it matters whether this is "conceptually a separate
> value" as the comment argues ...

Good catch.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 21:56     ` Michael Meissner
@ 2016-02-11 22:24       ` Alan Modra
  2016-02-11 22:42         ` Michael Meissner
  0 siblings, 1 reply; 14+ messages in thread
From: Alan Modra @ 2016-02-11 22:24 UTC (permalink / raw)
  To: Michael Meissner, Ulrich Weigand, David Edelsohn, GCC Patches,
	Michael R Meissner

On Thu, Feb 11, 2016 at 04:55:58PM -0500, Michael Meissner wrote:
> This is one of the cases I wished the reload support had the ability to
> allocate 2 scratch temporaries instead of 1.  As I said in my other message,
> TFmode was a hack to get two registers to use.

Another concern I had about this, besides using %L in asm output (what
forces TFmode to use just fprs?), is what happens when we're using
IEEE 128-bit floats?  In that case it looks like we'd get just one reg.

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 22:24       ` Alan Modra
@ 2016-02-11 22:42         ` Michael Meissner
  0 siblings, 0 replies; 14+ messages in thread
From: Michael Meissner @ 2016-02-11 22:42 UTC (permalink / raw)
  To: Alan Modra
  Cc: Michael Meissner, Ulrich Weigand, David Edelsohn, GCC Patches,
	Michael R Meissner

On Fri, Feb 12, 2016 at 08:54:19AM +1030, Alan Modra wrote:
> On Thu, Feb 11, 2016 at 04:55:58PM -0500, Michael Meissner wrote:
> > This is one of the cases I wished the reload support had the ability to
> > allocate 2 scratch temporaries instead of 1.  As I said in my other message,
> > TFmode was a hack to get two registers to use.
> 
> Another concern I had about this, besides using %L in asm output (what
> forces TFmode to use just fprs?), is what happens when we're using
> IEEE 128-bit floats?  In that case it looks like we'd get just one reg.

The code in rs6000_hard_regno_mode_ok only allows TFmode (IBM extended double)
in GPRs and FPRs.  In theory, TFmode could go in VSX registers, just it hasn't
been done.

Good point that it breaks if the default long double (TFmode) type is IEEE
128-bit floating point.  We would need to have two patterns, one that uses
TFmode and one that uses IFmode.  I wrote the power8 direct move stuff before
going down the road of IEEE 128-bit floating point.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 18:38   ` Ulrich Weigand
  2016-02-11 21:56     ` Michael Meissner
@ 2016-02-12  0:21     ` David Edelsohn
  1 sibling, 0 replies; 14+ messages in thread
From: David Edelsohn @ 2016-02-12  0:21 UTC (permalink / raw)
  To: Ulrich Weigand, Michael Meissner, Alan Modra; +Cc: GCC Patches

On Thu, Feb 11, 2016 at 10:38 AM, Ulrich Weigand <uweigand@de.ibm.com> wrote:
> David Edelsohn wrote:
>> On Thu, Feb 11, 2016 at 6:04 AM, Alan Modra <amodra@gmail.com> wrote:
>> > This is PR68973 part 2, the failure of a boost test, caused by a
>> > splitter emitting an invalid move in reload_vsx_from_gprsf:
>> >   emit_move_insn (op0_di, op2);
>> >
>> > op0 can be any vsx reg, but the mtvsrd destination constraint in
>> > movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
>> > we have that constraint so left movdi_internal64 alone and used a
>> > special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
>> > by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
>> > restricted to fprs there too so fixed that as well.  (We can't use %L
>> > in asm operands that must accept vsx.)
>>
>> Michael, please review the "wj" constraint in these patterns.
>>
>> Alan, the explanation says that it uses a special p8_mtvsrd similar to
>> p8_mtvsrd_[12], but does not explain why the two similar patterns are
>> removed.  The incorrect use of %L implies those patterns, but the
>> change is to p8_xxpermdi_<mode> that is not mentioned in the
>> ChangeLog.
>>
>> I also would appreciate Uli's comments on this direction because of
>> his reload expertise.
>
> For the most part, this patch doesn't really change anything in the
> interaction with reload as far as I can see.  The changes introduced
> by the patch do make sense to me.  In particular, replacing the two
> patterns p8_mtvsrd_1 and p8_mtvsrd_2 used to fill high and low parts
> of a TFmode register pair with a single pattern p8_mtvsrd that just
> works on any DFmode register (leaving the split into high/low to the
> caller if necessary) seems to simplify things.
>
>> > +  /* You might think that we could use op0 as one temp and a DF clobber
>> > +     as the other, but you'd be wrong.  These secondary_reload patterns
>> > +     don't check that the clobber is different to the destination, which
>> > +     is probably a reload bug.  */
>
> It's not a bug, it's deliberate behavior.  The reload registers allocated
> for secondary reload clobbers may overlap the destination, since in many
> cases you simply move the input to the reload register, and then the
> reload register to the destination (where the latter move can be a no-op
> if it is possible to allocate the reload register and the destination
> into the same physical register).  If you need two separate intermediate
> values, you need to allocate a secondary reload register of a larger
> mode (as is already done in the pattern).
>
>> >    /* Also use the destination register to hold the unconverted DImode value.
>> >       This is conceptually a separate value from OP0, so we use gen_rtx_REG
>> >       rather than simplify_gen_subreg.  */
>> > -  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
>> > +  rtx op0_df = gen_rtx_REG (DFmode, REGNO (op0));
>> > +  rtx op0_v4sf = gen_rtx_REG (V4SFmode, REGNO (op0));
>
> While this was not introduced by this patch, I'm a little bit concerned
> about the hard-coded use of REGNO here, which will crash if op0 at this
> point happens to be a SUBREG instead of a REG.  This is extremely unlikely
> at this point in reload, but not 100% impossible, e.g. if op0 for some
> reason is one of the "magic" registers like the stack pointer.
>
> That's why it is in general better to use simplify_gen_subreg or one of
> gen_highpart/gen_lowpart, which will handle SUBREG correctly as well.
> I'm not sure why it matters whether this is "conceptually a separate
> value" as the comment argues ...
>
>> >    /* Move SF value to upper 32-bits for xscvspdpn.  */
>> >    emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
>> > -  emit_move_insn (op0_di, op2);
>> > -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
>> > +  emit_insn (gen_p8_mtvsrd (op0_df, op2));
>> > +  emit_insn (gen_vsx_xscvspdpn (op0_df, op0_v4sf));
>
> The sequence of modes used for op0 here is a bit weird.  First,
> op0 is loaded as DFmode by mtvsrd, then it is silently re-
> interpreted as V4SFmode when used as input to xscvspdpn, which
> gets a DFmode output that is again silently re-interpreted as
> SFmode.
>
> This isn't really wrong as such, just maybe a bit confusing.
> Maybe instead have p8_mtvsrd use DImode as output (instead of
> DFmode), which shouldn't be any harder to use in the
> reload_vsx_from_gpr<mode> splitter, and keep using the
> vsx_xscvspdpn_directmove pattern?
>
> [ This of course reinforces the question why we have p8_mtvsrd
> in the first place, instead of just allowing this use directly
> in movdi_internal64 itself.  ]

Good question: is p8_mtvsrd really necessary if the movdi_internal64
is updated to use the correct constraints?

The patch definitely is going in the right direction.  Can we remove
more unnecessary bits?

Thanks, David

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-11 21:53   ` Michael Meissner
@ 2016-02-12 13:57     ` Ulrich Weigand
  2016-02-15 12:37       ` Alan Modra
  0 siblings, 1 reply; 14+ messages in thread
From: Ulrich Weigand @ 2016-02-12 13:57 UTC (permalink / raw)
  To: Michael Meissner
  Cc: David Edelsohn, Alan Modra, GCC Patches, Michael R Meissner

(Replying to multiple messages at once ...)

Michael Meissner wrote:

> At the present time, we do not allow DImode to go into Altivec
> registers. Mostly it was due to reload complications in the power8 time frame,
> where we didn't have d-form addressing for the Altivec registers and
> interference with the other DImode reload patterns, but also I felt I would
> need to go through the main rs6000.md to look for the various DImode patterns
> to see if they needed to be updated.  At some I hoped to extend this, so I
> added the "wi" and "wj" constraints to be used whenever the mode is DImode.
> "wi" is the constraint for the VSX register class DImode can go in
> (i.e. currently FLOAT=5FREGS), and "wj" is the VSX register class DImode can go
> in if we have 64-bit direct move.

I see.  I hadn't noticed the restriction on DImode, sorry.  I think that in
general, it would be a good idea to allow DImode wherever DFmode is allowed,
since the same instructions should be able to handle both.  But given the
unexpected problems that seem to turn up whenever we want to allow more modes
in more registers, this is probably better for stage 1 than right now ...
 
> The TFmode thing was a hack.  It was the only way I could see getting two
> registers without a lot of hair.  Since TFmode is also restricted to FPR_REGS,
> you could use %L on it.  When I was writing it, I really wished we had a reload
> pattern to get more than one temporary, but we didn't, so I went for TFmode to
> get 2 FPR registers.  Given the replacement pattern no longer uses a TFmode
> temporary, it can go in any appropriate register.

Again, it would probably make sense to allow TFmode (when it is double double)
in any pair of registers where DFmode is allowed, since the type *is* just a
pair of DFmode values.  Again, really more a stage 1 matter ...

> On Thu, Feb 11, 2016 at 07:38:15PM +0100, Ulrich Weigand wrote:
> > It's not a bug, it's deliberate behavior.  The reload registers allocated
> > for secondary reload clobbers may overlap the destination, since in many
> > cases you simply move the input to the reload register, and then the
> > reload register to the destination (where the latter move can be a no-op
> > if it is possible to allocate the reload register and the destination
> > into the same physical register).  If you need two separate intermediate
> > values, you need to allocate a secondary reload register of a larger
> > mode (as is already done in the pattern).
> 
> This is one of the cases I wished the reload support had the ability to
> allocate 2 scratch temporaries instead of 1.  As I said in my other message,
> TFmode was a hack to get two registers to use.

Yes, it would be nice if we could specify multiple scratch registers.  That's
a long-standing FIXME in the reload code:

      /* ??? It would be useful to be able to handle only two, or more than
         three, operands, but for now we can only handle the case of having
         exactly three: output, input and one temp/scratch.  */

Unfortunately, given the structure of reload, that is difficult to fix.

> On Fri, Feb 12, 2016 at 08:54:19AM +1030, Alan Modra wrote:
> > Another concern I had about this, besides using %L in asm output (what
> > forces TFmode to use just fprs?), is what happens when we're using
> > IEEE 128-bit floats?  In that case it looks like we'd get just one reg.
> 
> Good point that it breaks if the default long double (TFmode) type is IEEE
> 128-bit floating point.  We would need to have two patterns, one that uses
> TFmode and one that uses IFmode.  I wrote the power8 direct move stuff before
> going down the road of IEEE 128-bit floating point.

Right.  It's a bit unfortunate that we can't just use IFmode unconditionally,
but it seems rs6000_scalar_mode_supported_p (IFmode) may return false, and
then we probably shouldn't be using it.

Another option might be to use TDmode to allocate a scratch register pair.


David Edelsohn wrote:

> Good question: is p8_mtvsrd really necessary if the movdi_internal64
> is updated to use the correct constraints?
> 
> The patch definitely is going in the right direction.  Can we remove
> more unnecessary bits?

Given Mike's reply above, I don't think it is simply a matter of updating
constraints in the one pattern.  Rather, enabling DImode in all vector
registers would require a change to rs6000_hard_regno_mode_ok and then
reviewing all DImode patterns to make sure they're still compatible.

As I said above, that's probably more of a stage 1 effort.

As long as that is not done, my original suggestion here:

> Maybe instead have p8_mtvsrd use DImode as output (instead of
> DFmode), which shouldn't be any harder to use in the
> reload_vsx_from_gpr<mode> splitter, and keep using the
> vsx_xscvspdpn_directmove pattern?

isn't actually possible.  Instead, one other option might be to
simply do the partial moves all in *DFmode* instead of DImode.
That way, we can make use of the fact that DFmode is already
supported in all vector registers, and the "*mov<mode>_hardfloat64"
pattern already uses mtvsrd to move from a GPR to a vector
register.  This way, we could still simply use emit_move_insn
without requiring new patterns, but still fix the original bug.

Note that we could still get rid of the existing p8_mtvsrd_...
patterns (and likewise just use emit_move_insn) if we do those
other moves also in DFmode.  But that wouldn't fix any bug at
this point, just maybe simplify code a bit.

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-12 13:57     ` Ulrich Weigand
@ 2016-02-15 12:37       ` Alan Modra
  2016-02-15 13:46         ` Ulrich Weigand
  2016-02-15 14:42         ` David Edelsohn
  0 siblings, 2 replies; 14+ messages in thread
From: Alan Modra @ 2016-02-15 12:37 UTC (permalink / raw)
  To: Ulrich Weigand
  Cc: Michael Meissner, David Edelsohn, GCC Patches, Michael R Meissner

On Fri, Feb 12, 2016 at 02:57:22PM +0100, Ulrich Weigand wrote:
> > On Fri, Feb 12, 2016 at 08:54:19AM +1030, Alan Modra wrote:
> > > Another concern I had about this, besides using %L in asm output (what
> > > forces TFmode to use just fprs?), is what happens when we're using
> > > IEEE 128-bit floats?  In that case it looks like we'd get just one reg.
> > 
> > Good point that it breaks if the default long double (TFmode) type is IEEE
> > 128-bit floating point.  We would need to have two patterns, one that uses
> > TFmode and one that uses IFmode.  I wrote the power8 direct move stuff before
> > going down the road of IEEE 128-bit floating point.
> 
> Right.  It's a bit unfortunate that we can't just use IFmode unconditionally,
> but it seems rs6000_scalar_mode_supported_p (IFmode) may return false, and
> then we probably shouldn't be using it.

Actually, we can use IFmode unconditionally.  scalar_mode_supported_p
is relevant only up to and including expand.  Nothing prevents the
backend from using IFmode.

> Another option might be to use TDmode to allocate a scratch register pair.

That won't work, at least if we want to extract the two component regs
with simplify_gen_subreg, due to rs6000_cannot_change_mode_class.  In
my original patch I just extracted the regs by using gen_rtx_REG but I
changed that, based on your criticism of using gen_rtx_REG in
reload_vsx_from_gprsf, and because rs6000.md avoids gen_rtx_REG using
operand regnos in other places.  That particular change is of course
entirely cosmetic.  I also changed reload_vsx_from_gprsf to avoid mode
punning regs, instead duplicating insn patterns as done elsewhere in
the vsx support.  I don't believe we will see subregs of vsx or fp
regs after reload, but I'm quite willing to concede the point for a
stage4 fix.

Here's the revised patch.  To recap, the main bug fixes here are:
- stop reload_vsx_from_gprsf splitter from emitting a move not
handled by movdi_internal64
- don't use TFmode, which cannot now be assumed to be IBM
double-double.
Secondary to that, not using or passing around TFmode means the %L
restriction no longer matters, and constraints on the reload temp reg
can be relaxed.

Bootstrapped and regression tested powerpc64-linux biarch and
powerpc64le-linux.  OK David?

	PR target/68973
	* config/rs6000/rs6000.md (reload_vsx_from_gprsf): Use p8_mtvsrd_sf
	rather than attempting to use movdi_internal64.  Remove op0_di.
	(p8_mtvsrd_df, p8_mtvsrd_sf): New.
	(p8_mtvsrd_1, p8_mtvsrd_2): Delete.
	(p8_mtvsrwz): New.
	(p8_mtvsrwz_1, p8_mtvsrwz_2): Delete.
	(p8_xxpermdi_<mode>): Take two DF inputs rather than one TF.
	(p8_fmrgow_<mode>): Likewise.
	(reload_vsx_from_gpr<mode>): Make clobber IF.  Adjust for above
	changes.
	(reload_fpr_from_gpr<mode>): Similarly. Use "d" for op0 constraint.
	* config/rs6000/vsx.md (vsx_xscvspdpn_directmove): Make op1 SFmode.

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index cdbf873..ec356cb 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7488,41 +7488,31 @@
 ;; value, since it is allocated in reload and not all of the flow information
 ;; is setup for it.  We have two patterns to do the two moves between gprs and
 ;; fprs.  There isn't a dependancy between the two, but we could potentially
-;; schedule other instructions between the two instructions.  TFmode is
-;; currently limited to traditional FPR registers.  If/when this is changed, we
-;; will need to revist %L to make sure it works with VSX registers, or add an
-;; %x version of %L.
+;; schedule other instructions between the two instructions.
 
 (define_insn "p8_fmrgow_<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
-	(unspec:FMOVE64X [(match_operand:TF 1 "register_operand" "d")]
+	(unspec:FMOVE64X [
+		(match_operand:DF 1 "register_operand" "d")
+		(match_operand:DF 2 "register_operand" "d")]
 			 UNSPEC_P8V_FMRGOW))]
   "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "fmrgow %0,%1,%L1"
+  "fmrgow %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
-(define_insn "p8_mtvsrwz_1"
-  [(set (match_operand:TF 0 "register_operand" "=d")
-	(unspec:TF [(match_operand:SI 1 "register_operand" "r")]
+(define_insn "p8_mtvsrwz"
+  [(set (match_operand:DF 0 "register_operand" "=d")
+	(unspec:DF [(match_operand:SI 1 "register_operand" "r")]
 		   UNSPEC_P8V_MTVSRWZ))]
   "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "mtvsrwz %x0,%1"
   [(set_attr "type" "mftgpr")])
 
-(define_insn "p8_mtvsrwz_2"
-  [(set (match_operand:TF 0 "register_operand" "+d")
-	(unspec:TF [(match_dup 0)
-		    (match_operand:SI 1 "register_operand" "r")]
-		   UNSPEC_P8V_MTVSRWZ))]
-  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrwz %L0,%1"
-  [(set_attr "type" "mftgpr")])
-
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
-  [(set (match_operand:FMOVE64X 0 "register_operand" "=ws")
+  [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
 	(unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
 			 UNSPEC_P8V_RELOAD_FROM_GPR))
-   (clobber (match_operand:TF 2 "register_operand" "=d"))]
+   (clobber (match_operand:IF 2 "register_operand" "=d"))]
   "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "#"
   "&& reload_completed"
@@ -7530,42 +7520,36 @@
 {
   rtx dest = operands[0];
   rtx src = operands[1];
-  rtx tmp = operands[2];
+  rtx tmp_hi = simplify_gen_subreg (DFmode, operands[2], IFmode, 0);
+  rtx tmp_lo = simplify_gen_subreg (DFmode, operands[2], IFmode, 8);
   rtx gpr_hi_reg = gen_highpart (SImode, src);
   rtx gpr_lo_reg = gen_lowpart (SImode, src);
 
-  emit_insn (gen_p8_mtvsrwz_1 (tmp, gpr_hi_reg));
-  emit_insn (gen_p8_mtvsrwz_2 (tmp, gpr_lo_reg));
-  emit_insn (gen_p8_fmrgow_<mode> (dest, tmp));
+  emit_insn (gen_p8_mtvsrwz (tmp_hi, gpr_hi_reg));
+  emit_insn (gen_p8_mtvsrwz (tmp_lo, gpr_lo_reg));
+  emit_insn (gen_p8_fmrgow_<mode> (dest, tmp_hi, tmp_lo));
   DONE;
 }
   [(set_attr "length" "12")
    (set_attr "type" "three")])
 
 ;; Move 128 bit values from GPRs to VSX registers in 64-bit mode
-(define_insn "p8_mtvsrd_1"
-  [(set (match_operand:TF 0 "register_operand" "=ws")
-	(unspec:TF [(match_operand:DI 1 "register_operand" "r")]
-		   UNSPEC_P8V_MTVSRD))]
-  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrd %0,%1"
-  [(set_attr "type" "mftgpr")])
-
-(define_insn "p8_mtvsrd_2"
-  [(set (match_operand:TF 0 "register_operand" "+ws")
-	(unspec:TF [(match_dup 0)
-		    (match_operand:DI 1 "register_operand" "r")]
+(define_insn "p8_mtvsrd_df"
+  [(set (match_operand:DF 0 "register_operand" "=wa")
+	(unspec:DF [(match_operand:DI 1 "register_operand" "r")]
 		   UNSPEC_P8V_MTVSRD))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrd %L0,%1"
+  "mtvsrd %x0,%1"
   [(set_attr "type" "mftgpr")])
 
 (define_insn "p8_xxpermdi_<mode>"
   [(set (match_operand:FMOVE128_GPR 0 "register_operand" "=wa")
-	(unspec:FMOVE128_GPR [(match_operand:TF 1 "register_operand" "ws")]
-			     UNSPEC_P8V_XXPERMDI))]
+	(unspec:FMOVE128_GPR [
+		(match_operand:DF 1 "register_operand" "wa")
+		(match_operand:DF 2 "register_operand" "wa")]
+		UNSPEC_P8V_XXPERMDI))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "xxpermdi %x0,%1,%L1,0"
+  "xxpermdi %x0,%x1,%x2,0"
   [(set_attr "type" "vecperm")])
 
 (define_insn_and_split "reload_vsx_from_gpr<mode>"
@@ -7573,7 +7557,7 @@
 	(unspec:FMOVE128_GPR
 	 [(match_operand:FMOVE128_GPR 1 "register_operand" "r")]
 	 UNSPEC_P8V_RELOAD_FROM_GPR))
-   (clobber (match_operand:TF 2 "register_operand" "=ws"))]
+   (clobber (match_operand:IF 2 "register_operand" "=wa"))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
   "#"
   "&& reload_completed"
@@ -7581,13 +7565,17 @@
 {
   rtx dest = operands[0];
   rtx src = operands[1];
-  rtx tmp = operands[2];
+  /* You might think that we could use op0 as one temp and a DF clobber
+     as op2, but you'd be wrong.  Secondary reload move patterns don't
+     check for overlap of the clobber and the destination.  */
+  rtx tmp_hi = simplify_gen_subreg (DFmode, operands[2], IFmode, 0);
+  rtx tmp_lo = simplify_gen_subreg (DFmode, operands[2], IFmode, 8);
   rtx gpr_hi_reg = gen_highpart (DImode, src);
   rtx gpr_lo_reg = gen_lowpart (DImode, src);
 
-  emit_insn (gen_p8_mtvsrd_1 (tmp, gpr_hi_reg));
-  emit_insn (gen_p8_mtvsrd_2 (tmp, gpr_lo_reg));
-  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp));
+  emit_insn (gen_p8_mtvsrd_df (tmp_hi, gpr_hi_reg));
+  emit_insn (gen_p8_mtvsrd_df (tmp_lo, gpr_lo_reg));
+  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp_hi, tmp_lo));
   DONE;
 }
   [(set_attr "length" "12")
@@ -7608,6 +7596,13 @@
 ;; Move SFmode to a VSX from a GPR register.  Because scalar floating point
 ;; type is stored internally as double precision in the VSX registers, we have
 ;; to convert it from the vector format.
+(define_insn "p8_mtvsrd_sf"
+  [(set (match_operand:SF 0 "register_operand" "=wa")
+	(unspec:SF [(match_operand:DI 1 "register_operand" "r")]
+		   UNSPEC_P8V_MTVSRD))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrd %x0,%1"
+  [(set_attr "type" "mftgpr")])
 
 (define_insn_and_split "reload_vsx_from_gprsf"
   [(set (match_operand:SF 0 "register_operand" "=wa")
@@ -7622,16 +7617,12 @@
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
-  /* Also use the destination register to hold the unconverted DImode value.
-     This is conceptually a separate value from OP0, so we use gen_rtx_REG
-     rather than simplify_gen_subreg.  */
-  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
   rtx op1_di = simplify_gen_subreg (DImode, op1, SFmode, 0);
 
   /* Move SF value to upper 32-bits for xscvspdpn.  */
   emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
-  emit_move_insn (op0_di, op2);
-  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
+  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
+  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
   DONE;
 }
   [(set_attr "length" "8")
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 997ff31..45af233 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1521,7 +1521,7 @@
 ;; Used by direct move to move a SFmode value from GPR to VSX register
 (define_insn "vsx_xscvspdpn_directmove"
   [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
-	(unspec:SF [(match_operand:DI 1 "vsx_register_operand" "wa")]
+	(unspec:SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
 		   UNSPEC_VSX_CVSPDPN))]
   "TARGET_XSCVSPDPN"
   "xscvspdpn %x0,%x1"


-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-15 12:37       ` Alan Modra
@ 2016-02-15 13:46         ` Ulrich Weigand
  2016-02-15 14:42         ` David Edelsohn
  1 sibling, 0 replies; 14+ messages in thread
From: Ulrich Weigand @ 2016-02-15 13:46 UTC (permalink / raw)
  To: Alan Modra
  Cc: Michael Meissner, David Edelsohn, GCC Patches, Michael R Meissner

Alan Modra wrote:
> On Fri, Feb 12, 2016 at 02:57:22PM +0100, Ulrich Weigand wrote:
> > Right.  It's a bit unfortunate that we can't just use IFmode unconditionally,
> > but it seems rs6000_scalar_mode_supported_p (IFmode) may return false, and
> > then we probably shouldn't be using it.
> 
> Actually, we can use IFmode unconditionally.  scalar_mode_supported_p
> is relevant only up to and including expand.  Nothing prevents the
> backend from using IFmode.

Hmm, OK.  That does make things simpler.

> > Another option might be to use TDmode to allocate a scratch register pair.
> 
> That won't work, at least if we want to extract the two component regs
> with simplify_gen_subreg, due to rs6000_cannot_change_mode_class.  In
> my original patch I just extracted the regs by using gen_rtx_REG but I
> changed that, based on your criticism of using gen_rtx_REG in
> reload_vsx_from_gprsf, and because rs6000.md avoids gen_rtx_REG using
> operand regnos in other places.  That particular change is of course
> entirely cosmetic.  I also changed reload_vsx_from_gprsf to avoid mode
> punning regs, instead duplicating insn patterns as done elsewhere in
> the vsx support.  I don't believe we will see subregs of vsx or fp
> regs after reload, but I'm quite willing to concede the point for a
> stage4 fix.

I was thinking here that in the special case of the *reload scratch
register*, which reload allocates for us, we will always get a full
register.  This is different from some other operand that may originate
from pre-existing RTX that may require a SUBREG even after reload.

But I certainly agree that your current patch looks like a good choice
for a stage4 bugfix change.  Further cleanup can always happen later.

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-15 12:37       ` Alan Modra
  2016-02-15 13:46         ` Ulrich Weigand
@ 2016-02-15 14:42         ` David Edelsohn
  2016-02-16  0:24           ` Alan Modra
  1 sibling, 1 reply; 14+ messages in thread
From: David Edelsohn @ 2016-02-15 14:42 UTC (permalink / raw)
  To: Alan Modra; +Cc: Ulrich Weigand, Michael Meissner, GCC Patches

On Mon, Feb 15, 2016 at 4:36 AM, Alan Modra <amodra@gmail.com> wrote:
> On Fri, Feb 12, 2016 at 02:57:22PM +0100, Ulrich Weigand wrote:
>> > On Fri, Feb 12, 2016 at 08:54:19AM +1030, Alan Modra wrote:
>> > > Another concern I had about this, besides using %L in asm output (what
>> > > forces TFmode to use just fprs?), is what happens when we're using
>> > > IEEE 128-bit floats?  In that case it looks like we'd get just one reg.
>> >
>> > Good point that it breaks if the default long double (TFmode) type is IEEE
>> > 128-bit floating point.  We would need to have two patterns, one that uses
>> > TFmode and one that uses IFmode.  I wrote the power8 direct move stuff before
>> > going down the road of IEEE 128-bit floating point.
>>
>> Right.  It's a bit unfortunate that we can't just use IFmode unconditionally,
>> but it seems rs6000_scalar_mode_supported_p (IFmode) may return false, and
>> then we probably shouldn't be using it.
>
> Actually, we can use IFmode unconditionally.  scalar_mode_supported_p
> is relevant only up to and including expand.  Nothing prevents the
> backend from using IFmode.
>
>> Another option might be to use TDmode to allocate a scratch register pair.
>
> That won't work, at least if we want to extract the two component regs
> with simplify_gen_subreg, due to rs6000_cannot_change_mode_class.  In
> my original patch I just extracted the regs by using gen_rtx_REG but I
> changed that, based on your criticism of using gen_rtx_REG in
> reload_vsx_from_gprsf, and because rs6000.md avoids gen_rtx_REG using
> operand regnos in other places.  That particular change is of course
> entirely cosmetic.  I also changed reload_vsx_from_gprsf to avoid mode
> punning regs, instead duplicating insn patterns as done elsewhere in
> the vsx support.  I don't believe we will see subregs of vsx or fp
> regs after reload, but I'm quite willing to concede the point for a
> stage4 fix.
>
> Here's the revised patch.  To recap, the main bug fixes here are:
> - stop reload_vsx_from_gprsf splitter from emitting a move not
> handled by movdi_internal64
> - don't use TFmode, which cannot now be assumed to be IBM
> double-double.
> Secondary to that, not using or passing around TFmode means the %L
> restriction no longer matters, and constraints on the reload temp reg
> can be relaxed.
>
> Bootstrapped and regression tested powerpc64-linux biarch and
> powerpc64le-linux.  OK David?
>
>         PR target/68973
>         * config/rs6000/rs6000.md (reload_vsx_from_gprsf): Use p8_mtvsrd_sf
>         rather than attempting to use movdi_internal64.  Remove op0_di.
>         (p8_mtvsrd_df, p8_mtvsrd_sf): New.
>         (p8_mtvsrd_1, p8_mtvsrd_2): Delete.
>         (p8_mtvsrwz): New.
>         (p8_mtvsrwz_1, p8_mtvsrwz_2): Delete.
>         (p8_xxpermdi_<mode>): Take two DF inputs rather than one TF.
>         (p8_fmrgow_<mode>): Likewise.
>         (reload_vsx_from_gpr<mode>): Make clobber IF.  Adjust for above
>         changes.
>         (reload_fpr_from_gpr<mode>): Similarly. Use "d" for op0 constraint.
>         * config/rs6000/vsx.md (vsx_xscvspdpn_directmove): Make op1 SFmode.
>

Okay.

Is there still an issue with the constraints used for movdi_internal64?

Thanks, David

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-15 14:42         ` David Edelsohn
@ 2016-02-16  0:24           ` Alan Modra
  2016-02-16  1:29             ` David Edelsohn
  0 siblings, 1 reply; 14+ messages in thread
From: Alan Modra @ 2016-02-16  0:24 UTC (permalink / raw)
  To: David Edelsohn; +Cc: Ulrich Weigand, Michael Meissner, GCC Patches

On Mon, Feb 15, 2016 at 06:42:35AM -0800, David Edelsohn wrote:
> Is there still an issue with the constraints used for movdi_internal64?

Yes and no.  No because we shouldn't be attempting DI moves between vsx
regs and gprs.  Yes because we ought to allow DImode in vsx regs, but
fixing that is likely not trivial.

Do we want to backport the PR68973 fixes to gcc-5 and gcc-4.9?  We are
exposed to the reload_vsx_from_gprsf bug there, I think, but TFmode
won't be IEEE.

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RS6000] reload_vsx_from_gprsf splitter
  2016-02-16  0:24           ` Alan Modra
@ 2016-02-16  1:29             ` David Edelsohn
  0 siblings, 0 replies; 14+ messages in thread
From: David Edelsohn @ 2016-02-16  1:29 UTC (permalink / raw)
  To: Alan Modra; +Cc: Ulrich Weigand, Michael Meissner, GCC Patches

On Mon, Feb 15, 2016 at 4:24 PM, Alan Modra <amodra@gmail.com> wrote:
> On Mon, Feb 15, 2016 at 06:42:35AM -0800, David Edelsohn wrote:
>> Is there still an issue with the constraints used for movdi_internal64?
>
> Yes and no.  No because we shouldn't be attempting DI moves between vsx
> regs and gprs.  Yes because we ought to allow DImode in vsx regs, but
> fixing that is likely not trivial.
>
> Do we want to backport the PR68973 fixes to gcc-5 and gcc-4.9?  We are
> exposed to the reload_vsx_from_gprsf bug there, I think, but TFmode
> won't be IEEE.

Backporting to 5 and 4.9 branches is okay with me.

Thanks, David

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-02-16  1:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-11 14:04 [RS6000] reload_vsx_from_gprsf splitter Alan Modra
2016-02-11 17:34 ` David Edelsohn
2016-02-11 18:38   ` Ulrich Weigand
2016-02-11 21:56     ` Michael Meissner
2016-02-11 22:24       ` Alan Modra
2016-02-11 22:42         ` Michael Meissner
2016-02-12  0:21     ` David Edelsohn
2016-02-11 21:53   ` Michael Meissner
2016-02-12 13:57     ` Ulrich Weigand
2016-02-15 12:37       ` Alan Modra
2016-02-15 13:46         ` Ulrich Weigand
2016-02-15 14:42         ` David Edelsohn
2016-02-16  0:24           ` Alan Modra
2016-02-16  1:29             ` David Edelsohn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).