From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-patches-return-508059-listarch-gcc-patches=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 17416 invoked by alias); 30 Aug 2019 16:35:16 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Received: (qmail 17407 invoked by uid 89); 30 Aug 2019 16:35:16 -0000
Authentication-Results: sourceware.org; auth=none
X-Spam-SWARE-Status: No, score=-7.2 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_2,GIT_PATCH_3,RCVD_IN_DNSWL_NONE autolearn=ham version=3.3.1 spammy=Back, Guess, formatted, CONST
X-HELO: gate.crashing.org
Received: from gate.crashing.org (HELO gate.crashing.org) (63.228.1.57) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 30 Aug 2019 16:35:14 +0000
Received: from gate.crashing.org (localhost.localdomain [127.0.0.1])	by gate.crashing.org (8.14.1/8.14.1) with ESMTP id x7UGZB3o025437;	Fri, 30 Aug 2019 11:35:11 -0500
Received: (from segher@localhost)	by gate.crashing.org (8.14.1/8.14.1/Submit) id x7UGZBKr025436;	Fri, 30 Aug 2019 11:35:11 -0500
Date: Fri, 30 Aug 2019 19:22:00 -0000
From: Segher Boessenkool <segher@kernel.crashing.org>
To: Michael Meissner <meissner@linux.ibm.com>, gcc-patches@gcc.gnu.org,        dje.gcc@gmail.com
Subject: Re: [PATCH, V3, #4 of 10], Add general prefixed/pcrel support
Message-ID: <20190830163511.GP31406@gate.crashing.org>
References: <20190826173320.GA7958@ibm-toto.the-meissners.org> <20190826204337.GD11790@ibm-toto.the-meissners.org>
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20190826204337.GD11790@ibm-toto.the-meissners.org>
User-Agent: Mutt/1.4.2.3i
X-IsSubscribed: yes
X-SW-Source: 2019-08/txt/msg02104.txt.bz2

Hi!

(Please split off paddi to a separate patch?)

On Mon, Aug 26, 2019 at 04:43:37PM -0400, Michael Meissner wrote:
> 	(prefixed_paddi_p): Fix thinkos in last patch.

Do that separately please.  Don't hide this in another patch like this.

Hrm, this is not in this patch at all?  Fix the changelog, then :-)

> --- gcc/config/rs6000/predicates.md	(revision 274870)
> +++ gcc/config/rs6000/predicates.md	(working copy)
> @@ -839,7 +839,8 @@ (define_special_predicate "indexed_addre
>  (define_predicate "add_operand"
>    (if_then_else (match_code "const_int")
>      (match_test "satisfies_constraint_I (op)
> -		 || satisfies_constraint_L (op)")
> +		 || satisfies_constraint_L (op)
> +		 || satisfies_constraint_eI (op)")
>      (match_operand 0 "gpc_reg_operand")))
>  
>  ;; Return 1 if the operand is either a non-special register, or 0, or -1.
> @@ -852,7 +853,8 @@ (define_predicate "adde_operand"
>  (define_predicate "non_add_cint_operand"
>    (and (match_code "const_int")
>         (match_test "!satisfies_constraint_I (op)
> -		    && !satisfies_constraint_L (op)")))
> +		    && !satisfies_constraint_L (op)
> +		    && !satisfies_constraint_eI (op)")))

(define_predicate "non_add_cint_operand"
  (and (match_code "const_int")
       (not (match_operand 0 "add_operand"))))

?  You can do that *now*, and it is pre-approved.  (This could use a better
name btw., I always have to look up what it means; a longer name is fine as
well of course, it is used only once or so).

> @@ -933,6 +935,13 @@ (define_predicate "lwa_operand"
>      return false;
>  
>    addr = XEXP (inner, 0);
> +
> +  /* The LWA instruction uses the DS-form format where the bottom two bits of
> +     the offset must be 0.  The prefixed PLWA does not have this
> +     restriction.  */
> +  if (prefixed_local_addr_p (addr, mode, TRAD_INSN_DS))
> +    return true;

Why does the decision whether something is a valid prefixed lwa_operand
need to know the non-prefixed lwa is a DS-form instruction?

And "local" is a head-scratcher for this condition, too.

> +;; Return 1 if op is a memory operand that is not prefixed.
> +(define_predicate "non_prefixed_mem_operand"
> +  (match_code "mem")
> +{
> +  if (!memory_operand (op, mode))
> +    return false;
> +
> +  return !prefixed_local_addr_p (XEXP (op, 0), GET_MODE (op),
> +				 TRAD_INSN_DEFAULT);
> +})

Use match_operand for the first condition please (and then match_test for
the second?)

This does make it seem like we need a prefixed_local_mem_p as well?  So
that we need neither that XEXP nor that GET_MODE.

> @@ -5735,6 +5735,10 @@ num_insns_constant_gpr (HOST_WIDE_INT va
>  	   && (value >> 31 == -1 || value >> 31 == 0))
>      return 1;
>  
> +  /* PADDI can support up to 34 bit signed integers.  */
> +  else if (TARGET_PREFIXED_ADDR && SIGNED_34BIT_OFFSET_P (value))
> +    return 1;

Write this earlier, together with the 16BIT one?

> @@ -6905,6 +6909,7 @@ rs6000_adjust_vec_address (rtx scalar_re
>    rtx element_offset;
>    rtx new_addr;
>    bool valid_addr_p;
> +  bool pcrel_p = TARGET_PCREL && pcrel_local_address (addr, Pmode);

This is used 159 lines later.  Please refactor things.  That would make
a separate patch *before* this one.

> +  /* Optimize pc-relative addresses.  */
> +  else if (pcrel_p)
> +    {
> +      if (CONST_INT_P (element_offset))
> +	{
> +	  rtx addr2 = addr;

This var needs a better name and/or comments.  Or maybe just factoring.

> @@ -7007,9 +7050,8 @@ rs6000_adjust_vec_address (rtx scalar_re
>  
>    /* If we have a PLUS, we need to see whether the particular register class
>       allows for D-FORM or X-FORM addressing.  */
> -  if (GET_CODE (new_addr) == PLUS)
> +  if (GET_CODE (new_addr) == PLUS || pcrel_p)

That second condition needs a comment.

> @@ -7609,7 +7675,7 @@ mem_operand_ds_form (rtx op, machine_mod
>         causes a wrap, so test only the low 16 bits.  */
>      offset = ((offset & 0xffff) ^ 0x8000) - 0x8000;
>  
> -  return offset + 0x8000 < 0x10000u - extra;
> +  return SIGNED_16BIT_OFFSET_EXTRA_P (offset, extra);

Please do all these things first too, as a separate patch.

> -  offset += 0x8000;
> -  return offset < 0x10000 - extra;
> +  if (TARGET_PREFIXED_ADDR)
> +    return SIGNED_34BIT_OFFSET_EXTRA_P (offset, extra);
> +  else
> +    return SIGNED_16BIT_OFFSET_EXTRA_P (offset, extra);

So this you could just do the 16BIT first, and then *this* patch will add
the 34BIT thing, in an easy-to-read patch.

> +    {
> +      /* There is no prefixed version of the load/store with update.  */
> +      return !prefixed_local_addr_p (XEXP (x, 1), mode, TRAD_INSN_DEFAULT);
> +    }

If you pass the actual MEM, the prefixed_local_mem_p function can return
false, itself.

> -	  unsigned HOST_WIDE_INT val = INTVAL (XEXP (addr, 1));
> -	  return val + 0x8000 >= 0x10000 - (TARGET_POWERPC64 ? 8 : 12);
> +	  HOST_WIDE_INT val = INTVAL (XEXP (addr, 1));
> +	  HOST_WIDE_INT extra = TARGET_POWERPC64 ? 8 : 12;

The 8 vs. 12 could use a comment (yes, I know it was there already).  Do you
know what this is about, why it is 8 and 12?

> +/* Make a memory address non-prefixed if it is prefixed.  */

"Return an RTX that is like MEM but does not need prefixed instructions
to access."?

> +rtx
> +make_memory_non_prefixed (rtx mem)
> +{
> +  gcc_assert (MEM_P (mem));
> +  if (prefixed_local_addr_p (XEXP (mem, 0), GET_MODE (mem), TRAD_INSN_DEFAULT))

Swap the condition and do an early-out, please.

> +    {
> +      rtx old_addr = XEXP (mem, 0);
> +      rtx new_addr;

You you also need to strip CONST from around the address here?

> @@ -21060,7 +21168,8 @@ rs6000_rtx_costs (rtx x, machine_mode mo
>  	    || outer_code == PLUS
>  	    || outer_code == MINUS)
>  	   && (satisfies_constraint_I (x)
> -	       || satisfies_constraint_L (x)))
> +	       || satisfies_constraint_L (x)
> +	       || satisfies_constraint_eI (x)))

Just use add_operand here, maybe?

OTOH, do we want to count prefixed insns as not more expensive than
prefixed ones?

> +/* How many real instructions are generated for this insn?  This is slightly

"How many machine instructions are generated for INSN".

> +static int
> +rs6000_num_insns (rtx_insn *insn)
> +{
> +  /* Try to figure it out based on the length and whether there are prefixed
> +     instructions.  While prefixed instructions are only 8 bytes, we have to
> +     use 12 as the size of the first prefixed instruction in case the
> +     instruction needs to be aligned.  Back to back prefixed instructions would
> +     only take 20 bytes, since it is guaranteed that one of the prefixed
> +     instructions does not need the alignment.  */
> +  int length = get_attr_length (insn);
> +
> +  if (length >= 12 && TARGET_PREFIXED_ADDR
> +      && get_attr_prefixed (insn) == PREFIXED_YES)
> +    {
> +      /* Single prefixed instruction.  */
> +      if (length == 12)
> +	return 1;
> +
> +      /* A normal instruction and a prefixed instruction (16) or two back
> +	 to back prefixed instructions (20).  */
> +      if (length == 16 || length == 20)
> +	return 2;
> +
> +      /* Guess for larger instruction sizes.  */
> +      return 2 + (length - 20) / 4;
> +    }
> +
> +  return length / 4;
> +}

Yuck.  It only needs an approximate answer, but why then handle all kinds
of cases that you cannot test (because they do not currently happen)?

Instead, handle prefixed insns one step up, in insn_cost itself?  It
knows more, it can make better estimates.  It can do it per instruction
type, importantly.

>  (define_insn "*add<mode>3"
> -  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r,r,r")
> -	(plus:GPR (match_operand:GPR 1 "gpc_reg_operand" "%r,b,b")
> -		  (match_operand:GPR 2 "add_operand" "r,I,L")))]
> +  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r,r,r,r")
> +	(plus:GPR (match_operand:GPR 1 "gpc_reg_operand" "%r,b,b,b")
> +		  (match_operand:GPR 2 "add_operand" "r,I,L,eI")))]
>    ""
>    "@
>     add %0,%1,%2
>     addi %0,%1,%2
> -   addis %0,%1,%v2"
> -  [(set_attr "type" "add")])
> +   addis %0,%1,%v2
> +   addi %0,%1,%2"
> +  [(set_attr "type" "add")
> +   (set_attr "isa" "*,*,*,fut")])

Okay.

> @@ -6909,22 +6911,22 @@ (define_insn "movsi_low"
>  
>  ;;		MR           LA           LWZ          LFIWZX       LXSIWZX
>  ;;		STW          STFIWX       STXSIWX      LI           LIS
> -;;		#            XXLOR        XXSPLTIB 0   XXSPLTIB -1  VSPLTISW
> -;;		XXLXOR 0     XXLORC -1    P9 const     MTVSRWZ      MFVSRWZ
> -;;		MF%1         MT%0         NOP
> +;;		PLI          #            XXLOR        XXSPLTIB 0   XXSPLTIB -1
> +;;		VSPLTISW     XXLXOR 0     XXLORC -1    P9 const     MTVSRWZ
> +;;		MFVSRWZ      MF%1         MT%0         NOP

So this is adding just the PLI?  Put it on a line of its own then?  And
don't reformat the existing stuff.

It would be nice if this all can be formatted a bit nicer eventually, in
more logical groups, not strictly five per line, which isn't need for
anything and not helpful at all either.  So for this maybe

  mr la
  lwz lfiwzx lxsiwzx
  stw stfiwx stxsiwx
  li lis pli #

etc.  You also have the restriction that the order matters somewhat, but
there still is a lot of room to make it easier to read.

> -;; Split a load of a large constant into the appropriate two-insn
> -;; sequence.
> +;; Split a load of a large constant into the appropriate two-insn sequence.  On
> +;; systems that support PADDI (PLI), we can use PLI to load any 32-bit constant
> +;; in one instruction.
>  
>  (define_split
>    [(set (match_operand:SI 0 "gpc_reg_operand")
>  	(match_operand:SI 1 "const_int_operand"))]
>    "(unsigned HOST_WIDE_INT) (INTVAL (operands[1]) + 0x8000) >= 0x10000

Use the 16BIT thing here?

> @@ -7769,9 +7782,13 @@ (define_insn_and_split "*mov<mode>_64bit
>    "#"
>    "&& reload_completed"
>    [(pc)]
> -{ rs6000_split_multireg_move (operands[0], operands[1]); DONE; }
> -  [(set_attr "length" "8,8,8,8,12,12,8,8,8")
> -   (set_attr "isa" "*,*,*,*,*,*,*,p8v,p8v")])
> +{
> +  rs6000_split_multireg_move (operands[0], operands[1]);
> +  DONE;
> +}
> +  [(set_attr "isa" "*,*,*,*,*,*,*,*,p8v,p8v")
> +   (set_attr "non_prefixed_length" "8")
> +   (set_attr "prefixed_length" "20")])

Should this have a separate alternative for prefixed addressing?

What happened to the 12's?

> @@ -1149,10 +1149,30 @@ (define_insn "vsx_mov<mode>_64bit"
>                 "vecstore,  vecload,   vecsimple, mffgpr,    mftgpr,    load,
>                  store,     load,      store,     *,         vecsimple, vecsimple,
>                  vecsimple, *,         *,         vecstore,  vecload")
> -   (set_attr "length"
> -               "*,         *,         *,         8,         *,         8,
> -                8,         8,         8,         8,         *,         *,
> -                *,         20,        8,         *,         *")
> +   (set (attr "non_prefixed_length")
> +	(cond [(and (eq_attr "alternative" "4")		;; MTVSRDD
> +		    (match_test "TARGET_P9_VECTOR"))
> +	       (const_string "4")
> +
> +	       (eq_attr "alternative" "3,4")		;; GPR <-> VSX
> +	       (const_string "8")
> +
> +	       (eq_attr "alternative" "5,6,7,8")	;; GPR load/store
> +	       (const_string "8")]
> +	      (const_string "*")))

Why handle alternative 4 separately like this?  Shouldn't there just be
a separate alternative for the p9 version?


Segher