Re: [PATCH] Support APX NF

public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed

From: Jan Beulich <jbeulich@suse.com>
To: "Cui, Lili" <lili.cui@intel.com>
Cc: hongjiu.lu@intel.com, binutils@sourceware.org
Subject: Re: [PATCH] Support APX NF
Date: Thu, 29 Feb 2024 12:21:16 +0100	[thread overview]
Message-ID: <e908fddd-6fd2-4880-bcd7-7c49d245db18@suse.com> (raw)
In-Reply-To: <20240227090106.200134-1-lili.cui@intel.com>

On 27.02.2024 10:01, Cui, Lili wrote:
> @@ -415,6 +416,9 @@ struct _i386_insn
>      /* Compressed disp8*N attribute.  */
>      unsigned int memshift;
>  
> +    /* No CSPAZO flags update.  */
> +    bool has_nf;
> +
>      /* Prefer load or store in encoding.  */
>      enum
>        {

There's a group of booleans further up and another one further down. Is there
any reason not to leverage an available padding slot there?

> @@ -6627,6 +6635,9 @@ md_assemble (char *line)
>  	case unsupported_EGPR_for_addressing:
>  	  err_msg = _("extended GPR cannot be used as base/index");
>  	  break;
> +	case unsupported_nf:
> +	  err_msg = _("unsupported NF");
> +	  break;

No tests showing this new error message in action? I'm once again a
little worried about the resulting overall wording of the diagnostic.

> @@ -7187,6 +7198,10 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
>  		  /* {rex2} */
>  		  i.rex2_encoding = true;
>  		  break;
> +		case Prefix_NF:
> +		  /* {NF} */
> +		  i.has_nf = true;
> +		  break;
>  		case Prefix_NoOptimize:
>  		  /* {nooptimize} */
>  		  i.no_optimize = true;

Nit: Preferably {nf} in the comment, matching comments in context.

> @@ -8860,6 +8880,9 @@ match_template (char mnem_suffix)
>  		  goto check_operands_345;
>  		}
>  	      else if (t->opcode_space != SPACE_BASE
> +		       /* Map0 and map1 are promoted to MAP4 when NF is enabled.
> +			*/
> +		       && !t->opcode_modifier.nf
>  		       && (t->opcode_space != SPACE_0F
>  			   /* MOV to/from CR/DR/TR, as an exception, follow
>  			      the base opcode space encoding model.  */

I don't understand this: How does a template permitting NF matter here?
I could see the immediately preceding "else if" become something along
the lines of

	      else if (is_cpu (t, CpuAPX_F) && (i.operands == 3 || i.has_nf))

But I admit I didn't fully think this through. It's just that the change
as is looks wrong to me.

> --- /dev/null
> +++ b/gas/testsuite/gas/i386/x86-64-apx-nf.s
>[...]
> +	{nf}	ror	%cl, 291(%r8, %rax, 4), %r9
> +	{nf}	sar	$1, %bl
> +	{nf}	sar	$1, %bl, %dl
> +	{nf}	sar	$1, %dx
> +	{nf}	sar	$1, %dx, %ax
> +	{nf}	sar	$1, %ecx
> +	{nf}	sar	$1, %ecx, %edx
> +	{nf}	sar	$1, %r9
> +	{nf}	sar	$1, %r9, %r31
> +	{nf}	sarb	$1, 291(%r8, %rax, 4)
> +	{nf}	sar	$1, 291(%r8, %rax, 4), %bl
> +	{nf}	sarw	$1, 291(%r8, %rax, 4)
> +	{nf}	sar	$1, 291(%r8, %rax, 4), %dx
> +	{nf}	sarl	$1, 291(%r8, %rax, 4)
> +	{nf}	sar	$1, 291(%r8, %rax, 4), %ecx
> +	{nf}	sarq	$1, 291(%r8, %rax, 4)
> +	{nf}	sar	$1, 291(%r8, %rax, 4), %r9
> +	{nf}	sar	$123, %bl
> +	{nf}	sar	$123, %bl, %dl
> +	{nf}	sar	$123, %dx
> +	{nf}	sar	$123, %dx, %ax
> +	{nf}	sar	$123, %ecx
> +	{nf}	sar	$123, %ecx, %edx
> +	{nf}	sar	$123, %r9
> +	{nf}	sar	$123, %r9, %r31
> +	{nf}	sarb	$123, 291(%r8, %rax, 4)
> +	{nf}	sar	$123, 291(%r8, %rax, 4), %bl
> +	{nf}	sarw	$123, 291(%r8, %rax, 4)
> +	{nf}	sar	$123, 291(%r8, %rax, 4), %dx
> +	{nf}	sarl	$123, 291(%r8, %rax, 4)
> +	{nf}	sar	$123, 291(%r8, %rax, 4), %ecx
> +	{nf}	sarq	$123, 291(%r8, %rax, 4)
> +	{nf}	sar	$123, 291(%r8, %rax, 4), %r9
> +	{nf}	sar	%cl, %bl
> +	{nf}	sar	%cl, %bl, %dl
> +	{nf}	sar	%cl, %dx
> +	{nf}	sar	%cl, %dx, %ax
> +	{nf}	sar	%cl, %ecx
> +	{nf}	sar	%cl, %ecx, %edx
> +	{nf}	sar	%cl, %r9
> +	{nf}	sar	%cl, %r9, %r31
> +	{nf}	sarb	%cl, 291(%r8, %rax, 4)
> +	{nf}	sar	%cl, 291(%r8, %rax, 4), %bl
> +	{nf}	sarw	%cl, 291(%r8, %rax, 4)
> +	{nf}	sar	%cl, 291(%r8, %rax, 4), %dx
> +	{nf}	sarl	%cl, 291(%r8, %rax, 4)
> +	{nf}	sar	%cl, 291(%r8, %rax, 4), %ecx
> +	{nf}	sarq	%cl, 291(%r8, %rax, 4)
> +	{nf}	sar	%cl, 291(%r8, %rax, 4), %r9
> +	{nf}	shl	$1, %bl
> +	{nf}	shl	$1, %bl, %dl
> +	{nf}	shl	$1, %dx
> +	{nf}	shl	$1, %dx, %ax
> +	{nf}	shl	$1, %ecx
> +	{nf}	shl	$1, %ecx, %edx
> +	{nf}	shl	$1, %r9
> +	{nf}	shl	$1, %r9, %r31
> +	{nf}	shlb	$1, 291(%r8, %rax, 4)
> +	{nf}	shl	$1, 291(%r8, %rax, 4), %bl
> +	{nf}	shlw	$1, 291(%r8, %rax, 4)
> +	{nf}	shl	$1, 291(%r8, %rax, 4), %dx
> +	{nf}	shll	$1, 291(%r8, %rax, 4)
> +	{nf}	shl	$1, 291(%r8, %rax, 4), %ecx
> +	{nf}	shlq	$1, 291(%r8, %rax, 4)
> +	{nf}	shl	$1, 291(%r8, %rax, 4), %r9
> +	{nf}	shl	$123, %bl
> +	{nf}	shl	$123, %bl, %dl
> +	{nf}	shl	$123, %dx
> +	{nf}	shl	$123, %dx, %ax
> +	{nf}	shl	$123, %ecx
> +	{nf}	shl	$123, %ecx, %edx
> +	{nf}	shl	$123, %r9
> +	{nf}	shl	$123, %r9, %r31
> +	{nf}	shlb	$123, 291(%r8, %rax, 4)
> +	{nf}	shl	$123, 291(%r8, %rax, 4), %bl
> +	{nf}	shlw	$123, 291(%r8, %rax, 4)
> +	{nf}	shl	$123, 291(%r8, %rax, 4), %dx
> +	{nf}	shll	$123, 291(%r8, %rax, 4)
> +	{nf}	shl	$123, 291(%r8, %rax, 4), %ecx
> +	{nf}	shlq	$123, 291(%r8, %rax, 4)
> +	{nf}	shl	$123, 291(%r8, %rax, 4), %r9
> +	{nf}	shl	%cl, %bl
> +	{nf}	shl	%cl, %bl, %dl
> +	{nf}	shl	%cl, %dx
> +	{nf}	shl	%cl, %dx, %ax
> +	{nf}	shl	%cl, %ecx
> +	{nf}	shl	%cl, %ecx, %edx
> +	{nf}	shl	%cl, %r9
> +	{nf}	shl	%cl, %r9, %r31
> +	{nf}	shlb	%cl, 291(%r8, %rax, 4)
> +	{nf}	shl	%cl, 291(%r8, %rax, 4), %bl
> +	{nf}	shlw	%cl, 291(%r8, %rax, 4)
> +	{nf}	shl	%cl, 291(%r8, %rax, 4), %dx
> +	{nf}	shll	%cl, 291(%r8, %rax, 4)
> +	{nf}	shl	%cl, 291(%r8, %rax, 4), %ecx
> +	{nf}	shlq	%cl, 291(%r8, %rax, 4)
> +	{nf}	shl	%cl, 291(%r8, %rax, 4), %r9
> +	{nf}	sal	$1, %bl
> +	{nf}	sal	$1, %bl, %dl
> +	{nf}	sal	$1, %dx
> +	{nf}	sal	$1, %dx, %ax
> +	{nf}	sal	$1, %ecx
> +	{nf}	sal	$1, %ecx, %edx
> +	{nf}	sal	$1, %r9
> +	{nf}	sal	$1, %r9, %r31
> +	{nf}	salb	$1, 291(%r8, %rax, 4)
> +	{nf}	sal	$1, 291(%r8, %rax, 4), %bl
> +	{nf}	salw	$1, 291(%r8, %rax, 4)
> +	{nf}	sal	$1, 291(%r8, %rax, 4), %dx
> +	{nf}	sall	$1, 291(%r8, %rax, 4)
> +	{nf}	sal	$1, 291(%r8, %rax, 4), %ecx
> +	{nf}	salq	$1, 291(%r8, %rax, 4)
> +	{nf}	sal	$1, 291(%r8, %rax, 4), %r9
> +	{nf}	sal	$123, %bl
> +	{nf}	sal	$123, %bl, %dl
> +	{nf}	sal	$123, %dx
> +	{nf}	sal	$123, %dx, %ax
> +	{nf}	sal	$123, %ecx
> +	{nf}	sal	$123, %ecx, %edx
> +	{nf}	sal	$123, %r9
> +	{nf}	sal	$123, %r9, %r31
> +	{nf}	salb	$123, 291(%r8, %rax, 4)
> +	{nf}	sal	$123, 291(%r8, %rax, 4), %bl
> +	{nf}	salw	$123, 291(%r8, %rax, 4)
> +	{nf}	sal	$123, 291(%r8, %rax, 4), %dx
> +	{nf}	sall	$123, 291(%r8, %rax, 4)
> +	{nf}	sal	$123, 291(%r8, %rax, 4), %ecx
> +	{nf}	salq	$123, 291(%r8, %rax, 4)
> +	{nf}	sal	$123, 291(%r8, %rax, 4), %r9
> +	{nf}	sal	%cl, %bl
> +	{nf}	sal	%cl, %bl, %dl
> +	{nf}	sal	%cl, %dx
> +	{nf}	sal	%cl, %dx, %ax
> +	{nf}	sal	%cl, %ecx
> +	{nf}	sal	%cl, %ecx, %edx
> +	{nf}	sal	%cl, %r9
> +	{nf}	sal	%cl, %r9, %r31
> +	{nf}	salb	%cl, 291(%r8, %rax, 4)
> +	{nf}	sal	%cl, 291(%r8, %rax, 4), %bl
> +	{nf}	salw	%cl, 291(%r8, %rax, 4)
> +	{nf}	sal	%cl, 291(%r8, %rax, 4), %dx
> +	{nf}	sall	%cl, 291(%r8, %rax, 4)
> +	{nf}	sal	%cl, 291(%r8, %rax, 4), %ecx
> +	{nf}	salq	%cl, 291(%r8, %rax, 4)
> +	{nf}	sal	%cl, 291(%r8, %rax, 4), %r9

Hmm, I think sorting in the source file is more relevant than in the output
(expectations), so I think this SAL block wants moving up. Would of course
be yet more natural if we actually encoded SAL with ModR/M.reg=6 rather
than the same encoding as SHL ...

> --- a/opcodes/i386-dis-evex-reg.h
> +++ b/opcodes/i386-dis-evex-reg.h
> @@ -51,33 +51,33 @@
>    },
>    /* REG_EVEX_MAP4_80 */
>    {
> -    { "addA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> -    { "orA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "%XNaddA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> +    { "%XNorA",	{ VexGb, Eb, Ib }, NO_PREFIX },

Since there are quite a number of entries which are affected (and more to
come), did you consider using a single-character macro here? I realize
the three we presently have free don't fit overly well letter-wise, but it
ought to be possible to e.g. free up F (rarely used, could become a two-
letter one) for use here.

Seeing that you need to fiddle with the "case 'N'" code anyway, did you
further consider giving 'N' a second purpose? Present and projected uses
are easy to tell apart by being non-EVEX / EVEX respectively.

If we really wanted to stick to a two-letter one, I think it would further
want considering to use %NF, such that its purpose is immediately clear
from the letters used.

> @@ -9147,6 +9150,10 @@ get_valid_dis386 (const struct dis386 *dp, instr_info *ins)
>        ins->vex.v = *ins->codep & 0x8;
>        ins->vex.mask_register_specifier = *ins->codep & 0x7;
>        ins->vex.zeroing = *ins->codep & 0x80;
> +      /* Set the NF bit for the EVEX instruction extended from the legacy or
> +	 vex instruction, this bit will be cleared when it can be confirmed
> +	 that its defaut type is evex.  */
> +      ins->vex.nf = *ins->codep & 0x4;
>  
>        if (ins->address_mode != mode_64bit)
>  	{
> @@ -9600,6 +9607,15 @@ print_insn (bfd_vma pc, disassemble_info *info, int intel_syntax)
>  	  && ins.vex.prefix == DATA_PREFIX_OPCODE)
>  	sizeflag ^= DFLAG;
>  
> +      if(ins.evex_type == evex_default)
> +	ins.vex.nf = false;

Up to here I think I agree.

> +      else
> +	/* For EVEX-promoted formats, we need to clear EVEX.NF (For ccmp and
> +	   ctest, they will be cleared separately.) in mask_register_specifier
> +	   and keep the low 2 bits of mask_register_specifier to report errors
> +	   for invalid cases.*/
> +	ins.vex.mask_register_specifier &= 0x3;

But this I'm in trouble with: How would you recognize (and accordingly print)
insns with NF wrongly set? (By implication there's also a respective testcase
[addition] missing.)

> --- a/opcodes/i386-opc.h
> +++ b/opcodes/i386-opc.h
> @@ -1017,7 +1017,8 @@ typedef struct insn_template
>  #define Prefix_EVEX		7	/* {evex} */
>  #define Prefix_REX		8	/* {rex} */
>  #define Prefix_REX2		9	/* {rex2} */
> -#define Prefix_NoOptimize	10	/* {nooptimize} */
> +#define Prefix_NF		10	/* {nf} */
> +#define Prefix_NoOptimize	11	/* {nooptimize} */

I find it increasingly puzzling that nooptimize is pushed all further down,
for no real reason.

> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -310,32 +310,42 @@ sti, 0xfb, 0, NoSuf, {}
>  // Arithmetic.
>  add, 0x0, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
>  add, 0x0, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +add, 0x0, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
>  add, 0x83/0, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
>  add, 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +add, 0x83/0, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
>  add, 0x4, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
>  add, 0x80/0, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
>  add, 0x80/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +add, 0x80/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }

Adding these templates has a 2nd effect, for which no testcase is being added:
They now allow (taking the example here) "{evex} add $1, %eax". Such a new
test (which could be less extensive than the -nf one you already add) should
then also cover ADCX and ADOX, for which the 2-operand EVEX templates were
added prematurely.

>  inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
>  inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|NF, {Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
>  inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
>  
>  sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }
>  sub, 0x28, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|Optimize|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }

What's the purpose of Optimize here? Just to repeat my earlier request:
Please don't blindly copy all attributes when you clone templates. See
how the existing APX template already doesn't have this attribute.
Apparently you re-cloned the legacy one, not the APX one.

Specifically here, this template will only be chosen if either {nf} or
{evex} is present. Both of which preclude the NDD->REX2 transformation,
in turn making further optimization impossible.

As to {nf} precluding optimization: can_convert_NDD_to_legacy() checks
i.tm.opcode_modifier.nf rather than i.has_nf. That's entirely dead code,
as i.tm is populated only by install_template(). This check wants
dropping in a prereq patch, I suppose, and then the patch here should
add the correct check. I recall saying back then that a respective check
needs adding here, not already in the patch introducing the
transformation.

Jan

next prev parent reply	other threads:[~2024-02-29 11:21 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-27  9:01 Cui, Lili
2024-02-28 16:11 ` H.J. Lu
2024-02-29  1:12   ` Cui, Lili
2024-02-29  6:53   ` Jan Beulich
2024-02-29  8:39     ` Cui, Lili
2024-02-29  9:06       ` Jan Beulich
2024-02-29 10:22         ` Cui, Lili
2024-02-29 12:23           ` H.J. Lu
2024-02-29 12:26             ` Cui, Lili
2024-02-29 11:21 ` Jan Beulich [this message]
2024-02-29 12:00   ` Cui, Lili
2024-02-29 12:04     ` Jan Beulich
2024-02-29 12:41       ` Cui, Lili
2024-02-29 13:17         ` Jan Beulich
2024-02-29 13:47           ` Cui, Lili
2024-02-29 14:12             ` Jan Beulich
2024-03-01  3:23               ` Cui, Lili
2024-03-01  6:56                 ` Jan Beulich
2024-03-01  8:01                   ` Cui, Lili
2024-03-01 11:36   ` Cui, Lili
2024-03-01 11:49     ` Jan Beulich
2024-03-01  7:04 ` Jan Beulich
2024-03-01 11:50   ` Cui, Lili
2024-03-19  6:41 Cui, Lili
2024-03-21 14:26 ` Jan Beulich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e908fddd-6fd2-4880-bcd7-7c49d245db18@suse.com \
    --to=jbeulich@suse.com \
    --cc=binutils@sourceware.org \
    --cc=hongjiu.lu@intel.com \
    --cc=lili.cui@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).