From: Jan Beulich <jbeulich@suse.com>
To: "Cui, Lili" <lili.cui@intel.com>
Cc: hongjiu.lu@intel.com, binutils@sourceware.org
Subject: Re: [PATCH] Support APX NF
Date: Thu, 29 Feb 2024 12:21:16 +0100 [thread overview]
Message-ID: <e908fddd-6fd2-4880-bcd7-7c49d245db18@suse.com> (raw)
In-Reply-To: <20240227090106.200134-1-lili.cui@intel.com>
On 27.02.2024 10:01, Cui, Lili wrote:
> @@ -415,6 +416,9 @@ struct _i386_insn
> /* Compressed disp8*N attribute. */
> unsigned int memshift;
>
> + /* No CSPAZO flags update. */
> + bool has_nf;
> +
> /* Prefer load or store in encoding. */
> enum
> {
There's a group of booleans further up and another one further down. Is there
any reason not to leverage an available padding slot there?
> @@ -6627,6 +6635,9 @@ md_assemble (char *line)
> case unsupported_EGPR_for_addressing:
> err_msg = _("extended GPR cannot be used as base/index");
> break;
> + case unsupported_nf:
> + err_msg = _("unsupported NF");
> + break;
No tests showing this new error message in action? I'm once again a
little worried about the resulting overall wording of the diagnostic.
> @@ -7187,6 +7198,10 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
> /* {rex2} */
> i.rex2_encoding = true;
> break;
> + case Prefix_NF:
> + /* {NF} */
> + i.has_nf = true;
> + break;
> case Prefix_NoOptimize:
> /* {nooptimize} */
> i.no_optimize = true;
Nit: Preferably {nf} in the comment, matching comments in context.
> @@ -8860,6 +8880,9 @@ match_template (char mnem_suffix)
> goto check_operands_345;
> }
> else if (t->opcode_space != SPACE_BASE
> + /* Map0 and map1 are promoted to MAP4 when NF is enabled.
> + */
> + && !t->opcode_modifier.nf
> && (t->opcode_space != SPACE_0F
> /* MOV to/from CR/DR/TR, as an exception, follow
> the base opcode space encoding model. */
I don't understand this: How does a template permitting NF matter here?
I could see the immediately preceding "else if" become something along
the lines of
else if (is_cpu (t, CpuAPX_F) && (i.operands == 3 || i.has_nf))
But I admit I didn't fully think this through. It's just that the change
as is looks wrong to me.
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/x86-64-apx-nf.s
>[...]
> + {nf} ror %cl, 291(%r8, %rax, 4), %r9
> + {nf} sar $1, %bl
> + {nf} sar $1, %bl, %dl
> + {nf} sar $1, %dx
> + {nf} sar $1, %dx, %ax
> + {nf} sar $1, %ecx
> + {nf} sar $1, %ecx, %edx
> + {nf} sar $1, %r9
> + {nf} sar $1, %r9, %r31
> + {nf} sarb $1, 291(%r8, %rax, 4)
> + {nf} sar $1, 291(%r8, %rax, 4), %bl
> + {nf} sarw $1, 291(%r8, %rax, 4)
> + {nf} sar $1, 291(%r8, %rax, 4), %dx
> + {nf} sarl $1, 291(%r8, %rax, 4)
> + {nf} sar $1, 291(%r8, %rax, 4), %ecx
> + {nf} sarq $1, 291(%r8, %rax, 4)
> + {nf} sar $1, 291(%r8, %rax, 4), %r9
> + {nf} sar $123, %bl
> + {nf} sar $123, %bl, %dl
> + {nf} sar $123, %dx
> + {nf} sar $123, %dx, %ax
> + {nf} sar $123, %ecx
> + {nf} sar $123, %ecx, %edx
> + {nf} sar $123, %r9
> + {nf} sar $123, %r9, %r31
> + {nf} sarb $123, 291(%r8, %rax, 4)
> + {nf} sar $123, 291(%r8, %rax, 4), %bl
> + {nf} sarw $123, 291(%r8, %rax, 4)
> + {nf} sar $123, 291(%r8, %rax, 4), %dx
> + {nf} sarl $123, 291(%r8, %rax, 4)
> + {nf} sar $123, 291(%r8, %rax, 4), %ecx
> + {nf} sarq $123, 291(%r8, %rax, 4)
> + {nf} sar $123, 291(%r8, %rax, 4), %r9
> + {nf} sar %cl, %bl
> + {nf} sar %cl, %bl, %dl
> + {nf} sar %cl, %dx
> + {nf} sar %cl, %dx, %ax
> + {nf} sar %cl, %ecx
> + {nf} sar %cl, %ecx, %edx
> + {nf} sar %cl, %r9
> + {nf} sar %cl, %r9, %r31
> + {nf} sarb %cl, 291(%r8, %rax, 4)
> + {nf} sar %cl, 291(%r8, %rax, 4), %bl
> + {nf} sarw %cl, 291(%r8, %rax, 4)
> + {nf} sar %cl, 291(%r8, %rax, 4), %dx
> + {nf} sarl %cl, 291(%r8, %rax, 4)
> + {nf} sar %cl, 291(%r8, %rax, 4), %ecx
> + {nf} sarq %cl, 291(%r8, %rax, 4)
> + {nf} sar %cl, 291(%r8, %rax, 4), %r9
> + {nf} shl $1, %bl
> + {nf} shl $1, %bl, %dl
> + {nf} shl $1, %dx
> + {nf} shl $1, %dx, %ax
> + {nf} shl $1, %ecx
> + {nf} shl $1, %ecx, %edx
> + {nf} shl $1, %r9
> + {nf} shl $1, %r9, %r31
> + {nf} shlb $1, 291(%r8, %rax, 4)
> + {nf} shl $1, 291(%r8, %rax, 4), %bl
> + {nf} shlw $1, 291(%r8, %rax, 4)
> + {nf} shl $1, 291(%r8, %rax, 4), %dx
> + {nf} shll $1, 291(%r8, %rax, 4)
> + {nf} shl $1, 291(%r8, %rax, 4), %ecx
> + {nf} shlq $1, 291(%r8, %rax, 4)
> + {nf} shl $1, 291(%r8, %rax, 4), %r9
> + {nf} shl $123, %bl
> + {nf} shl $123, %bl, %dl
> + {nf} shl $123, %dx
> + {nf} shl $123, %dx, %ax
> + {nf} shl $123, %ecx
> + {nf} shl $123, %ecx, %edx
> + {nf} shl $123, %r9
> + {nf} shl $123, %r9, %r31
> + {nf} shlb $123, 291(%r8, %rax, 4)
> + {nf} shl $123, 291(%r8, %rax, 4), %bl
> + {nf} shlw $123, 291(%r8, %rax, 4)
> + {nf} shl $123, 291(%r8, %rax, 4), %dx
> + {nf} shll $123, 291(%r8, %rax, 4)
> + {nf} shl $123, 291(%r8, %rax, 4), %ecx
> + {nf} shlq $123, 291(%r8, %rax, 4)
> + {nf} shl $123, 291(%r8, %rax, 4), %r9
> + {nf} shl %cl, %bl
> + {nf} shl %cl, %bl, %dl
> + {nf} shl %cl, %dx
> + {nf} shl %cl, %dx, %ax
> + {nf} shl %cl, %ecx
> + {nf} shl %cl, %ecx, %edx
> + {nf} shl %cl, %r9
> + {nf} shl %cl, %r9, %r31
> + {nf} shlb %cl, 291(%r8, %rax, 4)
> + {nf} shl %cl, 291(%r8, %rax, 4), %bl
> + {nf} shlw %cl, 291(%r8, %rax, 4)
> + {nf} shl %cl, 291(%r8, %rax, 4), %dx
> + {nf} shll %cl, 291(%r8, %rax, 4)
> + {nf} shl %cl, 291(%r8, %rax, 4), %ecx
> + {nf} shlq %cl, 291(%r8, %rax, 4)
> + {nf} shl %cl, 291(%r8, %rax, 4), %r9
> + {nf} sal $1, %bl
> + {nf} sal $1, %bl, %dl
> + {nf} sal $1, %dx
> + {nf} sal $1, %dx, %ax
> + {nf} sal $1, %ecx
> + {nf} sal $1, %ecx, %edx
> + {nf} sal $1, %r9
> + {nf} sal $1, %r9, %r31
> + {nf} salb $1, 291(%r8, %rax, 4)
> + {nf} sal $1, 291(%r8, %rax, 4), %bl
> + {nf} salw $1, 291(%r8, %rax, 4)
> + {nf} sal $1, 291(%r8, %rax, 4), %dx
> + {nf} sall $1, 291(%r8, %rax, 4)
> + {nf} sal $1, 291(%r8, %rax, 4), %ecx
> + {nf} salq $1, 291(%r8, %rax, 4)
> + {nf} sal $1, 291(%r8, %rax, 4), %r9
> + {nf} sal $123, %bl
> + {nf} sal $123, %bl, %dl
> + {nf} sal $123, %dx
> + {nf} sal $123, %dx, %ax
> + {nf} sal $123, %ecx
> + {nf} sal $123, %ecx, %edx
> + {nf} sal $123, %r9
> + {nf} sal $123, %r9, %r31
> + {nf} salb $123, 291(%r8, %rax, 4)
> + {nf} sal $123, 291(%r8, %rax, 4), %bl
> + {nf} salw $123, 291(%r8, %rax, 4)
> + {nf} sal $123, 291(%r8, %rax, 4), %dx
> + {nf} sall $123, 291(%r8, %rax, 4)
> + {nf} sal $123, 291(%r8, %rax, 4), %ecx
> + {nf} salq $123, 291(%r8, %rax, 4)
> + {nf} sal $123, 291(%r8, %rax, 4), %r9
> + {nf} sal %cl, %bl
> + {nf} sal %cl, %bl, %dl
> + {nf} sal %cl, %dx
> + {nf} sal %cl, %dx, %ax
> + {nf} sal %cl, %ecx
> + {nf} sal %cl, %ecx, %edx
> + {nf} sal %cl, %r9
> + {nf} sal %cl, %r9, %r31
> + {nf} salb %cl, 291(%r8, %rax, 4)
> + {nf} sal %cl, 291(%r8, %rax, 4), %bl
> + {nf} salw %cl, 291(%r8, %rax, 4)
> + {nf} sal %cl, 291(%r8, %rax, 4), %dx
> + {nf} sall %cl, 291(%r8, %rax, 4)
> + {nf} sal %cl, 291(%r8, %rax, 4), %ecx
> + {nf} salq %cl, 291(%r8, %rax, 4)
> + {nf} sal %cl, 291(%r8, %rax, 4), %r9
Hmm, I think sorting in the source file is more relevant than in the output
(expectations), so I think this SAL block wants moving up. Would of course
be yet more natural if we actually encoded SAL with ModR/M.reg=6 rather
than the same encoding as SHL ...
> --- a/opcodes/i386-dis-evex-reg.h
> +++ b/opcodes/i386-dis-evex-reg.h
> @@ -51,33 +51,33 @@
> },
> /* REG_EVEX_MAP4_80 */
> {
> - { "addA", { VexGb, Eb, Ib }, NO_PREFIX },
> - { "orA", { VexGb, Eb, Ib }, NO_PREFIX },
> + { "%XNaddA", { VexGb, Eb, Ib }, NO_PREFIX },
> + { "%XNorA", { VexGb, Eb, Ib }, NO_PREFIX },
Since there are quite a number of entries which are affected (and more to
come), did you consider using a single-character macro here? I realize
the three we presently have free don't fit overly well letter-wise, but it
ought to be possible to e.g. free up F (rarely used, could become a two-
letter one) for use here.
Seeing that you need to fiddle with the "case 'N'" code anyway, did you
further consider giving 'N' a second purpose? Present and projected uses
are easy to tell apart by being non-EVEX / EVEX respectively.
If we really wanted to stick to a two-letter one, I think it would further
want considering to use %NF, such that its purpose is immediately clear
from the letters used.
> @@ -9147,6 +9150,10 @@ get_valid_dis386 (const struct dis386 *dp, instr_info *ins)
> ins->vex.v = *ins->codep & 0x8;
> ins->vex.mask_register_specifier = *ins->codep & 0x7;
> ins->vex.zeroing = *ins->codep & 0x80;
> + /* Set the NF bit for the EVEX instruction extended from the legacy or
> + vex instruction, this bit will be cleared when it can be confirmed
> + that its defaut type is evex. */
> + ins->vex.nf = *ins->codep & 0x4;
>
> if (ins->address_mode != mode_64bit)
> {
> @@ -9600,6 +9607,15 @@ print_insn (bfd_vma pc, disassemble_info *info, int intel_syntax)
> && ins.vex.prefix == DATA_PREFIX_OPCODE)
> sizeflag ^= DFLAG;
>
> + if(ins.evex_type == evex_default)
> + ins.vex.nf = false;
Up to here I think I agree.
> + else
> + /* For EVEX-promoted formats, we need to clear EVEX.NF (For ccmp and
> + ctest, they will be cleared separately.) in mask_register_specifier
> + and keep the low 2 bits of mask_register_specifier to report errors
> + for invalid cases.*/
> + ins.vex.mask_register_specifier &= 0x3;
But this I'm in trouble with: How would you recognize (and accordingly print)
insns with NF wrongly set? (By implication there's also a respective testcase
[addition] missing.)
> --- a/opcodes/i386-opc.h
> +++ b/opcodes/i386-opc.h
> @@ -1017,7 +1017,8 @@ typedef struct insn_template
> #define Prefix_EVEX 7 /* {evex} */
> #define Prefix_REX 8 /* {rex} */
> #define Prefix_REX2 9 /* {rex2} */
> -#define Prefix_NoOptimize 10 /* {nooptimize} */
> +#define Prefix_NF 10 /* {nf} */
> +#define Prefix_NoOptimize 11 /* {nooptimize} */
I find it increasingly puzzling that nooptimize is pushed all further down,
for no real reason.
> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -310,32 +310,42 @@ sti, 0xfb, 0, NoSuf, {}
> // Arithmetic.
> add, 0x0, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
> add, 0x0, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +add, 0x0, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> add, 0x83/0, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
> add, 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +add, 0x83/0, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> add, 0x4, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> add, 0x80/0, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
> add, 0x80/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +add, 0x80/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
Adding these templates has a 2nd effect, for which no testcase is being added:
They now allow (taking the example here) "{evex} add $1, %eax". Such a new
test (which could be less extensive than the -nf one you already add) should
then also cover ADCX and ADOX, for which the 2-operand EVEX templates were
added prematurely.
> inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }
> inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|NF, {Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
> inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
>
> sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }
> sub, 0x28, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> +sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|Optimize|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
What's the purpose of Optimize here? Just to repeat my earlier request:
Please don't blindly copy all attributes when you clone templates. See
how the existing APX template already doesn't have this attribute.
Apparently you re-cloned the legacy one, not the APX one.
Specifically here, this template will only be chosen if either {nf} or
{evex} is present. Both of which preclude the NDD->REX2 transformation,
in turn making further optimization impossible.
As to {nf} precluding optimization: can_convert_NDD_to_legacy() checks
i.tm.opcode_modifier.nf rather than i.has_nf. That's entirely dead code,
as i.tm is populated only by install_template(). This check wants
dropping in a prereq patch, I suppose, and then the patch here should
add the correct check. I recall saying back then that a respective check
needs adding here, not already in the patch introducing the
transformation.
Jan
next prev parent reply other threads:[~2024-02-29 11:21 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-27 9:01 Cui, Lili
2024-02-28 16:11 ` H.J. Lu
2024-02-29 1:12 ` Cui, Lili
2024-02-29 6:53 ` Jan Beulich
2024-02-29 8:39 ` Cui, Lili
2024-02-29 9:06 ` Jan Beulich
2024-02-29 10:22 ` Cui, Lili
2024-02-29 12:23 ` H.J. Lu
2024-02-29 12:26 ` Cui, Lili
2024-02-29 11:21 ` Jan Beulich [this message]
2024-02-29 12:00 ` Cui, Lili
2024-02-29 12:04 ` Jan Beulich
2024-02-29 12:41 ` Cui, Lili
2024-02-29 13:17 ` Jan Beulich
2024-02-29 13:47 ` Cui, Lili
2024-02-29 14:12 ` Jan Beulich
2024-03-01 3:23 ` Cui, Lili
2024-03-01 6:56 ` Jan Beulich
2024-03-01 8:01 ` Cui, Lili
2024-03-01 11:36 ` Cui, Lili
2024-03-01 11:49 ` Jan Beulich
2024-03-01 7:04 ` Jan Beulich
2024-03-01 11:50 ` Cui, Lili
2024-03-19 6:41 Cui, Lili
2024-03-21 14:26 ` Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=e908fddd-6fd2-4880-bcd7-7c49d245db18@suse.com \
--to=jbeulich@suse.com \
--cc=binutils@sourceware.org \
--cc=hongjiu.lu@intel.com \
--cc=lili.cui@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).