* [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
@ 2022-01-12 8:11 Haochen Jiang
2022-01-12 19:22 ` Uros Bizjak
0 siblings, 1 reply; 5+ messages in thread
From: Haochen Jiang @ 2022-01-12 8:11 UTC (permalink / raw)
To: gcc-patches; +Cc: hongtao.liu, ubizjak
Hi all,
This patch targets PR94790, which change the instruction selection under the following circumstance.
Regtested on x86_64-pc-linux-gnu. Ok for trunk?
BRs,
Haochen
From the perspective of the pipeline, `andn + and + ior` version take
2 cycles(AND and ANDN doesn't have dependence), but xor + and + xor
will take 3 cycles.
- xorl %edi, %esi
andl %edx, %esi
- movl %esi, %eax
- xorl %edi, %eax
+ andn %edi, %edx, %eax
+ orl %esi, %eax
gcc/ChangeLog:
PR taeget/94790
* config/i386/i386.md (*xor2andn): New define_insn_and_split.
gcc/testsuite/ChangeLog:
PR taeget/94790
* gcc.target/i386/pr94790-1.c: New test.
* gcc.target/i386/pr94790-2.c: Ditto.
---
gcc/config/i386/i386.md | 39 +++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr94790-1.c | 14 ++++++++
gcc/testsuite/gcc.target/i386/pr94790-2.c | 9 ++++++
3 files changed, 62 insertions(+)
create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-1.c
create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-2.c
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9b424a3935b..38efc6d5837 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -10452,6 +10452,45 @@
(set_attr "znver1_decode" "double")
(set_attr "mode" "DI")])
+;; PR target/94790: Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask)
+(define_insn_and_split "*xor2andn"
+ [(set (match_operand:SWI248 0 "nonimmediate_operand")
+ (xor:SWI248
+ (and:SWI248
+ (xor:SWI248
+ (match_operand:SWI248 1 "nonimmediate_operand")
+ (match_operand:SWI248 2 "nonimmediate_operand"))
+ (match_operand:SWI248 3 "nonimmediate_operand"))
+ (match_dup 1)))
+ (clobber (reg:CC FLAGS_REG))]
+ "(TARGET_BMI || TARGET_AVX512BW)
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(parallel [(set (match_dup 4)
+ (and:SWI248
+ (not:SWI248
+ (match_dup 3))
+ (match_dup 1)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel [(set (match_dup 5)
+ (and:SWI248
+ (match_dup 2)
+ (match_dup 3)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel [(set (match_dup 0)
+ (ior:SWI248
+ (match_dup 4)
+ (match_dup 5)))
+ (clobber (reg:CC FLAGS_REG))])]
+ {
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ operands[3] = force_reg (<MODE>mode, operands[3]);
+ operands[4] = gen_reg_rtx (<MODE>mode);
+ operands[5] = gen_reg_rtx (<MODE>mode);
+ }
+)
+
;; See comment for addsi_1_zext why we do use nonimmediate_operand
(define_insn "*<code>si_1_zext"
[(set (match_operand:DI 0 "register_operand" "=r")
diff --git a/gcc/testsuite/gcc.target/i386/pr94790-1.c b/gcc/testsuite/gcc.target/i386/pr94790-1.c
new file mode 100755
index 00000000000..6ebbec15cfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94790-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi" } */
+/* { dg-final { scan-assembler-times "andn\[ \\t\]" 2 } } */
+/* { dg-final { scan-assembler-not "xorl\[ \\t\]" } } */
+
+unsigned r1(unsigned a, unsigned b, unsigned mask)
+{
+ return a ^ ((a ^ b) & mask);
+}
+
+unsigned r2(unsigned a, unsigned b, unsigned mask)
+{
+ return (~mask & a) | (b & mask);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr94790-2.c b/gcc/testsuite/gcc.target/i386/pr94790-2.c
new file mode 100755
index 00000000000..d7b0eec5bef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94790-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi" } */
+/* { dg-final { scan-assembler-not "andn\[ \\t\]" } } */
+/* { dg-final { scan-assembler-times "xorl\[ \\t\]" 2 } } */
+
+unsigned r1(unsigned a, unsigned b, unsigned mask)
+{
+ return a ^ ((a ^ b) & mask) + (a ^ b);
+}
--
2.18.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
2022-01-12 8:11 [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask) Haochen Jiang
@ 2022-01-12 19:22 ` Uros Bizjak
2022-01-13 1:21 ` Jiang, Haochen
2022-01-13 1:53 ` Jiang, Haochen
0 siblings, 2 replies; 5+ messages in thread
From: Uros Bizjak @ 2022-01-12 19:22 UTC (permalink / raw)
To: Haochen Jiang; +Cc: gcc-patches, Hongtao Liu
On Wed, Jan 12, 2022 at 9:11 AM Haochen Jiang <haochen.jiang@intel.com> wrote:
>
> Hi all,
>
> This patch targets PR94790, which change the instruction selection under the following circumstance.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
Please also test with -m32, e.g.:
make -j 12 -k check RUNTESTFLAGS="--target_board=unix\{,-m32\}"
OK (with an it below), if new testcases do not FAIL with -m32.
Thanks,
Uros.
>
> BRs,
> Haochen
>
> From the perspective of the pipeline, `andn + and + ior` version take
> 2 cycles(AND and ANDN doesn't have dependence), but xor + and + xor
> will take 3 cycles.
>
> - xorl %edi, %esi
> andl %edx, %esi
> - movl %esi, %eax
> - xorl %edi, %eax
> + andn %edi, %edx, %eax
> + orl %esi, %eax
>
> gcc/ChangeLog:
>
> PR taeget/94790
> * config/i386/i386.md (*xor2andn): New define_insn_and_split.
>
> gcc/testsuite/ChangeLog:
>
> PR taeget/94790
> * gcc.target/i386/pr94790-1.c: New test.
> * gcc.target/i386/pr94790-2.c: Ditto.
> ---
> gcc/config/i386/i386.md | 39 +++++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr94790-1.c | 14 ++++++++
> gcc/testsuite/gcc.target/i386/pr94790-2.c | 9 ++++++
> 3 files changed, 62 insertions(+)
> create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-1.c
> create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-2.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 9b424a3935b..38efc6d5837 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -10452,6 +10452,45 @@
> (set_attr "znver1_decode" "double")
> (set_attr "mode" "DI")])
>
> +;; PR target/94790: Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask)
> +(define_insn_and_split "*xor2andn"
> + [(set (match_operand:SWI248 0 "nonimmediate_operand")
> + (xor:SWI248
> + (and:SWI248
> + (xor:SWI248
> + (match_operand:SWI248 1 "nonimmediate_operand")
> + (match_operand:SWI248 2 "nonimmediate_operand"))
> + (match_operand:SWI248 3 "nonimmediate_operand"))
> + (match_dup 1)))
> + (clobber (reg:CC FLAGS_REG))]
> + "(TARGET_BMI || TARGET_AVX512BW)
> + && ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(parallel [(set (match_dup 4)
> + (and:SWI248
> + (not:SWI248
> + (match_dup 3))
> + (match_dup 1)))
> + (clobber (reg:CC FLAGS_REG))])
> + (parallel [(set (match_dup 5)
> + (and:SWI248
> + (match_dup 2)
> + (match_dup 3)))
> + (clobber (reg:CC FLAGS_REG))])
> + (parallel [(set (match_dup 0)
> + (ior:SWI248
> + (match_dup 4)
> + (match_dup 5)))
> + (clobber (reg:CC FLAGS_REG))])]
> + {
> + operands[1] = force_reg (<MODE>mode, operands[1]);
> + operands[3] = force_reg (<MODE>mode, operands[3]);
> + operands[4] = gen_reg_rtx (<MODE>mode);
> + operands[5] = gen_reg_rtx (<MODE>mode);
> + }
> +)
Please put brace just after the curved brace, see numerous examples in
.md files.
> +
> ;; See comment for addsi_1_zext why we do use nonimmediate_operand
> (define_insn "*<code>si_1_zext"
> [(set (match_operand:DI 0 "register_operand" "=r")
> diff --git a/gcc/testsuite/gcc.target/i386/pr94790-1.c b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> new file mode 100755
> index 00000000000..6ebbec15cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi" } */
> +/* { dg-final { scan-assembler-times "andn\[ \\t\]" 2 } } */
> +/* { dg-final { scan-assembler-not "xorl\[ \\t\]" } } */
> +
> +unsigned r1(unsigned a, unsigned b, unsigned mask)
> +{
> + return a ^ ((a ^ b) & mask);
> +}
> +
> +unsigned r2(unsigned a, unsigned b, unsigned mask)
> +{
> + return (~mask & a) | (b & mask);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94790-2.c b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> new file mode 100755
> index 00000000000..d7b0eec5bef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi" } */
> +/* { dg-final { scan-assembler-not "andn\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-times "xorl\[ \\t\]" 2 } } */
> +
> +unsigned r1(unsigned a, unsigned b, unsigned mask)
> +{
> + return a ^ ((a ^ b) & mask) + (a ^ b);
> +}
> --
> 2.18.1
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* RE: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
2022-01-12 19:22 ` Uros Bizjak
@ 2022-01-13 1:21 ` Jiang, Haochen
2022-01-13 1:53 ` Jiang, Haochen
1 sibling, 0 replies; 5+ messages in thread
From: Jiang, Haochen @ 2022-01-13 1:21 UTC (permalink / raw)
To: Uros Bizjak; +Cc: gcc-patches, Liu, Hongtao
Hi Uros,
I have also tested on -m32. They do not fail.
Thx,
Haochen
-----Original Message-----
From: Uros Bizjak <ubizjak@gmail.com>
Sent: Thursday, January 13, 2022 3:22 AM
To: Jiang, Haochen <haochen.jiang@intel.com>
Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>
Subject: Re: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
On Wed, Jan 12, 2022 at 9:11 AM Haochen Jiang <haochen.jiang@intel.com> wrote:
>
> Hi all,
>
> This patch targets PR94790, which change the instruction selection under the following circumstance.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
Please also test with -m32, e.g.:
make -j 12 -k check RUNTESTFLAGS="--target_board=unix\{,-m32\}"
OK (with an it below), if new testcases do not FAIL with -m32.
Thanks,
Uros.
>
> BRs,
> Haochen
>
> From the perspective of the pipeline, `andn + and + ior` version take
> 2 cycles(AND and ANDN doesn't have dependence), but xor + and + xor
> will take 3 cycles.
>
> - xorl %edi, %esi
> andl %edx, %esi
> - movl %esi, %eax
> - xorl %edi, %eax
> + andn %edi, %edx, %eax
> + orl %esi, %eax
>
> gcc/ChangeLog:
>
> PR taeget/94790
> * config/i386/i386.md (*xor2andn): New define_insn_and_split.
>
> gcc/testsuite/ChangeLog:
>
> PR taeget/94790
> * gcc.target/i386/pr94790-1.c: New test.
> * gcc.target/i386/pr94790-2.c: Ditto.
> ---
> gcc/config/i386/i386.md | 39 +++++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr94790-1.c | 14 ++++++++
> gcc/testsuite/gcc.target/i386/pr94790-2.c | 9 ++++++
> 3 files changed, 62 insertions(+)
> create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-1.c
> create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-2.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
> 9b424a3935b..38efc6d5837 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -10452,6 +10452,45 @@
> (set_attr "znver1_decode" "double")
> (set_attr "mode" "DI")])
>
> +;; PR target/94790: Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b
> +& mask) (define_insn_and_split "*xor2andn"
> + [(set (match_operand:SWI248 0 "nonimmediate_operand")
> + (xor:SWI248
> + (and:SWI248
> + (xor:SWI248
> + (match_operand:SWI248 1 "nonimmediate_operand")
> + (match_operand:SWI248 2 "nonimmediate_operand"))
> + (match_operand:SWI248 3 "nonimmediate_operand"))
> + (match_dup 1)))
> + (clobber (reg:CC FLAGS_REG))]
> + "(TARGET_BMI || TARGET_AVX512BW)
> + && ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(parallel [(set (match_dup 4)
> + (and:SWI248
> + (not:SWI248
> + (match_dup 3))
> + (match_dup 1)))
> + (clobber (reg:CC FLAGS_REG))])
> + (parallel [(set (match_dup 5)
> + (and:SWI248
> + (match_dup 2)
> + (match_dup 3)))
> + (clobber (reg:CC FLAGS_REG))])
> + (parallel [(set (match_dup 0)
> + (ior:SWI248
> + (match_dup 4)
> + (match_dup 5)))
> + (clobber (reg:CC FLAGS_REG))])]
> + {
> + operands[1] = force_reg (<MODE>mode, operands[1]);
> + operands[3] = force_reg (<MODE>mode, operands[3]);
> + operands[4] = gen_reg_rtx (<MODE>mode);
> + operands[5] = gen_reg_rtx (<MODE>mode);
> + }
> +)
Please put brace just after the curved brace, see numerous examples in .md files.
> +
> ;; See comment for addsi_1_zext why we do use nonimmediate_operand
> (define_insn "*<code>si_1_zext"
> [(set (match_operand:DI 0 "register_operand" "=r") diff --git
> a/gcc/testsuite/gcc.target/i386/pr94790-1.c
> b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> new file mode 100755
> index 00000000000..6ebbec15cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi" } */
> +/* { dg-final { scan-assembler-times "andn\[ \\t\]" 2 } } */
> +/* { dg-final { scan-assembler-not "xorl\[ \\t\]" } } */
> +
> +unsigned r1(unsigned a, unsigned b, unsigned mask) {
> + return a ^ ((a ^ b) & mask);
> +}
> +
> +unsigned r2(unsigned a, unsigned b, unsigned mask) {
> + return (~mask & a) | (b & mask);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94790-2.c
> b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> new file mode 100755
> index 00000000000..d7b0eec5bef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi" } */
> +/* { dg-final { scan-assembler-not "andn\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-times "xorl\[ \\t\]" 2 } } */
> +
> +unsigned r1(unsigned a, unsigned b, unsigned mask) {
> + return a ^ ((a ^ b) & mask) + (a ^ b); }
> --
> 2.18.1
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* RE: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
2022-01-12 19:22 ` Uros Bizjak
2022-01-13 1:21 ` Jiang, Haochen
@ 2022-01-13 1:53 ` Jiang, Haochen
2022-01-13 7:44 ` Uros Bizjak
1 sibling, 1 reply; 5+ messages in thread
From: Jiang, Haochen @ 2022-01-13 1:53 UTC (permalink / raw)
To: Uros Bizjak; +Cc: gcc-patches, Liu, Hongtao
[-- Attachment #1: Type: text/plain, Size: 5108 bytes --]
Hi Uros,
Has fixed that format issue with this new patch. Ok for trunk?
Thx,
Haochen
-----Original Message-----
From: Uros Bizjak <ubizjak@gmail.com>
Sent: Thursday, January 13, 2022 3:22 AM
To: Jiang, Haochen <haochen.jiang@intel.com>
Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>
Subject: Re: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
On Wed, Jan 12, 2022 at 9:11 AM Haochen Jiang <haochen.jiang@intel.com> wrote:
>
> Hi all,
>
> This patch targets PR94790, which change the instruction selection under the following circumstance.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
Please also test with -m32, e.g.:
make -j 12 -k check RUNTESTFLAGS="--target_board=unix\{,-m32\}"
OK (with an it below), if new testcases do not FAIL with -m32.
Thanks,
Uros.
>
> BRs,
> Haochen
>
> From the perspective of the pipeline, `andn + and + ior` version take
> 2 cycles(AND and ANDN doesn't have dependence), but xor + and + xor
> will take 3 cycles.
>
> - xorl %edi, %esi
> andl %edx, %esi
> - movl %esi, %eax
> - xorl %edi, %eax
> + andn %edi, %edx, %eax
> + orl %esi, %eax
>
> gcc/ChangeLog:
>
> PR taeget/94790
> * config/i386/i386.md (*xor2andn): New define_insn_and_split.
>
> gcc/testsuite/ChangeLog:
>
> PR taeget/94790
> * gcc.target/i386/pr94790-1.c: New test.
> * gcc.target/i386/pr94790-2.c: Ditto.
> ---
> gcc/config/i386/i386.md | 39 +++++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr94790-1.c | 14 ++++++++
> gcc/testsuite/gcc.target/i386/pr94790-2.c | 9 ++++++
> 3 files changed, 62 insertions(+)
> create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-1.c
> create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-2.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
> 9b424a3935b..38efc6d5837 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -10452,6 +10452,45 @@
> (set_attr "znver1_decode" "double")
> (set_attr "mode" "DI")])
>
> +;; PR target/94790: Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b
> +& mask) (define_insn_and_split "*xor2andn"
> + [(set (match_operand:SWI248 0 "nonimmediate_operand")
> + (xor:SWI248
> + (and:SWI248
> + (xor:SWI248
> + (match_operand:SWI248 1 "nonimmediate_operand")
> + (match_operand:SWI248 2 "nonimmediate_operand"))
> + (match_operand:SWI248 3 "nonimmediate_operand"))
> + (match_dup 1)))
> + (clobber (reg:CC FLAGS_REG))]
> + "(TARGET_BMI || TARGET_AVX512BW)
> + && ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(parallel [(set (match_dup 4)
> + (and:SWI248
> + (not:SWI248
> + (match_dup 3))
> + (match_dup 1)))
> + (clobber (reg:CC FLAGS_REG))])
> + (parallel [(set (match_dup 5)
> + (and:SWI248
> + (match_dup 2)
> + (match_dup 3)))
> + (clobber (reg:CC FLAGS_REG))])
> + (parallel [(set (match_dup 0)
> + (ior:SWI248
> + (match_dup 4)
> + (match_dup 5)))
> + (clobber (reg:CC FLAGS_REG))])]
> + {
> + operands[1] = force_reg (<MODE>mode, operands[1]);
> + operands[3] = force_reg (<MODE>mode, operands[3]);
> + operands[4] = gen_reg_rtx (<MODE>mode);
> + operands[5] = gen_reg_rtx (<MODE>mode);
> + }
> +)
Please put brace just after the curved brace, see numerous examples in .md files.
> +
> ;; See comment for addsi_1_zext why we do use nonimmediate_operand
> (define_insn "*<code>si_1_zext"
> [(set (match_operand:DI 0 "register_operand" "=r") diff --git
> a/gcc/testsuite/gcc.target/i386/pr94790-1.c
> b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> new file mode 100755
> index 00000000000..6ebbec15cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi" } */
> +/* { dg-final { scan-assembler-times "andn\[ \\t\]" 2 } } */
> +/* { dg-final { scan-assembler-not "xorl\[ \\t\]" } } */
> +
> +unsigned r1(unsigned a, unsigned b, unsigned mask) {
> + return a ^ ((a ^ b) & mask);
> +}
> +
> +unsigned r2(unsigned a, unsigned b, unsigned mask) {
> + return (~mask & a) | (b & mask);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94790-2.c
> b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> new file mode 100755
> index 00000000000..d7b0eec5bef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi" } */
> +/* { dg-final { scan-assembler-not "andn\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-times "xorl\[ \\t\]" 2 } } */
> +
> +unsigned r1(unsigned a, unsigned b, unsigned mask) {
> + return a ^ ((a ^ b) & mask) + (a ^ b); }
> --
> 2.18.1
>
[-- Attachment #2: 0001-i386-Optimize-a-a-b-mask-to-mask-a-b-mask.patch --]
[-- Type: application/octet-stream, Size: 3829 bytes --]
From 19bf72443e49339846216fc07888e1244472b9b3 Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Wed, 12 Jan 2022 10:01:21 +0800
Subject: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b &
mask).
From the perspective of the pipeline, `andn + and + ior` version take
2 cycles(AND and ANDN doesn't have dependence), but xor + and + xor
will take 3 cycles.
- xorl %edi, %esi
andl %edx, %esi
- movl %esi, %eax
- xorl %edi, %eax
+ andn %edi, %edx, %eax
+ orl %esi, %eax
gcc/ChangeLog:
PR taeget/94790
* config/i386/i386.md (*xor2andn): New define_insn_and_split.
gcc/testsuite/ChangeLog:
PR taeget/94790
* gcc.target/i386/pr94790-1.c: New test.
* gcc.target/i386/pr94790-2.c: Ditto.
---
gcc/config/i386/i386.md | 38 +++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr94790-1.c | 14 +++++++++
gcc/testsuite/gcc.target/i386/pr94790-2.c | 9 ++++++
3 files changed, 61 insertions(+)
create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-1.c
create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-2.c
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9b424a3935b..60c440fcfb1 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -10452,6 +10452,44 @@
(set_attr "znver1_decode" "double")
(set_attr "mode" "DI")])
+;; PR target/94790: Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask)
+(define_insn_and_split "*xor2andn"
+ [(set (match_operand:SWI248 0 "nonimmediate_operand")
+ (xor:SWI248
+ (and:SWI248
+ (xor:SWI248
+ (match_operand:SWI248 1 "nonimmediate_operand")
+ (match_operand:SWI248 2 "nonimmediate_operand"))
+ (match_operand:SWI248 3 "nonimmediate_operand"))
+ (match_dup 1)))
+ (clobber (reg:CC FLAGS_REG))]
+ "(TARGET_BMI || TARGET_AVX512BW)
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(parallel [(set (match_dup 4)
+ (and:SWI248
+ (not:SWI248
+ (match_dup 3))
+ (match_dup 1)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel [(set (match_dup 5)
+ (and:SWI248
+ (match_dup 2)
+ (match_dup 3)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel [(set (match_dup 0)
+ (ior:SWI248
+ (match_dup 4)
+ (match_dup 5)))
+ (clobber (reg:CC FLAGS_REG))])]
+{
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ operands[3] = force_reg (<MODE>mode, operands[3]);
+ operands[4] = gen_reg_rtx (<MODE>mode);
+ operands[5] = gen_reg_rtx (<MODE>mode);
+})
+
;; See comment for addsi_1_zext why we do use nonimmediate_operand
(define_insn "*<code>si_1_zext"
[(set (match_operand:DI 0 "register_operand" "=r")
diff --git a/gcc/testsuite/gcc.target/i386/pr94790-1.c b/gcc/testsuite/gcc.target/i386/pr94790-1.c
new file mode 100755
index 00000000000..6ebbec15cfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94790-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi" } */
+/* { dg-final { scan-assembler-times "andn\[ \\t\]" 2 } } */
+/* { dg-final { scan-assembler-not "xorl\[ \\t\]" } } */
+
+unsigned r1(unsigned a, unsigned b, unsigned mask)
+{
+ return a ^ ((a ^ b) & mask);
+}
+
+unsigned r2(unsigned a, unsigned b, unsigned mask)
+{
+ return (~mask & a) | (b & mask);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr94790-2.c b/gcc/testsuite/gcc.target/i386/pr94790-2.c
new file mode 100755
index 00000000000..d7b0eec5bef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94790-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi" } */
+/* { dg-final { scan-assembler-not "andn\[ \\t\]" } } */
+/* { dg-final { scan-assembler-times "xorl\[ \\t\]" 2 } } */
+
+unsigned r1(unsigned a, unsigned b, unsigned mask)
+{
+ return a ^ ((a ^ b) & mask) + (a ^ b);
+}
--
2.18.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
2022-01-13 1:53 ` Jiang, Haochen
@ 2022-01-13 7:44 ` Uros Bizjak
0 siblings, 0 replies; 5+ messages in thread
From: Uros Bizjak @ 2022-01-13 7:44 UTC (permalink / raw)
To: Jiang, Haochen; +Cc: gcc-patches, Liu, Hongtao
On Thu, Jan 13, 2022 at 2:53 AM Jiang, Haochen <haochen.jiang@intel.com> wrote:
>
> Hi Uros,
>
> Has fixed that format issue with this new patch. Ok for trunk?
The patch was already approved in my previous message, so no need to
re-approve it. I'm sure you are able to move one brace to a new
position without another review. ;)
Uros.
> Thx,
> Haochen
>
> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: Thursday, January 13, 2022 3:22 AM
> To: Jiang, Haochen <haochen.jiang@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>
> Subject: Re: [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask).
>
> On Wed, Jan 12, 2022 at 9:11 AM Haochen Jiang <haochen.jiang@intel.com> wrote:
> >
> > Hi all,
> >
> > This patch targets PR94790, which change the instruction selection under the following circumstance.
> >
> > Regtested on x86_64-pc-linux-gnu. Ok for trunk?
>
> Please also test with -m32, e.g.:
>
> make -j 12 -k check RUNTESTFLAGS="--target_board=unix\{,-m32\}"
>
> OK (with an it below), if new testcases do not FAIL with -m32.
>
> Thanks,
> Uros.
>
> >
> > BRs,
> > Haochen
> >
> > From the perspective of the pipeline, `andn + and + ior` version take
> > 2 cycles(AND and ANDN doesn't have dependence), but xor + and + xor
> > will take 3 cycles.
> >
> > - xorl %edi, %esi
> > andl %edx, %esi
> > - movl %esi, %eax
> > - xorl %edi, %eax
> > + andn %edi, %edx, %eax
> > + orl %esi, %eax
> >
> > gcc/ChangeLog:
> >
> > PR taeget/94790
> > * config/i386/i386.md (*xor2andn): New define_insn_and_split.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR taeget/94790
> > * gcc.target/i386/pr94790-1.c: New test.
> > * gcc.target/i386/pr94790-2.c: Ditto.
> > ---
> > gcc/config/i386/i386.md | 39 +++++++++++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr94790-1.c | 14 ++++++++
> > gcc/testsuite/gcc.target/i386/pr94790-2.c | 9 ++++++
> > 3 files changed, 62 insertions(+)
> > create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-1.c
> > create mode 100755 gcc/testsuite/gcc.target/i386/pr94790-2.c
> >
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
> > 9b424a3935b..38efc6d5837 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -10452,6 +10452,45 @@
> > (set_attr "znver1_decode" "double")
> > (set_attr "mode" "DI")])
> >
> > +;; PR target/94790: Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b
> > +& mask) (define_insn_and_split "*xor2andn"
> > + [(set (match_operand:SWI248 0 "nonimmediate_operand")
> > + (xor:SWI248
> > + (and:SWI248
> > + (xor:SWI248
> > + (match_operand:SWI248 1 "nonimmediate_operand")
> > + (match_operand:SWI248 2 "nonimmediate_operand"))
> > + (match_operand:SWI248 3 "nonimmediate_operand"))
> > + (match_dup 1)))
> > + (clobber (reg:CC FLAGS_REG))]
> > + "(TARGET_BMI || TARGET_AVX512BW)
> > + && ix86_pre_reload_split ()"
> > + "#"
> > + "&& 1"
> > + [(parallel [(set (match_dup 4)
> > + (and:SWI248
> > + (not:SWI248
> > + (match_dup 3))
> > + (match_dup 1)))
> > + (clobber (reg:CC FLAGS_REG))])
> > + (parallel [(set (match_dup 5)
> > + (and:SWI248
> > + (match_dup 2)
> > + (match_dup 3)))
> > + (clobber (reg:CC FLAGS_REG))])
> > + (parallel [(set (match_dup 0)
> > + (ior:SWI248
> > + (match_dup 4)
> > + (match_dup 5)))
> > + (clobber (reg:CC FLAGS_REG))])]
> > + {
> > + operands[1] = force_reg (<MODE>mode, operands[1]);
> > + operands[3] = force_reg (<MODE>mode, operands[3]);
> > + operands[4] = gen_reg_rtx (<MODE>mode);
> > + operands[5] = gen_reg_rtx (<MODE>mode);
> > + }
> > +)
>
> Please put brace just after the curved brace, see numerous examples in .md files.
>
> > +
> > ;; See comment for addsi_1_zext why we do use nonimmediate_operand
> > (define_insn "*<code>si_1_zext"
> > [(set (match_operand:DI 0 "register_operand" "=r") diff --git
> > a/gcc/testsuite/gcc.target/i386/pr94790-1.c
> > b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> > new file mode 100755
> > index 00000000000..6ebbec15cfd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr94790-1.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mbmi" } */
> > +/* { dg-final { scan-assembler-times "andn\[ \\t\]" 2 } } */
> > +/* { dg-final { scan-assembler-not "xorl\[ \\t\]" } } */
> > +
> > +unsigned r1(unsigned a, unsigned b, unsigned mask) {
> > + return a ^ ((a ^ b) & mask);
> > +}
> > +
> > +unsigned r2(unsigned a, unsigned b, unsigned mask) {
> > + return (~mask & a) | (b & mask);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr94790-2.c
> > b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> > new file mode 100755
> > index 00000000000..d7b0eec5bef
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr94790-2.c
> > @@ -0,0 +1,9 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mbmi" } */
> > +/* { dg-final { scan-assembler-not "andn\[ \\t\]" } } */
> > +/* { dg-final { scan-assembler-times "xorl\[ \\t\]" 2 } } */
> > +
> > +unsigned r1(unsigned a, unsigned b, unsigned mask) {
> > + return a ^ ((a ^ b) & mask) + (a ^ b); }
> > --
> > 2.18.1
> >
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2022-01-13 7:44 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-12 8:11 [PATCH] [i386] Optimize a ^ ((a ^ b) & mask) to (~mask & a) | (b & mask) Haochen Jiang
2022-01-12 19:22 ` Uros Bizjak
2022-01-13 1:21 ` Jiang, Haochen
2022-01-13 1:53 ` Jiang, Haochen
2022-01-13 7:44 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).