public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-3586] Enable more optimization for 32-bit/64-bit shrd/shld with imm shift count.
@ 2022-11-01 3:24 hongtao Liu
0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2022-11-01 3:24 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:5c5ef2f9ab545b680cd4bb6c540a9dadb12ead86
commit r13-3586-g5c5ef2f9ab545b680cd4bb6c540a9dadb12ead86
Author: liuhongt <hongtao.liu@intel.com>
Date: Thu Oct 27 18:48:41 2022 +0800
Enable more optimization for 32-bit/64-bit shrd/shld with imm shift count.
This patch doens't handle variable count since it require 5 insns to
be combined to get wanted pattern, but current pass_combine only
supports at most 4.
This patch doesn't handle 16-bit shrd/shld either.
gcc/ChangeLog:
PR target/55583
* config/i386/i386.md (*x86_64_shld_1): Rename to ..
(x86_64_shld_1): .. this.
(*x86_shld_1): Rename to ..
(x86_shld_1): .. this.
(*x86_64_shrd_1): Rename to ..
(x86_64_shrd_1): .. this.
(*x86_shrd_1): Rename to ..
(x86_shrd_1): .. this.
(*x86_64_shld_shrd_1_nozext): New pre_reload splitter.
(*x86_shld_shrd_1_nozext): Ditto.
(*x86_64_shrd_shld_1_nozext): Ditto.
(*x86_shrd_shld_1_nozext): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr55583.c: New test.
Diff:
---
gcc/config/i386/i386.md | 150 +++++++++++++++++++++++++++++++-
gcc/testsuite/gcc.target/i386/pr55583.c | 27 ++++++
2 files changed, 173 insertions(+), 4 deletions(-)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index f28c2a171d7..85567980aa3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12479,7 +12479,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
-(define_insn "*x86_64_shld_1"
+(define_insn "x86_64_shld_1"
[(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
(ior:DI (ashift:DI (match_dup 0)
(match_operand:QI 2 "const_0_to_63_operand"))
@@ -12500,6 +12500,42 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext"
+ [(set (match_operand:DI 0 "nonimmediate_operand")
+ (ior:DI (ashift:DI (match_operand:DI 4 "nonimmediate_operand")
+ (match_operand:QI 2 "const_0_to_63_operand"))
+ (lshiftrt:DI
+ (match_operand:DI 1 "nonimmediate_operand")
+ (match_operand:QI 3 "const_0_to_63_operand"))))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_64BIT
+ && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ if (rtx_equal_p (operands[4], operands[0]))
+ {
+ operands[1] = force_reg (DImode, operands[1]);
+ emit_insn (gen_x86_64_shld_1 (operands[0], operands[1], operands[2], operands[3]));
+ }
+ else if (rtx_equal_p (operands[1], operands[0]))
+ {
+ operands[4] = force_reg (DImode, operands[4]);
+ emit_insn (gen_x86_64_shrd_1 (operands[0], operands[4], operands[3], operands[2]));
+ }
+ else
+ {
+ operands[1] = force_reg (DImode, operands[1]);
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_move_insn (tmp, operands[4]);
+ emit_insn (gen_x86_64_shld_1 (tmp, operands[1], operands[2], operands[3]));
+ emit_move_insn (operands[0], tmp);
+ }
+ DONE;
+})
+
(define_insn_and_split "*x86_64_shld_2"
[(set (match_operand:DI 0 "nonimmediate_operand")
(ior:DI (ashift:DI (match_dup 0)
@@ -12543,7 +12579,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
-(define_insn "*x86_shld_1"
+(define_insn "x86_shld_1"
[(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
(ior:SI (ashift:SI (match_dup 0)
(match_operand:QI 2 "const_0_to_31_operand"))
@@ -12564,6 +12600,41 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
+(define_insn_and_split "*x86_shld_shrd_1_nozext"
+ [(set (match_operand:SI 0 "nonimmediate_operand")
+ (ior:SI (ashift:SI (match_operand:SI 4 "nonimmediate_operand")
+ (match_operand:QI 2 "const_0_to_31_operand"))
+ (lshiftrt:SI
+ (match_operand:SI 1 "nonimmediate_operand")
+ (match_operand:QI 3 "const_0_to_31_operand"))))
+ (clobber (reg:CC FLAGS_REG))]
+ "INTVAL (operands[3]) == 32 - INTVAL (operands[2])
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ if (rtx_equal_p (operands[4], operands[0]))
+ {
+ operands[1] = force_reg (SImode, operands[1]);
+ emit_insn (gen_x86_shld_1 (operands[0], operands[1], operands[2], operands[3]));
+ }
+ else if (rtx_equal_p (operands[1], operands[0]))
+ {
+ operands[4] = force_reg (SImode, operands[4]);
+ emit_insn (gen_x86_shrd_1 (operands[0], operands[4], operands[3], operands[2]));
+ }
+ else
+ {
+ operands[1] = force_reg (SImode, operands[1]);
+ rtx tmp = gen_reg_rtx (SImode);
+ emit_move_insn (tmp, operands[4]);
+ emit_insn (gen_x86_shld_1 (tmp, operands[1], operands[2], operands[3]));
+ emit_move_insn (operands[0], tmp);
+ }
+ DONE;
+})
+
(define_insn_and_split "*x86_shld_2"
[(set (match_operand:SI 0 "nonimmediate_operand")
(ior:SI (ashift:SI (match_dup 0)
@@ -13442,7 +13513,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
-(define_insn "*x86_64_shrd_1"
+(define_insn "x86_64_shrd_1"
[(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
(ior:DI (lshiftrt:DI (match_dup 0)
(match_operand:QI 2 "const_0_to_63_operand"))
@@ -13463,6 +13534,42 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
+(define_insn_and_split "*x86_64_shrd_shld_1_nozext"
+ [(set (match_operand:DI 0 "nonimmediate_operand")
+ (ior:DI (lshiftrt:DI (match_operand:DI 4 "nonimmediate_operand")
+ (match_operand:QI 2 "const_0_to_63_operand"))
+ (ashift:DI
+ (match_operand:DI 1 "nonimmediate_operand")
+ (match_operand:QI 3 "const_0_to_63_operand"))))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_64BIT
+ && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ if (rtx_equal_p (operands[4], operands[0]))
+ {
+ operands[1] = force_reg (DImode, operands[1]);
+ emit_insn (gen_x86_64_shrd_1 (operands[0], operands[1], operands[2], operands[3]));
+ }
+ else if (rtx_equal_p (operands[1], operands[0]))
+ {
+ operands[4] = force_reg (DImode, operands[4]);
+ emit_insn (gen_x86_64_shld_1 (operands[0], operands[4], operands[3], operands[2]));
+ }
+ else
+ {
+ operands[1] = force_reg (DImode, operands[1]);
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_move_insn (tmp, operands[4]);
+ emit_insn (gen_x86_64_shrd_1 (tmp, operands[1], operands[2], operands[3]));
+ emit_move_insn (operands[0], tmp);
+ }
+ DONE;
+})
+
(define_insn_and_split "*x86_64_shrd_2"
[(set (match_operand:DI 0 "nonimmediate_operand")
(ior:DI (lshiftrt:DI (match_dup 0)
@@ -13506,7 +13613,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
-(define_insn "*x86_shrd_1"
+(define_insn "x86_shrd_1"
[(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
(ior:SI (lshiftrt:SI (match_dup 0)
(match_operand:QI 2 "const_0_to_31_operand"))
@@ -13527,6 +13634,41 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
+(define_insn_and_split "*x86_shrd_shld_1_nozext"
+ [(set (match_operand:SI 0 "nonimmediate_operand")
+ (ior:SI (lshiftrt:SI (match_operand:SI 4 "nonimmediate_operand")
+ (match_operand:QI 2 "const_0_to_31_operand"))
+ (ashift:SI
+ (match_operand:SI 1 "nonimmediate_operand")
+ (match_operand:QI 3 "const_0_to_31_operand"))))
+ (clobber (reg:CC FLAGS_REG))]
+ "INTVAL (operands[3]) == 32 - INTVAL (operands[2])
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ if (rtx_equal_p (operands[4], operands[0]))
+ {
+ operands[1] = force_reg (SImode, operands[1]);
+ emit_insn (gen_x86_shrd_1 (operands[0], operands[1], operands[2], operands[3]));
+ }
+ else if (rtx_equal_p (operands[1], operands[0]))
+ {
+ operands[4] = force_reg (SImode, operands[4]);
+ emit_insn (gen_x86_shld_1 (operands[0], operands[4], operands[3], operands[2]));
+ }
+ else
+ {
+ operands[1] = force_reg (SImode, operands[1]);
+ rtx tmp = gen_reg_rtx (SImode);
+ emit_move_insn (tmp, operands[4]);
+ emit_insn (gen_x86_shrd_1 (tmp, operands[1], operands[2], operands[3]));
+ emit_move_insn (operands[0], tmp);
+ }
+ DONE;
+})
+
(define_insn_and_split "*x86_shrd_2"
[(set (match_operand:SI 0 "nonimmediate_operand")
(ior:SI (lshiftrt:SI (match_dup 0)
diff --git a/gcc/testsuite/gcc.target/i386/pr55583.c b/gcc/testsuite/gcc.target/i386/pr55583.c
new file mode 100644
index 00000000000..1c128b5d929
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr55583.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -Wno-shift-count-overflow" } */
+/* { dg-final { scan-assembler-times {(?n)shrd[ql]?[\t ]*\$2} 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times {(?n)shrdl?[\t ]*\$2} 2 { target ia32 } } } */
+/* { dg-final { scan-assembler-times {(?n)shldl?[\t ]*\$2} 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times {(?n)shld[ql]?[\t ]*\$2} 2 { target { ! ia32 } } } } */
+
+typedef unsigned long u64;
+typedef unsigned int u32;
+typedef unsigned short u16;
+
+long a, b;
+int c, d;
+short e, f;
+const int n = 2;
+
+void test64r () { b = ((u64)b >> n) | (a << (64 - n)); }
+void test32r () { d = ((u32)d >> n) | (c << (32 - n)); }
+
+unsigned long ua, ub;
+unsigned int uc, ud;
+unsigned short ue, uf;
+
+void testu64l () { ub = (ub << n) | (ua >> (64 - n)); }
+void testu64r () { ub = (ub >> n) | (ua << (64 - n)); }
+void testu32l () { ud = (ud << n) | (uc >> (32 - n)); }
+void testu32r () { ud = (ud >> n) | (uc << (32 - n)); }
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2022-11-01 3:24 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-01 3:24 [gcc r13-3586] Enable more optimization for 32-bit/64-bit shrd/shld with imm shift count hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).