public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.
@ 2023-12-05 13:59 Roger Sayle
  2023-12-07 14:47 ` Jeff Law
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Roger Sayle @ 2023-12-05 13:59 UTC (permalink / raw)
  To: gcc-patches; +Cc: 'Claudiu Zissulescu'

[-- Attachment #1: Type: text/plain, Size: 1892 bytes --]


This patch improves the code generated for bitfield sign extensions on
ARC cpus without a barrel shifter.


Compiling the following test case:

int foo(int x) { return (x<<27)>>27; }

with -O2 -mcpu=em, generates two loops:

foo:    mov     lp_count,27
        lp      2f
        add     r0,r0,r0
        nop
2:      # end single insn loop
        mov     lp_count,27
        lp      2f
        asr     r0,r0
        nop
2:      # end single insn loop
        j_s     [blink]


and the closely related test case:

struct S { int a : 5; };
int bar (struct S *p) { return p->a; }

generates the slightly better:

bar:    ldb_s   r0,[r0]
        mov_s   r2,0    ;3
        add3    r0,r2,r0
        sexb_s  r0,r0
        asr_s   r0,r0
        asr_s   r0,r0
        j_s.d   [blink]
        asr_s   r0,r0

which uses 6 instructions to perform this particular sign extension.
It turns out that sign extensions can always be implemented using at
most three instructions on ARC (without a barrel shifter) using the
idiom ((x&mask)^msb)-msb [as described in section "2-5 Sign Extension"
of Henry Warren's book "Hacker's Delight"].  Using this, the sign
extensions above on ARC's EM both become:

        bmsk_s  r0,r0,4
        xor     r0,r0,32
        sub     r0,r0,32

which takes about 3 cycles, compared to the ~112 cycles for the loops
in foo.


Tested with a cross-compiler to arc-linux hosted on x86_64,
with no new (compile-only) regressions from make -k check.
Ok for mainline if this passes Claudiu's nightly testing?


2023-12-05  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/arc/arc.md (*extvsi_n_0): New define_insn_and_split to
        implement SImode sign extract using a AND, XOR and MINUS sequence.

gcc/testsuite/ChangeLog
        * gcc.target/arc/extvsi-1.c: New test case.
        * gcc.target/arc/extvsi-2.c: Likewise.


Thanks in advance,
Roger
--


[-- Attachment #2: patchar.txt --]
[-- Type: text/plain, Size: 2463 bytes --]

diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index bf9f88eff047..5ebaf2e20ab0 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -6127,6 +6127,26 @@ archs4x, archs4xd"
   ""
   [(set_attr "length" "8")])
 
+(define_insn_and_split "*extvsi_n_0"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(sign_extract:SI (match_operand:SI 1 "register_operand" "0")
+			 (match_operand:QI 2 "const_int_operand")
+			 (const_int 0)))]
+  "!TARGET_BARREL_SHIFTER
+   && IN_RANGE (INTVAL (operands[2]), 2,
+		(optimize_insn_for_size_p () ? 28 : 30))"
+  "#"
+  "&& 1"
+[(set (match_dup 0) (and:SI (match_dup 0) (match_dup 3)))
+ (set (match_dup 0) (xor:SI (match_dup 0) (match_dup 4)))
+ (set (match_dup 0) (minus:SI (match_dup 0) (match_dup 4)))]
+{
+  int tmp = INTVAL (operands[2]);
+  operands[3] = GEN_INT (~(HOST_WIDE_INT_M1U << tmp));
+  operands[4] = GEN_INT (HOST_WIDE_INT_1U << tmp);
+}
+  [(set_attr "length" "14")])
+
 (define_insn_and_split "rotlsi3_cnt1"
   [(set (match_operand:SI 0 "dest_reg_operand"            "=r")
 	(rotate:SI (match_operand:SI 1 "register_operand" "r")
diff --git a/gcc/testsuite/gcc.target/arc/extvsi-1.c b/gcc/testsuite/gcc.target/arc/extvsi-1.c
new file mode 100644
index 000000000000..eb53c78b4e6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/extvsi-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=em" } */
+struct S { int a : 5; };
+
+int foo (struct S *p)
+{
+  return p->a;
+}
+
+/* { dg-final { scan-assembler "msk_s\\s+r0,r0,4" } } */
+/* { dg-final { scan-assembler "xor\\s+r0,r0,32" } } */
+/* { dg-final { scan-assembler "sub\\s+r0,r0,32" } } */
+/* { dg-final { scan-assembler-not "add3\\s+r0,r2,r0" } } */
+/* { dg-final { scan-assembler-not "sext_s\\s+r0,r0" } } */
+/* { dg-final { scan-assembler-not "asr_s\\s+r0,r0" } } */
diff --git a/gcc/testsuite/gcc.target/arc/extvsi-2.c b/gcc/testsuite/gcc.target/arc/extvsi-2.c
new file mode 100644
index 000000000000..a0c6894259d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/extvsi-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=em" } */
+
+int foo(int x)
+{
+  return (x<<27)>>27;
+}
+
+/* { dg-final { scan-assembler "msk_s\\s+r0,r0,4" } } */
+/* { dg-final { scan-assembler "xor\\s+r0,r0,32" } } */
+/* { dg-final { scan-assembler "sub\\s+r0,r0,32" } } */
+/* { dg-final { scan-assembler-not "lp\\s+2f" } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.
  2023-12-05 13:59 [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717 Roger Sayle
@ 2023-12-07 14:47 ` Jeff Law
  2023-12-07 16:04   ` Roger Sayle
  2023-12-10 15:57 ` Jeff Law
  2023-12-13 10:28 ` Claudiu Zissulescu
  2 siblings, 1 reply; 6+ messages in thread
From: Jeff Law @ 2023-12-07 14:47 UTC (permalink / raw)
  To: Roger Sayle, gcc-patches; +Cc: 'Claudiu Zissulescu'



On 12/5/23 06:59, Roger Sayle wrote:
> This patch improves the code generated for bitfield sign extensions on
> ARC cpus without a barrel shifter.
> 
> 
> Compiling the following test case:
> 
> int foo(int x) { return (x<<27)>>27; }
> 
> with -O2 -mcpu=em, generates two loops:
> 
> foo:    mov     lp_count,27
>          lp      2f
>          add     r0,r0,r0
>          nop
> 2:      # end single insn loop
>          mov     lp_count,27
>          lp      2f
>          asr     r0,r0
>          nop
> 2:      # end single insn loop
>          j_s     [blink]
> 
> 
> and the closely related test case:
> 
> struct S { int a : 5; };
> int bar (struct S *p) { return p->a; }
> 
> generates the slightly better:
> 
> bar:    ldb_s   r0,[r0]
>          mov_s   r2,0    ;3
>          add3    r0,r2,r0
>          sexb_s  r0,r0
>          asr_s   r0,r0
>          asr_s   r0,r0
>          j_s.d   [blink]
>          asr_s   r0,r0
> 
> which uses 6 instructions to perform this particular sign extension.
> It turns out that sign extensions can always be implemented using at
> most three instructions on ARC (without a barrel shifter) using the
> idiom ((x&mask)^msb)-msb [as described in section "2-5 Sign Extension"
> of Henry Warren's book "Hacker's Delight"].  Using this, the sign
> extensions above on ARC's EM both become:
> 
>          bmsk_s  r0,r0,4
>          xor     r0,r0,32
>          sub     r0,r0,32
> 
> which takes about 3 cycles, compared to the ~112 cycles for the loops
> in foo.
> 
> 
> Tested with a cross-compiler to arc-linux hosted on x86_64,
> with no new (compile-only) regressions from make -k check.
> Ok for mainline if this passes Claudiu's nightly testing?
> 
> 
> 2023-12-05  Roger Sayle<roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
>          * config/arc/arc.md (*extvsi_n_0): New define_insn_and_split to
>          implement SImode sign extract using a AND, XOR and MINUS sequence.
> 
> gcc/testsuite/ChangeLog
>          * gcc.target/arc/extvsi-1.c: New test case.
>          * gcc.target/arc/extvsi-2.c: Likewise.
> 
> 
> Thanks in advance,
> Roger
> --
> 
> 
> patchar.txt
> 
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index bf9f88eff047..5ebaf2e20ab0 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -6127,6 +6127,26 @@ archs4x, archs4xd"
>     ""
>     [(set_attr "length" "8")])
>   
> +(define_insn_and_split "*extvsi_n_0"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +	(sign_extract:SI (match_operand:SI 1 "register_operand" "0")
> +			 (match_operand:QI 2 "const_int_operand")
> +			 (const_int 0)))]
> +  "!TARGET_BARREL_SHIFTER
> +   && IN_RANGE (INTVAL (operands[2]), 2,
> +		(optimize_insn_for_size_p () ? 28 : 30))"
> +  "#"
> +  "&& 1"
> +[(set (match_dup 0) (and:SI (match_dup 0) (match_dup 3)))
> + (set (match_dup 0) (xor:SI (match_dup 0) (match_dup 4)))
> + (set (match_dup 0) (minus:SI (match_dup 0) (match_dup 4)))]
> +{
> +  int tmp = INTVAL (operands[2]);
> +  operands[3] = GEN_INT (~(HOST_WIDE_INT_M1U << tmp));
> +  operands[4] = GEN_INT (HOST_WIDE_INT_1U << tmp);
Shouldn't operands[4] be GEN_INT ((HOST_WIDE_INT_1U << tmp) - 1)? 
Otherwise it's flipping the wrong bit AFAICT.

H8 can benefit from the same transformation which is how I found this 
little goof.  It's not as big a gain as ARC, but it does affect one of 
those builtin-overflow tests which tend to dominate testing time on the H8.

jeff

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.
  2023-12-07 14:47 ` Jeff Law
@ 2023-12-07 16:04   ` Roger Sayle
  2023-12-07 16:53     ` Jeff Law
  0 siblings, 1 reply; 6+ messages in thread
From: Roger Sayle @ 2023-12-07 16:04 UTC (permalink / raw)
  To: 'Jeff Law', gcc-patches; +Cc: 'Claudiu Zissulescu'

[-- Attachment #1: Type: text/plain, Size: 4782 bytes --]


Hi Jeff,
Doh!  Great catch.  The perils of not (yet) being able to actually
run any ARC execution tests myself.

> Shouldn't operands[4] be GEN_INT ((HOST_WIDE_INT_1U << tmp) - 1)?
Yes(-ish), operands[4] should be GEN_INT(HOST_WIDE_INT_1U << (tmp - 1)).

And the 32s in the test cases need to be 16s (the MSB of a five bit field is 16).

You're probably also thinking the same thing that I am... that it might be possible
to implement this in the middle-end, but things are complicated by combine's
make_compound_operation/expand_compound_operation, and that
combine doesn't (normally) like turning two instructions into three.

Fingers-crossed the attached patch works better on the nightly testers.

Thanks in advance,
Roger
--

> -----Original Message-----
> From: Jeff Law <jeffreyalaw@gmail.com>
> Sent: 07 December 2023 14:47
> To: Roger Sayle <roger@nextmovesoftware.com>; gcc-patches@gcc.gnu.org
> Cc: 'Claudiu Zissulescu' <claziss@gmail.com>
> Subject: Re: [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.
> 
> On 12/5/23 06:59, Roger Sayle wrote:
> > This patch improves the code generated for bitfield sign extensions on
> > ARC cpus without a barrel shifter.
> >
> >
> > Compiling the following test case:
> >
> > int foo(int x) { return (x<<27)>>27; }
> >
> > with -O2 -mcpu=em, generates two loops:
> >
> > foo:    mov     lp_count,27
> >          lp      2f
> >          add     r0,r0,r0
> >          nop
> > 2:      # end single insn loop
> >          mov     lp_count,27
> >          lp      2f
> >          asr     r0,r0
> >          nop
> > 2:      # end single insn loop
> >          j_s     [blink]
> >
> >
> > and the closely related test case:
> >
> > struct S { int a : 5; };
> > int bar (struct S *p) { return p->a; }
> >
> > generates the slightly better:
> >
> > bar:    ldb_s   r0,[r0]
> >          mov_s   r2,0    ;3
> >          add3    r0,r2,r0
> >          sexb_s  r0,r0
> >          asr_s   r0,r0
> >          asr_s   r0,r0
> >          j_s.d   [blink]
> >          asr_s   r0,r0
> >
> > which uses 6 instructions to perform this particular sign extension.
> > It turns out that sign extensions can always be implemented using at
> > most three instructions on ARC (without a barrel shifter) using the
> > idiom ((x&mask)^msb)-msb [as described in section "2-5 Sign Extension"
> > of Henry Warren's book "Hacker's Delight"].  Using this, the sign
> > extensions above on ARC's EM both become:
> >
> >          bmsk_s  r0,r0,4
> >          xor     r0,r0,32
> >          sub     r0,r0,32
> >
> > which takes about 3 cycles, compared to the ~112 cycles for the loops
> > in foo.
> >
> >
> > Tested with a cross-compiler to arc-linux hosted on x86_64, with no
> > new (compile-only) regressions from make -k check.
> > Ok for mainline if this passes Claudiu's nightly testing?
> >
> >
> > 2023-12-05  Roger Sayle<roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >          * config/arc/arc.md (*extvsi_n_0): New define_insn_and_split to
> >          implement SImode sign extract using a AND, XOR and MINUS sequence.
> >
> > gcc/testsuite/ChangeLog
> >          * gcc.target/arc/extvsi-1.c: New test case.
> >          * gcc.target/arc/extvsi-2.c: Likewise.
> >
> >
> > Thanks in advance,
> > Roger
> > --
> >
> >
> > patchar.txt
> >
> > diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index
> > bf9f88eff047..5ebaf2e20ab0 100644
> > --- a/gcc/config/arc/arc.md
> > +++ b/gcc/config/arc/arc.md
> > @@ -6127,6 +6127,26 @@ archs4x, archs4xd"
> >     ""
> >     [(set_attr "length" "8")])
> >
> > +(define_insn_and_split "*extvsi_n_0"
> > +  [(set (match_operand:SI 0 "register_operand" "=r")
> > +	(sign_extract:SI (match_operand:SI 1 "register_operand" "0")
> > +			 (match_operand:QI 2 "const_int_operand")
> > +			 (const_int 0)))]
> > +  "!TARGET_BARREL_SHIFTER
> > +   && IN_RANGE (INTVAL (operands[2]), 2,
> > +		(optimize_insn_for_size_p () ? 28 : 30))"
> > +  "#"
> > +  "&& 1"
> > +[(set (match_dup 0) (and:SI (match_dup 0) (match_dup 3)))  (set
> > +(match_dup 0) (xor:SI (match_dup 0) (match_dup 4)))  (set (match_dup
> > +0) (minus:SI (match_dup 0) (match_dup 4)))] {
> > +  int tmp = INTVAL (operands[2]);
> > +  operands[3] = GEN_INT (~(HOST_WIDE_INT_M1U << tmp));
> > +  operands[4] = GEN_INT (HOST_WIDE_INT_1U << tmp);
> Shouldn't operands[4] be GEN_INT ((HOST_WIDE_INT_1U << tmp) - 1)?
> Otherwise it's flipping the wrong bit AFAICT.
> 
> H8 can benefit from the same transformation which is how I found this little goof.
> It's not as big a gain as ARC, but it does affect one of those builtin-overflow tests
> which tend to dominate testing time on the H8.
> 
> jeff

[-- Attachment #2: patcharb.txt --]
[-- Type: text/plain, Size: 2469 bytes --]

diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index bf9f88eff047..d980876eff8f 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -6127,6 +6127,26 @@ archs4x, archs4xd"
   ""
   [(set_attr "length" "8")])
 
+(define_insn_and_split "*extvsi_n_0"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(sign_extract:SI (match_operand:SI 1 "register_operand" "0")
+			 (match_operand:QI 2 "const_int_operand")
+			 (const_int 0)))]
+  "!TARGET_BARREL_SHIFTER
+   && IN_RANGE (INTVAL (operands[2]), 2,
+		(optimize_insn_for_size_p () ? 28 : 30))"
+  "#"
+  "&& 1"
+[(set (match_dup 0) (and:SI (match_dup 0) (match_dup 3)))
+ (set (match_dup 0) (xor:SI (match_dup 0) (match_dup 4)))
+ (set (match_dup 0) (minus:SI (match_dup 0) (match_dup 4)))]
+{
+  int tmp = INTVAL (operands[2]);
+  operands[3] = GEN_INT (~(HOST_WIDE_INT_M1U << tmp));
+  operands[4] = GEN_INT (HOST_WIDE_INT_1U << (tmp - 1));
+}
+  [(set_attr "length" "14")])
+
 (define_insn_and_split "rotlsi3_cnt1"
   [(set (match_operand:SI 0 "dest_reg_operand"            "=r")
 	(rotate:SI (match_operand:SI 1 "register_operand" "r")
diff --git a/gcc/testsuite/gcc.target/arc/extvsi-1.c b/gcc/testsuite/gcc.target/arc/extvsi-1.c
new file mode 100644
index 000000000000..5ac6feafae30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/extvsi-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=em" } */
+struct S { int a : 5; };
+
+int foo (struct S *p)
+{
+  return p->a;
+}
+
+/* { dg-final { scan-assembler "msk_s\\s+r0,r0,4" } } */
+/* { dg-final { scan-assembler "xor\\s+r0,r0,16" } } */
+/* { dg-final { scan-assembler "sub\\s+r0,r0,16" } } */
+/* { dg-final { scan-assembler-not "add3\\s+r0,r2,r0" } } */
+/* { dg-final { scan-assembler-not "sext_s\\s+r0,r0" } } */
+/* { dg-final { scan-assembler-not "asr_s\\s+r0,r0" } } */
diff --git a/gcc/testsuite/gcc.target/arc/extvsi-2.c b/gcc/testsuite/gcc.target/arc/extvsi-2.c
new file mode 100644
index 000000000000..953ea6a8b243
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/extvsi-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=em" } */
+
+int foo(int x)
+{
+  return (x<<27)>>27;
+}
+
+/* { dg-final { scan-assembler "msk_s\\s+r0,r0,4" } } */
+/* { dg-final { scan-assembler "xor\\s+r0,r0,16" } } */
+/* { dg-final { scan-assembler "sub\\s+r0,r0,16" } } */
+/* { dg-final { scan-assembler-not "lp\\s+2f" } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.
  2023-12-07 16:04   ` Roger Sayle
@ 2023-12-07 16:53     ` Jeff Law
  0 siblings, 0 replies; 6+ messages in thread
From: Jeff Law @ 2023-12-07 16:53 UTC (permalink / raw)
  To: Roger Sayle, gcc-patches; +Cc: 'Claudiu Zissulescu'



On 12/7/23 09:04, Roger Sayle wrote:
> 
> Hi Jeff,
> Doh!  Great catch.  The perils of not (yet) being able to actually
> run any ARC execution tests myself.
ACK.


> 
>> Shouldn't operands[4] be GEN_INT ((HOST_WIDE_INT_1U << tmp) - 1)?
> Yes(-ish), operands[4] should be GEN_INT(HOST_WIDE_INT_1U << (tmp - 1)).
> 
> And the 32s in the test cases need to be 16s (the MSB of a five bit field is 16).
> 
> You're probably also thinking the same thing that I am... that it might be possible
> to implement this in the middle-end, but things are complicated by combine's
> make_compound_operation/expand_compound_operation, and that
> combine doesn't (normally) like turning two instructions into three.
Yea, I pondered, but didn't explore.   I was expecting problems around 
costing, make_compound_operation/expand_compound_operation didn't even 
cross my mind, but I can certainly see how they'd be problematical.


> 
> Fingers-crossed the attached patch works better on the nightly testers.
The H8 variant without the -1 failed in my tester, but passed once the 
-1 was added.  I've got one thing to re-review as the -1 changes a few 
things on the codegen side and I need to re-verify the profitability 
across the various H8 configurations.



jeff

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.
  2023-12-05 13:59 [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717 Roger Sayle
  2023-12-07 14:47 ` Jeff Law
@ 2023-12-10 15:57 ` Jeff Law
  2023-12-13 10:28 ` Claudiu Zissulescu
  2 siblings, 0 replies; 6+ messages in thread
From: Jeff Law @ 2023-12-10 15:57 UTC (permalink / raw)
  To: Roger Sayle, gcc-patches; +Cc: 'Claudiu Zissulescu'



On 12/5/23 06:59, Roger Sayle wrote:
> 
> This patch improves the code generated for bitfield sign extensions on
> ARC cpus without a barrel shifter.
> 
> 
> Compiling the following test case:
> 
> int foo(int x) { return (x<<27)>>27; }
> 
> with -O2 -mcpu=em, generates two loops:
> 
> foo:    mov     lp_count,27
>          lp      2f
>          add     r0,r0,r0
>          nop
> 2:      # end single insn loop
>          mov     lp_count,27
>          lp      2f
>          asr     r0,r0
>          nop
> 2:      # end single insn loop
>          j_s     [blink]
> 
> 
> and the closely related test case:
> 
> struct S { int a : 5; };
> int bar (struct S *p) { return p->a; }
> 
> generates the slightly better:
> 
> bar:    ldb_s   r0,[r0]
>          mov_s   r2,0    ;3
>          add3    r0,r2,r0
>          sexb_s  r0,r0
>          asr_s   r0,r0
>          asr_s   r0,r0
>          j_s.d   [blink]
>          asr_s   r0,r0
> 
> which uses 6 instructions to perform this particular sign extension.
> It turns out that sign extensions can always be implemented using at
> most three instructions on ARC (without a barrel shifter) using the
> idiom ((x&mask)^msb)-msb [as described in section "2-5 Sign Extension"
> of Henry Warren's book "Hacker's Delight"].  Using this, the sign
> extensions above on ARC's EM both become:
> 
>          bmsk_s  r0,r0,4
>          xor     r0,r0,32
>          sub     r0,r0,32
> 
> which takes about 3 cycles, compared to the ~112 cycles for the loops
> in foo.
> 
> 
> Tested with a cross-compiler to arc-linux hosted on x86_64,
> with no new (compile-only) regressions from make -k check.
> Ok for mainline if this passes Claudiu's nightly testing?
> 
> 
> 2023-12-05  Roger Sayle  <roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
>          * config/arc/arc.md (*extvsi_n_0): New define_insn_and_split to
>          implement SImode sign extract using a AND, XOR and MINUS sequence.
Note with this sequence in place (assuming it moves forward on the ARC), 
you may be able to build a better generalized signed bitfield extraction.

Rather than a shift-left followed by an arithmetic shift right, you can 
instead do a logical shift right to get the field into the LSBs, then 
use the this pattern to implement the sign extension from the MSB of the 
field.

Given it saves a potentially very expensive shift, it may be worth 
exploring for the ARC.

I've done this for the H8.  Not every sequence is better, but many are. 
There's improvements that could be made, but we're probably capturing 
the vast majority of the benefit in the patch I'm currently testing.

Anyway just thought I'd point out the natural follow-on from this effort.

Jeff

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.
  2023-12-05 13:59 [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717 Roger Sayle
  2023-12-07 14:47 ` Jeff Law
  2023-12-10 15:57 ` Jeff Law
@ 2023-12-13 10:28 ` Claudiu Zissulescu
  2 siblings, 0 replies; 6+ messages in thread
From: Claudiu Zissulescu @ 2023-12-13 10:28 UTC (permalink / raw)
  To: Roger Sayle, gcc-patches; +Cc: 'Claudiu Zissulescu'

Hi Roger,

It looks good to me.

Thank you for your contribution,
Claudiu

-----Original Message-----
From: Roger Sayle <roger@nextmovesoftware.com> 
Sent: Tuesday, December 5, 2023 4:00 PM
To: gcc-patches@gcc.gnu.org
Cc: 'Claudiu Zissulescu' <claziss@gmail.com>
Subject: [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717.


This patch improves the code generated for bitfield sign extensions on ARC cpus without a barrel shifter.


Compiling the following test case:

int foo(int x) { return (x<<27)>>27; }

with -O2 -mcpu=em, generates two loops:

foo:    mov     lp_count,27
        lp      2f
        add     r0,r0,r0
        nop
2:      # end single insn loop
        mov     lp_count,27
        lp      2f
        asr     r0,r0
        nop
2:      # end single insn loop
        j_s     [blink]


and the closely related test case:

struct S { int a : 5; };
int bar (struct S *p) { return p->a; }

generates the slightly better:

bar:    ldb_s   r0,[r0]
        mov_s   r2,0    ;3
        add3    r0,r2,r0
        sexb_s  r0,r0
        asr_s   r0,r0
        asr_s   r0,r0
        j_s.d   [blink]
        asr_s   r0,r0

which uses 6 instructions to perform this particular sign extension.
It turns out that sign extensions can always be implemented using at most three instructions on ARC (without a barrel shifter) using the idiom ((x&mask)^msb)-msb [as described in section "2-5 Sign Extension"
of Henry Warren's book "Hacker's Delight"].  Using this, the sign extensions above on ARC's EM both become:

        bmsk_s  r0,r0,4
        xor     r0,r0,32
        sub     r0,r0,32

which takes about 3 cycles, compared to the ~112 cycles for the loops in foo.


Tested with a cross-compiler to arc-linux hosted on x86_64, with no new (compile-only) regressions from make -k check.
Ok for mainline if this passes Claudiu's nightly testing?


2023-12-05  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/arc/arc.md (*extvsi_n_0): New define_insn_and_split to
        implement SImode sign extract using a AND, XOR and MINUS sequence.

gcc/testsuite/ChangeLog
        * gcc.target/arc/extvsi-1.c: New test case.
        * gcc.target/arc/extvsi-2.c: Likewise.


Thanks in advance,
Roger
--


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-12-13 10:28 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-05 13:59 [ARC PATCH] Add *extvsi_n_0 define_insn_and_split for PR 110717 Roger Sayle
2023-12-07 14:47 ` Jeff Law
2023-12-07 16:04   ` Roger Sayle
2023-12-07 16:53     ` Jeff Law
2023-12-10 15:57 ` Jeff Law
2023-12-13 10:28 ` Claudiu Zissulescu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).