[Patch/rtl-expand] Take tree range info into account to improve LSHIFT

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
@ 2015-04-16 11:04 Jiong Wang
  2015-04-24  2:23 ` Jeff Law
  0 siblings, 1 reply; 12+ messages in thread
From: Jiong Wang @ 2015-04-16 11:04 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2508 bytes --]


This is a rework of

  https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01998.html

After second thinking, I feel it's better to fix this in earlier stage
during RTL expand which is more generic, and we also avoid making the
already complex combine pass complexer.

Currently gcc expand wide mode left shift to some generic complex
instruction sequences, while if we have known the high part of wide mode
all comes from sign extension, the expand logic could be simplifed.

Given the following example,

T A = (T) B  << const_imm_shift

We know the high part of A are all comes from sign extension, if

* T is the next wider type of word_mode.

For example, for aarch64, if type T is 128int (TImode), and B is with
type SImode or DImode, then tree analyzer know that the high part of
TImode result all comes from sign extension, and kept them in range info.

 |<           T          >|
 |   high     |   low     |
              |<- sizel ->|

For above example, we could simplify the expand logic into
 1. low = low << const_imm_shift;
 2. high = low >> (sizel - const_imm_shift)  */

We can utilize the arithmetic right shift to do the sign
extension. Those reduntant instructions will be optimized out later.

For actual .s improvement,

AArch64
=======

  __int128_t
  foo (int data)
  {
    return (__int128_t) data << 50;
  }

  old:
    sxtw    x2, w0
    asr     x1, x2, 63
    lsl     x0, x2, 50
    lsl     x1, x1, 50
    orr     x1, x1, x2, lsr 14
 
  new:
    sxtw    x1, w0
    lsl     x0, x1, 50
    asr     x1, x1, 14


ARM (.fpu softvfp)
===========

  long long
  shift (int data)
  {
    return (long long) data << 20;
  }
 
  old:
    stmfd   sp!, {r4, r5}
    mov     r5, r0, asr #31
    mov     r3, r0
    mov     r0, r0, asl #20
    mov     r1, r5, asl #20
    orr     r1, r1, r3, lsr #12
    ldmfd   sp!, {r4, r5}
    bx      lr

  new:
    mov     r1, r0
    mov     r0, r0, asl #20
    mov     r1, r1, asr #12
    bx      lr

Test
====

  x86 bootstrap OK, regression test OK.
  AArch64 bootstrap OK, regression test on board OK.

Regards,
Jiong

2015-04-116  Jiong.Wang  <jiong.wang@arm.com>

gcc/
  * expr.c (expand_expr_real_2): Take tree range info into account when
  expanding LSHIFT_EXPR.

gcc/testsuite
  * gcc.dg/wide_shift_64_1.c: New testcase.
  * gcc.dg/wide_shift_128_1.c: Ditto.
  * gcc.target/aarch64/ashlti3_1.c: Ditto.
  * gcc.target/arm/ashldisi_1.c: Ditto.
  

[-- Attachment #2: range-expand.patch --]
[-- Type: text/x-diff, Size: 7360 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index 89ca129..96d64cc 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8984,23 +8984,85 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 
     case LSHIFT_EXPR:
     case RSHIFT_EXPR:
-      /* If this is a fixed-point operation, then we cannot use the code
-	 below because "expand_shift" doesn't support sat/no-sat fixed-point
-         shifts.   */
-      if (ALL_FIXED_POINT_MODE_P (mode))
-	goto binop;
-
-      if (! safe_from_p (subtarget, treeop1, 1))
-	subtarget = 0;
-      if (modifier == EXPAND_STACK_PARM)
-	target = 0;
-      op0 = expand_expr (treeop0, subtarget,
-			 VOIDmode, EXPAND_NORMAL);
-      temp = expand_variable_shift (code, mode, op0, treeop1, target,
-				    unsignedp);
-      if (code == LSHIFT_EXPR)
-	temp = REDUCE_BIT_FIELD (temp);
-      return temp;
+      {
+	/* If this is a fixed-point operation, then we cannot use the code
+	   below because "expand_shift" doesn't support sat/no-sat fixed-point
+	   shifts.  */
+	if (ALL_FIXED_POINT_MODE_P (mode))
+	  goto binop;
+
+	if (! safe_from_p (subtarget, treeop1, 1))
+	  subtarget = 0;
+	if (modifier == EXPAND_STACK_PARM)
+	  target = 0;
+
+	op0 = expand_expr (treeop0, subtarget,
+			   VOIDmode, EXPAND_NORMAL);
+
+	/* If mode == GET_MODE_WIDER_MODE (word_mode),
+	   then normally, there will no native instructions to support
+	   this wide mode left shift.
+
+	   given below example,
+
+	   T A = (T) B  << C
+
+	   |<		T	   >|
+	   |   high     |   low     |
+
+			|<- sizel ->|
+
+	   if from range info, we could deduce that the high part are all sign
+	   bit extension, then this left shift operation could be largely
+	   simplified into.
+
+	     1. low = low << C;
+	     2. high = low >> (sizel - C)  */
+
+	int o_bits = GET_MODE_SIZE (mode) * BITS_PER_UNIT;
+	wide_int min, max;
+
+	if (code == LSHIFT_EXPR
+	    && !unsignedp
+	    && mode == GET_MODE_WIDER_MODE (word_mode)
+	    && !have_insn_for (LSHIFT_EXPR, mode)
+	    && TREE_CONSTANT (treeop1)
+	    && get_range_info (treeop0, &min, &max) == VR_RANGE
+	    && (wi::cmp (min,
+			 wide_int::from (wi::min_value
+					 ((unsigned) (BITS_PER_WORD),
+					  SIGNED), o_bits, SIGNED),
+			 SIGNED) != -1)
+	    && (wi::cmp (max,
+			 wide_int::from (wi::max_value
+					 ((unsigned)(BITS_PER_WORD),
+					  SIGNED), o_bits, SIGNED),
+			 SIGNED) != 1))
+	  {
+	    rtx low = simplify_gen_subreg (word_mode, op0, mode, 0);
+	    rtx t_low = simplify_gen_subreg (word_mode, target, mode, 0);
+	    rtx t_high = simplify_gen_subreg (word_mode, target, mode,
+					      UNITS_PER_WORD);
+	    tree high_shift =
+	      build_int_cst (TREE_TYPE (treeop1),
+			     BITS_PER_WORD -TREE_INT_CST_LOW (treeop1));
+
+	    temp = expand_variable_shift (code, word_mode, low, treeop1,
+					  t_low, unsignedp);
+
+	    temp = expand_variable_shift (RSHIFT_EXPR, word_mode, low,
+					  high_shift, t_high, unsignedp);
+
+	    gcc_assert (GET_CODE (temp) == SUBREG);
+	    temp = XEXP (temp, 0);
+	  }
+	else
+	  temp = expand_variable_shift (code, mode, op0, treeop1, target,
+					unsignedp);
+	if (code == LSHIFT_EXPR)
+	  temp = REDUCE_BIT_FIELD (temp);
+	return temp;
+      }
 
       /* Could determine the answer when only additive constants differ.  Also,
 	 the addition of one can be handled by changing the condition.  */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-128.c b/gcc/testsuite/gcc.dg/wide-shift-128.c
new file mode 100644
index 0000000..9b62715
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-128.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target aarch64*-*-* mips64*-*-* sparc64*-*-* } } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+__int128_t
+load2 (int data)
+{
+    return (__int128_t) data << 50;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-64.c b/gcc/testsuite/gcc.dg/wide-shift-64.c
new file mode 100644
index 0000000..5bc278f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-64.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target arm*-*-* mips*-*-* sparc*-*-* } } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+long long
+load1 (int data)
+{
+    return (long long) data << 12;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ashltidisi.c b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
new file mode 100644
index 0000000..aeb2a24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
@@ -0,0 +1,49 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x, y, z)\
+__uint128_t __attribute__ ((noinline))\
+ushift_##x##_##z (unsigned y data)\
+{\
+  return (__uint128_t) data << x;\
+}\
+__int128_t __attribute__ ((noinline)) \
+shift_##x##_##z (y data) \
+{\
+  return (__int128_t) data << x;\
+}
+
+GEN_TEST_CASE (53, int, i)
+GEN_TEST_CASE (3, long long, ll)
+GEN_TEST_CASE (13, long long, ll)
+GEN_TEST_CASE (53, long long, ll)
+
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y, z, p) \
+	if (ushift_##y##_##p (x)\
+	    != ((__uint128_t) (unsigned z) x << y)) \
+	  abort ();\
+	if (shift_##y##_##p (x)\
+	    != ((__uint128_t) (signed z) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 53, int, i)
+  SHIFT_CHECK (0xcafecafe, 53, int, i)
+
+  SHIFT_CHECK (0x1234567890abcdefLL, 3, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 13, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 53, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 3, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 13, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 53, long long, ll)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/ashldisi.c b/gcc/testsuite/gcc.target/arm/ashldisi.c
new file mode 100644
index 0000000..00dc06e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/ashldisi.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x)\
+unsigned long long __attribute__ ((noinline))\
+ushift_ ## x (unsigned int data)\
+{\
+  return (unsigned long long) data << x;\
+}\
+long long __attribute__ ((noinline)) \
+shift_ ## x (int data) \
+{\
+  return (long long) data << x;\
+}
+
+GEN_TEST_CASE (3)
+GEN_TEST_CASE (23)
+GEN_TEST_CASE (30)
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y) \
+	if (ushift_ ## y (x)\
+	    != ((unsigned long long) (unsigned) x << y)) \
+	  abort (); \
+	if (shift_ ## y (x)\
+	    != ((long long) (signed) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 3)
+  SHIFT_CHECK (0xcafecafe, 3)
+  SHIFT_CHECK (0x12345678, 23)
+  SHIFT_CHECK (0xcafecafe, 23)
+  SHIFT_CHECK (0x12345678, 30)
+  SHIFT_CHECK (0xcafecafe, 30)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 3 } } */
+/* { dg-final { cleanup-saved-temps } } */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-04-16 11:04 [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding Jiong Wang
@ 2015-04-24  2:23 ` Jeff Law
  2015-04-27 20:58   ` Jiong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Law @ 2015-04-24  2:23 UTC (permalink / raw)
  To: Jiong Wang, gcc-patches

On 04/16/2015 05:04 AM, Jiong Wang wrote:
>
> This is a rework of
>
>    https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01998.html
>
> After second thinking, I feel it's better to fix this in earlier stage
> during RTL expand which is more generic, and we also avoid making the
> already complex combine pass complexer.
>
> Currently gcc expand wide mode left shift to some generic complex
> instruction sequences, while if we have known the high part of wide mode
> all comes from sign extension, the expand logic could be simplifed.
>
> Given the following example,
>
> T A = (T) B  << const_imm_shift
>
> We know the high part of A are all comes from sign extension, if
>
> * T is the next wider type of word_mode.
>
> For example, for aarch64, if type T is 128int (TImode), and B is with
> type SImode or DImode, then tree analyzer know that the high part of
> TImode result all comes from sign extension, and kept them in range info.
>
>   |<           T          >|
>   |   high     |   low     |
>                |<- sizel ->|
>
> For above example, we could simplify the expand logic into
>   1. low = low << const_imm_shift;
>   2. high = low >> (sizel - const_imm_shift)  */
>
> We can utilize the arithmetic right shift to do the sign
> extension. Those reduntant instructions will be optimized out later.
>
> For actual .s improvement,
>
> AArch64
> =======
>
>    __int128_t
>    foo (int data)
>    {
>      return (__int128_t) data << 50;
>    }
>
>    old:
>      sxtw    x2, w0
>      asr     x1, x2, 63
>      lsl     x0, x2, 50
>      lsl     x1, x1, 50
>      orr     x1, x1, x2, lsr 14
>
>    new:
>      sxtw    x1, w0
>      lsl     x0, x1, 50
>      asr     x1, x1, 14
>
>
> ARM (.fpu softvfp)
> ===========
>
>    long long
>    shift (int data)
>    {
>      return (long long) data << 20;
>    }
>
>    old:
>      stmfd   sp!, {r4, r5}
>      mov     r5, r0, asr #31
>      mov     r3, r0
>      mov     r0, r0, asl #20
>      mov     r1, r5, asl #20
>      orr     r1, r1, r3, lsr #12
>      ldmfd   sp!, {r4, r5}
>      bx      lr
>
>    new:
>      mov     r1, r0
>      mov     r0, r0, asl #20
>      mov     r1, r1, asr #12
>      bx      lr
>
> Test
> ====
>
>    x86 bootstrap OK, regression test OK.
>    AArch64 bootstrap OK, regression test on board OK.
>
> Regards,
> Jiong
>
> 2015-04-116  Jiong.Wang  <jiong.wang@arm.com>
>
> gcc/
>    * expr.c (expand_expr_real_2): Take tree range info into account when
>    expanding LSHIFT_EXPR.
>
> gcc/testsuite
>    * gcc.dg/wide_shift_64_1.c: New testcase.
>    * gcc.dg/wide_shift_128_1.c: Ditto.
>    * gcc.target/aarch64/ashlti3_1.c: Ditto.
>    * gcc.target/arm/ashldisi_1.c: Ditto.
Funny, I find myself wanting this transformation in both places :-) 
Expansion time so that we generate efficient code from the start and 
combine to catch those cases which are too complex to see at expansion, 
but due to other optimizations become visible in the combiner.

Sadly, it's been fairly common practice for targets to define 
double-word shift patterns which catch a variety of special cases. 
Ports will have to choose between using those patterns and exploiting 
your work.   I'd be tempted to generate a double-word shift by the given 
constant and check its cost vs the single word shifts.

What happens if there's an overlap between t_low and low?  Won't the 
lshift clobber low and thus we get the right value for the rshift in 
that case?

Note that expand_variable_shift may not honor your request for putting 
the result in the TARGET target parameter you pass in.

Thus:

   temp = expand_variable_shift (...)
   temp = expand_variable_shift (...)

Can't be right.    You probably need something like

   temp = expand_variable_shift (...)
   if (temp != t_low)
     emit_move_insn (t_low, temp);
   temp = expand_variable_shift (...)
   if (temp != t_high)
     emit_move_insn (t_high, temp);
   return target;


So I generally like where you're going with this, but I have concerns 
about its correctness, particularly in cases where there's an overlap or 
when expand_variable_shift returns its value in something other than the 
passed in target.

Jeff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-04-24  2:23 ` Jeff Law
@ 2015-04-27 20:58   ` Jiong Wang
  2015-04-29  3:53     ` Jeff Law
  0 siblings, 1 reply; 12+ messages in thread
From: Jiong Wang @ 2015-04-27 20:58 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches


Jeff Law writes:

> On 04/16/2015 05:04 AM, Jiong Wang wrote:
>>
>> This is a rework of
>>
>>    https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01998.html
>>
>> After second thinking, I feel it's better to fix this in earlier stage
>> during RTL expand which is more generic, and we also avoid making the
>> already complex combine pass complexer.
>>
>> Currently gcc expand wide mode left shift to some generic complex
>> instruction sequences, while if we have known the high part of wide mode
>> all comes from sign extension, the expand logic could be simplifed.
>>
>> Given the following example,
>>
>> T A = (T) B  << const_imm_shift
>>
>> We know the high part of A are all comes from sign extension, if
>>
>> * T is the next wider type of word_mode.
>>
>> For example, for aarch64, if type T is 128int (TImode), and B is with
>> type SImode or DImode, then tree analyzer know that the high part of
>> TImode result all comes from sign extension, and kept them in range info.
>>
>>   |<           T          >|
>>   |   high     |   low     |
>>                |<- sizel ->|
>>
>> For above example, we could simplify the expand logic into
>>   1. low = low << const_imm_shift;
>>   2. high = low >> (sizel - const_imm_shift)  */
>>
>> We can utilize the arithmetic right shift to do the sign
>> extension. Those reduntant instructions will be optimized out later.
>>
>> For actual .s improvement,
>>
>> AArch64
>> =======
>>
>>    __int128_t
>>    foo (int data)
>>    {
>>      return (__int128_t) data << 50;
>>    }
>>
>>    old:
>>      sxtw    x2, w0
>>      asr     x1, x2, 63
>>      lsl     x0, x2, 50
>>      lsl     x1, x1, 50
>>      orr     x1, x1, x2, lsr 14
>>
>>    new:
>>      sxtw    x1, w0
>>      lsl     x0, x1, 50
>>      asr     x1, x1, 14
>>
>>
>> ARM (.fpu softvfp)
>> ===========
>>
>>    long long
>>    shift (int data)
>>    {
>>      return (long long) data << 20;
>>    }
>>
>>    old:
>>      stmfd   sp!, {r4, r5}
>>      mov     r5, r0, asr #31
>>      mov     r3, r0
>>      mov     r0, r0, asl #20
>>      mov     r1, r5, asl #20
>>      orr     r1, r1, r3, lsr #12
>>      ldmfd   sp!, {r4, r5}
>>      bx      lr
>>
>>    new:
>>      mov     r1, r0
>>      mov     r0, r0, asl #20
>>      mov     r1, r1, asr #12
>>      bx      lr
>>
>> Test
>> ====
>>
>>    x86 bootstrap OK, regression test OK.
>>    AArch64 bootstrap OK, regression test on board OK.
>>
>> Regards,
>> Jiong
>>
>> 2015-04-116  Jiong.Wang  <jiong.wang@arm.com>
>>
>> gcc/
>>    * expr.c (expand_expr_real_2): Take tree range info into account when
>>    expanding LSHIFT_EXPR.
>>
>> gcc/testsuite
>>    * gcc.dg/wide_shift_64_1.c: New testcase.
>>    * gcc.dg/wide_shift_128_1.c: Ditto.
>>    * gcc.target/aarch64/ashlti3_1.c: Ditto.
>>    * gcc.target/arm/ashldisi_1.c: Ditto.
> Funny, I find myself wanting this transformation in both places :-) 
> Expansion time so that we generate efficient code from the start and 
> combine to catch those cases which are too complex to see at expansion, 
> but due to other optimizations become visible in the combiner.
>
> Sadly, it's been fairly common practice for targets to define 
> double-word shift patterns which catch a variety of special cases. 
> Ports will have to choose between using those patterns and exploiting 
> your work.   I'd be tempted to generate a double-word shift by the given 
> constant and check its cost vs the single word shifts.
>
> What happens if there's an overlap between t_low and low?  Won't the 
> lshift clobber low and thus we get the right value for the rshift in 
> that case?

Jeff,

  Sorry, I can't understand the meaning of "overlap between t_low and low",
  assume "right" in "right value" means the opposite of "left" not
  "correct".

  So what you mean is t_low and low share the same pseudo regiser?

  or you mean if we are shifting a value across the word boundary? like the following.
 
   |<      double word      >|
   |   t_high  |   t_low     |
          |<- low ->|
         
  for above situation, the simplified two instruction sequence do works.
  "t_low = low << const_imm_shift ; t_high = low >> (sizel - const_imm_shift)"

  I attached the expand result for a simple testcase below. I appreicate
  if you could comment on the RTL. 

  Thanks.
  
  __int128_t
  foo (int data)
  {
    return (__int128_t) data << 50;
  }

  foo.c.188t.optimized
  ===
  foo (int data)
  {
    __int128 _2;
    __int128 _3;

  <bb 2>:
    _2 = (__int128) data_1(D);
    _3 = _2 << 50;
    return _3;
  }

  foo.c.189r.expand
  ===
  
  (insn 2 4 3 2 (set (reg/v:SI 76 [ data ])
    (reg:SI 0 x0 [ data ])) foo.c:3 -1
      (nil))
  (insn 6 3 7 2 (set (reg:DI 79)
    (sign_extend:DI (reg/v:SI 76 [ data ]))) foo.c:4 -1
      (nil))
  (insn 7 6 8 2 (set (subreg:DI (reg:TI 78 [ D.2677 ]) 0)
    (reg:DI 79)) foo.c:4 -1
      (nil))
  (insn 8 7 9 2 (set (reg:DI 80)
    (ashiftrt:DI (reg:DI 79) (const_int 63 [0x3f]))) foo.c:4 -1
      (nil))
  (insn 9 8 10 2 (set (subreg:DI (reg:TI 78 [ D.2677 ]) 8)
    (reg:DI 80)) foo.c:4 -1
      (nil))
      
  ^
   ~~~~~~~~~ sign extend SImode "data" into TImode "_2" (r78)
                    
  (insn 10 9 11 2 (set (subreg:DI (reg:TI 77 [D.2677 ]) 0)
    (ashift:DI (subreg:DI (reg:TI 78 [ D.2677 ]) 0)
               (const_int 50 [0x32]))) foo.c:4 -1
      (nil))

  ^
   ~~~~~~~~~~ t_low = low << const_imm_shift, target be r77
  
  (insn 11 10 12 2 (set (subreg:DI (reg:TI 77 [ D.2677 ]) 8)
    (ashiftrt:DI (subreg:DI (reg:TI 78 [ D.2677 ]) 0)
                 (const_int 14 [0xe]))) foo.c:4 -1
      (nil))

  ^
   ~~~~~~~~~~ t_high = low >> (sizel - const_imm_shift)
  
  (insn 12 11 16 2 (set (reg:TI 75 [ <retval> ])
    (reg:TI 77 [ D.2677 ])) foo.c:4 -1
      (nil))
  (insn 16 12 17 2 (set (reg/i:TI 0 x0)
    (reg:TI 75 [ <retval> ])) foo.c:5 -1
      (nil))

> Note that expand_variable_shift may not honor your request for putting 
> the result in the TARGET target parameter you pass in.

Thanks, agree, it's better to add those extra move.

I noticed the comments at the start of the function:

  "Store the result in the rtx TARGET, if that is convenient."

Although I still don't understand in which case it's inconveninent.

-- 
Regards,
Jiong

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-04-27 20:58   ` Jiong Wang
@ 2015-04-29  3:53     ` Jeff Law
  2015-04-29 22:14       ` Jiong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Law @ 2015-04-29  3:53 UTC (permalink / raw)
  To: Jiong Wang; +Cc: gcc-patches

On 04/27/2015 02:21 PM, Jiong Wang wrote:

>> Funny, I find myself wanting this transformation in both places :-)
>> Expansion time so that we generate efficient code from the start and
>> combine to catch those cases which are too complex to see at expansion,
>> but due to other optimizations become visible in the combiner.
>>
>> Sadly, it's been fairly common practice for targets to define
>> double-word shift patterns which catch a variety of special cases.
>> Ports will have to choose between using those patterns and exploiting
>> your work.   I'd be tempted to generate a double-word shift by the given
>> constant and check its cost vs the single word shifts.
>>
>> What happens if there's an overlap between t_low and low?  Won't the
>> lshift clobber low and thus we get the right value for the rshift in
>> that case?
>
> Jeff,
>
>    Sorry, I can't understand the meaning of "overlap between t_low and low",
>    assume "right" in "right value" means the opposite of "left" not
>    "correct".
>
>    So what you mean is t_low and low share the same pseudo regiser?
My concern is sharing the same pseudo or memory location.  But thinking 
more about it, the shifted value has to have range information, so it 
must have been an SSA_NAME, right?  If so, then it can't overlap with 
the destination, so this is a non-issue.  Sorry for the confusion.

>
>> Note that expand_variable_shift may not honor your request for putting
>> the result in the TARGET target parameter you pass in.
>
> Thanks, agree, it's better to add those extra move.
>
> I noticed the comments at the start of the function:
>
>    "Store the result in the rtx TARGET, if that is convenient."
>
> Although I still don't understand in which case it's inconveninent.
I've never liked the model of storing into TARGET when it's convenient. 
  Because storing into TARGET is totally optional, it means the callers 
have to check if the value was stored into TARGET or not.

Sadly that model has been in the expanders as long as I can remember.

So I think this can go forward once we resolve the case where 
expand_variable_shift returns its value in something other than the 
passed in target.

Jeff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-04-29  3:53     ` Jeff Law
@ 2015-04-29 22:14       ` Jiong Wang
  2015-04-29 22:55         ` Jeff Law
  0 siblings, 1 reply; 12+ messages in thread
From: Jiong Wang @ 2015-04-29 22:14 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches


Jeff Law writes:

> On 04/27/2015 02:21 PM, Jiong Wang wrote:
>
>> Jeff,
>>
>>    Sorry, I can't understand the meaning of "overlap between t_low and low",
>>    assume "right" in "right value" means the opposite of "left" not
>>    "correct".
>>
>>    So what you mean is t_low and low share the same pseudo regiser?
> My concern is sharing the same pseudo or memory location.  But thinking 
> more about it, the shifted value has to have range information, so it 
> must have been an SSA_NAME, right?  If so, then it can't overlap with 
> the destination, so this is a non-issue.  Sorry for the confusion.

Thanks for the light. By looking at related code, looks like even it's
SSA_NAME, it's still possible to share the same pseudo given the
destination is in the same SSA map parition after ssa name coleascing?   

> I've never liked the model of storing into TARGET when it's convenient. 
>   Because storing into TARGET is totally optional, it means the callers 
> have to check if the value was stored into TARGET or not.
>
> Sadly that model has been in the expanders as long as I can remember.
>
> So I think this can go forward once we resolve the case where 
> expand_variable_shift returns its value in something other than the 
> passed in target.

OK. I will rework the patch, and I found there is a function named
"expand_doubleword_shift" which looks like a more natural place to do
this optimization, although it's hard to get range info there. I will do
further explore on this.

-- 
Regards,
Jiong

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-04-29 22:14       ` Jiong Wang
@ 2015-04-29 22:55         ` Jeff Law
  2015-08-14 17:55           ` Jiong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Law @ 2015-04-29 22:55 UTC (permalink / raw)
  To: Jiong Wang; +Cc: gcc-patches

On 04/29/2015 03:36 PM, Jiong Wang wrote:
>
> Jeff Law writes:
>
>> On 04/27/2015 02:21 PM, Jiong Wang wrote:
>>
>>> Jeff,
>>>
>>>     Sorry, I can't understand the meaning of "overlap between t_low and low",
>>>     assume "right" in "right value" means the opposite of "left" not
>>>     "correct".
>>>
>>>     So what you mean is t_low and low share the same pseudo regiser?
>> My concern is sharing the same pseudo or memory location.  But thinking
>> more about it, the shifted value has to have range information, so it
>> must have been an SSA_NAME, right?  If so, then it can't overlap with
>> the destination, so this is a non-issue.  Sorry for the confusion.
>
> Thanks for the light. By looking at related code, looks like even it's
> SSA_NAME, it's still possible to share the same pseudo given the
> destination is in the same SSA map parition after ssa name coleascing?
If they're the same size and have non-overlapping lifetimes, then yes, 
they could be the same pseudo.  That ought to be easy to check. 
Thankfully we don't have to worry about MEMs, which is a harder check.

> OK. I will rework the patch, and I found there is a function named
> "expand_doubleword_shift" which looks like a more natural place to do
> this optimization, although it's hard to get range info there. I will do
> further explore on this.
Sounds good.

jeff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-04-29 22:55         ` Jeff Law
@ 2015-08-14 17:55           ` Jiong Wang
  2015-08-14 20:30             ` Jeff Law
  0 siblings, 1 reply; 12+ messages in thread
From: Jiong Wang @ 2015-08-14 17:55 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3532 bytes --]


Jeff Law writes:

> On 04/29/2015 03:36 PM, Jiong Wang wrote:
>>
>> Jeff Law writes:
>>
>>> On 04/27/2015 02:21 PM, Jiong Wang wrote:
>>>
>>>> Jeff,
>>>>
>>>>     Sorry, I can't understand the meaning of "overlap between t_low and low",
>>>>     assume "right" in "right value" means the opposite of "left" not
>>>>     "correct".
>>>>
>>>>     So what you mean is t_low and low share the same pseudo regiser?
>>> My concern is sharing the same pseudo or memory location.  But thinking
>>> more about it, the shifted value has to have range information, so it
>>> must have been an SSA_NAME, right?  If so, then it can't overlap with
>>> the destination, so this is a non-issue.  Sorry for the confusion.
>>
>> Thanks for the light. By looking at related code, looks like even it's
>> SSA_NAME, it's still possible to share the same pseudo given the
>> destination is in the same SSA map parition after ssa name coleascing?
> If they're the same size and have non-overlapping lifetimes, then yes, 
> they could be the same pseudo.  That ought to be easy to check. 
> Thankfully we don't have to worry about MEMs, which is a harder check.
>
>> OK. I will rework the patch, and I found there is a function named
>> "expand_doubleword_shift" which looks like a more natural place to do
>> this optimization, although it's hard to get range info there. I will do
>> further explore on this.
> Sounds good.
>
> jeff

Jeff,

  Sorry for the slow response.

  I found we can't integrate this transformation into
  "expand_doubleword_shift" as it's at quite deep layer in the call
  stack, when we reach there, we have lost some high level info.

  So I am keeping this transformaion still in expr.c, and attachment is
  the updated version of this patch. The main changes are:

  * Figuring out whether the shift source is coming from sign extension
    by checking SSA_NAME_DEF_STMT instead of deducing from tree range
    info. I fell checking the gimple statement is more reliable and
    straigtforward.

  * For the pseudo register overlaping issue, given:

      RTX target = TREE source << TREE amount

    I was trying to make sure those two SSA_NAME won't get the same
    pseudo register by comparing their assigned partition using
    var_to_partition, but I can't get the associated tree representation
    for "target" which is RTX.

    Then I just expand "source" and compare the register number with
    "target".

    But I found the simplest way maybe just reorder

      low_part (target) = low_part (source) << amount
      high_part (target) = low_part (source) >> amount1

    to:

      high_part (target) = low_part (source) >> amount1
      low_part (target) = low_part (source) << amount

    then, even target and source share the same pseudo register, we can
    still get what we want, as we are operating on their subreg.

  * Added checking on the result value of expand_variable_shift, if it's
    not the same as "target" then call emit_move_insn to do that.

  How is the re-worked patch looks to you?

  x86 bootstrap OK, regression OK. aarch64 bootstrap OK.

2015-08-14  Jiong.Wang  <jiong.wang@arm.com>

gcc/
  * expr.c (expand_expr_real_2): Check tree format to optimize
  LSHIFT_EXPR expand.

gcc/testsuite
  * gcc.dg/wide_shift_64_1.c: New testcase.
  * gcc.dg/wide_shift_128_1.c: Likewise.
  * gcc.target/aarch64/ashlti3_1.c: Likewise.
  * gcc.target/arm/ashldisi_1.c: Likewise.
--
Regards,
Jiong


[-- Attachment #2: k.patch --]
[-- Type: text/x-diff, Size: 7508 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index 31b4573..8a25e98 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8836,23 +8836,90 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 
     case LSHIFT_EXPR:
     case RSHIFT_EXPR:
-      /* If this is a fixed-point operation, then we cannot use the code
-	 below because "expand_shift" doesn't support sat/no-sat fixed-point
-         shifts.   */
-      if (ALL_FIXED_POINT_MODE_P (mode))
-	goto binop;
+      {
+	/* If this is a fixed-point operation, then we cannot use the code
+	   below because "expand_shift" doesn't support sat/no-sat fixed-point
+	   shifts.  */
+	if (ALL_FIXED_POINT_MODE_P (mode))
+	  goto binop;
+
+	if (! safe_from_p (subtarget, treeop1, 1))
+	  subtarget = 0;
+	if (modifier == EXPAND_STACK_PARM)
+	  target = 0;
+	op0 = expand_expr (treeop0, subtarget,
+			   VOIDmode, EXPAND_NORMAL);
+	/* Left shfit optimization:
+
+	   If mode == GET_MODE_WIDER_MODE (word_mode), then normally there isn't
+	   native instruction to support this wide mode left shift.  Given below
+	   example:
+
+	    Type A = (Type) B  << C
+
+            |<           T	    >|
+            |   high     |   low     |
+
+                         |<- size  ->|
+
+	   By checking tree shape, if operand B is coming from signed extension,
+	   then the left shift operand can be simplified into:
+
+	      1. high = low >> (size - C);
+	      2. low = low << C;
+	  */
+
+	temp = NULL_RTX;
+	if (code == LSHIFT_EXPR
+	    && target
+	    && REG_P (target)
+	    && ! unsignedp
+	    && mode == GET_MODE_WIDER_MODE (word_mode)
+	    && ! have_insn_for (ASHIFT, mode)
+	    && TREE_CONSTANT (treeop1)
+	    && TREE_CODE (treeop0) == SSA_NAME)
+	  {
+	    gimple def = SSA_NAME_DEF_STMT (treeop0);
+	    if (is_gimple_assign (def)
+		&& gimple_assign_rhs_code (def) == NOP_EXPR)
+	      {
+		machine_mode rmode = TYPE_MODE
+		  (TREE_TYPE (gimple_assign_rhs1 (def)));
 
-      if (! safe_from_p (subtarget, treeop1, 1))
-	subtarget = 0;
-      if (modifier == EXPAND_STACK_PARM)
-	target = 0;
-      op0 = expand_expr (treeop0, subtarget,
-			 VOIDmode, EXPAND_NORMAL);
-      temp = expand_variable_shift (code, mode, op0, treeop1, target,
-				    unsignedp);
-      if (code == LSHIFT_EXPR)
-	temp = REDUCE_BIT_FIELD (temp);
-      return temp;
+		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
+		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
+			>= GET_MODE_BITSIZE (word_mode)))
+		  {
+		    rtx low = simplify_gen_subreg (word_mode, op0, mode, 0);
+		    rtx tlow = simplify_gen_subreg (word_mode, target, mode, 0);
+		    rtx thigh = simplify_gen_subreg (word_mode, target, mode,
+						     UNITS_PER_WORD);
+		    HOST_WIDE_INT ramount = (BITS_PER_WORD
+					     - TREE_INT_CST_LOW (treeop1));
+		    tree rshift = build_int_cst (TREE_TYPE (treeop1), ramount);
+
+		    temp = expand_variable_shift (code, word_mode, low, treeop1,
+						  tlow, unsignedp);
+		    if (temp != tlow)
+		      emit_move_insn (tlow, temp);
+
+		    temp = expand_variable_shift (RSHIFT_EXPR, word_mode, low,
+						  rshift, thigh, unsignedp);
+		    if (temp != thigh)
+		      emit_move_insn (thigh, temp);
+
+		    temp = target ;
+		  }
+	      }
+	  }
+
+	if (temp == NULL_RTX)
+	  temp = expand_variable_shift (code, mode, op0, treeop1, target,
+					unsignedp);
+	if (code == LSHIFT_EXPR)
+	  temp = REDUCE_BIT_FIELD (temp);
+	return temp;
+      }
 
       /* Could determine the answer when only additive constants differ.  Also,
 	 the addition of one can be handled by changing the condition.  */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-128.c b/gcc/testsuite/gcc.dg/wide-shift-128.c
new file mode 100644
index 0000000..9b62715
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-128.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target aarch64*-*-* mips64*-*-* sparc64*-*-* } } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+__int128_t
+load2 (int data)
+{
+    return (__int128_t) data << 50;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-64.c b/gcc/testsuite/gcc.dg/wide-shift-64.c
new file mode 100644
index 0000000..5bc278f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-64.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target arm*-*-* mips*-*-* sparc*-*-* } } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+long long
+load1 (int data)
+{
+    return (long long) data << 12;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ashltidisi.c b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
new file mode 100644
index 0000000..aeb2a24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x, y, z)\
+__uint128_t __attribute__ ((noinline))\
+ushift_##x##_##z (unsigned y data)\
+{\
+  return (__uint128_t) data << x;\
+}\
+__int128_t __attribute__ ((noinline)) \
+shift_##x##_##z (y data) \
+{\
+  return (__int128_t) data << x;\
+}
+
+GEN_TEST_CASE (53, int, i)
+GEN_TEST_CASE (3, long long, ll)
+GEN_TEST_CASE (13, long long, ll)
+GEN_TEST_CASE (53, long long, ll)
+
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y, z, p) \
+	if (ushift_##y##_##p (x)\
+	    != ((__uint128_t) (unsigned z) x << y)) \
+	  abort ();\
+	if (shift_##y##_##p (x)\
+	    != ((__uint128_t) (signed z) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 53, int, i)
+  SHIFT_CHECK (0xcafecafe, 53, int, i)
+
+  SHIFT_CHECK (0x1234567890abcdefLL, 3, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 13, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 53, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 3, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 13, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 53, long long, ll)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 4 } } */
diff --git a/gcc/testsuite/gcc.target/arm/ashldisi.c b/gcc/testsuite/gcc.target/arm/ashldisi.c
new file mode 100644
index 0000000..00dc06e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/ashldisi.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x)\
+unsigned long long __attribute__ ((noinline))\
+ushift_ ## x (unsigned int data)\
+{\
+  return (unsigned long long) data << x;\
+}\
+long long __attribute__ ((noinline)) \
+shift_ ## x (int data) \
+{\
+  return (long long) data << x;\
+}
+
+GEN_TEST_CASE (3)
+GEN_TEST_CASE (23)
+GEN_TEST_CASE (30)
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y) \
+	if (ushift_ ## y (x)\
+	    != ((unsigned long long) (unsigned) x << y)) \
+	  abort (); \
+	if (shift_ ## y (x)\
+	    != ((long long) (signed) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 3)
+  SHIFT_CHECK (0xcafecafe, 3)
+  SHIFT_CHECK (0x12345678, 23)
+  SHIFT_CHECK (0xcafecafe, 23)
+  SHIFT_CHECK (0x12345678, 30)
+  SHIFT_CHECK (0xcafecafe, 30)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 3 } } */
+/* { dg-final { cleanup-saved-temps } } */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-08-14 17:55           ` Jiong Wang
@ 2015-08-14 20:30             ` Jeff Law
  2015-08-14 22:24               ` Jiong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Law @ 2015-08-14 20:30 UTC (permalink / raw)
  To: Jiong Wang; +Cc: gcc-patches

On 08/14/2015 11:40 AM, Jiong Wang wrote:
>
>    * Figuring out whether the shift source is coming from sign extension
>      by checking SSA_NAME_DEF_STMT instead of deducing from tree range
>      info. I fell checking the gimple statement is more reliable and
>      straigtforward.
I suspect it'll also apply more often if you're looking for the 
nop-conversion rather than looking at range information.

I keep thinking there ought to be a generalization here so that we're 
not so restrictive on the modes, but it's probably not worth doing.

In a perfect world we'd also integrate the other strategies for 
double-word shifts that exist in the various ports as special cases in 
expansion and remove the double-word shift patterns for ports that don't 
actually have them in hardware.  But that's probably a bit much to ask 
-- however, it probably couldn't hurt to see if there are any that are 
easily handled.


>
>    * For the pseudo register overlaping issue, given:
>
>        RTX target = TREE source << TREE amount
>
>      I was trying to make sure those two SSA_NAME won't get the same
>      pseudo register by comparing their assigned partition using
>      var_to_partition, but I can't get the associated tree representation
>      for "target" which is RTX.
>
>      Then I just expand "source" and compare the register number with
>      "target".
Right.  Which is what I would have suggested.  Given two pseudos, you 
can just compare them for equality.  If they're unequal, then its safe 
to assume they do not overlap.

>
>      But I found the simplest way maybe just reorder
>
>        low_part (target) = low_part (source) << amount
>        high_part (target) = low_part (source) >> amount1
>
>      to:
>
>        high_part (target) = low_part (source) >> amount1
>        low_part (target) = low_part (source) << amount
>
>      then, even target and source share the same pseudo register, we can
>      still get what we want, as we are operating on their subreg.
Yes.  This would work too and avoids the need to find source's pseudo.

>
>    * Added checking on the result value of expand_variable_shift, if it's
>      not the same as "target" then call emit_move_insn to do that.
Excellent.

>
>    How is the re-worked patch looks to you?
>
>    x86 bootstrap OK, regression OK. aarch64 bootstrap OK.
>
> 2015-08-14  Jiong.Wang<jiong.wang@arm.com>
>
> gcc/
>    * expr.c (expand_expr_real_2): Check tree format to optimize
>    LSHIFT_EXPR expand.
>
> gcc/testsuite
>    * gcc.dg/wide_shift_64_1.c: New testcase.
>    * gcc.dg/wide_shift_128_1.c: Likewise.
>    * gcc.target/aarch64/ashlti3_1.c: Likewise.
>    * gcc.target/arm/ashldisi_1.c: Likewise.



> +	/* Left shfit optimization:
s/shfit/shift/


> +
> +	   If mode == GET_MODE_WIDER_MODE (word_mode), then normally there isn't
> +	   native instruction to support this wide mode left shift.  Given below
> +	   example:
> +
> +	    Type A = (Type) B  << C
> +
> +            |<           T	    >|
> +            |   high     |   low     |
> +
> +                         |<- size  ->|
> +
> +	   By checking tree shape, if operand B is coming from signed extension,
> +	   then the left shift operand can be simplified into:
> +
> +	      1. high = low >> (size - C);
> +	      2. low = low << C;
You'll want to reorder those to match the code you generate.

Doesn't this require that C be less than the bitsize of a word?

If C is larger than the bitsize of a word, then you need some 
adjustments, something like:


	      1. high = low << (C - size)
               2. low = 0

Does this transformation require that the wider mode be exactly 2X the 
narrower mode?  If so, that needs to be verified.

> +		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
So we're assured we have a widening conversion.

> +		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
> +			>= GET_MODE_BITSIZE (word_mode)))
This test seems wrong.  I'm not sure what you're really trying to test 
here.  It seems you want to look at the shift count relative to the 
bitsize of word_mode.  If it's less, then generate the code you 
mentioned above in the comment.  If it's more, then generate the 
sequence I referenced?  Right?

I do think you need to be verifying that rmode == wordmode here.  If I 
understand everything correctly, the final value is "mode" which must be 
2X the size size of rmode/wordmode here, right?



The other question is are we endianness-safe in these transformations?




> +		  {
> +		    rtx low = simplify_gen_subreg (word_mode, op0, mode, 0);
> +		    rtx tlow = simplify_gen_subreg (word_mode, target, mode, 0);
> +		    rtx thigh = simplify_gen_subreg (word_mode, target, mode,
> +						     UNITS_PER_WORD);
> +		    HOST_WIDE_INT ramount = (BITS_PER_WORD
> +					     - TREE_INT_CST_LOW (treeop1));
> +		    tree rshift = build_int_cst (TREE_TYPE (treeop1), ramount);
> +
> +		    temp = expand_variable_shift (code, word_mode, low, treeop1,
> +						  tlow, unsignedp);
Why use "code" here right than just using LSHIFT_EXPR?  As noted earlier,

Jeff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-08-14 20:30             ` Jeff Law
@ 2015-08-14 22:24               ` Jiong Wang
  2015-08-18 13:22                 ` Jiong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Jiong Wang @ 2015-08-14 22:24 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches

Jeff Law writes:

> On 08/14/2015 11:40 AM, Jiong Wang wrote:
>>
>>    * Figuring out whether the shift source is coming from sign extension
>>      by checking SSA_NAME_DEF_STMT instead of deducing from tree range
>>      info. I fell checking the gimple statement is more reliable and
>>      straigtforward.
> I suspect it'll also apply more often if you're looking for the 
> nop-conversion rather than looking at range information.
>
> I keep thinking there ought to be a generalization here so that we're 
> not so restrictive on the modes, but it's probably not worth doing.
>
> In a perfect world we'd also integrate the other strategies for 
> double-word shifts that exist in the various ports as special cases in 
> expansion and remove the double-word shift patterns for ports that don't 
> actually have them in hardware.  But that's probably a bit much to ask 
> -- however, it probably couldn't hurt to see if there are any that are 
> easily handled.

Agree.

Doing these in early tree/rtl expand stage is more clean & safe, and
expose more details to later RTL optimization passes as I think if you
handle double-word by defining insn pattern, then you will try to split
it in RTL split pass which happens later after some important
optimizations.

>
>> +
>> +	   If mode == GET_MODE_WIDER_MODE (word_mode), then normally there isn't
>> +	   native instruction to support this wide mode left shift.  Given below
>> +	   example:
>> +
>> +	    Type A = (Type) B  << C
>> +
>> +            |<           T	    >|
>> +            |   high     |   low     |
>> +
>> +                         |<- size  ->|
>> +
>> +	   By checking tree shape, if operand B is coming from signed extension,
>> +	   then the left shift operand can be simplified into:
>> +
>> +	      1. high = low >> (size - C);
>> +	      2. low = low << C;
> You'll want to reorder those to match the code you generate.
>
> Doesn't this require that C be less than the bitsize of a word?

Yes.

Above transformation is to handle double word left shift which shift the
original source across the word boundary.

Part of the contents shifted into the high half of destination, and the
other remain in the low half.

So if C is bigger than bitsize of a word, the the whole source will be
shifted into high half, thus should generate what you have listed below
and that's what gcc is generating, looks like the existed double word
shift logic can recognize this special cases.

So, I should check the value of C,if it's bigger than bitsize of a word,
then just fall through to default expand logic.

> If C is larger than the bitsize of a word, then you need some 
> adjustments, something like:
>
>
> 	      1. high = low << (C - size)
>                2. low = 0
>
> Does this transformation require that the wider mode be exactly 2X the 
> narrower mode?  If so, that needs to be verified.

I think so, the wider mode should be exactly 2X the word_mode, will
add the check.

>
>> +		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
> So we're assured we have a widening conversion.
>
>> +		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
>> +			>= GET_MODE_BITSIZE (word_mode)))
> This test seems wrong.  I'm not sure what you're really trying to test 
> here.  It seems you want to look at the shift count relative to the 
> bitsize of word_mode.  If it's less, then generate the code you 
> mentioned above in the comment.  If it's more, then generate the 
> sequence I referenced?  Right?

As explained above, I am trying to check whether the left shift is
causing the original source across word boundary while I should make sure
TREE_INT_CST_LOW (treeop1) < GET_MODE_BITSIZE (word_mode) at the same
time, otherwise fall through to default expand logic.

>
> I do think you need to be verifying that rmode == wordmode here.  If I 
> understand everything correctly, the final value is "mode" which must be 
> 2X the size size of rmode/wordmode here, right?

I think rmode don't need to equal wordmode. For example the
transformation is OK for the following, where rmode = SImode, and final
mode = TImode.

int b;
__128_int a = (__128_int) b;

the expand_expr (treeop0... in the start of those code will generate
necessary instruction sequences to extend whatever mode b is into TImode
register (op0 in the patch);

>
>
>
> The other question is are we endianness-safe in these transformations?

I think it is. As this transformation is done with register involved
only. I think endianness issue will only happen if there is operation on
memory object.

>> +		    temp = expand_variable_shift (code, word_mode, low, treeop1,
>> +						  tlow, unsignedp);
> Why use "code" here right than just using LSHIFT_EXPR?  As noted
> earlier,

Yes, better to use the constant LSHIFT_EXPR.

Thanks.

-- 
Regards,
Jiong

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-08-14 22:24               ` Jiong Wang
@ 2015-08-18 13:22                 ` Jiong Wang
  2015-08-18 17:47                   ` Jeff Law
  0 siblings, 1 reply; 12+ messages in thread
From: Jiong Wang @ 2015-08-18 13:22 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 6344 bytes --]


Jiong Wang writes:

> Jeff Law writes:
>
>> On 08/14/2015 11:40 AM, Jiong Wang wrote:
>>>
>>>    * Figuring out whether the shift source is coming from sign extension
>>>      by checking SSA_NAME_DEF_STMT instead of deducing from tree range
>>>      info. I fell checking the gimple statement is more reliable and
>>>      straigtforward.
>> I suspect it'll also apply more often if you're looking for the 
>> nop-conversion rather than looking at range information.
>>
>> I keep thinking there ought to be a generalization here so that we're 
>> not so restrictive on the modes, but it's probably not worth doing.
>>
>> In a perfect world we'd also integrate the other strategies for 
>> double-word shifts that exist in the various ports as special cases in 
>> expansion and remove the double-word shift patterns for ports that don't 
>> actually have them in hardware.  But that's probably a bit much to ask 
>> -- however, it probably couldn't hurt to see if there are any that are 
>> easily handled.
>
> Agree.
>
> Doing these in early tree/rtl expand stage is more clean & safe, and
> expose more details to later RTL optimization passes as I think if you
> handle double-word by defining insn pattern, then you will try to split
> it in RTL split pass which happens later after some important
> optimizations.
>
>>
>>> +
>>> +	   If mode == GET_MODE_WIDER_MODE (word_mode), then normally there isn't
>>> +	   native instruction to support this wide mode left shift.  Given below
>>> +	   example:
>>> +
>>> +	    Type A = (Type) B  << C
>>> +
>>> +            |<           T	    >|
>>> +            |   high     |   low     |
>>> +
>>> +                         |<- size  ->|
>>> +
>>> +	   By checking tree shape, if operand B is coming from signed extension,
>>> +	   then the left shift operand can be simplified into:
>>> +
>>> +	      1. high = low >> (size - C);
>>> +	      2. low = low << C;
>> You'll want to reorder those to match the code you generate.
>>
>> Doesn't this require that C be less than the bitsize of a word?
>
> Yes.
>
> Above transformation is to handle double word left shift which shift the
> original source across the word boundary.
>
> Part of the contents shifted into the high half of destination, and the
> other remain in the low half.
>
> So if C is bigger than bitsize of a word, the the whole source will be
> shifted into high half, thus should generate what you have listed below
> and that's what gcc is generating, looks like the existed double word
> shift logic can recognize this special cases.
>
> So, I should check the value of C,if it's bigger than bitsize of a word,
> then just fall through to default expand logic.
>
>> If C is larger than the bitsize of a word, then you need some 
>> adjustments, something like:
>>
>>
>> 	      1. high = low << (C - size)
>>                2. low = 0
>>
>> Does this transformation require that the wider mode be exactly 2X the 
>> narrower mode?  If so, that needs to be verified.
>
> I think so, the wider mode should be exactly 2X the word_mode, will
> add the check.
>
>>
>>> +		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
>> So we're assured we have a widening conversion.
>>
>>> +		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
>>> +			>= GET_MODE_BITSIZE (word_mode)))
>> This test seems wrong.  I'm not sure what you're really trying to test 
>> here.  It seems you want to look at the shift count relative to the 
>> bitsize of word_mode.  If it's less, then generate the code you 
>> mentioned above in the comment.  If it's more, then generate the 
>> sequence I referenced?  Right?
>
> As explained above, I am trying to check whether the left shift is
> causing the original source across word boundary while I should make sure
> TREE_INT_CST_LOW (treeop1) < GET_MODE_BITSIZE (word_mode) at the same
> time, otherwise fall through to default expand logic.
>
>>
>> I do think you need to be verifying that rmode == wordmode here.  If I 
>> understand everything correctly, the final value is "mode" which must be 
>> 2X the size size of rmode/wordmode here, right?
>
> I think rmode don't need to equal wordmode. For example the
> transformation is OK for the following, where rmode = SImode, and final
> mode = TImode.
>
> int b;
> __128_int a = (__128_int) b;
>
> the expand_expr (treeop0... in the start of those code will generate
> necessary instruction sequences to extend whatever mode b is into TImode
> register (op0 in the patch);
>
>>
>>
>>
>> The other question is are we endianness-safe in these transformations?
>
> I think it is. As this transformation is done with register involved
> only. I think endianness issue will only happen if there is operation on
> memory object.
>
>>> +		    temp = expand_variable_shift (code, word_mode, low, treeop1,
>>> +						  tlow, unsignedp);
>> Why use "code" here right than just using LSHIFT_EXPR?  As noted
>> earlier,
>
> Yes, better to use the constant LSHIFT_EXPR.
>
> Thanks.

patch updated, please review thanks.

Changes are:

  1. s/shfit/shift/
  2. Re-write the comment.
     more explanations on the left shift across word size boundary
     scenario, explan gcc's current expand steps and what this
     optimization can improve.
  3. Match the code to the comment.
     Expand high part first, then low part, so we don't need to check
     the pseudo register overlapping.
  4. Add the check to make sure if we are shifting the whole source
     operand into high part (sfhit amount >= word mode size) then just
     don't do this optimization, use the default expand logic instead
     which generate optimized rtx sequences already.
  5. Only do this optimization when
       GET_MOZE_SIZE (mode) == 2 * GET_MODE_SIZE (word_mode)

  bootstrap OK on x86-64, no regression.
  bootstrap OK on aarch64, no regression.

2015-08-18  Jiong.Wang  <jiong.wang@arm.com>

gcc/
  * expr.c (expand_expr_real_2): Check gimple statement during
  LSHIFT_EXPR expand.
  
gcc/testsuite
  * gcc.dg/wide_shift_64_1.c: New testcase.
  * gcc.dg/wide_shift_128_1.c: Likewise.
  * gcc.target/aarch64/ashlti3_1.c: Likewise.
  * gcc.target/arm/ashldisi_1.c: Likewise.
  
-- 
Regards,
Jiong


[-- Attachment #2: k-new.patch --]
[-- Type: text/x-diff, Size: 8548 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index 3202f55..eca9a20 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8836,23 +8836,110 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 
     case LSHIFT_EXPR:
     case RSHIFT_EXPR:
-      /* If this is a fixed-point operation, then we cannot use the code
-	 below because "expand_shift" doesn't support sat/no-sat fixed-point
-         shifts.   */
-      if (ALL_FIXED_POINT_MODE_P (mode))
-	goto binop;
+      {
+	/* If this is a fixed-point operation, then we cannot use the code
+	   below because "expand_shift" doesn't support sat/no-sat fixed-point
+	   shifts.  */
+	if (ALL_FIXED_POINT_MODE_P (mode))
+	  goto binop;
+
+	if (! safe_from_p (subtarget, treeop1, 1))
+	  subtarget = 0;
+	if (modifier == EXPAND_STACK_PARM)
+	  target = 0;
+	op0 = expand_expr (treeop0, subtarget,
+			   VOIDmode, EXPAND_NORMAL);
+	/* Left shift optimization when shifting across word_size boundary.
+
+	   If mode == GET_MODE_WIDER_MODE (word_mode), then normally there isn't
+	   native instruction to support this wide mode left shift.  Given below
+	   scenario:
+
+	    Type A = (Type) B  << C
+
+	    |<		 T	    >|
+	    | dest_high  |  dest_low |
+
+			 | word_size |
+
+	   If the shfit amount C caused we shift B to across the word size
+	   boundary, i.e part of B shifted into high half of destination
+	   register, and part of B remains in the low half, then GCC will use
+	   the following left shift expand logic:
+
+	   1. Initialize dest_low to B.
+	   2. Initialize every bit of dest_high to the sign bit of B.
+	   3. Logic left shift dest_low by C bit to finalize dest_low.
+	      The value of dest_low before this shift is kept in a temp D.
+	   4. Logic left shift dest_high by C.
+	   5. Logic right shift D by (word_size - C).
+	   6. Or the result of 4 and 5 to finalize dest_high.
+
+	   While, by checking gimple statements, if operand B is coming from
+	   signed extension, then we can simplify above expand logic into:
+
+	      1. dest_high = src_low >> (word_size - C).
+	      2. dest_low = src_low << C.
+
+	   We can use one arithmetic right shift to finish all the purpose of
+	   steps 2, 4, 5, 6, thus we reduce the steps needed from 6 into 2.  */
+
+	temp = NULL_RTX;
+	if (code == LSHIFT_EXPR
+	    && target
+	    && REG_P (target)
+	    && ! unsignedp
+	    && mode == GET_MODE_WIDER_MODE (word_mode)
+	    && GET_MODE_SIZE (mode) == 2 * GET_MODE_SIZE (word_mode)
+	    && ! have_insn_for (ASHIFT, mode)
+	    && TREE_CONSTANT (treeop1)
+	    && TREE_CODE (treeop0) == SSA_NAME)
+	  {
+	    gimple def = SSA_NAME_DEF_STMT (treeop0);
+	    if (is_gimple_assign (def)
+		&& gimple_assign_rhs_code (def) == NOP_EXPR)
+	      {
+		machine_mode rmode = TYPE_MODE
+		  (TREE_TYPE (gimple_assign_rhs1 (def)));
 
-      if (! safe_from_p (subtarget, treeop1, 1))
-	subtarget = 0;
-      if (modifier == EXPAND_STACK_PARM)
-	target = 0;
-      op0 = expand_expr (treeop0, subtarget,
-			 VOIDmode, EXPAND_NORMAL);
-      temp = expand_variable_shift (code, mode, op0, treeop1, target,
-				    unsignedp);
-      if (code == LSHIFT_EXPR)
-	temp = REDUCE_BIT_FIELD (temp);
-      return temp;
+		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
+		    && TREE_INT_CST_LOW (treeop1) < GET_MODE_BITSIZE (word_mode)
+		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
+			>= GET_MODE_BITSIZE (word_mode)))
+		  {
+		    rtx low = simplify_gen_subreg (word_mode, op0, mode, 0);
+		    rtx dest_low = simplify_gen_subreg (word_mode, target, mode,
+							0);
+		    rtx dest_high = simplify_gen_subreg (word_mode, target,
+							 mode, UNITS_PER_WORD);
+		    HOST_WIDE_INT ramount = (BITS_PER_WORD
+					     - TREE_INT_CST_LOW (treeop1));
+		    tree rshift = build_int_cst (TREE_TYPE (treeop1), ramount);
+
+		    /* dest_high = src_low >> (word_size - C).  */
+		    temp = expand_variable_shift (RSHIFT_EXPR, word_mode, low,
+						  rshift, dest_high, unsignedp);
+		    if (temp != dest_high)
+		      emit_move_insn (dest_high, temp);
+
+		    /* dest_low = src_low << C.  */
+		    temp = expand_variable_shift (LSHIFT_EXPR, word_mode, low,
+						  treeop1, dest_low, unsignedp);
+		    if (temp != dest_low)
+		      emit_move_insn (dest_low, temp);
+
+		    temp = target ;
+		  }
+	      }
+	  }
+
+	if (temp == NULL_RTX)
+	  temp = expand_variable_shift (code, mode, op0, treeop1, target,
+					unsignedp);
+	if (code == LSHIFT_EXPR)
+	  temp = REDUCE_BIT_FIELD (temp);
+	return temp;
+      }
 
       /* Could determine the answer when only additive constants differ.  Also,
 	 the addition of one can be handled by changing the condition.  */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-128.c b/gcc/testsuite/gcc.dg/wide-shift-128.c
new file mode 100644
index 0000000..9b62715
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target aarch64*-*-* mips64*-*-* sparc64*-*-* } } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+__int128_t
+load2 (int data)
+{
+    return (__int128_t) data << 50;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-64.c b/gcc/testsuite/gcc.dg/wide-shift-64.c
new file mode 100644
index 0000000..ea78708
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-64.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target arm*-*-* mips*-*-* sparc*-*-* } } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+long long
+load1 (int data)
+{
+    return (long long) data << 12;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ashltidisi.c b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
new file mode 100644
index 0000000..d7b02b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x, y, z)\
+__uint128_t __attribute__ ((noinline))\
+ushift_##x##_##z (unsigned y data)\
+{\
+  return (__uint128_t) data << x;\
+}\
+__int128_t __attribute__ ((noinline)) \
+shift_##x##_##z (y data) \
+{\
+  return (__int128_t) data << x;\
+}
+
+GEN_TEST_CASE (53, int, i)
+GEN_TEST_CASE (3, long long, ll)
+GEN_TEST_CASE (13, long long, ll)
+GEN_TEST_CASE (53, long long, ll)
+
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y, z, p) \
+	if (ushift_##y##_##p (x)\
+	    != ((__uint128_t) (unsigned z) x << y)) \
+	  abort ();\
+	if (shift_##y##_##p (x)\
+	    != ((__uint128_t) (signed z) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 53, int, i)
+  SHIFT_CHECK (0xcafecafe, 53, int, i)
+
+  SHIFT_CHECK (0x1234567890abcdefLL, 3, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 13, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 53, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 3, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 13, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 53, long long, ll)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 4 } } */
diff --git a/gcc/testsuite/gcc.target/arm/ashldisi.c b/gcc/testsuite/gcc.target/arm/ashldisi.c
new file mode 100644
index 0000000..00dc06e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/ashldisi.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x)\
+unsigned long long __attribute__ ((noinline))\
+ushift_ ## x (unsigned int data)\
+{\
+  return (unsigned long long) data << x;\
+}\
+long long __attribute__ ((noinline)) \
+shift_ ## x (int data) \
+{\
+  return (long long) data << x;\
+}
+
+GEN_TEST_CASE (3)
+GEN_TEST_CASE (23)
+GEN_TEST_CASE (30)
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y) \
+	if (ushift_ ## y (x)\
+	    != ((unsigned long long) (unsigned) x << y)) \
+	  abort (); \
+	if (shift_ ## y (x)\
+	    != ((long long) (signed) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 3)
+  SHIFT_CHECK (0xcafecafe, 3)
+  SHIFT_CHECK (0x12345678, 23)
+  SHIFT_CHECK (0xcafecafe, 23)
+  SHIFT_CHECK (0x12345678, 30)
+  SHIFT_CHECK (0xcafecafe, 30)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 3 } } */
+/* { dg-final { cleanup-saved-temps } } */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-08-18 13:22                 ` Jiong Wang
@ 2015-08-18 17:47                   ` Jeff Law
  2015-08-19 23:05                     ` Jiong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Law @ 2015-08-18 17:47 UTC (permalink / raw)
  To: Jiong Wang; +Cc: gcc-patches

On 08/18/2015 06:54 AM, Jiong Wang wrote:
>
> Changes are:
>
>    1. s/shfit/shift/
>    2. Re-write the comment.
>       more explanations on the left shift across word size boundary
>       scenario, explan gcc's current expand steps and what this
>       optimization can improve.
>    3. Match the code to the comment.
>       Expand high part first, then low part, so we don't need to check
>       the pseudo register overlapping.
>    4. Add the check to make sure if we are shifting the whole source
>       operand into high part (sfhit amount >= word mode size) then just
>       don't do this optimization, use the default expand logic instead
>       which generate optimized rtx sequences already.
>    5. Only do this optimization when
>         GET_MOZE_SIZE (mode) == 2 * GET_MODE_SIZE (word_mode)
>
>    bootstrap OK on x86-64, no regression.
>    bootstrap OK on aarch64, no regression.
>
> 2015-08-18  Jiong.Wang<jiong.wang@arm.com>
>
> gcc/
>    * expr.c (expand_expr_real_2): Check gimple statement during
>    LSHIFT_EXPR expand.
>
> gcc/testsuite
>    * gcc.dg/wide_shift_64_1.c: New testcase.
>    * gcc.dg/wide_shift_128_1.c: Likewise.
>    * gcc.target/aarch64/ashlti3_1.c: Likewise.
>    * gcc.target/arm/ashldisi_1.c: Likewise.
>
> -- Regards, Jiong
>
>
> k-new.patch
>
>
> diff --git a/gcc/expr.c b/gcc/expr.c
> index 3202f55..eca9a20 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -8836,23 +8836,110 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
>
>       case LSHIFT_EXPR:
>       case RSHIFT_EXPR:
> -      /* If this is a fixed-point operation, then we cannot use the code
> -	 below because "expand_shift" doesn't support sat/no-sat fixed-point
> -         shifts.   */
> -      if (ALL_FIXED_POINT_MODE_P (mode))
> -	goto binop;
> +      {
> +	/* If this is a fixed-point operation, then we cannot use the code
> +	   below because "expand_shift" doesn't support sat/no-sat fixed-point
> +	   shifts.  */
> +	if (ALL_FIXED_POINT_MODE_P (mode))
> +	  goto binop;
> +
> +	if (! safe_from_p (subtarget, treeop1, 1))
> +	  subtarget = 0;
> +	if (modifier == EXPAND_STACK_PARM)
> +	  target = 0;
> +	op0 = expand_expr (treeop0, subtarget,
> +			   VOIDmode, EXPAND_NORMAL);
> +	/* Left shift optimization when shifting across word_size boundary.
Please insert a newline before this comment.


> +
> +	   If mode == GET_MODE_WIDER_MODE (word_mode), then normally there isn't
> +	   native instruction to support this wide mode left shift.  Given below
> +	   scenario:
> +
> +	    Type A = (Type) B  << C
> +
> +	    |<		 T	    >|
> +	    | dest_high  |  dest_low |
> +
> +			 | word_size |
> +
> +	   If the shfit amount C caused we shift B to across the word size
s/shfit/shift/


> +
> +	   While, by checking gimple statements, if operand B is coming from
> +	   signed extension, then we can simplify above expand logic into:
> +
> +	      1. dest_high = src_low >> (word_size - C).
> +	      2. dest_low = src_low << C.
> +
> +	   We can use one arithmetic right shift to finish all the purpose of
> +	   steps 2, 4, 5, 6, thus we reduce the steps needed from 6 into 2.  */
> +
> +	temp = NULL_RTX;
> +	if (code == LSHIFT_EXPR
> +	    && target
> +	    && REG_P (target)
> +	    && ! unsignedp
> +	    && mode == GET_MODE_WIDER_MODE (word_mode)
> +	    && GET_MODE_SIZE (mode) == 2 * GET_MODE_SIZE (word_mode)
> +	    && ! have_insn_for (ASHIFT, mode)
> +	    && TREE_CONSTANT (treeop1)
> +	    && TREE_CODE (treeop0) == SSA_NAME)
> +	  {
> +	    gimple def = SSA_NAME_DEF_STMT (treeop0);
> +	    if (is_gimple_assign (def)
> +		&& gimple_assign_rhs_code (def) == NOP_EXPR)
Don't you need to check that the conversion is actually a sign 
extension.  Oh, you're relying on the signedness of ops->type.  That 
should be sufficient.

> +	      {
> +		machine_mode rmode = TYPE_MODE
> +		  (TREE_TYPE (gimple_assign_rhs1 (def)));
>
> -      if (! safe_from_p (subtarget, treeop1, 1))
> -	subtarget = 0;
> -      if (modifier == EXPAND_STACK_PARM)
> -	target = 0;
> -      op0 = expand_expr (treeop0, subtarget,
> -			 VOIDmode, EXPAND_NORMAL);
> -      temp = expand_variable_shift (code, mode, op0, treeop1, target,
> -				    unsignedp);
> -      if (code == LSHIFT_EXPR)
> -	temp = REDUCE_BIT_FIELD (temp);
> -      return temp;
> +		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
> +		    && TREE_INT_CST_LOW (treeop1) < GET_MODE_BITSIZE (word_mode)
> +		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
> +			>= GET_MODE_BITSIZE (word_mode)))
I keep having trouble with this.  I think the key to why this works is 
you don't actually use the source of the conversion, but instead the 
destination of the conversion (held in op0).

So this is OK for the trunk with the typo & whitespace nits fixed.

Thanks for your patience,
Jeff


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding
  2015-08-18 17:47                   ` Jeff Law
@ 2015-08-19 23:05                     ` Jiong Wang
  0 siblings, 0 replies; 12+ messages in thread
From: Jiong Wang @ 2015-08-19 23:05 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3729 bytes --]


Jeff Law writes:

>> +	    && ! unsignedp
> Don't you need to check that the conversion is actually a sign 
> extension.  Oh, you're relying on the signedness of ops->type.  That 
> should be sufficient.

  Exactly.

>> +		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
>> +		    && TREE_INT_CST_LOW (treeop1) < GET_MODE_BITSIZE (word_mode)
>> +		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
>> +			>= GET_MODE_BITSIZE (word_mode)))
> I keep having trouble with this.  I think the key to why this works is 
> you don't actually use the source of the conversion, but instead the 
> destination of the conversion (held in op0).

  Yes, my thought is given any narrower type, the

  op0 = expand_expr (treeop0, subtarget, VOIDmode, EXPAND_NORMAL);

  Will do necesary sign extension, then finally want we get from op0 is
  a "2 x word_mode_size" register whose high part contains duplication
  of sign bit, and low part contains the source of the conversion if the
  narrower mode is word_mode_size, or contains sign extened source, but
  either way, the low part can be used as our new shift operand. And the
  high part of op0 generated from above expand_expr is actually dead
  which will be removed by later optimization pass.

>
> So this is OK for the trunk with the typo & whitespace nits fixed.

During my final testing of the patch on arm, mips32, mips64, powerpc32,
powerpc64 I do found two new issues:

  * As you have mentioned, this patch do have endianness issue!
    Although the new testcases (wide-shift-128/64.c) under gcc.dg passed
    my manual powerpc32/powerpc64 test which check combine pass rtl
    dump, but the generated assembly looks weird to me, then I found
    simplify_gen_subreg itself is not endianness safe. We need to use
    subreg_highpart/lowpart_offset to get the offset.    

    So I done minor modifications on the patch by using lowpart_reg and
    subreg_highpart_offset I think it's endianess safe now.

    And the instruction sequences generated for powerpc, for example
    powerpc32 looks sane to me now:

    long long load1 (int data) { return (long long) data << 12;}

    load1-before-patch:
	srawi 9,3,31
	mr 4,3
	srwi 3,3,20
	slwi 4,4,12
	rlwimi 3,9,12,0,31-12
	blr

    load1-after-patch:
	mr 4,3
	srawi 3,3,20
	slwi 4,4,12
	blr

   * The other issue is still what you have noticed. For some targets,
     for example arm, there is backend define_expand for double word
     left shift, then because my patch will only do the tranformation
     when "! have_insn_for (ASHIFT, mode)", thus this transformation
     will not happen on these targets.

     While I belive this transformation do generate better instruction
     sequences for some case. As backend don't know high level type
     conversion information, those backends expand logic like
     arm_emit_coreregs_64bit_shift can only assume the shift operand
     may come from any cases thus the expand logic will be very generic
     thus not optimal.

     We should compare the cost to decide whether to go with this
     transformation or use the backend's customized expand logic.

     But anyway, this patch will not cause regression. We need another
     seperate patch to implement the cost comparision then we can let
     those targets benefit from this transformation.

     Thus I removed the arm target testcase. (I recall arm testcase is
     in this patch because my original local patch don't contain
     the "have_insn_for" check.)

 Thanks very much for the review!
 
 Committed attached patch (above minor issues I treat them as obivious)
 as 227018.
 
-- 
Regards,
Jiong


[-- Attachment #2: k.patch --]
[-- Type: text/x-diff, Size: 8416 bytes --]

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 227017)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,8 @@
+2015-08-19  Jiong Wang  <jiong.wang@arm.com>
+
+	* expr.c (expand_expr_real_2): Check gimple statement during
+	LSHIFT_EXPR expand.
+
 2015-08-19  Magnus Granberg  <zorry@gentoo.org>
 
 	* common.opt (fstack-protector): Initialize to -1.
Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 227017)
+++ gcc/expr.c	(working copy)
@@ -8836,24 +8836,113 @@
 
     case LSHIFT_EXPR:
     case RSHIFT_EXPR:
-      /* If this is a fixed-point operation, then we cannot use the code
-	 below because "expand_shift" doesn't support sat/no-sat fixed-point
-         shifts.   */
-      if (ALL_FIXED_POINT_MODE_P (mode))
-	goto binop;
+      {
+	/* If this is a fixed-point operation, then we cannot use the code
+	   below because "expand_shift" doesn't support sat/no-sat fixed-point
+	   shifts.  */
+	if (ALL_FIXED_POINT_MODE_P (mode))
+	  goto binop;
 
-      if (! safe_from_p (subtarget, treeop1, 1))
-	subtarget = 0;
-      if (modifier == EXPAND_STACK_PARM)
-	target = 0;
-      op0 = expand_expr (treeop0, subtarget,
-			 VOIDmode, EXPAND_NORMAL);
-      temp = expand_variable_shift (code, mode, op0, treeop1, target,
-				    unsignedp);
-      if (code == LSHIFT_EXPR)
-	temp = REDUCE_BIT_FIELD (temp);
-      return temp;
+	if (! safe_from_p (subtarget, treeop1, 1))
+	  subtarget = 0;
+	if (modifier == EXPAND_STACK_PARM)
+	  target = 0;
+	op0 = expand_expr (treeop0, subtarget,
+			   VOIDmode, EXPAND_NORMAL);
 
+	/* Left shift optimization when shifting across word_size boundary.
+
+	   If mode == GET_MODE_WIDER_MODE (word_mode), then normally there isn't
+	   native instruction to support this wide mode left shift.  Given below
+	   scenario:
+
+	    Type A = (Type) B  << C
+
+	    |<		 T	    >|
+	    | dest_high  |  dest_low |
+
+			 | word_size |
+
+	   If the shift amount C caused we shift B to across the word size
+	   boundary, i.e part of B shifted into high half of destination
+	   register, and part of B remains in the low half, then GCC will use
+	   the following left shift expand logic:
+
+	   1. Initialize dest_low to B.
+	   2. Initialize every bit of dest_high to the sign bit of B.
+	   3. Logic left shift dest_low by C bit to finalize dest_low.
+	      The value of dest_low before this shift is kept in a temp D.
+	   4. Logic left shift dest_high by C.
+	   5. Logic right shift D by (word_size - C).
+	   6. Or the result of 4 and 5 to finalize dest_high.
+
+	   While, by checking gimple statements, if operand B is coming from
+	   signed extension, then we can simplify above expand logic into:
+
+	      1. dest_high = src_low >> (word_size - C).
+	      2. dest_low = src_low << C.
+
+	   We can use one arithmetic right shift to finish all the purpose of
+	   steps 2, 4, 5, 6, thus we reduce the steps needed from 6 into 2.  */
+
+	temp = NULL_RTX;
+	if (code == LSHIFT_EXPR
+	    && target
+	    && REG_P (target)
+	    && ! unsignedp
+	    && mode == GET_MODE_WIDER_MODE (word_mode)
+	    && GET_MODE_SIZE (mode) == 2 * GET_MODE_SIZE (word_mode)
+	    && ! have_insn_for (ASHIFT, mode)
+	    && TREE_CONSTANT (treeop1)
+	    && TREE_CODE (treeop0) == SSA_NAME)
+	  {
+	    gimple def = SSA_NAME_DEF_STMT (treeop0);
+	    if (is_gimple_assign (def)
+		&& gimple_assign_rhs_code (def) == NOP_EXPR)
+	      {
+		machine_mode rmode = TYPE_MODE
+		  (TREE_TYPE (gimple_assign_rhs1 (def)));
+
+		if (GET_MODE_SIZE (rmode) < GET_MODE_SIZE (mode)
+		    && TREE_INT_CST_LOW (treeop1) < GET_MODE_BITSIZE (word_mode)
+		    && ((TREE_INT_CST_LOW (treeop1) + GET_MODE_BITSIZE (rmode))
+			>= GET_MODE_BITSIZE (word_mode)))
+		  {
+		    unsigned int high_off = subreg_highpart_offset (word_mode,
+								    mode);
+		    rtx low = lowpart_subreg (word_mode, op0, mode);
+		    rtx dest_low = lowpart_subreg (word_mode, target, mode);
+		    rtx dest_high = simplify_gen_subreg (word_mode, target,
+							 mode, high_off);
+		    HOST_WIDE_INT ramount = (BITS_PER_WORD
+					     - TREE_INT_CST_LOW (treeop1));
+		    tree rshift = build_int_cst (TREE_TYPE (treeop1), ramount);
+
+		    /* dest_high = src_low >> (word_size - C).  */
+		    temp = expand_variable_shift (RSHIFT_EXPR, word_mode, low,
+						  rshift, dest_high, unsignedp);
+		    if (temp != dest_high)
+		      emit_move_insn (dest_high, temp);
+
+		    /* dest_low = src_low << C.  */
+		    temp = expand_variable_shift (LSHIFT_EXPR, word_mode, low,
+						  treeop1, dest_low, unsignedp);
+		    if (temp != dest_low)
+		      emit_move_insn (dest_low, temp);
+
+		    temp = target ;
+		  }
+	      }
+	  }
+
+	if (temp == NULL_RTX)
+	  temp = expand_variable_shift (code, mode, op0, treeop1, target,
+					unsignedp);
+	if (code == LSHIFT_EXPR)
+	  temp = REDUCE_BIT_FIELD (temp);
+	return temp;
+      }
+
       /* Could determine the answer when only additive constants differ.  Also,
 	 the addition of one can be handled by changing the condition.  */
     case LT_EXPR:
Index: gcc/testsuite/ChangeLog
===================================================================
--- gcc/testsuite/ChangeLog	(revision 227017)
+++ gcc/testsuite/ChangeLog	(working copy)
@@ -1,3 +1,9 @@
+2015-08-19  Jiong Wang  <jiong.wang@arm.com>
+
+	* gcc.dg/wide_shift_64_1.c: New testcase.
+	* gcc.dg/wide_shift_128_1.c: Likewise.
+	* gcc.target/aarch64/ashlti3_1.c: Likewise.
+
 2015-08-19  Magnus Granberg  <zorry@gentoo.org>
 
 	* lib/target-supports.exp
Index: gcc/testsuite/gcc.dg/wide-shift-128.c
===================================================================
--- gcc/testsuite/gcc.dg/wide-shift-128.c	(revision 0)
+++ gcc/testsuite/gcc.dg/wide-shift-128.c	(working copy)
@@ -0,0 +1,11 @@
+/* { dg-do compile { target aarch64*-*-* mips64*-*-* sparc64*-*-* } } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+__int128_t
+load2 (int data)
+{
+    return (__int128_t) data << 50;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
Index: gcc/testsuite/gcc.dg/wide-shift-64.c
===================================================================
--- gcc/testsuite/gcc.dg/wide-shift-64.c	(revision 0)
+++ gcc/testsuite/gcc.dg/wide-shift-64.c	(working copy)
@@ -0,0 +1,10 @@
+/* { dg-do compile { target mips*-*-* sparc*-*-* } } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+long long
+load1 (int data)
+{
+    return (long long) data << 12;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
Index: gcc/testsuite/gcc.target/aarch64/ashltidisi.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/ashltidisi.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/ashltidisi.c	(working copy)
@@ -0,0 +1,49 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x, y, z)\
+__uint128_t __attribute__ ((noinline))\
+ushift_##x##_##z (unsigned y data)\
+{\
+  return (__uint128_t) data << x;\
+}\
+__int128_t __attribute__ ((noinline)) \
+shift_##x##_##z (y data) \
+{\
+  return (__int128_t) data << x;\
+}
+
+GEN_TEST_CASE (53, int, i)
+GEN_TEST_CASE (3, long long, ll)
+GEN_TEST_CASE (13, long long, ll)
+GEN_TEST_CASE (53, long long, ll)
+
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y, z, p) \
+	if (ushift_##y##_##p (x)\
+	    != ((__uint128_t) (unsigned z) x << y)) \
+	  abort ();\
+	if (shift_##y##_##p (x)\
+	    != ((__uint128_t) (signed z) x << y)) \
+	  abort ();
+
+  SHIFT_CHECK (0x12345678, 53, int, i)
+  SHIFT_CHECK (0xcafecafe, 53, int, i)
+
+  SHIFT_CHECK (0x1234567890abcdefLL, 3, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 13, long long, ll)
+  SHIFT_CHECK (0x1234567890abcdefLL, 53, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 3, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 13, long long, ll)
+  SHIFT_CHECK (0xcafecafedeaddeadLL, 53, long long, ll)
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 4 } } */
+/* { dg-final { scan-assembler-not "extr\t" } } */

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2015-08-19 22:55 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-16 11:04 [Patch/rtl-expand] Take tree range info into account to improve LSHIFT_EXP expanding Jiong Wang
2015-04-24  2:23 ` Jeff Law
2015-04-27 20:58   ` Jiong Wang
2015-04-29  3:53     ` Jeff Law
2015-04-29 22:14       ` Jiong Wang
2015-04-29 22:55         ` Jeff Law
2015-08-14 17:55           ` Jiong Wang
2015-08-14 20:30             ` Jeff Law
2015-08-14 22:24               ` Jiong Wang
2015-08-18 13:22                 ` Jiong Wang
2015-08-18 17:47                   ` Jeff Law
2015-08-19 23:05                     ` Jiong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).