public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
@ 2018-07-13 16:09 Sam Tebbs
  2018-07-16 10:55 ` Sudakshina Das
  0 siblings, 1 reply; 13+ messages in thread
From: Sam Tebbs @ 2018-07-13 16:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 1563 bytes --]


Hi all,

This patch adds an optimisation that exploits the AArch64 BFXIL instruction
when or-ing the result of two bitwise and operations with non-overlapping
bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).

Example:

unsigned long long combine(unsigned long long a, unsigned long long b) {
  return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
}

void read2(unsigned long long a, unsigned long long b, unsigned long long *c,
  unsigned long long *d) {
  *c = combine(a, b); *d = combine(b, a);
}

When compiled with -O2, read2 would result in:

read2:
  and   x5, x1, #0xffffffff
  and   x4, x0, #0xffffffff00000000
  orr   x4, x4, x5
  and   x1, x1, #0xffffffff00000000
  and   x0, x0, #0xffffffff
  str   x4, [x2]
  orr   x0, x0, x1
  str   x0, [x3]
  ret

But with this patch results in:

read2:
  mov   x4, x1
  bfxil x4, x0, 0, 32
  str   x4, [x2]
  bfxil x0, x1, 0, 32
  str   x0, [x3]
  ret
  
Bootstrapped and regtested on aarch64-none-linux-gnu and aarch64-none-elf with no regressions.


gcc/
2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>

        * config/aarch64/aarch64.md (*aarch64_bfxil, *aarch64_bfxil_alt):
        Define.
        * config/aarch64/aarch64-protos.h (aarch64_is_left_consecutive):
        Define.
        * config/aarch64/aarch64.c (aarch64_is_left_consecutive): New function.

gcc/testsuite
2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>

        * gcc.target/aarch64/combine_bfxil.c: New file.
        * gcc.target/aarch64/combine_bfxil_2.c: New file.


   

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: fix.patch --]
[-- Type: text/x-patch; name="fix.patch", Size: 4622 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 514ddc4..b025cd6 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -558,4 +558,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
+bool aarch64_is_left_consecutive (HOST_WIDE_INT);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d75d45f..884958b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
     return SImode;
 }
 
+/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits are consecutive
+   ones from the MSB.  */
+bool
+aarch64_is_left_consecutive (HOST_WIDE_INT i)
+{
+  return (i | (i - 1)) == HOST_WIDE_INT_M1;
+}
+
 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
    that strcpy from constants will be faster.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a014a01..383d699 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4844,6 +4844,42 @@
   [(set_attr "type" "rev")]
 )
 
+(define_insn "*aarch64_bfxil"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
+		    (match_operand 3 "const_int_operand"))
+	    (and:DI (match_operand:DI 2 "register_operand" "0")
+		    (match_operand 4 "const_int_operand"))))]
+  "INTVAL (operands[3]) == ~INTVAL (operands[4])
+    && aarch64_is_left_consecutive (INTVAL (operands[3]))"
+  {
+    HOST_WIDE_INT op4 = INTVAL (operands[4]);
+    operands[3] = GEN_INT (64 - ceil_log2 (op4));
+    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
+    return "";
+  }
+  [(set_attr "type" "bfx")]
+)
+
+; An alternate bfxil pattern where the second bitmask is the smallest, and so
+; the first register used is changed instead of the second
+(define_insn "*aarch64_bfxil_alt"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "0")
+		    (match_operand 3 "const_int_operand"))
+	    (and:DI (match_operand:DI 2 "register_operand" "r")
+		    (match_operand 4 "const_int_operand"))))]
+  "INTVAL (operands[3]) == ~INTVAL (operands[4])
+    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
+  {
+    HOST_WIDE_INT op3 = INTVAL (operands[3]);
+    operands[3] = GEN_INT (64 - ceil_log2 (op3));
+    output_asm_insn ("bfxil\\t%0, %2, 0, %3", operands);
+    return "";
+  }
+  [(set_attr "type" "bfx")]
+)
+
 ;; There are no canonicalisation rules for the position of the lshiftrt, ashift
 ;; operations within an IOR/AND RTX, therefore we have two patterns matching
 ;; each valid permutation.
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
new file mode 100644
index 0000000..a0c6be4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long long
+combine_balanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
+}
+
+
+unsigned long long
+combine_unbalanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffffff000000ll) | (b & 0x0000000000ffffffll);
+}
+
+void
+foo2 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_balanced(a, b);
+  *d = combine_balanced(b, a);
+}
+
+void
+foo3 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_unbalanced(a, b);
+  *d = combine_unbalanced(b, a);
+}
+
+/* { dg-final { scan-assembler-times "bfxil\\t" 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
new file mode 100644
index 0000000..8237d94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long long
+combine_non_consecutive (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xfffffff200f00000ll) | (b & 0x00001000ffffffffll);
+}
+
+void
+foo4 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d) {
+  /* { dg-final { scan-assembler-not "bfxil\\t" } } */
+  *c = combine_non_consecutive(a, b);
+  *d = combine_non_consecutive(b, a);
+}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-13 16:09 [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks Sam Tebbs
@ 2018-07-16 10:55 ` Sudakshina Das
  2018-07-16 17:11   ` Sam Tebbs
  0 siblings, 1 reply; 13+ messages in thread
From: Sudakshina Das @ 2018-07-16 10:55 UTC (permalink / raw)
  To: Sam Tebbs, gcc-patches
  Cc: nd, Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

Hi Sam

On 13/07/18 17:09, Sam Tebbs wrote:
> Hi all,
>
> This patch adds an optimisation that exploits the AArch64 BFXIL instruction
> when or-ing the result of two bitwise and operations with non-overlapping
> bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).
>
> Example:
>
> unsigned long long combine(unsigned long long a, unsigned long long b) {
>    return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
> }
>
> void read2(unsigned long long a, unsigned long long b, unsigned long long *c,
>    unsigned long long *d) {
>    *c = combine(a, b); *d = combine(b, a);
> }
>
> When compiled with -O2, read2 would result in:
>
> read2:
>    and   x5, x1, #0xffffffff
>    and   x4, x0, #0xffffffff00000000
>    orr   x4, x4, x5
>    and   x1, x1, #0xffffffff00000000
>    and   x0, x0, #0xffffffff
>    str   x4, [x2]
>    orr   x0, x0, x1
>    str   x0, [x3]
>    ret
>
> But with this patch results in:
>
> read2:
>    mov   x4, x1
>    bfxil x4, x0, 0, 32
>    str   x4, [x2]
>    bfxil x0, x1, 0, 32
>    str   x0, [x3]
>    ret
>    
> Bootstrapped and regtested on aarch64-none-linux-gnu and aarch64-none-elf with no regressions.
>
I am not a maintainer but I have a question about this patch. I may be 
missing something or reading
it wrong. So feel free to point it out:

+(define_insn "*aarch64_bfxil"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
+            (match_operand 3 "const_int_operand"))
+        (and:DI (match_operand:DI 2 "register_operand" "0")
+            (match_operand 4 "const_int_operand"))))]
+  "INTVAL (operands[3]) == ~INTVAL (operands[4])
+    && aarch64_is_left_consecutive (INTVAL (operands[3]))"
+  {
+    HOST_WIDE_INT op4 = INTVAL (operands[4]);
+    operands[3] = GEN_INT (64 - ceil_log2 (op4));
+    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);

In the BFXIL you are reading %3 LSB bits from operand 1 and putting it 
in the LSBs of %0.
This means that the pattern should be masking the 32-%3 MSB of %0 and
%3 LSB of %1. So shouldn't operand 4 is LEFT_CONSECUTIVE>

Can you please compare a simpler version of the above example you gave to
make sure the generated assembly is equivalent before and after the patch:

void read2(unsigned long long a, unsigned long long b, unsigned long long *c) {
   *c = combine(a, b);
}


 From the above text

read2:
   and   x5, x1, #0xffffffff
   and   x4, x0, #0xffffffff00000000
   orr   x4, x4, x5

read2:
   mov   x4, x1
   bfxil x4, x0, 0, 32

This does not seem equivalent to me.

Thanks
Sudi

+    return "";
+  }
+  [(set_attr "type" "bfx")]
+)
> gcc/
> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>
>          * config/aarch64/aarch64.md (*aarch64_bfxil, *aarch64_bfxil_alt):
>          Define.
>          * config/aarch64/aarch64-protos.h (aarch64_is_left_consecutive):
>          Define.
>          * config/aarch64/aarch64.c (aarch64_is_left_consecutive): New function.
>
> gcc/testsuite
> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>
>          * gcc.target/aarch64/combine_bfxil.c: New file.
>          * gcc.target/aarch64/combine_bfxil_2.c: New file.
>
>
>     

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-16 10:55 ` Sudakshina Das
@ 2018-07-16 17:11   ` Sam Tebbs
  2018-07-17  1:34     ` Richard Henderson
  0 siblings, 1 reply; 13+ messages in thread
From: Sam Tebbs @ 2018-07-16 17:11 UTC (permalink / raw)
  To: Sudakshina Das, gcc-patches
  Cc: nd, richard.earnshaw, marcus.shawcroft, james.greenhalgh,
	Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 5106 bytes --]

Hi Sudi,

Thanks for noticing that, I have attached an improved patch file that 
fixes this issue.

Below is an updated description and changelog:

This patch adds an optimisation that exploits the AArch64 BFXIL instruction
when or-ing the result of two bitwise and operations with non-overlapping
bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).

Example:

unsigned long long combine(unsigned long long a, unsigned long long b) {
   return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
}

void read(unsigned long long a, unsigned long long b, unsigned long long 
*c) {
   *c = combine(a, b);
}

When compiled with -O2, read would result in:

read:
   and   x5, x1, #0xffffffff
   and   x4, x0, #0xffffffff00000000
   orr   x4, x4, x5
   str   x4, [x2]
   ret

But with this patch results in:

read:
   mov    x4, x0
   bfxil    x4, x1, 0, 32
   str    x4, [x2]
   ret

Bootstrapped and regtested on aarch64-none-linux-gnu and 
aarch64-none-elf with no regressions.


gcc/
2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>

         * config/aarch64/aarch64.md (*aarch64_bfxil, *aarch64_bfxil_alt):
         Define.
         * config/aarch64/aarch64-protos.h (aarch64_is_left_consecutive):
         Define.
         * config/aarch64/aarch64.c (aarch64_is_left_consecutive): New 
function.

gcc/testsuite
2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>

         * gcc.target/aarch64/combine_bfxil.c: New file.
         * gcc.target/aarch64/combine_bfxil_2.c: New file.


On 07/16/2018 11:54 AM, Sudakshina Das wrote:
> Hi Sam
>
> On 13/07/18 17:09, Sam Tebbs wrote:
>> Hi all,
>>
>> This patch adds an optimisation that exploits the AArch64 BFXIL 
>> instruction
>> when or-ing the result of two bitwise and operations with 
>> non-overlapping
>> bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).
>>
>> Example:
>>
>> unsigned long long combine(unsigned long long a, unsigned long long b) {
>>    return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
>> }
>>
>> void read2(unsigned long long a, unsigned long long b, unsigned long 
>> long *c,
>>    unsigned long long *d) {
>>    *c = combine(a, b); *d = combine(b, a);
>> }
>>
>> When compiled with -O2, read2 would result in:
>>
>> read2:
>>    and   x5, x1, #0xffffffff
>>    and   x4, x0, #0xffffffff00000000
>>    orr   x4, x4, x5
>>    and   x1, x1, #0xffffffff00000000
>>    and   x0, x0, #0xffffffff
>>    str   x4, [x2]
>>    orr   x0, x0, x1
>>    str   x0, [x3]
>>    ret
>>
>> But with this patch results in:
>>
>> read2:
>>    mov   x4, x1
>>    bfxil x4, x0, 0, 32
>>    str   x4, [x2]
>>    bfxil x0, x1, 0, 32
>>    str   x0, [x3]
>>    ret
>>    Bootstrapped and regtested on aarch64-none-linux-gnu and 
>> aarch64-none-elf with no regressions.
>>
> I am not a maintainer but I have a question about this patch. I may be 
> missing something or reading
> it wrong. So feel free to point it out:
>
> +(define_insn "*aarch64_bfxil"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
> +            (match_operand 3 "const_int_operand"))
> +        (and:DI (match_operand:DI 2 "register_operand" "0")
> +            (match_operand 4 "const_int_operand"))))]
> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
> +    && aarch64_is_left_consecutive (INTVAL (operands[3]))"
> +  {
> +    HOST_WIDE_INT op4 = INTVAL (operands[4]);
> +    operands[3] = GEN_INT (64 - ceil_log2 (op4));
> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>
> In the BFXIL you are reading %3 LSB bits from operand 1 and putting it 
> in the LSBs of %0.
> This means that the pattern should be masking the 32-%3 MSB of %0 and
> %3 LSB of %1. So shouldn't operand 4 is LEFT_CONSECUTIVE>
>
> Can you please compare a simpler version of the above example you gave to
> make sure the generated assembly is equivalent before and after the 
> patch:
>
> void read2(unsigned long long a, unsigned long long b, unsigned long 
> long *c) {
>   *c = combine(a, b);
> }
>
>
> From the above text
>
> read2:
>   and   x5, x1, #0xffffffff
>   and   x4, x0, #0xffffffff00000000
>   orr   x4, x4, x5
>
> read2:
>   mov   x4, x1
>   bfxil x4, x0, 0, 32
>
> This does not seem equivalent to me.
>
> Thanks
> Sudi
>
> +    return "";
> +  }
> +  [(set_attr "type" "bfx")]
> +)
>> gcc/
>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>
>>          * config/aarch64/aarch64.md (*aarch64_bfxil, 
>> *aarch64_bfxil_alt):
>>          Define.
>>          * config/aarch64/aarch64-protos.h 
>> (aarch64_is_left_consecutive):
>>          Define.
>>          * config/aarch64/aarch64.c (aarch64_is_left_consecutive): 
>> New function.
>>
>> gcc/testsuite
>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>
>>          * gcc.target/aarch64/combine_bfxil.c: New file.
>>          * gcc.target/aarch64/combine_bfxil_2.c: New file.
>>
>>
>


[-- Attachment #2: fix.patch --]
[-- Type: text/x-patch, Size: 4982 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 514ddc4..b025cd6 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -558,4 +558,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
+bool aarch64_is_left_consecutive (HOST_WIDE_INT);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d75d45f..884958b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
     return SImode;
 }
 
+/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits are consecutive
+   ones from the MSB.  */
+bool
+aarch64_is_left_consecutive (HOST_WIDE_INT i)
+{
+  return (i | (i - 1)) == HOST_WIDE_INT_M1;
+}
+
 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
    that strcpy from constants will be faster.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a014a01..78ec4cf 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4844,6 +4844,42 @@
   [(set_attr "type" "rev")]
 )
 
+(define_insn "*aarch64_bfxil"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
+		    (match_operand 3 "const_int_operand"))
+	    (and:DI (match_operand:DI 2 "register_operand" "0")
+		    (match_operand 4 "const_int_operand"))))]
+  "INTVAL (operands[3]) == ~INTVAL (operands[4])
+    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
+  {
+    HOST_WIDE_INT op3 = INTVAL (operands[3]);
+    operands[3] = GEN_INT (ceil_log2 (op3));
+    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
+    return "";
+  }
+  [(set_attr "type" "bfx")]
+)
+
+; An alternate bfxil pattern where the second bitmask is the smallest, and so
+; the first register used is changed instead of the second
+(define_insn "*aarch64_bfxil_alt"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "0")
+		    (match_operand 3 "const_int_operand"))
+	    (and:DI (match_operand:DI 2 "register_operand" "r")
+		    (match_operand 4 "const_int_operand"))))]
+  "INTVAL (operands[3]) == ~INTVAL (operands[4])
+    && aarch64_is_left_consecutive (INTVAL (operands[3]))"
+  {
+    HOST_WIDE_INT op4 = INTVAL (operands[4]);
+    operands[3] = GEN_INT (ceil_log2 (op4));
+    output_asm_insn ("bfxil\\t%0, %2, 0, %3", operands);
+    return "";
+  }
+  [(set_attr "type" "bfx")]
+)
+
 ;; There are no canonicalisation rules for the position of the lshiftrt, ashift
 ;; operations within an IOR/AND RTX, therefore we have two patterns matching
 ;; each valid permutation.
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
new file mode 100644
index 0000000..7189b80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+extern void abort(void);
+
+unsigned long long
+combine_balanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
+}
+
+
+unsigned long long
+combine_unbalanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffffff000000ll) | (b & 0x0000000000ffffffll);
+}
+
+void
+foo2 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_balanced(a, b);
+  *d = combine_balanced(b, a);
+}
+
+void
+foo3 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_unbalanced(a, b);
+  *d = combine_unbalanced(b, a);
+}
+
+int
+main(void)
+{
+  unsigned long long a = 0x0123456789ABCDEF, b = 0xFEDCBA9876543210, c, d;
+  foo3(a, b, &c, &d);
+  if(c != 0x0123456789543210) abort();
+  if(d != 0xfedcba9876abcdef) abort();
+  foo2(a, b, &c, &d);
+  if(c != 0x0123456776543210) abort();
+  if(d != 0xfedcba9889abcdef) abort(); 
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "bfxil\\t" 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
new file mode 100644
index 0000000..8237d94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long long
+combine_non_consecutive (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xfffffff200f00000ll) | (b & 0x00001000ffffffffll);
+}
+
+void
+foo4 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d) {
+  /* { dg-final { scan-assembler-not "bfxil\\t" } } */
+  *c = combine_non_consecutive(a, b);
+  *d = combine_non_consecutive(b, a);
+}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-16 17:11   ` Sam Tebbs
@ 2018-07-17  1:34     ` Richard Henderson
  2018-07-17 13:33       ` Richard Earnshaw (lists)
  2018-07-19 13:03       ` Sam Tebbs
  0 siblings, 2 replies; 13+ messages in thread
From: Richard Henderson @ 2018-07-17  1:34 UTC (permalink / raw)
  To: Sam Tebbs, Sudakshina Das, gcc-patches
  Cc: nd, richard.earnshaw, marcus.shawcroft, james.greenhalgh

On 07/16/2018 10:10 AM, Sam Tebbs wrote:
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
>      return SImode;
>  }
>  
> +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits are consecutive
> +   ones from the MSB.  */
> +bool
> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
> +{
> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
> +}
> +
...
> +(define_insn "*aarch64_bfxil"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
> +		    (match_operand 3 "const_int_operand"))
> +	    (and:DI (match_operand:DI 2 "register_operand" "0")
> +		    (match_operand 4 "const_int_operand"))))]
> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"

Better is to use a define_predicate to merge both that second test and the
const_int_operand.

(I'm not sure about the "left_consecutive" language either.
Isn't it more descriptive to say that op3 is a power of 2 minus 1?)

(define_predicate "pow2m1_operand"
  (and (match_code "const_int")
       (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))

and use

  (match_operand:DI 3 "pow2m1_operand")

and then just the

  INTVAL (operands[3]) == ~INTVAL (operands[4])

test.

Also, don't omit the modes for the constants.
Also, there's no reason this applies only to DI mode;
use the GPI iterator and %<w> in the output template.

> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
> +    operands[3] = GEN_INT (ceil_log2 (op3));
> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
> +    return "";

You can just return the string that you passed to output_asm_insn.

> +  }
> +  [(set_attr "type" "bfx")]

The other aliases of the BFM insn use type "bfm";
"bfx" appears to be aliases of UBFM and SBFM.
Not that it appears to matter to the scheduling
descriptions, but it is inconsistent.


r~

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-17  1:34     ` Richard Henderson
@ 2018-07-17 13:33       ` Richard Earnshaw (lists)
  2018-07-17 15:46         ` Richard Henderson
  2018-07-19 13:03       ` Sam Tebbs
  1 sibling, 1 reply; 13+ messages in thread
From: Richard Earnshaw (lists) @ 2018-07-17 13:33 UTC (permalink / raw)
  To: Richard Henderson, Sam Tebbs, Sudakshina Das, gcc-patches
  Cc: nd, marcus.shawcroft, james.greenhalgh

On 17/07/18 02:33, Richard Henderson wrote:
> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
>>      return SImode;
>>  }
>>  
>> +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits are consecutive
>> +   ones from the MSB.  */
>> +bool
>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>> +{
>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>> +}
>> +
> ...
>> +(define_insn "*aarch64_bfxil"
>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>> +		    (match_operand 3 "const_int_operand"))
>> +	    (and:DI (match_operand:DI 2 "register_operand" "0")
>> +		    (match_operand 4 "const_int_operand"))))]
>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
> 
> Better is to use a define_predicate to merge both that second test and the
> const_int_operand.
> 
> (I'm not sure about the "left_consecutive" language either.
> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
> 
> (define_predicate "pow2m1_operand"
>   (and (match_code "const_int")
>        (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))

ITYM exact_log2()

R.

> 
> and use
> 
>   (match_operand:DI 3 "pow2m1_operand")
> 
> and then just the
> 
>   INTVAL (operands[3]) == ~INTVAL (operands[4])
> 
> test.
> 
> Also, don't omit the modes for the constants.
> Also, there's no reason this applies only to DI mode;
> use the GPI iterator and %<w> in the output template.
> 
>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>> +    return "";
> 
> You can just return the string that you passed to output_asm_insn.
> 
>> +  }
>> +  [(set_attr "type" "bfx")]
> 
> The other aliases of the BFM insn use type "bfm";
> "bfx" appears to be aliases of UBFM and SBFM.
> Not that it appears to matter to the scheduling
> descriptions, but it is inconsistent.
> 
> 
> r~
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-17 13:33       ` Richard Earnshaw (lists)
@ 2018-07-17 15:46         ` Richard Henderson
  0 siblings, 0 replies; 13+ messages in thread
From: Richard Henderson @ 2018-07-17 15:46 UTC (permalink / raw)
  To: Richard Earnshaw (lists), Sam Tebbs, Sudakshina Das, gcc-patches
  Cc: nd, marcus.shawcroft, james.greenhalgh

On 07/17/2018 06:33 AM, Richard Earnshaw (lists) wrote:
>> (define_predicate "pow2m1_operand"
>>   (and (match_code "const_int")
>>        (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
> ITYM exact_log2()

Yes of course.  Thanks,


r~

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-17  1:34     ` Richard Henderson
  2018-07-17 13:33       ` Richard Earnshaw (lists)
@ 2018-07-19 13:03       ` Sam Tebbs
  2018-07-20  9:31         ` Sam Tebbs
  1 sibling, 1 reply; 13+ messages in thread
From: Sam Tebbs @ 2018-07-19 13:03 UTC (permalink / raw)
  To: Richard Henderson, Sudakshina Das, gcc-patches
  Cc: nd, richard.earnshaw, marcus.shawcroft, james.greenhalgh

Hi Richard,

Thanks for the feedback. I find that using "is_left_consecutive" is more 
descriptive than checking for it being a power of 2 - 1, since it 
describes the requirement (having consecutive ones from the MSB) more 
explicitly. I would be happy to change it though if that is the consensus.

I have addressed your point about just returning the string instead of 
using output_asm_insn and have changed it locally. I'll send an updated 
patch soon.


On 07/17/2018 02:33 AM, Richard Henderson wrote:
> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
>>       return SImode;
>>   }
>>   
>> +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits are consecutive
>> +   ones from the MSB.  */
>> +bool
>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>> +{
>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>> +}
>> +
> ...
>> +(define_insn "*aarch64_bfxil"
>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>> +		    (match_operand 3 "const_int_operand"))
>> +	    (and:DI (match_operand:DI 2 "register_operand" "0")
>> +		    (match_operand 4 "const_int_operand"))))]
>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
> Better is to use a define_predicate to merge both that second test and the
> const_int_operand.
>
> (I'm not sure about the "left_consecutive" language either.
> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
>
> (define_predicate "pow2m1_operand"
>    (and (match_code "const_int")
>         (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
>
> and use
>
>    (match_operand:DI 3 "pow2m1_operand")
>
> and then just the
>
>    INTVAL (operands[3]) == ~INTVAL (operands[4])
>
> test.
>
> Also, don't omit the modes for the constants.
> Also, there's no reason this applies only to DI mode;
> use the GPI iterator and %<w> in the output template.
>
>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>> +    return "";
> You can just return the string that you passed to output_asm_insn.
>
>> +  }
>> +  [(set_attr "type" "bfx")]
> The other aliases of the BFM insn use type "bfm";
> "bfx" appears to be aliases of UBFM and SBFM.
> Not that it appears to matter to the scheduling
> descriptions, but it is inconsistent.
>
>
> r~

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-19 13:03       ` Sam Tebbs
@ 2018-07-20  9:31         ` Sam Tebbs
  2018-07-20  9:33           ` Sam Tebbs
  0 siblings, 1 reply; 13+ messages in thread
From: Sam Tebbs @ 2018-07-20  9:31 UTC (permalink / raw)
  To: Richard Henderson, Sudakshina Das, gcc-patches
  Cc: nd, richard.earnshaw, marcus.shawcroft, james.greenhalgh

Hi all,

Here is an updated patch that does the following:

* Adds a new constraint in config/aarch64/constraints.md to check for a 
constant integer that is left consecutive. This addresses Richard 
Henderson's suggestion about combining the aarch64_is_left_consecutive 
call and the const_int match in the pattern.

* Merges the two patterns defined into one.

* Changes the pattern's type attribute to bfm.

* Improved the comment above the aarch64_is_left_consecutive implementation.

* Makes the pattern use the GPI iterator to accept smaller integer sizes 
(an extra test is added to check for this).

* Improves the tests in combine_bfxil.c to ensure they aren't optimised 
away and that they check for the pattern's correctness.

Below is a new changelog and the example given before.

Is this OK for trunk?

This patch adds an optimisation that exploits the AArch64 BFXIL instruction
when or-ing the result of two bitwise and operations with non-overlapping
bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).

Example:

unsigned long long combine(unsigned long long a, unsigned long long b) {
   return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
}

void read(unsigned long long a, unsigned long long b, unsigned long long 
*c) {
   *c = combine(a, b);
}

When compiled with -O2, read would result in:

read:
   and   x5, x1, #0xffffffff
   and   x4, x0, #0xffffffff00000000
   orr   x4, x4, x5
   str   x4, [x2]
   ret

But with this patch results in:

read:
   mov    x4, x0
   bfxil    x4, x1, 0, 32
   str    x4, [x2]
   ret



Bootstrapped and regtested on aarch64-none-linux-gnu and 
aarch64-none-elf with no regressions.


gcc/
2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>

         PR target/85628
         * config/aarch64/aarch64.md (*aarch64_bfxil):
         Define.
         * config/aarch64/constraints.md (Ulc): Define
         * config/aarch64/aarch64-protos.h (aarch64_is_left_consecutive):
         Define.
         * config/aarch64/aarch64.c (aarch64_is_left_consecutive): New 
function.

gcc/testsuite
2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>

         PR target/85628
         * gcc.target/aarch64/combine_bfxil.c: New file.
         * gcc.target/aarch64/combine_bfxil_2.c: New file.


On 07/19/2018 02:02 PM, Sam Tebbs wrote:
> Hi Richard,
>
> Thanks for the feedback. I find that using "is_left_consecutive" is 
> more descriptive than checking for it being a power of 2 - 1, since it 
> describes the requirement (having consecutive ones from the MSB) more 
> explicitly. I would be happy to change it though if that is the 
> consensus.
>
> I have addressed your point about just returning the string instead of 
> using output_asm_insn and have changed it locally. I'll send an 
> updated patch soon.
>
>
> On 07/17/2018 02:33 AM, Richard Henderson wrote:
>> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>>> +++ b/gcc/config/aarch64/aarch64.c
>>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned 
>>> regno, unsigned,
>>>       return SImode;
>>>   }
>>>   +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits are 
>>> consecutive
>>> +   ones from the MSB.  */
>>> +bool
>>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>>> +{
>>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>>> +}
>>> +
>> ...
>>> +(define_insn "*aarch64_bfxil"
>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>>> +            (match_operand 3 "const_int_operand"))
>>> +        (and:DI (match_operand:DI 2 "register_operand" "0")
>>> +            (match_operand 4 "const_int_operand"))))]
>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
>> Better is to use a define_predicate to merge both that second test 
>> and the
>> const_int_operand.
>>
>> (I'm not sure about the "left_consecutive" language either.
>> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
>>
>> (define_predicate "pow2m1_operand"
>>    (and (match_code "const_int")
>>         (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
>>
>> and use
>>
>>    (match_operand:DI 3 "pow2m1_operand")
>>
>> and then just the
>>
>>    INTVAL (operands[3]) == ~INTVAL (operands[4])
>>
>> test.
>>
>> Also, don't omit the modes for the constants.
>> Also, there's no reason this applies only to DI mode;
>> use the GPI iterator and %<w> in the output template.
>>
>>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>>> +    return "";
>> You can just return the string that you passed to output_asm_insn.
>>
>>> +  }
>>> +  [(set_attr "type" "bfx")]
>> The other aliases of the BFM insn use type "bfm";
>> "bfx" appears to be aliases of UBFM and SBFM.
>> Not that it appears to matter to the scheduling
>> descriptions, but it is inconsistent.
>>
>>
>> r~
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-20  9:31         ` Sam Tebbs
@ 2018-07-20  9:33           ` Sam Tebbs
  2018-07-23 11:38             ` Renlin Li
  0 siblings, 1 reply; 13+ messages in thread
From: Sam Tebbs @ 2018-07-20  9:33 UTC (permalink / raw)
  To: Richard Henderson, Sudakshina Das, gcc-patches
  Cc: nd, richard.earnshaw, marcus.shawcroft, james.greenhalgh

[-- Attachment #1: Type: text/plain, Size: 5790 bytes --]

Please disregard the original patch and see this updated version.


On 07/20/2018 10:31 AM, Sam Tebbs wrote:
> Hi all,
>
> Here is an updated patch that does the following:
>
> * Adds a new constraint in config/aarch64/constraints.md to check for 
> a constant integer that is left consecutive. This addresses Richard 
> Henderson's suggestion about combining the aarch64_is_left_consecutive 
> call and the const_int match in the pattern.
>
> * Merges the two patterns defined into one.
>
> * Changes the pattern's type attribute to bfm.
>
> * Improved the comment above the aarch64_is_left_consecutive 
> implementation.
>
> * Makes the pattern use the GPI iterator to accept smaller integer 
> sizes (an extra test is added to check for this).
>
> * Improves the tests in combine_bfxil.c to ensure they aren't 
> optimised away and that they check for the pattern's correctness.
>
> Below is a new changelog and the example given before.
>
> Is this OK for trunk?
>
> This patch adds an optimisation that exploits the AArch64 BFXIL 
> instruction
> when or-ing the result of two bitwise and operations with non-overlapping
> bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).
>
> Example:
>
> unsigned long long combine(unsigned long long a, unsigned long long b) {
>   return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
> }
>
> void read(unsigned long long a, unsigned long long b, unsigned long 
> long *c) {
>   *c = combine(a, b);
> }
>
> When compiled with -O2, read would result in:
>
> read:
>   and   x5, x1, #0xffffffff
>   and   x4, x0, #0xffffffff00000000
>   orr   x4, x4, x5
>   str   x4, [x2]
>   ret
>
> But with this patch results in:
>
> read:
>   mov    x4, x0
>   bfxil    x4, x1, 0, 32
>   str    x4, [x2]
>   ret
>
>
>
> Bootstrapped and regtested on aarch64-none-linux-gnu and 
> aarch64-none-elf with no regressions.
>
>
> gcc/
> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>
>         PR target/85628
>         * config/aarch64/aarch64.md (*aarch64_bfxil):
>         Define.
>         * config/aarch64/constraints.md (Ulc): Define
>         * config/aarch64/aarch64-protos.h (aarch64_is_left_consecutive):
>         Define.
>         * config/aarch64/aarch64.c (aarch64_is_left_consecutive): New 
> function.
>
> gcc/testsuite
> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>
>         PR target/85628
>         * gcc.target/aarch64/combine_bfxil.c: New file.
>         * gcc.target/aarch64/combine_bfxil_2.c: New file.
>
>
> On 07/19/2018 02:02 PM, Sam Tebbs wrote:
>> Hi Richard,
>>
>> Thanks for the feedback. I find that using "is_left_consecutive" is 
>> more descriptive than checking for it being a power of 2 - 1, since 
>> it describes the requirement (having consecutive ones from the MSB) 
>> more explicitly. I would be happy to change it though if that is the 
>> consensus.
>>
>> I have addressed your point about just returning the string instead 
>> of using output_asm_insn and have changed it locally. I'll send an 
>> updated patch soon.
>>
>>
>> On 07/17/2018 02:33 AM, Richard Henderson wrote:
>>> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>>>> +++ b/gcc/config/aarch64/aarch64.c
>>>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode 
>>>> (unsigned regno, unsigned,
>>>>       return SImode;
>>>>   }
>>>>   +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits 
>>>> are consecutive
>>>> +   ones from the MSB.  */
>>>> +bool
>>>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>>>> +{
>>>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>>>> +}
>>>> +
>>> ...
>>>> +(define_insn "*aarch64_bfxil"
>>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>>>> +            (match_operand 3 "const_int_operand"))
>>>> +        (and:DI (match_operand:DI 2 "register_operand" "0")
>>>> +            (match_operand 4 "const_int_operand"))))]
>>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
>>> Better is to use a define_predicate to merge both that second test 
>>> and the
>>> const_int_operand.
>>>
>>> (I'm not sure about the "left_consecutive" language either.
>>> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
>>>
>>> (define_predicate "pow2m1_operand"
>>>    (and (match_code "const_int")
>>>         (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
>>>
>>> and use
>>>
>>>    (match_operand:DI 3 "pow2m1_operand")
>>>
>>> and then just the
>>>
>>>    INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>
>>> test.
>>>
>>> Also, don't omit the modes for the constants.
>>> Also, there's no reason this applies only to DI mode;
>>> use the GPI iterator and %<w> in the output template.
>>>
>>>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>>>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>>>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>>>> +    return "";
>>> You can just return the string that you passed to output_asm_insn.
>>>
>>>> +  }
>>>> +  [(set_attr "type" "bfx")]
>>> The other aliases of the BFM insn use type "bfm";
>>> "bfx" appears to be aliases of UBFM and SBFM.
>>> Not that it appears to matter to the scheduling
>>> descriptions, but it is inconsistent.
>>>
>>>
>>> r~
>>
>


[-- Attachment #2: fix.patch --]
[-- Type: text/x-patch, Size: 5859 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 514ddc4..b025cd6 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -558,4 +558,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
+bool aarch64_is_left_consecutive (HOST_WIDE_INT);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d75d45f..bf084cb 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
     return SImode;
 }
 
+/* Implement IS_LEFT_CONSECUTIVE.  Check if I's bits are consecutive
+   ones from the MSB.  */
+bool
+aarch64_is_left_consecutive (HOST_WIDE_INT i)
+{
+  return (i | (i - 1)) == HOST_WIDE_INT_M1;
+}
+
 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
    that strcpy from constants will be faster.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a014a01..a841db8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4844,6 +4844,29 @@
   [(set_attr "type" "rev")]
 )
 
+(define_insn "*aarch64_bfxil<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r")
+    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
+		    (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
+	    (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
+		    (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
+  "INTVAL (operands[3]) == ~INTVAL (operands[4])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[3])));
+	return "bfxil\\t%<w>0, %<w>1, 0, %3";
+      case 1:
+	operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[4])));
+	return "bfxil\\t%<w>0, %<w>2, 0, %3";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfm")]
+)
+
 ;; There are no canonicalisation rules for the position of the lshiftrt, ashift
 ;; operations within an IOR/AND RTX, therefore we have two patterns matching
 ;; each valid permutation.
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 32a0fa6..46a4764 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -172,6 +172,13 @@
   A constraint that matches the immediate constant -1."
   (match_test "op == constm1_rtx"))
 
+(define_constraint "Ulc"
+ "@internal
+ A constraint that matches a constant integer whose bits are consecutive ones
+ from the MSB."
+ (and (match_code "const_int")
+      (match_test "aarch64_is_left_consecutive (ival)")))
+
 (define_constraint "Usv"
   "@internal
    A constraint that matches a VG-based constant that can be loaded by
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
new file mode 100644
index 0000000..e55f62a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps" } */
+
+extern void abort(void);
+
+unsigned long long
+combine_balanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
+}
+
+unsigned long long
+combine_unbalanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffffff000000ll) | (b & 0x0000000000ffffffll);
+}
+
+unsigned int
+combine_balanced_int (unsigned int a, unsigned int b)
+{
+  return (a & 0xffff0000ll) | (b & 0x0000ffffll);
+}
+
+unsigned int
+combine_unbalanced_int (unsigned int a, unsigned int b)
+{
+  return (a & 0xffffff00ll) | (b & 0x000000ffll);
+}
+
+__attribute__ ((noinline)) void
+foo2 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_balanced (a, b);
+  *d = combine_balanced (b, a);
+}
+
+__attribute__ ((noinline)) void
+foo3 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_unbalanced (a, b);
+  *d = combine_unbalanced (b, a);
+}
+
+void
+foo4 (unsigned int a, unsigned int b, unsigned int *c, unsigned int *d)
+{
+  *c = combine_balanced_int(a, b);
+  *d = combine_balanced_int(b, a);
+}
+
+void
+foo5 (unsigned int a, unsigned int b, unsigned int *c, unsigned int *d)
+{
+  *c = combine_unbalanced_int(a, b);
+  *d = combine_unbalanced_int(b, a);
+}
+
+int
+main(void)
+{
+  unsigned long long a = 0x0123456789ABCDEF, b = 0xFEDCBA9876543210, c, d;
+  foo3(a, b, &c, &d);
+  if(c != 0x0123456789543210) abort();
+  if(d != 0xfedcba9876abcdef) abort();
+  foo2(a, b, &c, &d);
+  if(c != 0x0123456776543210) abort();
+  if(d != 0xfedcba9889abcdef) abort();
+
+  unsigned int a2 = 0x01234567, b2 = 0xFEDCBA98, c2, d2;
+  foo4(a2, b2, &c2, &d2);
+  if(c2 != 0x0123ba98) abort();
+  if(d2 != 0xfedc4567) abort();
+  foo5(a2, b2, &c2, &d2);
+  if(c2 != 0x01234598) abort();
+  if(d2 != 0xfedcba67) abort();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "bfxil\\t" 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
new file mode 100644
index 0000000..0fc1404
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long long
+combine_non_consecutive (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xfffffff200f00000ll) | (b & 0x00001000ffffffffll);
+}
+
+void
+foo4 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d) {
+  /* { dg-final { scan-assembler-not "bfxil\\t" } } */
+  *c = combine_non_consecutive (a, b);
+  *d = combine_non_consecutive (b, a);
+}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-20  9:33           ` Sam Tebbs
@ 2018-07-23 11:38             ` Renlin Li
  2018-07-23 13:15               ` Sam Tebbs
  0 siblings, 1 reply; 13+ messages in thread
From: Renlin Li @ 2018-07-23 11:38 UTC (permalink / raw)
  To: Sam Tebbs, Richard Henderson, Sudakshina Das, gcc-patches
  Cc: nd, richard.earnshaw, marcus.shawcroft, james.greenhalgh

> +(define_insn "*aarch64_bfxil<mode>"
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r")
> +    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
> +		    (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
> +	    (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
> +		    (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])"
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[3])));
> +	return "bfxil\\t%<w>0, %<w>1, 0, %3";
> +      case 1:
> +	operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[4])));
> +	return "bfxil\\t%<w>0, %<w>2, 0, %3";
> +      default:
> +	gcc_unreachable ();
> +    }
> +  }
> +  [(set_attr "type" "bfm")]
> +)

Hi Sam,

Is it possible that operand[3] or operand[4] is 1?

ceil_log2() could return 0 if the argument is 1.
The width field of the resulting instruction will be 0. Is it still correct?

Regard,
Renlin



On 07/20/2018 10:33 AM, Sam Tebbs wrote:
> Please disregard the original patch and see this updated version.
> 
> 
> On 07/20/2018 10:31 AM, Sam Tebbs wrote:
>> Hi all,
>>
>> Here is an updated patch that does the following:
>>
>> * Adds a new constraint in config/aarch64/constraints.md to check for a constant integer that is left consecutive. This addresses Richard 
>> Henderson's suggestion about combining the aarch64_is_left_consecutive call and the const_int match in the pattern.
>>
>> * Merges the two patterns defined into one.
>>
>> * Changes the pattern's type attribute to bfm.
>>
>> * Improved the comment above the aarch64_is_left_consecutive implementation.
>>
>> * Makes the pattern use the GPI iterator to accept smaller integer sizes (an extra test is added to check for this).
>>
>> * Improves the tests in combine_bfxil.c to ensure they aren't optimised away and that they check for the pattern's correctness.
>>
>> Below is a new changelog and the example given before.
>>
>> Is this OK for trunk?
>>
>> This patch adds an optimisation that exploits the AArch64 BFXIL instruction
>> when or-ing the result of two bitwise and operations with non-overlapping
>> bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).
>>
>> Example:
>>
>> unsigned long long combine(unsigned long long a, unsigned long long b) {
>>   return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
>> }
>>
>> void read(unsigned long long a, unsigned long long b, unsigned long long *c) {
>>   *c = combine(a, b);
>> }
>>
>> When compiled with -O2, read would result in:
>>
>> read:
>>   and   x5, x1, #0xffffffff
>>   and   x4, x0, #0xffffffff00000000
>>   orr   x4, x4, x5
>>   str   x4, [x2]
>>   ret
>>
>> But with this patch results in:
>>
>> read:
>>   mov    x4, x0
>>   bfxil    x4, x1, 0, 32
>>   str    x4, [x2]
>>   ret
>>
>>
>>
>> Bootstrapped and regtested on aarch64-none-linux-gnu and aarch64-none-elf with no regressions.
>>
>>
>> gcc/
>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>
>>         PR target/85628
>>         * config/aarch64/aarch64.md (*aarch64_bfxil):
>>         Define.
>>         * config/aarch64/constraints.md (Ulc): Define
>>         * config/aarch64/aarch64-protos.h (aarch64_is_left_consecutive):
>>         Define.
>>         * config/aarch64/aarch64.c (aarch64_is_left_consecutive): New function.
>>
>> gcc/testsuite
>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>
>>         PR target/85628
>>         * gcc.target/aarch64/combine_bfxil.c: New file.
>>         * gcc.target/aarch64/combine_bfxil_2.c: New file.
>>
>>
>> On 07/19/2018 02:02 PM, Sam Tebbs wrote:
>>> Hi Richard,
>>>
>>> Thanks for the feedback. I find that using "is_left_consecutive" is more descriptive than checking for it being a power of 2 - 1, since it 
>>> describes the requirement (having consecutive ones from the MSB) more explicitly. I would be happy to change it though if that is the consensus.
>>>
>>> I have addressed your point about just returning the string instead of using output_asm_insn and have changed it locally. I'll send an updated 
>>> patch soon.
>>>
>>>
>>> On 07/17/2018 02:33 AM, Richard Henderson wrote:
>>>> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>>>>> +++ b/gcc/config/aarch64/aarch64.c
>>>>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
>>>>>       return SImode;
>>>>>   }
>>>>>   +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits are consecutive
>>>>> +   ones from the MSB.  */
>>>>> +bool
>>>>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>>>>> +{
>>>>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>>>>> +}
>>>>> +
>>>> ...
>>>>> +(define_insn "*aarch64_bfxil"
>>>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>>>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>>>>> +            (match_operand 3 "const_int_operand"))
>>>>> +        (and:DI (match_operand:DI 2 "register_operand" "0")
>>>>> +            (match_operand 4 "const_int_operand"))))]
>>>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
>>>> Better is to use a define_predicate to merge both that second test and the
>>>> const_int_operand.
>>>>
>>>> (I'm not sure about the "left_consecutive" language either.
>>>> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
>>>>
>>>> (define_predicate "pow2m1_operand"
>>>>    (and (match_code "const_int")
>>>>         (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
>>>>
>>>> and use
>>>>
>>>>    (match_operand:DI 3 "pow2m1_operand")
>>>>
>>>> and then just the
>>>>
>>>>    INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>
>>>> test.
>>>>
>>>> Also, don't omit the modes for the constants.
>>>> Also, there's no reason this applies only to DI mode;
>>>> use the GPI iterator and %<w> in the output template.
>>>>
>>>>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>>>>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>>>>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>>>>> +    return "";
>>>> You can just return the string that you passed to output_asm_insn.
>>>>
>>>>> +  }
>>>>> +  [(set_attr "type" "bfx")]
>>>> The other aliases of the BFM insn use type "bfm";
>>>> "bfx" appears to be aliases of UBFM and SBFM.
>>>> Not that it appears to matter to the scheduling
>>>> descriptions, but it is inconsistent.
>>>>
>>>>
>>>> r~
>>>
>>
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-23 11:38             ` Renlin Li
@ 2018-07-23 13:15               ` Sam Tebbs
  2018-07-24 16:24                 ` Sam Tebbs
  0 siblings, 1 reply; 13+ messages in thread
From: Sam Tebbs @ 2018-07-23 13:15 UTC (permalink / raw)
  To: Renlin Li, Richard Henderson, Sudakshina Das, gcc-patches
  Cc: nd, richard.earnshaw, marcus.shawcroft, james.greenhalgh



On 07/23/2018 12:38 PM, Renlin Li wrote:
>> +(define_insn "*aarch64_bfxil<mode>"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r")
>> +    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
>> +            (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
>> +        (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
>> +            (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])"
>> +  {
>> +    switch (which_alternative)
>> +    {
>> +      case 0:
>> +    operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[3])));
>> +    return "bfxil\\t%<w>0, %<w>1, 0, %3";
>> +      case 1:
>> +    operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[4])));
>> +    return "bfxil\\t%<w>0, %<w>2, 0, %3";
>> +      default:
>> +    gcc_unreachable ();
>> +    }
>> +  }
>> +  [(set_attr "type" "bfm")]
>> +)
>
> Hi Sam,
>
> Is it possible that operand[3] or operand[4] is 1?
>
> ceil_log2() could return 0 if the argument is 1.
> The width field of the resulting instruction will be 0. Is it still 
> correct?
>
> Regard,
> Renlin
>
>
Hi Renlin,

I think you have found an edge-case that I didn't think of and that the 
code would fail under. I have added a check for this situation and will 
send the updated patch soon.

Thanks,
Sam
>
> On 07/20/2018 10:33 AM, Sam Tebbs wrote:
>> Please disregard the original patch and see this updated version.
>>
>>
>> On 07/20/2018 10:31 AM, Sam Tebbs wrote:
>>> Hi all,
>>>
>>> Here is an updated patch that does the following:
>>>
>>> * Adds a new constraint in config/aarch64/constraints.md to check 
>>> for a constant integer that is left consecutive. This addresses 
>>> Richard Henderson's suggestion about combining the 
>>> aarch64_is_left_consecutive call and the const_int match in the 
>>> pattern.
>>>
>>> * Merges the two patterns defined into one.
>>>
>>> * Changes the pattern's type attribute to bfm.
>>>
>>> * Improved the comment above the aarch64_is_left_consecutive 
>>> implementation.
>>>
>>> * Makes the pattern use the GPI iterator to accept smaller integer 
>>> sizes (an extra test is added to check for this).
>>>
>>> * Improves the tests in combine_bfxil.c to ensure they aren't 
>>> optimised away and that they check for the pattern's correctness.
>>>
>>> Below is a new changelog and the example given before.
>>>
>>> Is this OK for trunk?
>>>
>>> This patch adds an optimisation that exploits the AArch64 BFXIL 
>>> instruction
>>> when or-ing the result of two bitwise and operations with 
>>> non-overlapping
>>> bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).
>>>
>>> Example:
>>>
>>> unsigned long long combine(unsigned long long a, unsigned long long 
>>> b) {
>>>   return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
>>> }
>>>
>>> void read(unsigned long long a, unsigned long long b, unsigned long 
>>> long *c) {
>>>   *c = combine(a, b);
>>> }
>>>
>>> When compiled with -O2, read would result in:
>>>
>>> read:
>>>   and   x5, x1, #0xffffffff
>>>   and   x4, x0, #0xffffffff00000000
>>>   orr   x4, x4, x5
>>>   str   x4, [x2]
>>>   ret
>>>
>>> But with this patch results in:
>>>
>>> read:
>>>   mov    x4, x0
>>>   bfxil    x4, x1, 0, 32
>>>   str    x4, [x2]
>>>   ret
>>>
>>>
>>>
>>> Bootstrapped and regtested on aarch64-none-linux-gnu and 
>>> aarch64-none-elf with no regressions.
>>>
>>>
>>> gcc/
>>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>>
>>>         PR target/85628
>>>         * config/aarch64/aarch64.md (*aarch64_bfxil):
>>>         Define.
>>>         * config/aarch64/constraints.md (Ulc): Define
>>>         * config/aarch64/aarch64-protos.h 
>>> (aarch64_is_left_consecutive):
>>>         Define.
>>>         * config/aarch64/aarch64.c (aarch64_is_left_consecutive): 
>>> New function.
>>>
>>> gcc/testsuite
>>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>>
>>>         PR target/85628
>>>         * gcc.target/aarch64/combine_bfxil.c: New file.
>>>         * gcc.target/aarch64/combine_bfxil_2.c: New file.
>>>
>>>
>>> On 07/19/2018 02:02 PM, Sam Tebbs wrote:
>>>> Hi Richard,
>>>>
>>>> Thanks for the feedback. I find that using "is_left_consecutive" is 
>>>> more descriptive than checking for it being a power of 2 - 1, since 
>>>> it describes the requirement (having consecutive ones from the MSB) 
>>>> more explicitly. I would be happy to change it though if that is 
>>>> the consensus.
>>>>
>>>> I have addressed your point about just returning the string instead 
>>>> of using output_asm_insn and have changed it locally. I'll send an 
>>>> updated patch soon.
>>>>
>>>>
>>>> On 07/17/2018 02:33 AM, Richard Henderson wrote:
>>>>> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>>>>>> +++ b/gcc/config/aarch64/aarch64.c
>>>>>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode 
>>>>>> (unsigned regno, unsigned,
>>>>>>       return SImode;
>>>>>>   }
>>>>>>   +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits 
>>>>>> are consecutive
>>>>>> +   ones from the MSB.  */
>>>>>> +bool
>>>>>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>>>>>> +{
>>>>>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>>>>>> +}
>>>>>> +
>>>>> ...
>>>>>> +(define_insn "*aarch64_bfxil"
>>>>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>>>>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>>>>>> +            (match_operand 3 "const_int_operand"))
>>>>>> +        (and:DI (match_operand:DI 2 "register_operand" "0")
>>>>>> +            (match_operand 4 "const_int_operand"))))]
>>>>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
>>>>> Better is to use a define_predicate to merge both that second test 
>>>>> and the
>>>>> const_int_operand.
>>>>>
>>>>> (I'm not sure about the "left_consecutive" language either.
>>>>> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
>>>>>
>>>>> (define_predicate "pow2m1_operand"
>>>>>    (and (match_code "const_int")
>>>>>         (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
>>>>>
>>>>> and use
>>>>>
>>>>>    (match_operand:DI 3 "pow2m1_operand")
>>>>>
>>>>> and then just the
>>>>>
>>>>>    INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>>
>>>>> test.
>>>>>
>>>>> Also, don't omit the modes for the constants.
>>>>> Also, there's no reason this applies only to DI mode;
>>>>> use the GPI iterator and %<w> in the output template.
>>>>>
>>>>>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>>>>>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>>>>>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>>>>>> +    return "";
>>>>> You can just return the string that you passed to output_asm_insn.
>>>>>
>>>>>> +  }
>>>>>> +  [(set_attr "type" "bfx")]
>>>>> The other aliases of the BFM insn use type "bfm";
>>>>> "bfx" appears to be aliases of UBFM and SBFM.
>>>>> Not that it appears to matter to the scheduling
>>>>> descriptions, but it is inconsistent.
>>>>>
>>>>>
>>>>> r~
>>>>
>>>
>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-23 13:15               ` Sam Tebbs
@ 2018-07-24 16:24                 ` Sam Tebbs
  2018-07-30 11:31                   ` Sam Tebbs
  0 siblings, 1 reply; 13+ messages in thread
From: Sam Tebbs @ 2018-07-24 16:24 UTC (permalink / raw)
  To: gcc-patches
  Cc: Renlin Li, Richard Henderson, Sudakshina Das, nd,
	richard.earnshaw, marcus.shawcroft, james.greenhalgh

[-- Attachment #1: Type: text/plain, Size: 8255 bytes --]



On 07/23/2018 02:14 PM, Sam Tebbs wrote:
>
>
> On 07/23/2018 12:38 PM, Renlin Li wrote:
>>> +(define_insn "*aarch64_bfxil<mode>"
>>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r")
>>> +    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
>>> +            (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
>>> +        (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
>>> +            (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])"
>>> +  {
>>> +    switch (which_alternative)
>>> +    {
>>> +      case 0:
>>> +    operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[3])));
>>> +    return "bfxil\\t%<w>0, %<w>1, 0, %3";
>>> +      case 1:
>>> +    operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[4])));
>>> +    return "bfxil\\t%<w>0, %<w>2, 0, %3";
>>> +      default:
>>> +    gcc_unreachable ();
>>> +    }
>>> +  }
>>> +  [(set_attr "type" "bfm")]
>>> +)
>>
>> Hi Sam,
>>
>> Is it possible that operand[3] or operand[4] is 1?
>>
>> ceil_log2() could return 0 if the argument is 1.
>> The width field of the resulting instruction will be 0. Is it still 
>> correct?
>>
>> Regard,
>> Renlin
>>
>>
> Hi Renlin,
>
> I think you have found an edge-case that I didn't think of and that 
> the code would fail under. I have added a check for this situation and 
> will send the updated patch soon.
>
> Thanks,
> Sam

Here is the updated patch that fixes the problem outlined by Renlin, and 
adds another testcase to check for the same issue. The changelog and 
example from my earlier email (sent on 07/20/2018 10:31 AM) still apply.

Thanks,
Sam

>>
>> On 07/20/2018 10:33 AM, Sam Tebbs wrote:
>>> Please disregard the original patch and see this updated version.
>>>
>>>
>>> On 07/20/2018 10:31 AM, Sam Tebbs wrote:
>>>> Hi all,
>>>>
>>>> Here is an updated patch that does the following:
>>>>
>>>> * Adds a new constraint in config/aarch64/constraints.md to check 
>>>> for a constant integer that is left consecutive. This addresses 
>>>> Richard Henderson's suggestion about combining the 
>>>> aarch64_is_left_consecutive call and the const_int match in the 
>>>> pattern.
>>>>
>>>> * Merges the two patterns defined into one.
>>>>
>>>> * Changes the pattern's type attribute to bfm.
>>>>
>>>> * Improved the comment above the aarch64_is_left_consecutive 
>>>> implementation.
>>>>
>>>> * Makes the pattern use the GPI iterator to accept smaller integer 
>>>> sizes (an extra test is added to check for this).
>>>>
>>>> * Improves the tests in combine_bfxil.c to ensure they aren't 
>>>> optimised away and that they check for the pattern's correctness.
>>>>
>>>> Below is a new changelog and the example given before.
>>>>
>>>> Is this OK for trunk?
>>>>
>>>> This patch adds an optimisation that exploits the AArch64 BFXIL 
>>>> instruction
>>>> when or-ing the result of two bitwise and operations with 
>>>> non-overlapping
>>>> bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).
>>>>
>>>> Example:
>>>>
>>>> unsigned long long combine(unsigned long long a, unsigned long long 
>>>> b) {
>>>>   return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
>>>> }
>>>>
>>>> void read(unsigned long long a, unsigned long long b, unsigned long 
>>>> long *c) {
>>>>   *c = combine(a, b);
>>>> }
>>>>
>>>> When compiled with -O2, read would result in:
>>>>
>>>> read:
>>>>   and   x5, x1, #0xffffffff
>>>>   and   x4, x0, #0xffffffff00000000
>>>>   orr   x4, x4, x5
>>>>   str   x4, [x2]
>>>>   ret
>>>>
>>>> But with this patch results in:
>>>>
>>>> read:
>>>>   mov    x4, x0
>>>>   bfxil    x4, x1, 0, 32
>>>>   str    x4, [x2]
>>>>   ret
>>>>
>>>>
>>>>
>>>> Bootstrapped and regtested on aarch64-none-linux-gnu and 
>>>> aarch64-none-elf with no regressions.
>>>>
>>>>
>>>> gcc/
>>>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>>>
>>>>         PR target/85628
>>>>         * config/aarch64/aarch64.md (*aarch64_bfxil):
>>>>         Define.
>>>>         * config/aarch64/constraints.md (Ulc): Define
>>>>         * config/aarch64/aarch64-protos.h 
>>>> (aarch64_is_left_consecutive):
>>>>         Define.
>>>>         * config/aarch64/aarch64.c (aarch64_is_left_consecutive): 
>>>> New function.
>>>>
>>>> gcc/testsuite
>>>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>>>
>>>>         PR target/85628
>>>>         * gcc.target/aarch64/combine_bfxil.c: New file.
>>>>         * gcc.target/aarch64/combine_bfxil_2.c: New file.
>>>>
>>>>
>>>> On 07/19/2018 02:02 PM, Sam Tebbs wrote:
>>>>> Hi Richard,
>>>>>
>>>>> Thanks for the feedback. I find that using "is_left_consecutive" 
>>>>> is more descriptive than checking for it being a power of 2 - 1, 
>>>>> since it describes the requirement (having consecutive ones from 
>>>>> the MSB) more explicitly. I would be happy to change it though if 
>>>>> that is the consensus.
>>>>>
>>>>> I have addressed your point about just returning the string 
>>>>> instead of using output_asm_insn and have changed it locally. I'll 
>>>>> send an updated patch soon.
>>>>>
>>>>>
>>>>> On 07/17/2018 02:33 AM, Richard Henderson wrote:
>>>>>> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>>>>>>> +++ b/gcc/config/aarch64/aarch64.c
>>>>>>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode 
>>>>>>> (unsigned regno, unsigned,
>>>>>>>       return SImode;
>>>>>>>   }
>>>>>>>   +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's bits 
>>>>>>> are consecutive
>>>>>>> +   ones from the MSB.  */
>>>>>>> +bool
>>>>>>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>>>>>>> +{
>>>>>>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>>>>>>> +}
>>>>>>> +
>>>>>> ...
>>>>>>> +(define_insn "*aarch64_bfxil"
>>>>>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>>>>>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>>>>>>> +            (match_operand 3 "const_int_operand"))
>>>>>>> +        (and:DI (match_operand:DI 2 "register_operand" "0")
>>>>>>> +            (match_operand 4 "const_int_operand"))))]
>>>>>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>>>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
>>>>>> Better is to use a define_predicate to merge both that second 
>>>>>> test and the
>>>>>> const_int_operand.
>>>>>>
>>>>>> (I'm not sure about the "left_consecutive" language either.
>>>>>> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
>>>>>>
>>>>>> (define_predicate "pow2m1_operand"
>>>>>>    (and (match_code "const_int")
>>>>>>         (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
>>>>>>
>>>>>> and use
>>>>>>
>>>>>>    (match_operand:DI 3 "pow2m1_operand")
>>>>>>
>>>>>> and then just the
>>>>>>
>>>>>>    INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>>>
>>>>>> test.
>>>>>>
>>>>>> Also, don't omit the modes for the constants.
>>>>>> Also, there's no reason this applies only to DI mode;
>>>>>> use the GPI iterator and %<w> in the output template.
>>>>>>
>>>>>>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>>>>>>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>>>>>>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>>>>>>> +    return "";
>>>>>> You can just return the string that you passed to output_asm_insn.
>>>>>>
>>>>>>> +  }
>>>>>>> +  [(set_attr "type" "bfx")]
>>>>>> The other aliases of the BFM insn use type "bfm";
>>>>>> "bfx" appears to be aliases of UBFM and SBFM.
>>>>>> Not that it appears to matter to the scheduling
>>>>>> descriptions, but it is inconsistent.
>>>>>>
>>>>>>
>>>>>> r~
>>>>>
>>>>
>>>
>


[-- Attachment #2: latest.patch --]
[-- Type: text/x-patch, Size: 6711 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index af5db9c595385f7586692258f750b6aceb3ed9c8..01d9e1bd634572fcfa60208ba4dc541805af5ccd 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -574,4 +574,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
+bool aarch64_is_left_consecutive (HOST_WIDE_INT);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fa01475aa9ee579b6a3b2526295b622157120660..3cfa51b15af3e241672f1383cf881c12a44494a5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1454,6 +1454,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
     return SImode;
 }
 
+/* Implement IS_LEFT_CONSECUTIVE.  Check if I's bits are consecutive
+   ones from the MSB.  */
+bool
+aarch64_is_left_consecutive (HOST_WIDE_INT i)
+{
+  return (i | (i - 1)) == HOST_WIDE_INT_M1;
+}
+
 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
    that strcpy from constants will be faster.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index e9c16f9697b766a5c56b6269a83b7276654c5668..08710168bd476799c0c22b7472766dcd41fde679 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5305,6 +5305,29 @@
   [(set_attr "type" "rev")]
 )
 
+(define_insn "*aarch64_bfxil<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r")
+    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
+		    (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
+	    (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
+		    (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
+  "INTVAL (operands[3]) == ~INTVAL (operands[4])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[3])));
+	return "bfxil\\t%<w>0, %<w>1, 0, %3";
+      case 1:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[4])));
+	return "bfxil\\t%<w>0, %<w>2, 0, %3";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfm")]
+)
+
 ;; There are no canonicalisation rules for the position of the lshiftrt, ashift
 ;; operations within an IOR/AND RTX, therefore we have two patterns matching
 ;; each valid permutation.
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..5bae0b70bbd11013a9fb27ec19cf7467eb20135f 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -172,6 +172,13 @@
   A constraint that matches the immediate constant -1."
   (match_test "op == constm1_rtx"))
 
+(define_constraint "Ulc"
+ "@internal
+ A constraint that matches a constant integer whose bits are consecutive ones
+ from the MSB."
+ (and (match_code "const_int")
+      (match_test "aarch64_is_left_consecutive (ival)")))
+
 (define_constraint "Usv"
   "@internal
    A constraint that matches a VG-based constant that can be loaded by
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
new file mode 100644
index 0000000000000000000000000000000000000000..3bc1dd5b216477efe7494dbcdac7a5bf465af218
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -0,0 +1,98 @@
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps" } */
+
+extern void abort(void);
+
+unsigned long long
+combine_balanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
+}
+
+unsigned long long
+combine_minimal (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xfffffffffffffffe) | (b & 0x0000000000000001);
+}
+
+unsigned long long
+combine_unbalanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffffff000000ll) | (b & 0x0000000000ffffffll);
+}
+
+unsigned int
+combine_balanced_int (unsigned int a, unsigned int b)
+{
+  return (a & 0xffff0000ll) | (b & 0x0000ffffll);
+}
+
+unsigned int
+combine_unbalanced_int (unsigned int a, unsigned int b)
+{
+  return (a & 0xffffff00ll) | (b & 0x000000ffll);
+}
+
+__attribute__ ((noinline)) void
+foo (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_minimal(a, b);
+  *d = combine_minimal(b, a);
+}
+
+__attribute__ ((noinline)) void
+foo2 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_balanced (a, b);
+  *d = combine_balanced (b, a);
+}
+
+__attribute__ ((noinline)) void
+foo3 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_unbalanced (a, b);
+  *d = combine_unbalanced (b, a);
+}
+
+void
+foo4 (unsigned int a, unsigned int b, unsigned int *c, unsigned int *d)
+{
+  *c = combine_balanced_int(a, b);
+  *d = combine_balanced_int(b, a);
+}
+
+void
+foo5 (unsigned int a, unsigned int b, unsigned int *c, unsigned int *d)
+{
+  *c = combine_unbalanced_int(a, b);
+  *d = combine_unbalanced_int(b, a);
+}
+
+int
+main(void)
+{
+  unsigned long long a = 0x0123456789ABCDEF, b = 0xFEDCBA9876543210, c, d;
+  foo3(a, b, &c, &d);
+  if(c != 0x0123456789543210) abort();
+  if(d != 0xfedcba9876abcdef) abort();
+  foo2(a, b, &c, &d);
+  if(c != 0x0123456776543210) abort();
+  if(d != 0xfedcba9889abcdef) abort();
+  foo(a, b, &c, &d);
+  if(c != 0x0123456789abcdee) abort();
+  if(d != 0xfedcba9876543211) abort();
+
+  unsigned int a2 = 0x01234567, b2 = 0xFEDCBA98, c2, d2;
+  foo4(a2, b2, &c2, &d2);
+  if(c2 != 0x0123ba98) abort();
+  if(d2 != 0xfedc4567) abort();
+  foo5(a2, b2, &c2, &d2);
+  if(c2 != 0x01234598) abort();
+  if(d2 != 0xfedcba67) abort();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "bfxil\\t" 10 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..0fc140443bc67bcf12b93d72b7970e095620021e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long long
+combine_non_consecutive (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xfffffff200f00000ll) | (b & 0x00001000ffffffffll);
+}
+
+void
+foo4 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d) {
+  /* { dg-final { scan-assembler-not "bfxil\\t" } } */
+  *c = combine_non_consecutive (a, b);
+  *d = combine_non_consecutive (b, a);
+}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks
  2018-07-24 16:24                 ` Sam Tebbs
@ 2018-07-30 11:31                   ` Sam Tebbs
  0 siblings, 0 replies; 13+ messages in thread
From: Sam Tebbs @ 2018-07-30 11:31 UTC (permalink / raw)
  To: gcc-patches
  Cc: Renlin Li, Richard Henderson, Sudakshina Das, nd,
	richard.earnshaw, marcus.shawcroft, james.greenhalgh

[-- Attachment #1: Type: text/plain, Size: 8942 bytes --]

Hi all,

This update fixes an issue where the predicate would match but during 
register allocation the constraints wouldn't match and so an internal 
compiler error would be raised. This was fixed by adding two 
left_consecutive checks to the pattern's predicate to stop it from 
matching and causing the issue described above. The changelog and 
description from before still apply.

Thanks,
Sam


On 07/24/2018 05:23 PM, Sam Tebbs wrote:
>
>
> On 07/23/2018 02:14 PM, Sam Tebbs wrote:
>>
>>
>> On 07/23/2018 12:38 PM, Renlin Li wrote:
>>>> +(define_insn "*aarch64_bfxil<mode>"
>>>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r")
>>>> +    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
>>>> +            (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
>>>> +        (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
>>>> +            (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
>>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])"
>>>> +  {
>>>> +    switch (which_alternative)
>>>> +    {
>>>> +      case 0:
>>>> +    operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[3])));
>>>> +    return "bfxil\\t%<w>0, %<w>1, 0, %3";
>>>> +      case 1:
>>>> +    operands[3] = GEN_INT (ceil_log2 (INTVAL (operands[4])));
>>>> +    return "bfxil\\t%<w>0, %<w>2, 0, %3";
>>>> +      default:
>>>> +    gcc_unreachable ();
>>>> +    }
>>>> +  }
>>>> +  [(set_attr "type" "bfm")]
>>>> +)
>>>
>>> Hi Sam,
>>>
>>> Is it possible that operand[3] or operand[4] is 1?
>>>
>>> ceil_log2() could return 0 if the argument is 1.
>>> The width field of the resulting instruction will be 0. Is it still 
>>> correct?
>>>
>>> Regard,
>>> Renlin
>>>
>>>
>> Hi Renlin,
>>
>> I think you have found an edge-case that I didn't think of and that 
>> the code would fail under. I have added a check for this situation 
>> and will send the updated patch soon.
>>
>> Thanks,
>> Sam
>
> Here is the updated patch that fixes the problem outlined by Renlin, 
> and adds another testcase to check for the same issue. The changelog 
> and example from my earlier email (sent on 07/20/2018 10:31 AM) still 
> apply.
>
> Thanks,
> Sam
>
>>>
>>> On 07/20/2018 10:33 AM, Sam Tebbs wrote:
>>>> Please disregard the original patch and see this updated version.
>>>>
>>>>
>>>> On 07/20/2018 10:31 AM, Sam Tebbs wrote:
>>>>> Hi all,
>>>>>
>>>>> Here is an updated patch that does the following:
>>>>>
>>>>> * Adds a new constraint in config/aarch64/constraints.md to check 
>>>>> for a constant integer that is left consecutive. This addresses 
>>>>> Richard Henderson's suggestion about combining the 
>>>>> aarch64_is_left_consecutive call and the const_int match in the 
>>>>> pattern.
>>>>>
>>>>> * Merges the two patterns defined into one.
>>>>>
>>>>> * Changes the pattern's type attribute to bfm.
>>>>>
>>>>> * Improved the comment above the aarch64_is_left_consecutive 
>>>>> implementation.
>>>>>
>>>>> * Makes the pattern use the GPI iterator to accept smaller integer 
>>>>> sizes (an extra test is added to check for this).
>>>>>
>>>>> * Improves the tests in combine_bfxil.c to ensure they aren't 
>>>>> optimised away and that they check for the pattern's correctness.
>>>>>
>>>>> Below is a new changelog and the example given before.
>>>>>
>>>>> Is this OK for trunk?
>>>>>
>>>>> This patch adds an optimisation that exploits the AArch64 BFXIL 
>>>>> instruction
>>>>> when or-ing the result of two bitwise and operations with 
>>>>> non-overlapping
>>>>> bitmasks (e.g. (a & 0xFFFF0000) | (b & 0x0000FFFF)).
>>>>>
>>>>> Example:
>>>>>
>>>>> unsigned long long combine(unsigned long long a, unsigned long 
>>>>> long b) {
>>>>>   return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
>>>>> }
>>>>>
>>>>> void read(unsigned long long a, unsigned long long b, unsigned 
>>>>> long long *c) {
>>>>>   *c = combine(a, b);
>>>>> }
>>>>>
>>>>> When compiled with -O2, read would result in:
>>>>>
>>>>> read:
>>>>>   and   x5, x1, #0xffffffff
>>>>>   and   x4, x0, #0xffffffff00000000
>>>>>   orr   x4, x4, x5
>>>>>   str   x4, [x2]
>>>>>   ret
>>>>>
>>>>> But with this patch results in:
>>>>>
>>>>> read:
>>>>>   mov    x4, x0
>>>>>   bfxil    x4, x1, 0, 32
>>>>>   str    x4, [x2]
>>>>>   ret
>>>>>
>>>>>
>>>>>
>>>>> Bootstrapped and regtested on aarch64-none-linux-gnu and 
>>>>> aarch64-none-elf with no regressions.
>>>>>
>>>>>
>>>>> gcc/
>>>>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>>>>
>>>>>         PR target/85628
>>>>>         * config/aarch64/aarch64.md (*aarch64_bfxil):
>>>>>         Define.
>>>>>         * config/aarch64/constraints.md (Ulc): Define
>>>>>         * config/aarch64/aarch64-protos.h 
>>>>> (aarch64_is_left_consecutive):
>>>>>         Define.
>>>>>         * config/aarch64/aarch64.c (aarch64_is_left_consecutive): 
>>>>> New function.
>>>>>
>>>>> gcc/testsuite
>>>>> 2018-07-11  Sam Tebbs  <sam.tebbs@arm.com>
>>>>>
>>>>>         PR target/85628
>>>>>         * gcc.target/aarch64/combine_bfxil.c: New file.
>>>>>         * gcc.target/aarch64/combine_bfxil_2.c: New file.
>>>>>
>>>>>
>>>>> On 07/19/2018 02:02 PM, Sam Tebbs wrote:
>>>>>> Hi Richard,
>>>>>>
>>>>>> Thanks for the feedback. I find that using "is_left_consecutive" 
>>>>>> is more descriptive than checking for it being a power of 2 - 1, 
>>>>>> since it describes the requirement (having consecutive ones from 
>>>>>> the MSB) more explicitly. I would be happy to change it though if 
>>>>>> that is the consensus.
>>>>>>
>>>>>> I have addressed your point about just returning the string 
>>>>>> instead of using output_asm_insn and have changed it locally. 
>>>>>> I'll send an updated patch soon.
>>>>>>
>>>>>>
>>>>>> On 07/17/2018 02:33 AM, Richard Henderson wrote:
>>>>>>> On 07/16/2018 10:10 AM, Sam Tebbs wrote:
>>>>>>>> +++ b/gcc/config/aarch64/aarch64.c
>>>>>>>> @@ -1439,6 +1439,14 @@ aarch64_hard_regno_caller_save_mode 
>>>>>>>> (unsigned regno, unsigned,
>>>>>>>>       return SImode;
>>>>>>>>   }
>>>>>>>>   +/* Implement IS_LEFT_CONSECUTIVE.  Check if an integer's 
>>>>>>>> bits are consecutive
>>>>>>>> +   ones from the MSB.  */
>>>>>>>> +bool
>>>>>>>> +aarch64_is_left_consecutive (HOST_WIDE_INT i)
>>>>>>>> +{
>>>>>>>> +  return (i | (i - 1)) == HOST_WIDE_INT_M1;
>>>>>>>> +}
>>>>>>>> +
>>>>>>> ...
>>>>>>>> +(define_insn "*aarch64_bfxil"
>>>>>>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>>>>>>> +    (ior:DI (and:DI (match_operand:DI 1 "register_operand" "r")
>>>>>>>> +            (match_operand 3 "const_int_operand"))
>>>>>>>> +        (and:DI (match_operand:DI 2 "register_operand" "0")
>>>>>>>> +            (match_operand 4 "const_int_operand"))))]
>>>>>>>> +  "INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>>>>> +    && aarch64_is_left_consecutive (INTVAL (operands[4]))"
>>>>>>> Better is to use a define_predicate to merge both that second 
>>>>>>> test and the
>>>>>>> const_int_operand.
>>>>>>>
>>>>>>> (I'm not sure about the "left_consecutive" language either.
>>>>>>> Isn't it more descriptive to say that op3 is a power of 2 minus 1?)
>>>>>>>
>>>>>>> (define_predicate "pow2m1_operand"
>>>>>>>    (and (match_code "const_int")
>>>>>>>         (match_test "exact_pow2 (INTVAL(op) + 1) > 0")))
>>>>>>>
>>>>>>> and use
>>>>>>>
>>>>>>>    (match_operand:DI 3 "pow2m1_operand")
>>>>>>>
>>>>>>> and then just the
>>>>>>>
>>>>>>>    INTVAL (operands[3]) == ~INTVAL (operands[4])
>>>>>>>
>>>>>>> test.
>>>>>>>
>>>>>>> Also, don't omit the modes for the constants.
>>>>>>> Also, there's no reason this applies only to DI mode;
>>>>>>> use the GPI iterator and %<w> in the output template.
>>>>>>>
>>>>>>>> +    HOST_WIDE_INT op3 = INTVAL (operands[3]);
>>>>>>>> +    operands[3] = GEN_INT (ceil_log2 (op3));
>>>>>>>> +    output_asm_insn ("bfxil\\t%0, %1, 0, %3", operands);
>>>>>>>> +    return "";
>>>>>>> You can just return the string that you passed to output_asm_insn.
>>>>>>>
>>>>>>>> +  }
>>>>>>>> +  [(set_attr "type" "bfx")]
>>>>>>> The other aliases of the BFM insn use type "bfm";
>>>>>>> "bfx" appears to be aliases of UBFM and SBFM.
>>>>>>> Not that it appears to matter to the scheduling
>>>>>>> descriptions, but it is inconsistent.
>>>>>>>
>>>>>>>
>>>>>>> r~
>>>>>>
>>>>>
>>>>
>>
>


[-- Attachment #2: rb9634.patch --]
[-- Type: text/x-patch, Size: 6831 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index af5db9c595385f7586692258f750b6aceb3ed9c8..01d9e1bd634572fcfa60208ba4dc541805af5ccd 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -574,4 +574,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
+bool aarch64_is_left_consecutive (HOST_WIDE_INT);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fa01475aa9ee579b6a3b2526295b622157120660..3cfa51b15af3e241672f1383cf881c12a44494a5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1454,6 +1454,14 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
     return SImode;
 }
 
+/* Implement IS_LEFT_CONSECUTIVE.  Check if I's bits are consecutive
+   ones from the MSB.  */
+bool
+aarch64_is_left_consecutive (HOST_WIDE_INT i)
+{
+  return (i | (i - 1)) == HOST_WIDE_INT_M1;
+}
+
 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
    that strcpy from constants will be faster.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index e9c16f9697b766a5c56b6269a83b7276654c5668..ff2db4af38e16630daeada79afc604c4696abf82 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5305,6 +5305,31 @@
   [(set_attr "type" "rev")]
 )
 
+(define_insn "*aarch64_bfxil<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r")
+    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
+		    (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
+	    (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
+		    (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
+  "(INTVAL (operands[3]) == ~INTVAL (operands[4]))
+  && (aarch64_is_left_consecutive (INTVAL (operands[3]))
+    || aarch64_is_left_consecutive (INTVAL (operands[4])))"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[3])));
+	return "bfxil\\t%<w>0, %<w>1, 0, %3";
+      case 1:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[4])));
+	return "bfxil\\t%<w>0, %<w>2, 0, %3";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfm")]
+)
+
 ;; There are no canonicalisation rules for the position of the lshiftrt, ashift
 ;; operations within an IOR/AND RTX, therefore we have two patterns matching
 ;; each valid permutation.
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..5bae0b70bbd11013a9fb27ec19cf7467eb20135f 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -172,6 +172,13 @@
   A constraint that matches the immediate constant -1."
   (match_test "op == constm1_rtx"))
 
+(define_constraint "Ulc"
+ "@internal
+ A constraint that matches a constant integer whose bits are consecutive ones
+ from the MSB."
+ (and (match_code "const_int")
+      (match_test "aarch64_is_left_consecutive (ival)")))
+
 (define_constraint "Usv"
   "@internal
    A constraint that matches a VG-based constant that can be loaded by
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
new file mode 100644
index 0000000000000000000000000000000000000000..3bc1dd5b216477efe7494dbcdac7a5bf465af218
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -0,0 +1,98 @@
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps" } */
+
+extern void abort(void);
+
+unsigned long long
+combine_balanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffff00000000ll) | (b & 0x00000000ffffffffll);
+}
+
+unsigned long long
+combine_minimal (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xfffffffffffffffe) | (b & 0x0000000000000001);
+}
+
+unsigned long long
+combine_unbalanced (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xffffffffff000000ll) | (b & 0x0000000000ffffffll);
+}
+
+unsigned int
+combine_balanced_int (unsigned int a, unsigned int b)
+{
+  return (a & 0xffff0000ll) | (b & 0x0000ffffll);
+}
+
+unsigned int
+combine_unbalanced_int (unsigned int a, unsigned int b)
+{
+  return (a & 0xffffff00ll) | (b & 0x000000ffll);
+}
+
+__attribute__ ((noinline)) void
+foo (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_minimal(a, b);
+  *d = combine_minimal(b, a);
+}
+
+__attribute__ ((noinline)) void
+foo2 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_balanced (a, b);
+  *d = combine_balanced (b, a);
+}
+
+__attribute__ ((noinline)) void
+foo3 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d)
+{
+  *c = combine_unbalanced (a, b);
+  *d = combine_unbalanced (b, a);
+}
+
+void
+foo4 (unsigned int a, unsigned int b, unsigned int *c, unsigned int *d)
+{
+  *c = combine_balanced_int(a, b);
+  *d = combine_balanced_int(b, a);
+}
+
+void
+foo5 (unsigned int a, unsigned int b, unsigned int *c, unsigned int *d)
+{
+  *c = combine_unbalanced_int(a, b);
+  *d = combine_unbalanced_int(b, a);
+}
+
+int
+main(void)
+{
+  unsigned long long a = 0x0123456789ABCDEF, b = 0xFEDCBA9876543210, c, d;
+  foo3(a, b, &c, &d);
+  if(c != 0x0123456789543210) abort();
+  if(d != 0xfedcba9876abcdef) abort();
+  foo2(a, b, &c, &d);
+  if(c != 0x0123456776543210) abort();
+  if(d != 0xfedcba9889abcdef) abort();
+  foo(a, b, &c, &d);
+  if(c != 0x0123456789abcdee) abort();
+  if(d != 0xfedcba9876543211) abort();
+
+  unsigned int a2 = 0x01234567, b2 = 0xFEDCBA98, c2, d2;
+  foo4(a2, b2, &c2, &d2);
+  if(c2 != 0x0123ba98) abort();
+  if(d2 != 0xfedc4567) abort();
+  foo5(a2, b2, &c2, &d2);
+  if(c2 != 0x01234598) abort();
+  if(d2 != 0xfedcba67) abort();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "bfxil\\t" 10 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..0fc140443bc67bcf12b93d72b7970e095620021e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil_2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long long
+combine_non_consecutive (unsigned long long a, unsigned long long b)
+{
+  return (a & 0xfffffff200f00000ll) | (b & 0x00001000ffffffffll);
+}
+
+void
+foo4 (unsigned long long a, unsigned long long b, unsigned long long *c,
+  unsigned long long *d) {
+  /* { dg-final { scan-assembler-not "bfxil\\t" } } */
+  *c = combine_non_consecutive (a, b);
+  *d = combine_non_consecutive (b, a);
+}

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-07-30 11:31 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-13 16:09 [GCC][PATCH][Aarch64] Exploiting BFXIL when OR-ing two AND-operations with appropriate bitmasks Sam Tebbs
2018-07-16 10:55 ` Sudakshina Das
2018-07-16 17:11   ` Sam Tebbs
2018-07-17  1:34     ` Richard Henderson
2018-07-17 13:33       ` Richard Earnshaw (lists)
2018-07-17 15:46         ` Richard Henderson
2018-07-19 13:03       ` Sam Tebbs
2018-07-20  9:31         ` Sam Tebbs
2018-07-20  9:33           ` Sam Tebbs
2018-07-23 11:38             ` Renlin Li
2018-07-23 13:15               ` Sam Tebbs
2018-07-24 16:24                 ` Sam Tebbs
2018-07-30 11:31                   ` Sam Tebbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).