* [PATCH] PR tree-optimization/90836 Missing popcount pattern matching @ 2019-09-05 15:35 Dmitrij Pochepko 2019-09-06 10:23 ` Richard Biener 2019-09-09 19:03 ` Dmitrij Pochepko 0 siblings, 2 replies; 13+ messages in thread From: Dmitrij Pochepko @ 2019-09-05 15:35 UTC (permalink / raw) To: gcc-patches [-- Attachment #1: Type: text/plain, Size: 1252 bytes --] This patch adds matching for Hamming weight (popcount) implementation. The following sources: int foo64 (unsigned long long a) { unsigned long long b = a; b -= ((b>>1) & 0x5555555555555555ULL); b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; b *= 0x0101010101010101ULL; return (int)(b >> 56); } and int foo32 (unsigned int a) { unsigned long b = a; b -= ((b>>1) & 0x55555555UL); b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); b = ((b>>4) + b) & 0x0F0F0F0FUL; b *= 0x01010101UL; return (int)(b >> 24); } and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. (I have no write access to repo) Thanks, Dmitrij gcc/ChangeLog: PR tree-optimization/90836 * gcc/match.pd (popcount): New pattern. gcc/testsuite/ChangeLog: PR tree-optimization/90836 * lib/target-supports.exp (check_effective_target_popcount) (check_effective_target_popcountll): New effective targets. * gcc.dg/tree-ssa/popcount4.c: New test. * gcc.dg/tree-ssa/popcount4l.c: New test. * gcc.dg/tree-ssa/popcount4ll.c: New test. [-- Attachment #2: popcount.diff --] [-- Type: text/x-diff, Size: 6051 bytes --] diff --git a/gcc/match.pd b/gcc/match.pd index 0317bc7..b1867bf 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -5358,6 +5358,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (cmp (popcount @0) integer_zerop) (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) +/* 64- and 32-bits branchless implementations of popcount are detected: + + int popcount64c (uint64_t x) + { + x -= (x >> 1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> 56; + } + + int popcount32c (uint32_t x) + { + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0f0f0f0f; + return (x * 0x01010101) >> 24; + } */ +(simplify + (convert + (rshift + (mult + (bit_and:c + (plus:c + (rshift @8 INTEGER_CST@5) + (plus:c@8 + (bit_and @6 INTEGER_CST@7) + (bit_and + (rshift + (minus@6 + @0 + (bit_and + (rshift @0 INTEGER_CST@4) + INTEGER_CST@11)) + INTEGER_CST@10) + INTEGER_CST@9))) + INTEGER_CST@3) + INTEGER_CST@2) + INTEGER_CST@1)) + /* Check constants and optab. */ + (with + { + tree argtype = TREE_TYPE (@0); + unsigned prec = TYPE_PRECISION (argtype); + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; + const unsigned long long c1 = 0x0101010101010101ULL >> shift, + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, + c3 = 0x3333333333333333ULL >> shift, + c4 = 0x5555555555555555ULL >> shift; + } + (if (types_match (type, integer_type_node) && tree_to_uhwi (@4) == 1 + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 + && optab_handler (popcount_optab, TYPE_MODE (argtype)) + != CODE_FOR_nothing) + (switch + (if (types_match (argtype, long_long_unsigned_type_node)) + (BUILT_IN_POPCOUNTLL @0)) + (if (types_match (argtype, long_unsigned_type_node)) + (BUILT_IN_POPCOUNTL @0)) + (if (types_match (argtype, unsigned_type_node)) + (BUILT_IN_POPCOUNT @0)))))) + /* Simplify: a = a1 op a2 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c new file mode 100644 index 0000000..9f759f8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcount } */ +/* { dg-require-effective-target int32plus } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned m1 = 0x55555555UL; +const unsigned m2 = 0x33333333UL; +const unsigned m4 = 0x0F0F0F0FUL; +const unsigned h01 = 0x01010101UL; +const int shift = 24; + +int popcount64c(unsigned x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ + + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c new file mode 100644 index 0000000..ab33f79 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcountl } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +#if __SIZEOF_LONG__ == 4 +const unsigned long m1 = 0x55555555UL; +const unsigned long m2 = 0x33333333UL; +const unsigned long m4 = 0x0F0F0F0FUL; +const unsigned long h01 = 0x01010101UL; +const int shift = 24; +#else +const unsigned long m1 = 0x5555555555555555UL; +const unsigned long m2 = 0x3333333333333333UL; +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; +const unsigned long h01 = 0x0101010101010101UL; +const int shift = 56; +#endif + + +int popcount64c(unsigned long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ + + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c new file mode 100644 index 0000000..f3755f1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcountll } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned long long m1 = 0x5555555555555555ULL; +const unsigned long long m2 = 0x3333333333333333ULL; +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; +const unsigned long long h01 = 0x0101010101010101ULL; +const int shift = 56; + +int popcount64c(unsigned long long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 3c50b89..32bbad7 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { } "" ] } +# Return 1 if the target supports popcount on long long. + +proc check_effective_target_popcountll { } { + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { + int foo (long long b) + { + return __builtin_popcountll (b); + } + } "" ] +} + + +# Return 1 if the target supports popcount on int. + +proc check_effective_target_popcount { } { + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { + int foo (int b) + { + return __builtin_popcount (b); + } + } "" ] +} + # Return 1 if the target supports atomic operations on "long long" # and can execute them. # ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-05 15:35 [PATCH] PR tree-optimization/90836 Missing popcount pattern matching Dmitrij Pochepko @ 2019-09-06 10:23 ` Richard Biener 2019-09-09 18:54 ` Dmitrij Pochepko 2019-09-09 19:03 ` Dmitrij Pochepko 1 sibling, 1 reply; 13+ messages in thread From: Richard Biener @ 2019-09-06 10:23 UTC (permalink / raw) To: Dmitrij Pochepko; +Cc: GCC Patches On Thu, Sep 5, 2019 at 5:35 PM Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com> wrote: > > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > int > foo64 (unsigned long long a) > { > unsigned long long b = a; > b -= ((b>>1) & 0x5555555555555555ULL); > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > b *= 0x0101010101010101ULL; > return (int)(b >> 56); > } > > and > > int > foo32 (unsigned int a) > { > unsigned long b = a; > b -= ((b>>1) & 0x55555555UL); > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > b = ((b>>4) + b) & 0x0F0F0F0FUL; > b *= 0x01010101UL; > return (int)(b >> 24); > } > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > (I have no write access to repo) +(simplify + (convert + (rshift + (mult is the outer convert really necessary? That is, if we change the simplification result to (convert (BUILT_IN_POPCOUNT @0)) wouldn't that be correct as well? Is the Hamming weight popcount faster than the libgcc table-based approach? I wonder if we really need to restrict this conversion to the case where the target has an expander. + (mult + (bit_and:c this doesn't need :c (second operand is a constant). + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; + const unsigned long long c1 = 0x0101010101010101ULL >> shift, I think this mixes host and target properties. I guess intead of 'const unsigned long long' you want to use 'const uint64_t' and instead of TYPE_PRECISION (long_long_unsigned_type_node) 64? Since you are later comparing with unsigned HOST_WIDE_INT eventually unsigned HOST_WIDE_INT is better (that's always 64bit as well). You are using tree_to_uhwi but nowhere verifying if @0 is unsigned. What happens if 'prec' is > 64? (__int128 ...). Ah, I guess the final selection will simply select nothing... Otherwise the patch looks reasonable, even if the pattern is a bit unwieldly... ;) Does it work for targets where 'unsigned int' is smaller than 32bit? Thanks, Richard. > > Thanks, > Dmitrij > > > gcc/ChangeLog: > > PR tree-optimization/90836 > > * gcc/match.pd (popcount): New pattern. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/90836 > > * lib/target-supports.exp (check_effective_target_popcount) > (check_effective_target_popcountll): New effective targets. > * gcc.dg/tree-ssa/popcount4.c: New test. > * gcc.dg/tree-ssa/popcount4l.c: New test. > * gcc.dg/tree-ssa/popcount4ll.c: New test. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-06 10:23 ` Richard Biener @ 2019-09-09 18:54 ` Dmitrij Pochepko 0 siblings, 0 replies; 13+ messages in thread From: Dmitrij Pochepko @ 2019-09-09 18:54 UTC (permalink / raw) To: Richard Biener; +Cc: GCC Patches Hi, thank you for looking into it. On Fri, Sep 06, 2019 at 12:23:40PM +0200, Richard Biener wrote: > On Thu, Sep 5, 2019 at 5:35 PM Dmitrij Pochepko > <dmitrij.pochepko@bell-sw.com> wrote: > > > > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > > > int > > foo64 (unsigned long long a) > > { > > unsigned long long b = a; > > b -= ((b>>1) & 0x5555555555555555ULL); > > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > > b *= 0x0101010101010101ULL; > > return (int)(b >> 56); > > } > > > > and > > > > int > > foo32 (unsigned int a) > > { > > unsigned long b = a; > > b -= ((b>>1) & 0x55555555UL); > > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > > b = ((b>>4) + b) & 0x0F0F0F0FUL; > > b *= 0x01010101UL; > > return (int)(b >> 24); > > } > > > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > > > (I have no write access to repo) > > +(simplify > + (convert > + (rshift > + (mult > > is the outer convert really necessary? That is, if we change > the simplification result to > > (convert (BUILT_IN_POPCOUNT @0)) > > wouldn't that be correct as well? Yes, this is better. I fixed it in the new version. > > Is the Hamming weight popcount > faster than the libgcc table-based approach? I wonder if we really > need to restrict this conversion to the case where the target > has an expander. > > + (mult > + (bit_and:c > > this doesn't need :c (second operand is a constant). Yes. Agree, this is redundant. > > + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; > + const unsigned long long c1 = 0x0101010101010101ULL >> shift, > > I think this mixes host and target properties. I guess intead of > 'const unsigned long long' you want to use 'const uint64_t' and > instead of TYPE_PRECISION (long_long_unsigned_type_node) 64? > Since you are later comparing with unsigned HOST_WIDE_INT > eventually unsigned HOST_WIDE_INT is better (that's always 64bit as well). Agree. It is better to use HOST_WIDE_INT. > > You are using tree_to_uhwi but nowhere verifying if @0 is unsigned. > What happens if 'prec' is > 64? (__int128 ...). Ah, I guess the > final selection will simply select nothing... > > Otherwise the patch looks reasonable, even if the pattern > is a bit unwieldly... ;) > > Does it work for targets where 'unsigned int' is smaller than 32bit? Yes. The only 16-bit-int architecture with popcount support on hw level is avr. I built gcc for avr and checked that 16-bit popcount algorithm is recognized successfully. Thanks, Dmitrij > > Thanks, > Richard. > > > > Thanks, > > Dmitrij > > > > > > gcc/ChangeLog: > > > > PR tree-optimization/90836 > > > > * gcc/match.pd (popcount): New pattern. > > > > gcc/testsuite/ChangeLog: > > > > PR tree-optimization/90836 > > > > * lib/target-supports.exp (check_effective_target_popcount) > > (check_effective_target_popcountll): New effective targets. > > * gcc.dg/tree-ssa/popcount4.c: New test. > > * gcc.dg/tree-ssa/popcount4l.c: New test. > > * gcc.dg/tree-ssa/popcount4ll.c: New test. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-05 15:35 [PATCH] PR tree-optimization/90836 Missing popcount pattern matching Dmitrij Pochepko 2019-09-06 10:23 ` Richard Biener @ 2019-09-09 19:03 ` Dmitrij Pochepko 2019-09-24 15:29 ` Dmitrij Pochepko 1 sibling, 1 reply; 13+ messages in thread From: Dmitrij Pochepko @ 2019-09-09 19:03 UTC (permalink / raw) To: gcc-patches [-- Attachment #1: Type: text/plain, Size: 8013 bytes --] Hi all. Please take a look at v2 (attached). I changed patch according to review comments. The same testing was performed again. Thanks, Dmitrij On Thu, Sep 05, 2019 at 06:34:49PM +0300, Dmitrij Pochepko wrote: > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > int > foo64 (unsigned long long a) > { > unsigned long long b = a; > b -= ((b>>1) & 0x5555555555555555ULL); > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > b *= 0x0101010101010101ULL; > return (int)(b >> 56); > } > > and > > int > foo32 (unsigned int a) > { > unsigned long b = a; > b -= ((b>>1) & 0x55555555UL); > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > b = ((b>>4) + b) & 0x0F0F0F0FUL; > b *= 0x01010101UL; > return (int)(b >> 24); > } > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > (I have no write access to repo) > > Thanks, > Dmitrij > > > gcc/ChangeLog: > > PR tree-optimization/90836 > > * gcc/match.pd (popcount): New pattern. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/90836 > > * lib/target-supports.exp (check_effective_target_popcount) > (check_effective_target_popcountll): New effective targets. > * gcc.dg/tree-ssa/popcount4.c: New test. > * gcc.dg/tree-ssa/popcount4l.c: New test. > * gcc.dg/tree-ssa/popcount4ll.c: New test. > diff --git a/gcc/match.pd b/gcc/match.pd > index 0317bc7..b1867bf 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -5358,6 +5358,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (cmp (popcount @0) integer_zerop) > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > +/* 64- and 32-bits branchless implementations of popcount are detected: > + > + int popcount64c (uint64_t x) > + { > + x -= (x >> 1) & 0x5555555555555555ULL; > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > + return (x * 0x0101010101010101ULL) >> 56; > + } > + > + int popcount32c (uint32_t x) > + { > + x -= (x >> 1) & 0x55555555; > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > + x = (x + (x >> 4)) & 0x0f0f0f0f; > + return (x * 0x01010101) >> 24; > + } */ > +(simplify > + (convert > + (rshift > + (mult > + (bit_and:c > + (plus:c > + (rshift @8 INTEGER_CST@5) > + (plus:c@8 > + (bit_and @6 INTEGER_CST@7) > + (bit_and > + (rshift > + (minus@6 > + @0 > + (bit_and > + (rshift @0 INTEGER_CST@4) > + INTEGER_CST@11)) > + INTEGER_CST@10) > + INTEGER_CST@9))) > + INTEGER_CST@3) > + INTEGER_CST@2) > + INTEGER_CST@1)) > + /* Check constants and optab. */ > + (with > + { > + tree argtype = TREE_TYPE (@0); > + unsigned prec = TYPE_PRECISION (argtype); > + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; > + const unsigned long long c1 = 0x0101010101010101ULL >> shift, > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > + c3 = 0x3333333333333333ULL >> shift, > + c4 = 0x5555555555555555ULL >> shift; > + } > + (if (types_match (type, integer_type_node) && tree_to_uhwi (@4) == 1 > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > + != CODE_FOR_nothing) > + (switch > + (if (types_match (argtype, long_long_unsigned_type_node)) > + (BUILT_IN_POPCOUNTLL @0)) > + (if (types_match (argtype, long_unsigned_type_node)) > + (BUILT_IN_POPCOUNTL @0)) > + (if (types_match (argtype, unsigned_type_node)) > + (BUILT_IN_POPCOUNT @0)))))) > + > /* Simplify: > > a = a1 op a2 > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > new file mode 100644 > index 0000000..9f759f8 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcount } */ > +/* { dg-require-effective-target int32plus } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +const unsigned m1 = 0x55555555UL; > +const unsigned m2 = 0x33333333UL; > +const unsigned m4 = 0x0F0F0F0FUL; > +const unsigned h01 = 0x01010101UL; > +const int shift = 24; > + > +int popcount64c(unsigned x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + return (x * h01) >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > + > + > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > new file mode 100644 > index 0000000..ab33f79 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcountl } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +#if __SIZEOF_LONG__ == 4 > +const unsigned long m1 = 0x55555555UL; > +const unsigned long m2 = 0x33333333UL; > +const unsigned long m4 = 0x0F0F0F0FUL; > +const unsigned long h01 = 0x01010101UL; > +const int shift = 24; > +#else > +const unsigned long m1 = 0x5555555555555555UL; > +const unsigned long m2 = 0x3333333333333333UL; > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > +const unsigned long h01 = 0x0101010101010101UL; > +const int shift = 56; > +#endif > + > + > +int popcount64c(unsigned long x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + return (x * h01) >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > + > + > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > new file mode 100644 > index 0000000..f3755f1 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcountll } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +const unsigned long long m1 = 0x5555555555555555ULL; > +const unsigned long long m2 = 0x3333333333333333ULL; > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > +const unsigned long long h01 = 0x0101010101010101ULL; > +const int shift = 56; > + > +int popcount64c(unsigned long long x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + return (x * h01) >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > index 3c50b89..32bbad7 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > } "" ] > } > > +# Return 1 if the target supports popcount on long long. > + > +proc check_effective_target_popcountll { } { > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > + int foo (long long b) > + { > + return __builtin_popcountll (b); > + } > + } "" ] > +} > + > + > +# Return 1 if the target supports popcount on int. > + > +proc check_effective_target_popcount { } { > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > + int foo (int b) > + { > + return __builtin_popcount (b); > + } > + } "" ] > +} > + > # Return 1 if the target supports atomic operations on "long long" > # and can execute them. > # [-- Attachment #2: popcount_v2.diff --] [-- Type: text/x-diff, Size: 6045 bytes --] diff --git a/gcc/match.pd b/gcc/match.pd index 0317bc7..896d5b6 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -5358,6 +5358,69 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (cmp (popcount @0) integer_zerop) (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) +/* 64- and 32-bits branchless implementations of popcount are detected: + + int popcount64c (uint64_t x) + { + x -= (x >> 1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> 56; + } + + int popcount32c (uint32_t x) + { + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0f0f0f0f; + return (x * 0x01010101) >> 24; + } */ +(simplify + (rshift + (mult + (bit_and + (plus:c + (rshift @8 INTEGER_CST@5) + (plus:c@8 + (bit_and @6 INTEGER_CST@7) + (bit_and + (rshift + (minus@6 + @0 + (bit_and + (rshift @0 INTEGER_CST@4) + INTEGER_CST@11)) + INTEGER_CST@10) + INTEGER_CST@9))) + INTEGER_CST@3) + INTEGER_CST@2) + INTEGER_CST@1) + /* Check constants and optab. */ + (with + { + tree argtype = TREE_TYPE (@0); + unsigned prec = TYPE_PRECISION (argtype); + int shift = 64 - prec; + const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, + c3 = 0x3333333333333333ULL >> shift, + c4 = 0x5555555555555555ULL >> shift; + } + (if (tree_to_uhwi (@4) == 1 + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 + && optab_handler (popcount_optab, TYPE_MODE (argtype)) + != CODE_FOR_nothing) + (switch + (if (types_match (argtype, long_long_unsigned_type_node)) + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) + (if (types_match (argtype, long_unsigned_type_node)) + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) + (if (types_match (argtype, unsigned_type_node)) + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) + /* Simplify: a = a1 op a2 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c new file mode 100644 index 0000000..9f759f8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcount } */ +/* { dg-require-effective-target int32plus } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned m1 = 0x55555555UL; +const unsigned m2 = 0x33333333UL; +const unsigned m4 = 0x0F0F0F0FUL; +const unsigned h01 = 0x01010101UL; +const int shift = 24; + +int popcount64c(unsigned x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ + + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c new file mode 100644 index 0000000..ab33f79 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcountl } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +#if __SIZEOF_LONG__ == 4 +const unsigned long m1 = 0x55555555UL; +const unsigned long m2 = 0x33333333UL; +const unsigned long m4 = 0x0F0F0F0FUL; +const unsigned long h01 = 0x01010101UL; +const int shift = 24; +#else +const unsigned long m1 = 0x5555555555555555UL; +const unsigned long m2 = 0x3333333333333333UL; +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; +const unsigned long h01 = 0x0101010101010101UL; +const int shift = 56; +#endif + + +int popcount64c(unsigned long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ + + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c new file mode 100644 index 0000000..f3755f1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcountll } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned long long m1 = 0x5555555555555555ULL; +const unsigned long long m2 = 0x3333333333333333ULL; +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; +const unsigned long long h01 = 0x0101010101010101ULL; +const int shift = 56; + +int popcount64c(unsigned long long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 3c50b89..32bbad7 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { } "" ] } +# Return 1 if the target supports popcount on long long. + +proc check_effective_target_popcountll { } { + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { + int foo (long long b) + { + return __builtin_popcountll (b); + } + } "" ] +} + + +# Return 1 if the target supports popcount on int. + +proc check_effective_target_popcount { } { + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { + int foo (int b) + { + return __builtin_popcount (b); + } + } "" ] +} + # Return 1 if the target supports atomic operations on "long long" # and can execute them. # ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-09 19:03 ` Dmitrij Pochepko @ 2019-09-24 15:29 ` Dmitrij Pochepko 2019-09-26 7:47 ` Richard Biener 0 siblings, 1 reply; 13+ messages in thread From: Dmitrij Pochepko @ 2019-09-24 15:29 UTC (permalink / raw) To: gcc-patches; +Cc: richard.guenther, Wilco.Dijkstra Hi, can anybody take a look at v2? Thanks, Dmitrij On Mon, Sep 09, 2019 at 10:03:40PM +0300, Dmitrij Pochepko wrote: > Hi all. > > Please take a look at v2 (attached). > I changed patch according to review comments. The same testing was performed again. > > Thanks, > Dmitrij > > On Thu, Sep 05, 2019 at 06:34:49PM +0300, Dmitrij Pochepko wrote: > > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > > > int > > foo64 (unsigned long long a) > > { > > unsigned long long b = a; > > b -= ((b>>1) & 0x5555555555555555ULL); > > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > > b *= 0x0101010101010101ULL; > > return (int)(b >> 56); > > } > > > > and > > > > int > > foo32 (unsigned int a) > > { > > unsigned long b = a; > > b -= ((b>>1) & 0x55555555UL); > > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > > b = ((b>>4) + b) & 0x0F0F0F0FUL; > > b *= 0x01010101UL; > > return (int)(b >> 24); > > } > > > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > > > (I have no write access to repo) > > > > Thanks, > > Dmitrij > > > > > > gcc/ChangeLog: > > > > PR tree-optimization/90836 > > > > * gcc/match.pd (popcount): New pattern. > > > > gcc/testsuite/ChangeLog: > > > > PR tree-optimization/90836 > > > > * lib/target-supports.exp (check_effective_target_popcount) > > (check_effective_target_popcountll): New effective targets. > > * gcc.dg/tree-ssa/popcount4.c: New test. > > * gcc.dg/tree-ssa/popcount4l.c: New test. > > * gcc.dg/tree-ssa/popcount4ll.c: New test. > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 0317bc7..b1867bf 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -5358,6 +5358,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (cmp (popcount @0) integer_zerop) > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > + > > + int popcount64c (uint64_t x) > > + { > > + x -= (x >> 1) & 0x5555555555555555ULL; > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > + return (x * 0x0101010101010101ULL) >> 56; > > + } > > + > > + int popcount32c (uint32_t x) > > + { > > + x -= (x >> 1) & 0x55555555; > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > + return (x * 0x01010101) >> 24; > > + } */ > > +(simplify > > + (convert > > + (rshift > > + (mult > > + (bit_and:c > > + (plus:c > > + (rshift @8 INTEGER_CST@5) > > + (plus:c@8 > > + (bit_and @6 INTEGER_CST@7) > > + (bit_and > > + (rshift > > + (minus@6 > > + @0 > > + (bit_and > > + (rshift @0 INTEGER_CST@4) > > + INTEGER_CST@11)) > > + INTEGER_CST@10) > > + INTEGER_CST@9))) > > + INTEGER_CST@3) > > + INTEGER_CST@2) > > + INTEGER_CST@1)) > > + /* Check constants and optab. */ > > + (with > > + { > > + tree argtype = TREE_TYPE (@0); > > + unsigned prec = TYPE_PRECISION (argtype); > > + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; > > + const unsigned long long c1 = 0x0101010101010101ULL >> shift, > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > + c3 = 0x3333333333333333ULL >> shift, > > + c4 = 0x5555555555555555ULL >> shift; > > + } > > + (if (types_match (type, integer_type_node) && tree_to_uhwi (@4) == 1 > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > + != CODE_FOR_nothing) > > + (switch > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > + (BUILT_IN_POPCOUNTLL @0)) > > + (if (types_match (argtype, long_unsigned_type_node)) > > + (BUILT_IN_POPCOUNTL @0)) > > + (if (types_match (argtype, unsigned_type_node)) > > + (BUILT_IN_POPCOUNT @0)))))) > > + > > /* Simplify: > > > > a = a1 op a2 > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > new file mode 100644 > > index 0000000..9f759f8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > @@ -0,0 +1,22 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target popcount } */ > > +/* { dg-require-effective-target int32plus } */ > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > + > > +const unsigned m1 = 0x55555555UL; > > +const unsigned m2 = 0x33333333UL; > > +const unsigned m4 = 0x0F0F0F0FUL; > > +const unsigned h01 = 0x01010101UL; > > +const int shift = 24; > > + > > +int popcount64c(unsigned x) > > +{ > > + x -= (x >> 1) & m1; > > + x = (x & m2) + ((x >> 2) & m2); > > + x = (x + (x >> 4)) & m4; > > + return (x * h01) >> shift; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > + > > + > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > new file mode 100644 > > index 0000000..ab33f79 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > @@ -0,0 +1,30 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target popcountl } */ > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > + > > +#if __SIZEOF_LONG__ == 4 > > +const unsigned long m1 = 0x55555555UL; > > +const unsigned long m2 = 0x33333333UL; > > +const unsigned long m4 = 0x0F0F0F0FUL; > > +const unsigned long h01 = 0x01010101UL; > > +const int shift = 24; > > +#else > > +const unsigned long m1 = 0x5555555555555555UL; > > +const unsigned long m2 = 0x3333333333333333UL; > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > +const unsigned long h01 = 0x0101010101010101UL; > > +const int shift = 56; > > +#endif > > + > > + > > +int popcount64c(unsigned long x) > > +{ > > + x -= (x >> 1) & m1; > > + x = (x & m2) + ((x >> 2) & m2); > > + x = (x + (x >> 4)) & m4; > > + return (x * h01) >> shift; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > + > > + > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > new file mode 100644 > > index 0000000..f3755f1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target popcountll } */ > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > + > > +const unsigned long long m1 = 0x5555555555555555ULL; > > +const unsigned long long m2 = 0x3333333333333333ULL; > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > +const unsigned long long h01 = 0x0101010101010101ULL; > > +const int shift = 56; > > + > > +int popcount64c(unsigned long long x) > > +{ > > + x -= (x >> 1) & m1; > > + x = (x & m2) + ((x >> 2) & m2); > > + x = (x + (x >> 4)) & m4; > > + return (x * h01) >> shift; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > index 3c50b89..32bbad7 100644 > > --- a/gcc/testsuite/lib/target-supports.exp > > +++ b/gcc/testsuite/lib/target-supports.exp > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > } "" ] > > } > > > > +# Return 1 if the target supports popcount on long long. > > + > > +proc check_effective_target_popcountll { } { > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > + int foo (long long b) > > + { > > + return __builtin_popcountll (b); > > + } > > + } "" ] > > +} > > + > > + > > +# Return 1 if the target supports popcount on int. > > + > > +proc check_effective_target_popcount { } { > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > + int foo (int b) > > + { > > + return __builtin_popcount (b); > > + } > > + } "" ] > > +} > > + > > # Return 1 if the target supports atomic operations on "long long" > > # and can execute them. > > # > > diff --git a/gcc/match.pd b/gcc/match.pd > index 0317bc7..896d5b6 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -5358,6 +5358,69 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (cmp (popcount @0) integer_zerop) > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > +/* 64- and 32-bits branchless implementations of popcount are detected: > + > + int popcount64c (uint64_t x) > + { > + x -= (x >> 1) & 0x5555555555555555ULL; > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > + return (x * 0x0101010101010101ULL) >> 56; > + } > + > + int popcount32c (uint32_t x) > + { > + x -= (x >> 1) & 0x55555555; > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > + x = (x + (x >> 4)) & 0x0f0f0f0f; > + return (x * 0x01010101) >> 24; > + } */ > +(simplify > + (rshift > + (mult > + (bit_and > + (plus:c > + (rshift @8 INTEGER_CST@5) > + (plus:c@8 > + (bit_and @6 INTEGER_CST@7) > + (bit_and > + (rshift > + (minus@6 > + @0 > + (bit_and > + (rshift @0 INTEGER_CST@4) > + INTEGER_CST@11)) > + INTEGER_CST@10) > + INTEGER_CST@9))) > + INTEGER_CST@3) > + INTEGER_CST@2) > + INTEGER_CST@1) > + /* Check constants and optab. */ > + (with > + { > + tree argtype = TREE_TYPE (@0); > + unsigned prec = TYPE_PRECISION (argtype); > + int shift = 64 - prec; > + const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > + c3 = 0x3333333333333333ULL >> shift, > + c4 = 0x5555555555555555ULL >> shift; > + } > + (if (tree_to_uhwi (@4) == 1 > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > + != CODE_FOR_nothing) > + (switch > + (if (types_match (argtype, long_long_unsigned_type_node)) > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > + (if (types_match (argtype, long_unsigned_type_node)) > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > + (if (types_match (argtype, unsigned_type_node)) > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > + > /* Simplify: > > a = a1 op a2 > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > new file mode 100644 > index 0000000..9f759f8 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcount } */ > +/* { dg-require-effective-target int32plus } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +const unsigned m1 = 0x55555555UL; > +const unsigned m2 = 0x33333333UL; > +const unsigned m4 = 0x0F0F0F0FUL; > +const unsigned h01 = 0x01010101UL; > +const int shift = 24; > + > +int popcount64c(unsigned x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + return (x * h01) >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > + > + > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > new file mode 100644 > index 0000000..ab33f79 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcountl } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +#if __SIZEOF_LONG__ == 4 > +const unsigned long m1 = 0x55555555UL; > +const unsigned long m2 = 0x33333333UL; > +const unsigned long m4 = 0x0F0F0F0FUL; > +const unsigned long h01 = 0x01010101UL; > +const int shift = 24; > +#else > +const unsigned long m1 = 0x5555555555555555UL; > +const unsigned long m2 = 0x3333333333333333UL; > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > +const unsigned long h01 = 0x0101010101010101UL; > +const int shift = 56; > +#endif > + > + > +int popcount64c(unsigned long x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + return (x * h01) >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > + > + > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > new file mode 100644 > index 0000000..f3755f1 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcountll } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +const unsigned long long m1 = 0x5555555555555555ULL; > +const unsigned long long m2 = 0x3333333333333333ULL; > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > +const unsigned long long h01 = 0x0101010101010101ULL; > +const int shift = 56; > + > +int popcount64c(unsigned long long x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + return (x * h01) >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > index 3c50b89..32bbad7 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > } "" ] > } > > +# Return 1 if the target supports popcount on long long. > + > +proc check_effective_target_popcountll { } { > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > + int foo (long long b) > + { > + return __builtin_popcountll (b); > + } > + } "" ] > +} > + > + > +# Return 1 if the target supports popcount on int. > + > +proc check_effective_target_popcount { } { > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > + int foo (int b) > + { > + return __builtin_popcount (b); > + } > + } "" ] > +} > + > # Return 1 if the target supports atomic operations on "long long" > # and can execute them. > # ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-24 15:29 ` Dmitrij Pochepko @ 2019-09-26 7:47 ` Richard Biener 2019-10-01 11:49 ` Dmitrij Pochepko 0 siblings, 1 reply; 13+ messages in thread From: Richard Biener @ 2019-09-26 7:47 UTC (permalink / raw) To: Dmitrij Pochepko; +Cc: GCC Patches, Wilco Dijkstra, Richard Sandiford On Tue, Sep 24, 2019 at 5:29 PM Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com> wrote: > > Hi, > > can anybody take a look at v2? + (if (tree_to_uhwi (@4) == 1 + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 those will still ICE for large __int128_t constants. Since you do not match any conversions you should probably restrict the precision of 'type' like with (if (TYPE_PRECISION (type) <= 64 && tree_to_uhwi (@4) ... likewise tree_to_uhwi will fail for negative constants thus if the pattern assumes unsigned you should verify that as well with && TYPE_UNSIGNED (type). Your 'argtype' is simply 'type' so you can elide it. + (switch + (if (types_match (argtype, long_long_unsigned_type_node)) + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) + (if (types_match (argtype, long_unsigned_type_node)) + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) + (if (types_match (argtype, unsigned_type_node)) + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) Please test small types first so we can avoid popcountll when long == long long or long == int. I also wonder if we really want to use the builtins and check optab availability or if we nowadays should use direct_internal_fn_supported_p (IFN_POPCOUNT, integer_type_node, type, OPTIMIZE_FOR_BOTH) and (convert (IFN_POPCOUNT:type @0)) without the switch? Thanks, Richard. > Thanks, > Dmitrij > > On Mon, Sep 09, 2019 at 10:03:40PM +0300, Dmitrij Pochepko wrote: > > Hi all. > > > > Please take a look at v2 (attached). > > I changed patch according to review comments. The same testing was performed again. > > > > Thanks, > > Dmitrij > > > > On Thu, Sep 05, 2019 at 06:34:49PM +0300, Dmitrij Pochepko wrote: > > > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > > > > > int > > > foo64 (unsigned long long a) > > > { > > > unsigned long long b = a; > > > b -= ((b>>1) & 0x5555555555555555ULL); > > > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > > > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > > > b *= 0x0101010101010101ULL; > > > return (int)(b >> 56); > > > } > > > > > > and > > > > > > int > > > foo32 (unsigned int a) > > > { > > > unsigned long b = a; > > > b -= ((b>>1) & 0x55555555UL); > > > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > > > b = ((b>>4) + b) & 0x0F0F0F0FUL; > > > b *= 0x01010101UL; > > > return (int)(b >> 24); > > > } > > > > > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > > > > > (I have no write access to repo) > > > > > > Thanks, > > > Dmitrij > > > > > > > > > gcc/ChangeLog: > > > > > > PR tree-optimization/90836 > > > > > > * gcc/match.pd (popcount): New pattern. > > > > > > gcc/testsuite/ChangeLog: > > > > > > PR tree-optimization/90836 > > > > > > * lib/target-supports.exp (check_effective_target_popcount) > > > (check_effective_target_popcountll): New effective targets. > > > * gcc.dg/tree-ssa/popcount4.c: New test. > > > * gcc.dg/tree-ssa/popcount4l.c: New test. > > > * gcc.dg/tree-ssa/popcount4ll.c: New test. > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index 0317bc7..b1867bf 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -5358,6 +5358,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > (cmp (popcount @0) integer_zerop) > > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > > + > > > + int popcount64c (uint64_t x) > > > + { > > > + x -= (x >> 1) & 0x5555555555555555ULL; > > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > > + return (x * 0x0101010101010101ULL) >> 56; > > > + } > > > + > > > + int popcount32c (uint32_t x) > > > + { > > > + x -= (x >> 1) & 0x55555555; > > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > > + return (x * 0x01010101) >> 24; > > > + } */ > > > +(simplify > > > + (convert > > > + (rshift > > > + (mult > > > + (bit_and:c > > > + (plus:c > > > + (rshift @8 INTEGER_CST@5) > > > + (plus:c@8 > > > + (bit_and @6 INTEGER_CST@7) > > > + (bit_and > > > + (rshift > > > + (minus@6 > > > + @0 > > > + (bit_and > > > + (rshift @0 INTEGER_CST@4) > > > + INTEGER_CST@11)) > > > + INTEGER_CST@10) > > > + INTEGER_CST@9))) > > > + INTEGER_CST@3) > > > + INTEGER_CST@2) > > > + INTEGER_CST@1)) > > > + /* Check constants and optab. */ > > > + (with > > > + { > > > + tree argtype = TREE_TYPE (@0); > > > + unsigned prec = TYPE_PRECISION (argtype); > > > + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; > > > + const unsigned long long c1 = 0x0101010101010101ULL >> shift, > > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > > + c3 = 0x3333333333333333ULL >> shift, > > > + c4 = 0x5555555555555555ULL >> shift; > > > + } > > > + (if (types_match (type, integer_type_node) && tree_to_uhwi (@4) == 1 > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > > + != CODE_FOR_nothing) > > > + (switch > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > + (BUILT_IN_POPCOUNTLL @0)) > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > + (BUILT_IN_POPCOUNTL @0)) > > > + (if (types_match (argtype, unsigned_type_node)) > > > + (BUILT_IN_POPCOUNT @0)))))) > > > + > > > /* Simplify: > > > > > > a = a1 op a2 > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > new file mode 100644 > > > index 0000000..9f759f8 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > @@ -0,0 +1,22 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target popcount } */ > > > +/* { dg-require-effective-target int32plus } */ > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > + > > > +const unsigned m1 = 0x55555555UL; > > > +const unsigned m2 = 0x33333333UL; > > > +const unsigned m4 = 0x0F0F0F0FUL; > > > +const unsigned h01 = 0x01010101UL; > > > +const int shift = 24; > > > + > > > +int popcount64c(unsigned x) > > > +{ > > > + x -= (x >> 1) & m1; > > > + x = (x & m2) + ((x >> 2) & m2); > > > + x = (x + (x >> 4)) & m4; > > > + return (x * h01) >> shift; > > > +} > > > + > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > > + > > > + > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > new file mode 100644 > > > index 0000000..ab33f79 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > @@ -0,0 +1,30 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target popcountl } */ > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > + > > > +#if __SIZEOF_LONG__ == 4 > > > +const unsigned long m1 = 0x55555555UL; > > > +const unsigned long m2 = 0x33333333UL; > > > +const unsigned long m4 = 0x0F0F0F0FUL; > > > +const unsigned long h01 = 0x01010101UL; > > > +const int shift = 24; > > > +#else > > > +const unsigned long m1 = 0x5555555555555555UL; > > > +const unsigned long m2 = 0x3333333333333333UL; > > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > > +const unsigned long h01 = 0x0101010101010101UL; > > > +const int shift = 56; > > > +#endif > > > + > > > + > > > +int popcount64c(unsigned long x) > > > +{ > > > + x -= (x >> 1) & m1; > > > + x = (x & m2) + ((x >> 2) & m2); > > > + x = (x + (x >> 4)) & m4; > > > + return (x * h01) >> shift; > > > +} > > > + > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > > + > > > + > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > new file mode 100644 > > > index 0000000..f3755f1 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > @@ -0,0 +1,19 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target popcountll } */ > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > + > > > +const unsigned long long m1 = 0x5555555555555555ULL; > > > +const unsigned long long m2 = 0x3333333333333333ULL; > > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > > +const unsigned long long h01 = 0x0101010101010101ULL; > > > +const int shift = 56; > > > + > > > +int popcount64c(unsigned long long x) > > > +{ > > > + x -= (x >> 1) & m1; > > > + x = (x & m2) + ((x >> 2) & m2); > > > + x = (x + (x >> 4)) & m4; > > > + return (x * h01) >> shift; > > > +} > > > + > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > > index 3c50b89..32bbad7 100644 > > > --- a/gcc/testsuite/lib/target-supports.exp > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > > } "" ] > > > } > > > > > > +# Return 1 if the target supports popcount on long long. > > > + > > > +proc check_effective_target_popcountll { } { > > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > > + int foo (long long b) > > > + { > > > + return __builtin_popcountll (b); > > > + } > > > + } "" ] > > > +} > > > + > > > + > > > +# Return 1 if the target supports popcount on int. > > > + > > > +proc check_effective_target_popcount { } { > > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > > + int foo (int b) > > > + { > > > + return __builtin_popcount (b); > > > + } > > > + } "" ] > > > +} > > > + > > > # Return 1 if the target supports atomic operations on "long long" > > > # and can execute them. > > > # > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 0317bc7..896d5b6 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -5358,6 +5358,69 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (cmp (popcount @0) integer_zerop) > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > + > > + int popcount64c (uint64_t x) > > + { > > + x -= (x >> 1) & 0x5555555555555555ULL; > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > + return (x * 0x0101010101010101ULL) >> 56; > > + } > > + > > + int popcount32c (uint32_t x) > > + { > > + x -= (x >> 1) & 0x55555555; > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > + return (x * 0x01010101) >> 24; > > + } */ > > +(simplify > > + (rshift > > + (mult > > + (bit_and > > + (plus:c > > + (rshift @8 INTEGER_CST@5) > > + (plus:c@8 > > + (bit_and @6 INTEGER_CST@7) > > + (bit_and > > + (rshift > > + (minus@6 > > + @0 > > + (bit_and > > + (rshift @0 INTEGER_CST@4) > > + INTEGER_CST@11)) > > + INTEGER_CST@10) > > + INTEGER_CST@9))) > > + INTEGER_CST@3) > > + INTEGER_CST@2) > > + INTEGER_CST@1) > > + /* Check constants and optab. */ > > + (with > > + { > > + tree argtype = TREE_TYPE (@0); > > + unsigned prec = TYPE_PRECISION (argtype); > > + int shift = 64 - prec; > > + const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > + c3 = 0x3333333333333333ULL >> shift, > > + c4 = 0x5555555555555555ULL >> shift; > > + } > > + (if (tree_to_uhwi (@4) == 1 > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > + != CODE_FOR_nothing) > > + (switch > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > > + (if (types_match (argtype, long_unsigned_type_node)) > > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > > + (if (types_match (argtype, unsigned_type_node)) > > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > > + > > /* Simplify: > > > > a = a1 op a2 > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > new file mode 100644 > > index 0000000..9f759f8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > @@ -0,0 +1,22 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target popcount } */ > > +/* { dg-require-effective-target int32plus } */ > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > + > > +const unsigned m1 = 0x55555555UL; > > +const unsigned m2 = 0x33333333UL; > > +const unsigned m4 = 0x0F0F0F0FUL; > > +const unsigned h01 = 0x01010101UL; > > +const int shift = 24; > > + > > +int popcount64c(unsigned x) > > +{ > > + x -= (x >> 1) & m1; > > + x = (x & m2) + ((x >> 2) & m2); > > + x = (x + (x >> 4)) & m4; > > + return (x * h01) >> shift; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > + > > + > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > new file mode 100644 > > index 0000000..ab33f79 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > @@ -0,0 +1,30 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target popcountl } */ > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > + > > +#if __SIZEOF_LONG__ == 4 > > +const unsigned long m1 = 0x55555555UL; > > +const unsigned long m2 = 0x33333333UL; > > +const unsigned long m4 = 0x0F0F0F0FUL; > > +const unsigned long h01 = 0x01010101UL; > > +const int shift = 24; > > +#else > > +const unsigned long m1 = 0x5555555555555555UL; > > +const unsigned long m2 = 0x3333333333333333UL; > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > +const unsigned long h01 = 0x0101010101010101UL; > > +const int shift = 56; > > +#endif > > + > > + > > +int popcount64c(unsigned long x) > > +{ > > + x -= (x >> 1) & m1; > > + x = (x & m2) + ((x >> 2) & m2); > > + x = (x + (x >> 4)) & m4; > > + return (x * h01) >> shift; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > + > > + > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > new file mode 100644 > > index 0000000..f3755f1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target popcountll } */ > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > + > > +const unsigned long long m1 = 0x5555555555555555ULL; > > +const unsigned long long m2 = 0x3333333333333333ULL; > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > +const unsigned long long h01 = 0x0101010101010101ULL; > > +const int shift = 56; > > + > > +int popcount64c(unsigned long long x) > > +{ > > + x -= (x >> 1) & m1; > > + x = (x & m2) + ((x >> 2) & m2); > > + x = (x + (x >> 4)) & m4; > > + return (x * h01) >> shift; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > index 3c50b89..32bbad7 100644 > > --- a/gcc/testsuite/lib/target-supports.exp > > +++ b/gcc/testsuite/lib/target-supports.exp > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > } "" ] > > } > > > > +# Return 1 if the target supports popcount on long long. > > + > > +proc check_effective_target_popcountll { } { > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > + int foo (long long b) > > + { > > + return __builtin_popcountll (b); > > + } > > + } "" ] > > +} > > + > > + > > +# Return 1 if the target supports popcount on int. > > + > > +proc check_effective_target_popcount { } { > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > + int foo (int b) > > + { > > + return __builtin_popcount (b); > > + } > > + } "" ] > > +} > > + > > # Return 1 if the target supports atomic operations on "long long" > > # and can execute them. > > # > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-26 7:47 ` Richard Biener @ 2019-10-01 11:49 ` Dmitrij Pochepko 2019-10-07 10:04 ` Richard Biener 0 siblings, 1 reply; 13+ messages in thread From: Dmitrij Pochepko @ 2019-10-01 11:49 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches, Wilco.Dijkstra [-- Attachment #1: Type: text/plain, Size: 19093 bytes --] Hi Richard, I updated patch according to all your comments. Also bootstrapped and tested again on x86_64-pc-linux-gnu and aarch64-linux-gnu, which took some time. attached v3. Thanks, Dmitrij On Thu, Sep 26, 2019 at 09:47:04AM +0200, Richard Biener wrote: > On Tue, Sep 24, 2019 at 5:29 PM Dmitrij Pochepko > <dmitrij.pochepko@bell-sw.com> wrote: > > > > Hi, > > > > can anybody take a look at v2? > > + (if (tree_to_uhwi (@4) == 1 > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > those will still ICE for large __int128_t constants. Since you do not match > any conversions you should probably restrict the precision of 'type' like > with > (if (TYPE_PRECISION (type) <= 64 > && tree_to_uhwi (@4) ... > > likewise tree_to_uhwi will fail for negative constants thus if the > pattern assumes > unsigned you should verify that as well with && TYPE_UNSIGNED (type). > > Your 'argtype' is simply 'type' so you can elide it. > > + (switch > + (if (types_match (argtype, long_long_unsigned_type_node)) > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > + (if (types_match (argtype, long_unsigned_type_node)) > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > + (if (types_match (argtype, unsigned_type_node)) > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > > Please test small types first so we can avoid popcountll when long == long long > or long == int. I also wonder if we really want to use the builtins and > check optab availability or if we nowadays should use > direct_internal_fn_supported_p (IFN_POPCOUNT, integer_type_node, type, > OPTIMIZE_FOR_BOTH) and > > (convert (IFN_POPCOUNT:type @0)) > > without the switch? > > Thanks, > Richard. > > > Thanks, > > Dmitrij > > > > On Mon, Sep 09, 2019 at 10:03:40PM +0300, Dmitrij Pochepko wrote: > > > Hi all. > > > > > > Please take a look at v2 (attached). > > > I changed patch according to review comments. The same testing was performed again. > > > > > > Thanks, > > > Dmitrij > > > > > > On Thu, Sep 05, 2019 at 06:34:49PM +0300, Dmitrij Pochepko wrote: > > > > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > > > > > > > int > > > > foo64 (unsigned long long a) > > > > { > > > > unsigned long long b = a; > > > > b -= ((b>>1) & 0x5555555555555555ULL); > > > > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > > > > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > > > > b *= 0x0101010101010101ULL; > > > > return (int)(b >> 56); > > > > } > > > > > > > > and > > > > > > > > int > > > > foo32 (unsigned int a) > > > > { > > > > unsigned long b = a; > > > > b -= ((b>>1) & 0x55555555UL); > > > > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > > > > b = ((b>>4) + b) & 0x0F0F0F0FUL; > > > > b *= 0x01010101UL; > > > > return (int)(b >> 24); > > > > } > > > > > > > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > > > > > > > (I have no write access to repo) > > > > > > > > Thanks, > > > > Dmitrij > > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > PR tree-optimization/90836 > > > > > > > > * gcc/match.pd (popcount): New pattern. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > PR tree-optimization/90836 > > > > > > > > * lib/target-supports.exp (check_effective_target_popcount) > > > > (check_effective_target_popcountll): New effective targets. > > > > * gcc.dg/tree-ssa/popcount4.c: New test. > > > > * gcc.dg/tree-ssa/popcount4l.c: New test. > > > > * gcc.dg/tree-ssa/popcount4ll.c: New test. > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > index 0317bc7..b1867bf 100644 > > > > --- a/gcc/match.pd > > > > +++ b/gcc/match.pd > > > > @@ -5358,6 +5358,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > > (cmp (popcount @0) integer_zerop) > > > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > > > + > > > > + int popcount64c (uint64_t x) > > > > + { > > > > + x -= (x >> 1) & 0x5555555555555555ULL; > > > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > > > + return (x * 0x0101010101010101ULL) >> 56; > > > > + } > > > > + > > > > + int popcount32c (uint32_t x) > > > > + { > > > > + x -= (x >> 1) & 0x55555555; > > > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > > > + return (x * 0x01010101) >> 24; > > > > + } */ > > > > +(simplify > > > > + (convert > > > > + (rshift > > > > + (mult > > > > + (bit_and:c > > > > + (plus:c > > > > + (rshift @8 INTEGER_CST@5) > > > > + (plus:c@8 > > > > + (bit_and @6 INTEGER_CST@7) > > > > + (bit_and > > > > + (rshift > > > > + (minus@6 > > > > + @0 > > > > + (bit_and > > > > + (rshift @0 INTEGER_CST@4) > > > > + INTEGER_CST@11)) > > > > + INTEGER_CST@10) > > > > + INTEGER_CST@9))) > > > > + INTEGER_CST@3) > > > > + INTEGER_CST@2) > > > > + INTEGER_CST@1)) > > > > + /* Check constants and optab. */ > > > > + (with > > > > + { > > > > + tree argtype = TREE_TYPE (@0); > > > > + unsigned prec = TYPE_PRECISION (argtype); > > > > + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; > > > > + const unsigned long long c1 = 0x0101010101010101ULL >> shift, > > > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > > > + c3 = 0x3333333333333333ULL >> shift, > > > > + c4 = 0x5555555555555555ULL >> shift; > > > > + } > > > > + (if (types_match (type, integer_type_node) && tree_to_uhwi (@4) == 1 > > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > > > + != CODE_FOR_nothing) > > > > + (switch > > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > > + (BUILT_IN_POPCOUNTLL @0)) > > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > > + (BUILT_IN_POPCOUNTL @0)) > > > > + (if (types_match (argtype, unsigned_type_node)) > > > > + (BUILT_IN_POPCOUNT @0)))))) > > > > + > > > > /* Simplify: > > > > > > > > a = a1 op a2 > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > new file mode 100644 > > > > index 0000000..9f759f8 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > @@ -0,0 +1,22 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-require-effective-target popcount } */ > > > > +/* { dg-require-effective-target int32plus } */ > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > + > > > > +const unsigned m1 = 0x55555555UL; > > > > +const unsigned m2 = 0x33333333UL; > > > > +const unsigned m4 = 0x0F0F0F0FUL; > > > > +const unsigned h01 = 0x01010101UL; > > > > +const int shift = 24; > > > > + > > > > +int popcount64c(unsigned x) > > > > +{ > > > > + x -= (x >> 1) & m1; > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > + x = (x + (x >> 4)) & m4; > > > > + return (x * h01) >> shift; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > > > + > > > > + > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > new file mode 100644 > > > > index 0000000..ab33f79 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > @@ -0,0 +1,30 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-require-effective-target popcountl } */ > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > + > > > > +#if __SIZEOF_LONG__ == 4 > > > > +const unsigned long m1 = 0x55555555UL; > > > > +const unsigned long m2 = 0x33333333UL; > > > > +const unsigned long m4 = 0x0F0F0F0FUL; > > > > +const unsigned long h01 = 0x01010101UL; > > > > +const int shift = 24; > > > > +#else > > > > +const unsigned long m1 = 0x5555555555555555UL; > > > > +const unsigned long m2 = 0x3333333333333333UL; > > > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > > > +const unsigned long h01 = 0x0101010101010101UL; > > > > +const int shift = 56; > > > > +#endif > > > > + > > > > + > > > > +int popcount64c(unsigned long x) > > > > +{ > > > > + x -= (x >> 1) & m1; > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > + x = (x + (x >> 4)) & m4; > > > > + return (x * h01) >> shift; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > > > + > > > > + > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > new file mode 100644 > > > > index 0000000..f3755f1 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > @@ -0,0 +1,19 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-require-effective-target popcountll } */ > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > + > > > > +const unsigned long long m1 = 0x5555555555555555ULL; > > > > +const unsigned long long m2 = 0x3333333333333333ULL; > > > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > > > +const unsigned long long h01 = 0x0101010101010101ULL; > > > > +const int shift = 56; > > > > + > > > > +int popcount64c(unsigned long long x) > > > > +{ > > > > + x -= (x >> 1) & m1; > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > + x = (x + (x >> 4)) & m4; > > > > + return (x * h01) >> shift; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > > > index 3c50b89..32bbad7 100644 > > > > --- a/gcc/testsuite/lib/target-supports.exp > > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > > > } "" ] > > > > } > > > > > > > > +# Return 1 if the target supports popcount on long long. > > > > + > > > > +proc check_effective_target_popcountll { } { > > > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > > > + int foo (long long b) > > > > + { > > > > + return __builtin_popcountll (b); > > > > + } > > > > + } "" ] > > > > +} > > > > + > > > > + > > > > +# Return 1 if the target supports popcount on int. > > > > + > > > > +proc check_effective_target_popcount { } { > > > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > > > + int foo (int b) > > > > + { > > > > + return __builtin_popcount (b); > > > > + } > > > > + } "" ] > > > > +} > > > > + > > > > # Return 1 if the target supports atomic operations on "long long" > > > > # and can execute them. > > > > # > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index 0317bc7..896d5b6 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -5358,6 +5358,69 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > (cmp (popcount @0) integer_zerop) > > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > > + > > > + int popcount64c (uint64_t x) > > > + { > > > + x -= (x >> 1) & 0x5555555555555555ULL; > > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > > + return (x * 0x0101010101010101ULL) >> 56; > > > + } > > > + > > > + int popcount32c (uint32_t x) > > > + { > > > + x -= (x >> 1) & 0x55555555; > > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > > + return (x * 0x01010101) >> 24; > > > + } */ > > > +(simplify > > > + (rshift > > > + (mult > > > + (bit_and > > > + (plus:c > > > + (rshift @8 INTEGER_CST@5) > > > + (plus:c@8 > > > + (bit_and @6 INTEGER_CST@7) > > > + (bit_and > > > + (rshift > > > + (minus@6 > > > + @0 > > > + (bit_and > > > + (rshift @0 INTEGER_CST@4) > > > + INTEGER_CST@11)) > > > + INTEGER_CST@10) > > > + INTEGER_CST@9))) > > > + INTEGER_CST@3) > > > + INTEGER_CST@2) > > > + INTEGER_CST@1) > > > + /* Check constants and optab. */ > > > + (with > > > + { > > > + tree argtype = TREE_TYPE (@0); > > > + unsigned prec = TYPE_PRECISION (argtype); > > > + int shift = 64 - prec; > > > + const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, > > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > > + c3 = 0x3333333333333333ULL >> shift, > > > + c4 = 0x5555555555555555ULL >> shift; > > > + } > > > + (if (tree_to_uhwi (@4) == 1 > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > > + != CODE_FOR_nothing) > > > + (switch > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > > > + (if (types_match (argtype, unsigned_type_node)) > > > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > > > + > > > /* Simplify: > > > > > > a = a1 op a2 > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > new file mode 100644 > > > index 0000000..9f759f8 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > @@ -0,0 +1,22 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target popcount } */ > > > +/* { dg-require-effective-target int32plus } */ > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > + > > > +const unsigned m1 = 0x55555555UL; > > > +const unsigned m2 = 0x33333333UL; > > > +const unsigned m4 = 0x0F0F0F0FUL; > > > +const unsigned h01 = 0x01010101UL; > > > +const int shift = 24; > > > + > > > +int popcount64c(unsigned x) > > > +{ > > > + x -= (x >> 1) & m1; > > > + x = (x & m2) + ((x >> 2) & m2); > > > + x = (x + (x >> 4)) & m4; > > > + return (x * h01) >> shift; > > > +} > > > + > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > > + > > > + > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > new file mode 100644 > > > index 0000000..ab33f79 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > @@ -0,0 +1,30 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target popcountl } */ > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > + > > > +#if __SIZEOF_LONG__ == 4 > > > +const unsigned long m1 = 0x55555555UL; > > > +const unsigned long m2 = 0x33333333UL; > > > +const unsigned long m4 = 0x0F0F0F0FUL; > > > +const unsigned long h01 = 0x01010101UL; > > > +const int shift = 24; > > > +#else > > > +const unsigned long m1 = 0x5555555555555555UL; > > > +const unsigned long m2 = 0x3333333333333333UL; > > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > > +const unsigned long h01 = 0x0101010101010101UL; > > > +const int shift = 56; > > > +#endif > > > + > > > + > > > +int popcount64c(unsigned long x) > > > +{ > > > + x -= (x >> 1) & m1; > > > + x = (x & m2) + ((x >> 2) & m2); > > > + x = (x + (x >> 4)) & m4; > > > + return (x * h01) >> shift; > > > +} > > > + > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > > + > > > + > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > new file mode 100644 > > > index 0000000..f3755f1 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > @@ -0,0 +1,19 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target popcountll } */ > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > + > > > +const unsigned long long m1 = 0x5555555555555555ULL; > > > +const unsigned long long m2 = 0x3333333333333333ULL; > > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > > +const unsigned long long h01 = 0x0101010101010101ULL; > > > +const int shift = 56; > > > + > > > +int popcount64c(unsigned long long x) > > > +{ > > > + x -= (x >> 1) & m1; > > > + x = (x & m2) + ((x >> 2) & m2); > > > + x = (x + (x >> 4)) & m4; > > > + return (x * h01) >> shift; > > > +} > > > + > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > > index 3c50b89..32bbad7 100644 > > > --- a/gcc/testsuite/lib/target-supports.exp > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > > } "" ] > > > } > > > > > > +# Return 1 if the target supports popcount on long long. > > > + > > > +proc check_effective_target_popcountll { } { > > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > > + int foo (long long b) > > > + { > > > + return __builtin_popcountll (b); > > > + } > > > + } "" ] > > > +} > > > + > > > + > > > +# Return 1 if the target supports popcount on int. > > > + > > > +proc check_effective_target_popcount { } { > > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > > + int foo (int b) > > > + { > > > + return __builtin_popcount (b); > > > + } > > > + } "" ] > > > +} > > > + > > > # Return 1 if the target supports atomic operations on "long long" > > > # and can execute them. > > > # > > [-- Attachment #2: popcount_v3.diff --] [-- Type: text/x-diff, Size: 5739 bytes --] diff --git a/gcc/match.pd b/gcc/match.pd index 0317bc7..9d8131c 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -5358,6 +5358,64 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (cmp (popcount @0) integer_zerop) (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) +#if GIMPLE +/* 64- and 32-bits branchless implementations of popcount are detected: + + int popcount64c (uint64_t x) + { + x -= (x >> 1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> 56; + } + + int popcount32c (uint32_t x) + { + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0f0f0f0f; + return (x * 0x01010101) >> 24; + } */ +(simplify + (rshift + (mult + (bit_and + (plus:c + (rshift @8 INTEGER_CST@5) + (plus:c@8 + (bit_and @6 INTEGER_CST@7) + (bit_and + (rshift + (minus@6 + @0 + (bit_and + (rshift @0 INTEGER_CST@4) + INTEGER_CST@11)) + INTEGER_CST@10) + INTEGER_CST@9))) + INTEGER_CST@3) + INTEGER_CST@2) + INTEGER_CST@1) + /* Check constants and optab. */ + (with + { + unsigned prec = TYPE_PRECISION (type); + int shift = 64 - prec; + const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, + c3 = 0x3333333333333333ULL >> shift, + c4 = 0x5555555555555555ULL >> shift; + } + (if (prec <= 64 && TYPE_UNSIGNED (type) && tree_to_uhwi (@4) == 1 + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 + && direct_internal_fn_supported_p (IFN_POPCOUNT, type, + OPTIMIZE_FOR_BOTH)) + (convert (IFN_POPCOUNT:type @0))))) +#endif + /* Simplify: a = a1 op a2 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c new file mode 100644 index 0000000..bfb563b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcount } */ +/* { dg-require-effective-target int32plus } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned m1 = 0x55555555UL; +const unsigned m2 = 0x33333333UL; +const unsigned m4 = 0x0F0F0F0FUL; +const unsigned h01 = 0x01010101UL; +const int shift = 24; + +int popcount64c(unsigned x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ + + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c new file mode 100644 index 0000000..69fb2d1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcountl } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +#if __SIZEOF_LONG__ == 4 +const unsigned long m1 = 0x55555555UL; +const unsigned long m2 = 0x33333333UL; +const unsigned long m4 = 0x0F0F0F0FUL; +const unsigned long h01 = 0x01010101UL; +const int shift = 24; +#else +const unsigned long m1 = 0x5555555555555555UL; +const unsigned long m2 = 0x3333333333333333UL; +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; +const unsigned long h01 = 0x0101010101010101UL; +const int shift = 56; +#endif + + +int popcount64c(unsigned long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ + + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c new file mode 100644 index 0000000..191d957 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target popcountll } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned long long m1 = 0x5555555555555555ULL; +const unsigned long long m2 = 0x3333333333333333ULL; +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; +const unsigned long long h01 = 0x0101010101010101ULL; +const int shift = 56; + +int popcount64c(unsigned long long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + return (x * h01) >> shift; +} + +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 3c50b89..32bbad7 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { } "" ] } +# Return 1 if the target supports popcount on long long. + +proc check_effective_target_popcountll { } { + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { + int foo (long long b) + { + return __builtin_popcountll (b); + } + } "" ] +} + + +# Return 1 if the target supports popcount on int. + +proc check_effective_target_popcount { } { + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { + int foo (int b) + { + return __builtin_popcount (b); + } + } "" ] +} + # Return 1 if the target supports atomic operations on "long long" # and can execute them. # ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-10-01 11:49 ` Dmitrij Pochepko @ 2019-10-07 10:04 ` Richard Biener 2019-10-08 22:14 ` Steve Ellcey 2019-12-29 22:27 ` Andrew Pinski 0 siblings, 2 replies; 13+ messages in thread From: Richard Biener @ 2019-10-07 10:04 UTC (permalink / raw) To: Dmitrij Pochepko; +Cc: GCC Patches, Wilco Dijkstra On Tue, Oct 1, 2019 at 1:48 PM Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com> wrote: > > Hi Richard, > > I updated patch according to all your comments. > Also bootstrapped and tested again on x86_64-pc-linux-gnu and aarch64-linux-gnu, which took some time. > > attached v3. OK. Thanks, Richard. > Thanks, > Dmitrij > > On Thu, Sep 26, 2019 at 09:47:04AM +0200, Richard Biener wrote: > > On Tue, Sep 24, 2019 at 5:29 PM Dmitrij Pochepko > > <dmitrij.pochepko@bell-sw.com> wrote: > > > > > > Hi, > > > > > > can anybody take a look at v2? > > > > + (if (tree_to_uhwi (@4) == 1 > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > > those will still ICE for large __int128_t constants. Since you do not match > > any conversions you should probably restrict the precision of 'type' like > > with > > (if (TYPE_PRECISION (type) <= 64 > > && tree_to_uhwi (@4) ... > > > > likewise tree_to_uhwi will fail for negative constants thus if the > > pattern assumes > > unsigned you should verify that as well with && TYPE_UNSIGNED (type). > > > > Your 'argtype' is simply 'type' so you can elide it. > > > > + (switch > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > > + (if (types_match (argtype, long_unsigned_type_node)) > > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > > + (if (types_match (argtype, unsigned_type_node)) > > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > > > > Please test small types first so we can avoid popcountll when long == long long > > or long == int. I also wonder if we really want to use the builtins and > > check optab availability or if we nowadays should use > > direct_internal_fn_supported_p (IFN_POPCOUNT, integer_type_node, type, > > OPTIMIZE_FOR_BOTH) and > > > > (convert (IFN_POPCOUNT:type @0)) > > > > without the switch? > > > > Thanks, > > Richard. > > > > > Thanks, > > > Dmitrij > > > > > > On Mon, Sep 09, 2019 at 10:03:40PM +0300, Dmitrij Pochepko wrote: > > > > Hi all. > > > > > > > > Please take a look at v2 (attached). > > > > I changed patch according to review comments. The same testing was performed again. > > > > > > > > Thanks, > > > > Dmitrij > > > > > > > > On Thu, Sep 05, 2019 at 06:34:49PM +0300, Dmitrij Pochepko wrote: > > > > > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > > > > > > > > > int > > > > > foo64 (unsigned long long a) > > > > > { > > > > > unsigned long long b = a; > > > > > b -= ((b>>1) & 0x5555555555555555ULL); > > > > > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > > > > > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > > > > > b *= 0x0101010101010101ULL; > > > > > return (int)(b >> 56); > > > > > } > > > > > > > > > > and > > > > > > > > > > int > > > > > foo32 (unsigned int a) > > > > > { > > > > > unsigned long b = a; > > > > > b -= ((b>>1) & 0x55555555UL); > > > > > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > > > > > b = ((b>>4) + b) & 0x0F0F0F0FUL; > > > > > b *= 0x01010101UL; > > > > > return (int)(b >> 24); > > > > > } > > > > > > > > > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > > > > > > > > > (I have no write access to repo) > > > > > > > > > > Thanks, > > > > > Dmitrij > > > > > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > PR tree-optimization/90836 > > > > > > > > > > * gcc/match.pd (popcount): New pattern. > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > PR tree-optimization/90836 > > > > > > > > > > * lib/target-supports.exp (check_effective_target_popcount) > > > > > (check_effective_target_popcountll): New effective targets. > > > > > * gcc.dg/tree-ssa/popcount4.c: New test. > > > > > * gcc.dg/tree-ssa/popcount4l.c: New test. > > > > > * gcc.dg/tree-ssa/popcount4ll.c: New test. > > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > > index 0317bc7..b1867bf 100644 > > > > > --- a/gcc/match.pd > > > > > +++ b/gcc/match.pd > > > > > @@ -5358,6 +5358,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > > > (cmp (popcount @0) integer_zerop) > > > > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > > > > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > > > > + > > > > > + int popcount64c (uint64_t x) > > > > > + { > > > > > + x -= (x >> 1) & 0x5555555555555555ULL; > > > > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > > > > + return (x * 0x0101010101010101ULL) >> 56; > > > > > + } > > > > > + > > > > > + int popcount32c (uint32_t x) > > > > > + { > > > > > + x -= (x >> 1) & 0x55555555; > > > > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > > > > + return (x * 0x01010101) >> 24; > > > > > + } */ > > > > > +(simplify > > > > > + (convert > > > > > + (rshift > > > > > + (mult > > > > > + (bit_and:c > > > > > + (plus:c > > > > > + (rshift @8 INTEGER_CST@5) > > > > > + (plus:c@8 > > > > > + (bit_and @6 INTEGER_CST@7) > > > > > + (bit_and > > > > > + (rshift > > > > > + (minus@6 > > > > > + @0 > > > > > + (bit_and > > > > > + (rshift @0 INTEGER_CST@4) > > > > > + INTEGER_CST@11)) > > > > > + INTEGER_CST@10) > > > > > + INTEGER_CST@9))) > > > > > + INTEGER_CST@3) > > > > > + INTEGER_CST@2) > > > > > + INTEGER_CST@1)) > > > > > + /* Check constants and optab. */ > > > > > + (with > > > > > + { > > > > > + tree argtype = TREE_TYPE (@0); > > > > > + unsigned prec = TYPE_PRECISION (argtype); > > > > > + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; > > > > > + const unsigned long long c1 = 0x0101010101010101ULL >> shift, > > > > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > > > > + c3 = 0x3333333333333333ULL >> shift, > > > > > + c4 = 0x5555555555555555ULL >> shift; > > > > > + } > > > > > + (if (types_match (type, integer_type_node) && tree_to_uhwi (@4) == 1 > > > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > > > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > > > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > > > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > > > > + != CODE_FOR_nothing) > > > > > + (switch > > > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > > > + (BUILT_IN_POPCOUNTLL @0)) > > > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > > > + (BUILT_IN_POPCOUNTL @0)) > > > > > + (if (types_match (argtype, unsigned_type_node)) > > > > > + (BUILT_IN_POPCOUNT @0)))))) > > > > > + > > > > > /* Simplify: > > > > > > > > > > a = a1 op a2 > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > > new file mode 100644 > > > > > index 0000000..9f759f8 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > > @@ -0,0 +1,22 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-require-effective-target popcount } */ > > > > > +/* { dg-require-effective-target int32plus } */ > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > + > > > > > +const unsigned m1 = 0x55555555UL; > > > > > +const unsigned m2 = 0x33333333UL; > > > > > +const unsigned m4 = 0x0F0F0F0FUL; > > > > > +const unsigned h01 = 0x01010101UL; > > > > > +const int shift = 24; > > > > > + > > > > > +int popcount64c(unsigned x) > > > > > +{ > > > > > + x -= (x >> 1) & m1; > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > + x = (x + (x >> 4)) & m4; > > > > > + return (x * h01) >> shift; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > > > > + > > > > > + > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > > new file mode 100644 > > > > > index 0000000..ab33f79 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > > @@ -0,0 +1,30 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-require-effective-target popcountl } */ > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > + > > > > > +#if __SIZEOF_LONG__ == 4 > > > > > +const unsigned long m1 = 0x55555555UL; > > > > > +const unsigned long m2 = 0x33333333UL; > > > > > +const unsigned long m4 = 0x0F0F0F0FUL; > > > > > +const unsigned long h01 = 0x01010101UL; > > > > > +const int shift = 24; > > > > > +#else > > > > > +const unsigned long m1 = 0x5555555555555555UL; > > > > > +const unsigned long m2 = 0x3333333333333333UL; > > > > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > > > > +const unsigned long h01 = 0x0101010101010101UL; > > > > > +const int shift = 56; > > > > > +#endif > > > > > + > > > > > + > > > > > +int popcount64c(unsigned long x) > > > > > +{ > > > > > + x -= (x >> 1) & m1; > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > + x = (x + (x >> 4)) & m4; > > > > > + return (x * h01) >> shift; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > > > > + > > > > > + > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > > new file mode 100644 > > > > > index 0000000..f3755f1 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > > @@ -0,0 +1,19 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-require-effective-target popcountll } */ > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > + > > > > > +const unsigned long long m1 = 0x5555555555555555ULL; > > > > > +const unsigned long long m2 = 0x3333333333333333ULL; > > > > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > > > > +const unsigned long long h01 = 0x0101010101010101ULL; > > > > > +const int shift = 56; > > > > > + > > > > > +int popcount64c(unsigned long long x) > > > > > +{ > > > > > + x -= (x >> 1) & m1; > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > + x = (x + (x >> 4)) & m4; > > > > > + return (x * h01) >> shift; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > > > > index 3c50b89..32bbad7 100644 > > > > > --- a/gcc/testsuite/lib/target-supports.exp > > > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > > > > } "" ] > > > > > } > > > > > > > > > > +# Return 1 if the target supports popcount on long long. > > > > > + > > > > > +proc check_effective_target_popcountll { } { > > > > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > > > > + int foo (long long b) > > > > > + { > > > > > + return __builtin_popcountll (b); > > > > > + } > > > > > + } "" ] > > > > > +} > > > > > + > > > > > + > > > > > +# Return 1 if the target supports popcount on int. > > > > > + > > > > > +proc check_effective_target_popcount { } { > > > > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > > > > + int foo (int b) > > > > > + { > > > > > + return __builtin_popcount (b); > > > > > + } > > > > > + } "" ] > > > > > +} > > > > > + > > > > > # Return 1 if the target supports atomic operations on "long long" > > > > > # and can execute them. > > > > > # > > > > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > index 0317bc7..896d5b6 100644 > > > > --- a/gcc/match.pd > > > > +++ b/gcc/match.pd > > > > @@ -5358,6 +5358,69 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > > (cmp (popcount @0) integer_zerop) > > > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > > > + > > > > + int popcount64c (uint64_t x) > > > > + { > > > > + x -= (x >> 1) & 0x5555555555555555ULL; > > > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > > > + return (x * 0x0101010101010101ULL) >> 56; > > > > + } > > > > + > > > > + int popcount32c (uint32_t x) > > > > + { > > > > + x -= (x >> 1) & 0x55555555; > > > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > > > + return (x * 0x01010101) >> 24; > > > > + } */ > > > > +(simplify > > > > + (rshift > > > > + (mult > > > > + (bit_and > > > > + (plus:c > > > > + (rshift @8 INTEGER_CST@5) > > > > + (plus:c@8 > > > > + (bit_and @6 INTEGER_CST@7) > > > > + (bit_and > > > > + (rshift > > > > + (minus@6 > > > > + @0 > > > > + (bit_and > > > > + (rshift @0 INTEGER_CST@4) > > > > + INTEGER_CST@11)) > > > > + INTEGER_CST@10) > > > > + INTEGER_CST@9))) > > > > + INTEGER_CST@3) > > > > + INTEGER_CST@2) > > > > + INTEGER_CST@1) > > > > + /* Check constants and optab. */ > > > > + (with > > > > + { > > > > + tree argtype = TREE_TYPE (@0); > > > > + unsigned prec = TYPE_PRECISION (argtype); > > > > + int shift = 64 - prec; > > > > + const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, > > > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > > > + c3 = 0x3333333333333333ULL >> shift, > > > > + c4 = 0x5555555555555555ULL >> shift; > > > > + } > > > > + (if (tree_to_uhwi (@4) == 1 > > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > > > + != CODE_FOR_nothing) > > > > + (switch > > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > > > > + (if (types_match (argtype, unsigned_type_node)) > > > > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > > > > + > > > > /* Simplify: > > > > > > > > a = a1 op a2 > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > new file mode 100644 > > > > index 0000000..9f759f8 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > @@ -0,0 +1,22 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-require-effective-target popcount } */ > > > > +/* { dg-require-effective-target int32plus } */ > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > + > > > > +const unsigned m1 = 0x55555555UL; > > > > +const unsigned m2 = 0x33333333UL; > > > > +const unsigned m4 = 0x0F0F0F0FUL; > > > > +const unsigned h01 = 0x01010101UL; > > > > +const int shift = 24; > > > > + > > > > +int popcount64c(unsigned x) > > > > +{ > > > > + x -= (x >> 1) & m1; > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > + x = (x + (x >> 4)) & m4; > > > > + return (x * h01) >> shift; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > > > + > > > > + > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > new file mode 100644 > > > > index 0000000..ab33f79 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > @@ -0,0 +1,30 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-require-effective-target popcountl } */ > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > + > > > > +#if __SIZEOF_LONG__ == 4 > > > > +const unsigned long m1 = 0x55555555UL; > > > > +const unsigned long m2 = 0x33333333UL; > > > > +const unsigned long m4 = 0x0F0F0F0FUL; > > > > +const unsigned long h01 = 0x01010101UL; > > > > +const int shift = 24; > > > > +#else > > > > +const unsigned long m1 = 0x5555555555555555UL; > > > > +const unsigned long m2 = 0x3333333333333333UL; > > > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > > > +const unsigned long h01 = 0x0101010101010101UL; > > > > +const int shift = 56; > > > > +#endif > > > > + > > > > + > > > > +int popcount64c(unsigned long x) > > > > +{ > > > > + x -= (x >> 1) & m1; > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > + x = (x + (x >> 4)) & m4; > > > > + return (x * h01) >> shift; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > > > + > > > > + > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > new file mode 100644 > > > > index 0000000..f3755f1 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > @@ -0,0 +1,19 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-require-effective-target popcountll } */ > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > + > > > > +const unsigned long long m1 = 0x5555555555555555ULL; > > > > +const unsigned long long m2 = 0x3333333333333333ULL; > > > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > > > +const unsigned long long h01 = 0x0101010101010101ULL; > > > > +const int shift = 56; > > > > + > > > > +int popcount64c(unsigned long long x) > > > > +{ > > > > + x -= (x >> 1) & m1; > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > + x = (x + (x >> 4)) & m4; > > > > + return (x * h01) >> shift; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > > > index 3c50b89..32bbad7 100644 > > > > --- a/gcc/testsuite/lib/target-supports.exp > > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > > > } "" ] > > > > } > > > > > > > > +# Return 1 if the target supports popcount on long long. > > > > + > > > > +proc check_effective_target_popcountll { } { > > > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > > > + int foo (long long b) > > > > + { > > > > + return __builtin_popcountll (b); > > > > + } > > > > + } "" ] > > > > +} > > > > + > > > > + > > > > +# Return 1 if the target supports popcount on int. > > > > + > > > > +proc check_effective_target_popcount { } { > > > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > > > + int foo (int b) > > > > + { > > > > + return __builtin_popcount (b); > > > > + } > > > > + } "" ] > > > > +} > > > > + > > > > # Return 1 if the target supports atomic operations on "long long" > > > > # and can execute them. > > > > # > > > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-10-07 10:04 ` Richard Biener @ 2019-10-08 22:14 ` Steve Ellcey 2019-12-29 22:27 ` Andrew Pinski 1 sibling, 0 replies; 13+ messages in thread From: Steve Ellcey @ 2019-10-08 22:14 UTC (permalink / raw) To: dmitrij.pochepko, richard.guenther; +Cc: gcc-patches, Wilco.Dijkstra On Mon, 2019-10-07 at 12:04 +0200, Richard Biener wrote: > On Tue, Oct 1, 2019 at 1:48 PM Dmitrij Pochepko > <dmitrij.pochepko@bell-sw.com> wrote: > > > > Hi Richard, > > > > I updated patch according to all your comments. > > Also bootstrapped and tested again on x86_64-pc-linux-gnu and > > aarch64-linux-gnu, which took some time. > > > > attached v3. > > OK. > > Thanks, > Richard. Dmitrij, I checked in this patch for you. Steve Ellcey sellcey@marvell.com ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-10-07 10:04 ` Richard Biener 2019-10-08 22:14 ` Steve Ellcey @ 2019-12-29 22:27 ` Andrew Pinski 1 sibling, 0 replies; 13+ messages in thread From: Andrew Pinski @ 2019-12-29 22:27 UTC (permalink / raw) To: Richard Biener; +Cc: Dmitrij Pochepko, GCC Patches, Wilco Dijkstra On Mon, Oct 7, 2019 at 3:05 AM Richard Biener <richard.guenther@gmail.com> wrote: > > On Tue, Oct 1, 2019 at 1:48 PM Dmitrij Pochepko > <dmitrij.pochepko@bell-sw.com> wrote: > > > > Hi Richard, > > > > I updated patch according to all your comments. > > Also bootstrapped and tested again on x86_64-pc-linux-gnu and aarch64-linux-gnu, which took some time. > > > > attached v3. > > OK. This introduced PR 93098 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93098 ). Thanks, Andrew Pinski > > Thanks, > Richard. > > > Thanks, > > Dmitrij > > > > On Thu, Sep 26, 2019 at 09:47:04AM +0200, Richard Biener wrote: > > > On Tue, Sep 24, 2019 at 5:29 PM Dmitrij Pochepko > > > <dmitrij.pochepko@bell-sw.com> wrote: > > > > > > > > Hi, > > > > > > > > can anybody take a look at v2? > > > > > > + (if (tree_to_uhwi (@4) == 1 > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > > > > those will still ICE for large __int128_t constants. Since you do not match > > > any conversions you should probably restrict the precision of 'type' like > > > with > > > (if (TYPE_PRECISION (type) <= 64 > > > && tree_to_uhwi (@4) ... > > > > > > likewise tree_to_uhwi will fail for negative constants thus if the > > > pattern assumes > > > unsigned you should verify that as well with && TYPE_UNSIGNED (type). > > > > > > Your 'argtype' is simply 'type' so you can elide it. > > > > > > + (switch > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > > > + (if (types_match (argtype, unsigned_type_node)) > > > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > > > > > > Please test small types first so we can avoid popcountll when long == long long > > > or long == int. I also wonder if we really want to use the builtins and > > > check optab availability or if we nowadays should use > > > direct_internal_fn_supported_p (IFN_POPCOUNT, integer_type_node, type, > > > OPTIMIZE_FOR_BOTH) and > > > > > > (convert (IFN_POPCOUNT:type @0)) > > > > > > without the switch? > > > > > > Thanks, > > > Richard. > > > > > > > Thanks, > > > > Dmitrij > > > > > > > > On Mon, Sep 09, 2019 at 10:03:40PM +0300, Dmitrij Pochepko wrote: > > > > > Hi all. > > > > > > > > > > Please take a look at v2 (attached). > > > > > I changed patch according to review comments. The same testing was performed again. > > > > > > > > > > Thanks, > > > > > Dmitrij > > > > > > > > > > On Thu, Sep 05, 2019 at 06:34:49PM +0300, Dmitrij Pochepko wrote: > > > > > > This patch adds matching for Hamming weight (popcount) implementation. The following sources: > > > > > > > > > > > > int > > > > > > foo64 (unsigned long long a) > > > > > > { > > > > > > unsigned long long b = a; > > > > > > b -= ((b>>1) & 0x5555555555555555ULL); > > > > > > b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); > > > > > > b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; > > > > > > b *= 0x0101010101010101ULL; > > > > > > return (int)(b >> 56); > > > > > > } > > > > > > > > > > > > and > > > > > > > > > > > > int > > > > > > foo32 (unsigned int a) > > > > > > { > > > > > > unsigned long b = a; > > > > > > b -= ((b>>1) & 0x55555555UL); > > > > > > b = ((b>>2) & 0x33333333UL) + (b & 0x33333333UL); > > > > > > b = ((b>>4) + b) & 0x0F0F0F0FUL; > > > > > > b *= 0x01010101UL; > > > > > > return (int)(b >> 24); > > > > > > } > > > > > > > > > > > > and equivalents are now recognized as popcount for platforms with hw popcount support. Bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu systems with no regressions. > > > > > > > > > > > > (I have no write access to repo) > > > > > > > > > > > > Thanks, > > > > > > Dmitrij > > > > > > > > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > PR tree-optimization/90836 > > > > > > > > > > > > * gcc/match.pd (popcount): New pattern. > > > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > > > PR tree-optimization/90836 > > > > > > > > > > > > * lib/target-supports.exp (check_effective_target_popcount) > > > > > > (check_effective_target_popcountll): New effective targets. > > > > > > * gcc.dg/tree-ssa/popcount4.c: New test. > > > > > > * gcc.dg/tree-ssa/popcount4l.c: New test. > > > > > > * gcc.dg/tree-ssa/popcount4ll.c: New test. > > > > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > > > index 0317bc7..b1867bf 100644 > > > > > > --- a/gcc/match.pd > > > > > > +++ b/gcc/match.pd > > > > > > @@ -5358,6 +5358,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > > > > (cmp (popcount @0) integer_zerop) > > > > > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > > > > > > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > > > > > + > > > > > > + int popcount64c (uint64_t x) > > > > > > + { > > > > > > + x -= (x >> 1) & 0x5555555555555555ULL; > > > > > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > > > > > + return (x * 0x0101010101010101ULL) >> 56; > > > > > > + } > > > > > > + > > > > > > + int popcount32c (uint32_t x) > > > > > > + { > > > > > > + x -= (x >> 1) & 0x55555555; > > > > > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > > > > > + return (x * 0x01010101) >> 24; > > > > > > + } */ > > > > > > +(simplify > > > > > > + (convert > > > > > > + (rshift > > > > > > + (mult > > > > > > + (bit_and:c > > > > > > + (plus:c > > > > > > + (rshift @8 INTEGER_CST@5) > > > > > > + (plus:c@8 > > > > > > + (bit_and @6 INTEGER_CST@7) > > > > > > + (bit_and > > > > > > + (rshift > > > > > > + (minus@6 > > > > > > + @0 > > > > > > + (bit_and > > > > > > + (rshift @0 INTEGER_CST@4) > > > > > > + INTEGER_CST@11)) > > > > > > + INTEGER_CST@10) > > > > > > + INTEGER_CST@9))) > > > > > > + INTEGER_CST@3) > > > > > > + INTEGER_CST@2) > > > > > > + INTEGER_CST@1)) > > > > > > + /* Check constants and optab. */ > > > > > > + (with > > > > > > + { > > > > > > + tree argtype = TREE_TYPE (@0); > > > > > > + unsigned prec = TYPE_PRECISION (argtype); > > > > > > + int shift = TYPE_PRECISION (long_long_unsigned_type_node) - prec; > > > > > > + const unsigned long long c1 = 0x0101010101010101ULL >> shift, > > > > > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > > > > > + c3 = 0x3333333333333333ULL >> shift, > > > > > > + c4 = 0x5555555555555555ULL >> shift; > > > > > > + } > > > > > > + (if (types_match (type, integer_type_node) && tree_to_uhwi (@4) == 1 > > > > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > > > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > > > > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > > > > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > > > > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > > > > > + != CODE_FOR_nothing) > > > > > > + (switch > > > > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > > > > + (BUILT_IN_POPCOUNTLL @0)) > > > > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > > > > + (BUILT_IN_POPCOUNTL @0)) > > > > > > + (if (types_match (argtype, unsigned_type_node)) > > > > > > + (BUILT_IN_POPCOUNT @0)))))) > > > > > > + > > > > > > /* Simplify: > > > > > > > > > > > > a = a1 op a2 > > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > > > new file mode 100644 > > > > > > index 0000000..9f759f8 > > > > > > --- /dev/null > > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > > > @@ -0,0 +1,22 @@ > > > > > > +/* { dg-do compile } */ > > > > > > +/* { dg-require-effective-target popcount } */ > > > > > > +/* { dg-require-effective-target int32plus } */ > > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > > + > > > > > > +const unsigned m1 = 0x55555555UL; > > > > > > +const unsigned m2 = 0x33333333UL; > > > > > > +const unsigned m4 = 0x0F0F0F0FUL; > > > > > > +const unsigned h01 = 0x01010101UL; > > > > > > +const int shift = 24; > > > > > > + > > > > > > +int popcount64c(unsigned x) > > > > > > +{ > > > > > > + x -= (x >> 1) & m1; > > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > > + x = (x + (x >> 4)) & m4; > > > > > > + return (x * h01) >> shift; > > > > > > +} > > > > > > + > > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > > > > > + > > > > > > + > > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > > > new file mode 100644 > > > > > > index 0000000..ab33f79 > > > > > > --- /dev/null > > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > > > @@ -0,0 +1,30 @@ > > > > > > +/* { dg-do compile } */ > > > > > > +/* { dg-require-effective-target popcountl } */ > > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > > + > > > > > > +#if __SIZEOF_LONG__ == 4 > > > > > > +const unsigned long m1 = 0x55555555UL; > > > > > > +const unsigned long m2 = 0x33333333UL; > > > > > > +const unsigned long m4 = 0x0F0F0F0FUL; > > > > > > +const unsigned long h01 = 0x01010101UL; > > > > > > +const int shift = 24; > > > > > > +#else > > > > > > +const unsigned long m1 = 0x5555555555555555UL; > > > > > > +const unsigned long m2 = 0x3333333333333333UL; > > > > > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > > > > > +const unsigned long h01 = 0x0101010101010101UL; > > > > > > +const int shift = 56; > > > > > > +#endif > > > > > > + > > > > > > + > > > > > > +int popcount64c(unsigned long x) > > > > > > +{ > > > > > > + x -= (x >> 1) & m1; > > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > > + x = (x + (x >> 4)) & m4; > > > > > > + return (x * h01) >> shift; > > > > > > +} > > > > > > + > > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > > > > > + > > > > > > + > > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > > > new file mode 100644 > > > > > > index 0000000..f3755f1 > > > > > > --- /dev/null > > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > > > @@ -0,0 +1,19 @@ > > > > > > +/* { dg-do compile } */ > > > > > > +/* { dg-require-effective-target popcountll } */ > > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > > + > > > > > > +const unsigned long long m1 = 0x5555555555555555ULL; > > > > > > +const unsigned long long m2 = 0x3333333333333333ULL; > > > > > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > > > > > +const unsigned long long h01 = 0x0101010101010101ULL; > > > > > > +const int shift = 56; > > > > > > + > > > > > > +int popcount64c(unsigned long long x) > > > > > > +{ > > > > > > + x -= (x >> 1) & m1; > > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > > + x = (x + (x >> 4)) & m4; > > > > > > + return (x * h01) >> shift; > > > > > > +} > > > > > > + > > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > > > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > > > > > index 3c50b89..32bbad7 100644 > > > > > > --- a/gcc/testsuite/lib/target-supports.exp > > > > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > > > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > > > > > } "" ] > > > > > > } > > > > > > > > > > > > +# Return 1 if the target supports popcount on long long. > > > > > > + > > > > > > +proc check_effective_target_popcountll { } { > > > > > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > > > > > + int foo (long long b) > > > > > > + { > > > > > > + return __builtin_popcountll (b); > > > > > > + } > > > > > > + } "" ] > > > > > > +} > > > > > > + > > > > > > + > > > > > > +# Return 1 if the target supports popcount on int. > > > > > > + > > > > > > +proc check_effective_target_popcount { } { > > > > > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > > > > > + int foo (int b) > > > > > > + { > > > > > > + return __builtin_popcount (b); > > > > > > + } > > > > > > + } "" ] > > > > > > +} > > > > > > + > > > > > > # Return 1 if the target supports atomic operations on "long long" > > > > > > # and can execute them. > > > > > > # > > > > > > > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > > index 0317bc7..896d5b6 100644 > > > > > --- a/gcc/match.pd > > > > > +++ b/gcc/match.pd > > > > > @@ -5358,6 +5358,69 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > > > (cmp (popcount @0) integer_zerop) > > > > > (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) > > > > > > > > > > +/* 64- and 32-bits branchless implementations of popcount are detected: > > > > > + > > > > > + int popcount64c (uint64_t x) > > > > > + { > > > > > + x -= (x >> 1) & 0x5555555555555555ULL; > > > > > + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); > > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; > > > > > + return (x * 0x0101010101010101ULL) >> 56; > > > > > + } > > > > > + > > > > > + int popcount32c (uint32_t x) > > > > > + { > > > > > + x -= (x >> 1) & 0x55555555; > > > > > + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > > > > > + x = (x + (x >> 4)) & 0x0f0f0f0f; > > > > > + return (x * 0x01010101) >> 24; > > > > > + } */ > > > > > +(simplify > > > > > + (rshift > > > > > + (mult > > > > > + (bit_and > > > > > + (plus:c > > > > > + (rshift @8 INTEGER_CST@5) > > > > > + (plus:c@8 > > > > > + (bit_and @6 INTEGER_CST@7) > > > > > + (bit_and > > > > > + (rshift > > > > > + (minus@6 > > > > > + @0 > > > > > + (bit_and > > > > > + (rshift @0 INTEGER_CST@4) > > > > > + INTEGER_CST@11)) > > > > > + INTEGER_CST@10) > > > > > + INTEGER_CST@9))) > > > > > + INTEGER_CST@3) > > > > > + INTEGER_CST@2) > > > > > + INTEGER_CST@1) > > > > > + /* Check constants and optab. */ > > > > > + (with > > > > > + { > > > > > + tree argtype = TREE_TYPE (@0); > > > > > + unsigned prec = TYPE_PRECISION (argtype); > > > > > + int shift = 64 - prec; > > > > > + const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, > > > > > + c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, > > > > > + c3 = 0x3333333333333333ULL >> shift, > > > > > + c4 = 0x5555555555555555ULL >> shift; > > > > > + } > > > > > + (if (tree_to_uhwi (@4) == 1 > > > > > + && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 > > > > > + && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 > > > > > + && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 > > > > > + && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 > > > > > + && optab_handler (popcount_optab, TYPE_MODE (argtype)) > > > > > + != CODE_FOR_nothing) > > > > > + (switch > > > > > + (if (types_match (argtype, long_long_unsigned_type_node)) > > > > > + (convert (BUILT_IN_POPCOUNTLL:integer_type_node @0))) > > > > > + (if (types_match (argtype, long_unsigned_type_node)) > > > > > + (convert (BUILT_IN_POPCOUNTL:integer_type_node @0))) > > > > > + (if (types_match (argtype, unsigned_type_node)) > > > > > + (convert (BUILT_IN_POPCOUNT:integer_type_node @0))))))) > > > > > + > > > > > /* Simplify: > > > > > > > > > > a = a1 op a2 > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > > new file mode 100644 > > > > > index 0000000..9f759f8 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4.c > > > > > @@ -0,0 +1,22 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-require-effective-target popcount } */ > > > > > +/* { dg-require-effective-target int32plus } */ > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > + > > > > > +const unsigned m1 = 0x55555555UL; > > > > > +const unsigned m2 = 0x33333333UL; > > > > > +const unsigned m4 = 0x0F0F0F0FUL; > > > > > +const unsigned h01 = 0x01010101UL; > > > > > +const int shift = 24; > > > > > + > > > > > +int popcount64c(unsigned x) > > > > > +{ > > > > > + x -= (x >> 1) & m1; > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > + x = (x + (x >> 4)) & m4; > > > > > + return (x * h01) >> shift; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ > > > > > + > > > > > + > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > > new file mode 100644 > > > > > index 0000000..ab33f79 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4l.c > > > > > @@ -0,0 +1,30 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-require-effective-target popcountl } */ > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > + > > > > > +#if __SIZEOF_LONG__ == 4 > > > > > +const unsigned long m1 = 0x55555555UL; > > > > > +const unsigned long m2 = 0x33333333UL; > > > > > +const unsigned long m4 = 0x0F0F0F0FUL; > > > > > +const unsigned long h01 = 0x01010101UL; > > > > > +const int shift = 24; > > > > > +#else > > > > > +const unsigned long m1 = 0x5555555555555555UL; > > > > > +const unsigned long m2 = 0x3333333333333333UL; > > > > > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > > > > > +const unsigned long h01 = 0x0101010101010101UL; > > > > > +const int shift = 56; > > > > > +#endif > > > > > + > > > > > + > > > > > +int popcount64c(unsigned long x) > > > > > +{ > > > > > + x -= (x >> 1) & m1; > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > + x = (x + (x >> 4)) & m4; > > > > > + return (x * h01) >> shift; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountl" 1 "optimized" } } */ > > > > > + > > > > > + > > > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > > new file mode 100644 > > > > > index 0000000..f3755f1 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount4ll.c > > > > > @@ -0,0 +1,19 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-require-effective-target popcountll } */ > > > > > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > > > > > + > > > > > +const unsigned long long m1 = 0x5555555555555555ULL; > > > > > +const unsigned long long m2 = 0x3333333333333333ULL; > > > > > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > > > > > +const unsigned long long h01 = 0x0101010101010101ULL; > > > > > +const int shift = 56; > > > > > + > > > > > +int popcount64c(unsigned long long x) > > > > > +{ > > > > > + x -= (x >> 1) & m1; > > > > > + x = (x & m2) + ((x >> 2) & m2); > > > > > + x = (x + (x >> 4)) & m4; > > > > > + return (x * h01) >> shift; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-tree-dump-times "__builtin_popcountll" 1 "optimized" } } */ > > > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > > > > > index 3c50b89..32bbad7 100644 > > > > > --- a/gcc/testsuite/lib/target-supports.exp > > > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > > > @@ -6855,6 +6855,29 @@ proc check_effective_target_popcountl { } { > > > > > } "" ] > > > > > } > > > > > > > > > > +# Return 1 if the target supports popcount on long long. > > > > > + > > > > > +proc check_effective_target_popcountll { } { > > > > > + return [check_no_messages_and_pattern popcountll "!\\(call" rtl-expand { > > > > > + int foo (long long b) > > > > > + { > > > > > + return __builtin_popcountll (b); > > > > > + } > > > > > + } "" ] > > > > > +} > > > > > + > > > > > + > > > > > +# Return 1 if the target supports popcount on int. > > > > > + > > > > > +proc check_effective_target_popcount { } { > > > > > + return [check_no_messages_and_pattern popcount "!\\(call" rtl-expand { > > > > > + int foo (int b) > > > > > + { > > > > > + return __builtin_popcount (b); > > > > > + } > > > > > + } "" ] > > > > > +} > > > > > + > > > > > # Return 1 if the target supports atomic operations on "long long" > > > > > # and can execute them. > > > > > # > > > > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching @ 2019-09-06 12:13 Wilco Dijkstra 2019-09-09 8:24 ` Richard Biener 2019-09-09 18:59 ` Dmitrij Pochepko 0 siblings, 2 replies; 13+ messages in thread From: Wilco Dijkstra @ 2019-09-06 12:13 UTC (permalink / raw) To: Richard Biener, dmitrij.pochepko, GCC Patches; +Cc: nd Hi, +(simplify + (convert + (rshift + (mult > is the outer convert really necessary? That is, if we change > the simplification result to Indeed that should be "convert?" to make it optional. > Is the Hamming weight popcount > faster than the libgcc table-based approach? I wonder if we really > need to restrict this conversion to the case where the target > has an expander. Well libgcc uses the exact same sequence (not a table): objdump -d ./aarch64-unknown-linux-gnu/libgcc/_popcountsi2.o 0000000000000000 <__popcountdi2>: 0: d341fc01 lsr x1, x0, #1 4: b200c3e3 mov x3, #0x101010101010101 // #72340172838076673 8: 9200f021 and x1, x1, #0x5555555555555555 c: cb010001 sub x1, x0, x1 10: 9200e422 and x2, x1, #0x3333333333333333 14: d342fc21 lsr x1, x1, #2 18: 9200e421 and x1, x1, #0x3333333333333333 1c: 8b010041 add x1, x2, x1 20: 8b411021 add x1, x1, x1, lsr #4 24: 9200cc20 and x0, x1, #0xf0f0f0f0f0f0f0f 28: 9b037c00 mul x0, x0, x3 2c: d378fc00 lsr x0, x0, #56 30: d65f03c0 ret So if you don't check for an expander you get an endless loop in libgcc since the makefile doesn't appear to use -fno-builtin anywhere... Wilco ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-06 12:13 Wilco Dijkstra @ 2019-09-09 8:24 ` Richard Biener 2019-09-09 18:59 ` Dmitrij Pochepko 1 sibling, 0 replies; 13+ messages in thread From: Richard Biener @ 2019-09-09 8:24 UTC (permalink / raw) To: Wilco Dijkstra; +Cc: dmitrij.pochepko, GCC Patches, nd On Fri, Sep 6, 2019 at 2:13 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > > Hi, > > +(simplify > + (convert > + (rshift > + (mult > > > is the outer convert really necessary? That is, if we change > > the simplification result to > > Indeed that should be "convert?" to make it optional. Rather drop it, a generated conversion should be elided by conversion simplification. > > Is the Hamming weight popcount > > faster than the libgcc table-based approach? I wonder if we really > > need to restrict this conversion to the case where the target > > has an expander. > > Well libgcc uses the exact same sequence (not a table): > > objdump -d ./aarch64-unknown-linux-gnu/libgcc/_popcountsi2.o > > 0000000000000000 <__popcountdi2>: > 0: d341fc01 lsr x1, x0, #1 > 4: b200c3e3 mov x3, #0x101010101010101 // #72340172838076673 > 8: 9200f021 and x1, x1, #0x5555555555555555 > c: cb010001 sub x1, x0, x1 > 10: 9200e422 and x2, x1, #0x3333333333333333 > 14: d342fc21 lsr x1, x1, #2 > 18: 9200e421 and x1, x1, #0x3333333333333333 > 1c: 8b010041 add x1, x2, x1 > 20: 8b411021 add x1, x1, x1, lsr #4 > 24: 9200cc20 and x0, x1, #0xf0f0f0f0f0f0f0f > 28: 9b037c00 mul x0, x0, x3 > 2c: d378fc00 lsr x0, x0, #56 > 30: d65f03c0 ret > > So if you don't check for an expander you get an endless loop in libgcc since > the makefile doesn't appear to use -fno-builtin anywhere... Hm, must be aarch specific. But indeed it should use -fno-builtin ... Richard. > > Wilco > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] PR tree-optimization/90836 Missing popcount pattern matching 2019-09-06 12:13 Wilco Dijkstra 2019-09-09 8:24 ` Richard Biener @ 2019-09-09 18:59 ` Dmitrij Pochepko 1 sibling, 0 replies; 13+ messages in thread From: Dmitrij Pochepko @ 2019-09-09 18:59 UTC (permalink / raw) To: Wilco Dijkstra; +Cc: Richard Biener, GCC Patches, nd Hi, thank you for looking into it. On Fri, Sep 06, 2019 at 12:13:34PM +0000, Wilco Dijkstra wrote: > Hi, > > +(simplify > + (convert > + (rshift > + (mult > > > is the outer convert really necessary? That is, if we change > > the simplification result to > > Indeed that should be "convert?" to make it optional. > I removed this one as Richard suggested in the new patch version. > > Is the Hamming weight popcount > > faster than the libgcc table-based approach? I wonder if we really > > need to restrict this conversion to the case where the target > > has an expander. > > Well libgcc uses the exact same sequence (not a table): > > objdump -d ./aarch64-unknown-linux-gnu/libgcc/_popcountsi2.o > > 0000000000000000 <__popcountdi2>: > 0: d341fc01 lsr x1, x0, #1 > 4: b200c3e3 mov x3, #0x101010101010101 // #72340172838076673 > 8: 9200f021 and x1, x1, #0x5555555555555555 > c: cb010001 sub x1, x0, x1 > 10: 9200e422 and x2, x1, #0x3333333333333333 > 14: d342fc21 lsr x1, x1, #2 > 18: 9200e421 and x1, x1, #0x3333333333333333 > 1c: 8b010041 add x1, x2, x1 > 20: 8b411021 add x1, x1, x1, lsr #4 > 24: 9200cc20 and x0, x1, #0xf0f0f0f0f0f0f0f > 28: 9b037c00 mul x0, x0, x3 > 2c: d378fc00 lsr x0, x0, #56 > 30: d65f03c0 ret > > So if you don't check for an expander you get an endless loop in libgcc since > the makefile doesn't appear to use -fno-builtin anywhere... The patch is designed to avoid such endless loop - libgcc popcount call is compiled into popcount cpu instruction(s) on supported platforms and the patch is only allowing simplification on such platforms. This is implemented via "optab_handler (popcount_optab, TYPE_MODE (argtype)) != CODE_FOR_nothing" check. Thanks, Dmitrij > > Wilco > ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2019-12-29 19:49 UTC | newest] Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2019-09-05 15:35 [PATCH] PR tree-optimization/90836 Missing popcount pattern matching Dmitrij Pochepko 2019-09-06 10:23 ` Richard Biener 2019-09-09 18:54 ` Dmitrij Pochepko 2019-09-09 19:03 ` Dmitrij Pochepko 2019-09-24 15:29 ` Dmitrij Pochepko 2019-09-26 7:47 ` Richard Biener 2019-10-01 11:49 ` Dmitrij Pochepko 2019-10-07 10:04 ` Richard Biener 2019-10-08 22:14 ` Steve Ellcey 2019-12-29 22:27 ` Andrew Pinski 2019-09-06 12:13 Wilco Dijkstra 2019-09-09 8:24 ` Richard Biener 2019-09-09 18:59 ` Dmitrij Pochepko
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).