public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] New hook adjust_iv_update_pos
@ 2021-06-25  8:31 Xionghu Luo
  2021-06-25  8:54 ` Richard Biener
  0 siblings, 1 reply; 14+ messages in thread
From: Xionghu Luo @ 2021-06-25  8:31 UTC (permalink / raw)
  To: gcc-patches; +Cc: segher, dje.gcc, wschmidt, guojiufu, linkw, Xiong Hu Luo

From: Xiong Hu Luo <luoxhu@linux.ibm.com>

adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
on Power.  For example, it generates mismatched address offset after
adjust iv update statement position:

<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
_34 = ref_180 + 18446744073709551615;
_86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
if (_84 == _86)
  goto <bb 56>; [94.50%]
  else
  goto <bb 87>; [5.50%]

Disable it will produce:

<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
_86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
if (_84 == _86)
  goto <bb 56>; [94.50%]
  else
  goto <bb 87>; [5.50%]

Then later pass loop unroll could benefit from same address offset
with different base address and reduces register dependency.
This patch could improve performance by 10% for typical case on Power,
no performance change observed for X86 or Aarch64 due to small loops
not unrolled on these platforms.  Any comments?

.L67:
  lbzx %r7,%r8,%r6
  lbzx %r12,%r25,%r4
  cmpw %cr0,%r7,%r12
  bne %cr0,.L11
  lbzx %r7,%r8,%r4
  mr %r6,%r4
  addi %r4,%r4,1
  lbzx %r12,%r25,%r4
  mr %r11,%r6
  cmpw %cr0,%r7,%r12
  bne %cr0,.L11
  mr %r6,%r4
.L12:
  cmpdi %cr0,%r10,1
  addi %r4,%r6,1
  mr %r11,%r6
  addi %r10,%r10,-1
  bne %cr0,.L67

vs.

.L67:
  lbzx %r25,%r8,%r6
  lbzx %r12,%r7,%r6
  addi %r4,%r6,1
  cmpw %cr0,%r25,%r12
  bne %cr0,.L11
  lbzx %r12,%r8,%r4
  lbzx %r25,%r7,%r4
  mr %r6,%r4
  mr %r11,%r4
  cmpw %cr0,%r12,%r25
  bne %cr0,.L11
  addi %r6,%r4,1
.L12:
  cmpdi %cr0,%r10,1
  mr %r11,%r6
  addi %r10,%r10,-1
  bne %cr0,.L67

gcc/ChangeLog:

	* config/rs6000/rs6000.c (TARGET_ADJUST_IV_UPDATE_POS):
	(rs6000_adjust_iv_update_pos):
	* doc/tm.texi:
	* doc/tm.texi.in:
	* target.def:
	* targhooks.c (default_adjust_iv_update_pos):
	* targhooks.h (default_adjust_iv_update_pos):
	* tree-ssa-loop-ivopts.c (rewrite_use_address):
---
 gcc/config/rs6000/rs6000.c | 11 +++++++++++
 gcc/doc/tm.texi            |  5 +++++
 gcc/doc/tm.texi.in         |  2 ++
 gcc/target.def             |  7 +++++++
 gcc/targhooks.c            |  6 ++++++
 gcc/targhooks.h            |  2 ++
 gcc/tree-ssa-loop-ivopts.c |  3 ++-
 7 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index cd130dea611..e7725997793 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1455,6 +1455,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_LOOP_UNROLL_ADJUST
 #define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
 
+#undef TARGET_ADJUST_IV_UPDATE_POS
+#define TARGET_ADJUST_IV_UPDATE_POS rs6000_adjust_iv_update_pos
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS rs6000_init_builtins
 #undef TARGET_BUILTIN_DECL
@@ -5457,6 +5460,14 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
   return nunroll;
 }
 
+/* Implement targetm.adjust_iv_update_pos.  */
+
+bool
+rs6000_adjust_iv_update_pos (void)
+{
+  return false;
+}
+
 /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
    library with vectorized intrinsics.  */
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index b272fa4806d..07ce40eb053 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11768,6 +11768,11 @@ By default, the RTL loop optimizer does not use a present doloop pattern for
 loops containing function calls or branch on table instructions.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_ADJUST_IV_UPDATE_POS (void)
+if adjust_iv_update_pos is enabled, reorder the iv update statement,
+ then mem ref uses the iv value after update.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_LEGITIMATE_COMBINED_INSN (rtx_insn *@var{insn})
 Take an instruction in @var{insn} and return @code{false} if the instruction is not appropriate as a combination of two or more instructions.  The default is to accept all instructions.
 @end deftypefn
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index bf724dc093c..87d02089588 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7979,6 +7979,8 @@ to by @var{ce_info}.
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
 
+@hook TARGET_ADJUST_IV_UPDATE_POS
+
 @hook TARGET_LEGITIMATE_COMBINED_INSN
 
 @hook TARGET_CAN_FOLLOW_JUMP
diff --git a/gcc/target.def b/gcc/target.def
index d7b94bd8e5d..aead7cb79ff 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4398,6 +4398,13 @@ loops containing function calls or branch on table instructions.",
  const char *, (const rtx_insn *insn),
  default_invalid_within_doloop)
 
+/* Function to adjust iv update statment position.  */
+DEFHOOK
+(adjust_iv_update_pos,
+ "if adjust_iv_update_pos is enabled, reorder the iv update statement,\n\
+ then mem ref uses the iv value after update.",
+ bool, (void), default_adjust_iv_update_pos)
+
 /* Returns true for a legitimate combined insn.  */
 DEFHOOK
 (legitimate_combined_insn,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index d69c9a2d819..2a93a3489e6 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -679,6 +679,12 @@ default_invalid_within_doloop (const rtx_insn *insn)
   return NULL;
 }
 
+bool
+default_adjust_iv_update_pos (void)
+{
+  return true;
+}
+
 /* Mapping of builtin functions to vectorized variants.  */
 
 tree
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 39a6f82f143..298ecd4fc99 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -90,6 +90,8 @@ extern bool default_has_ifunc_p (void);
 extern bool default_predict_doloop_p (class loop *);
 extern const char * default_invalid_within_doloop (const rtx_insn *);
 
+extern bool default_adjust_iv_update_pos (void);
+
 extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
 extern tree default_builtin_md_vectorized_function (tree, tree, tree);
 
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 4012ae3f19d..5dbc306862c 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -7438,7 +7438,8 @@ rewrite_use_address (struct ivopts_data *data,
   aff_tree aff;
   bool ok;
 
-  adjust_iv_update_pos (cand, use);
+  if (targetm.adjust_iv_update_pos ())
+    adjust_iv_update_pos (cand, use);
   ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff);
   gcc_assert (ok);
   unshare_aff_combination (&aff);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-25  8:31 [PATCH] New hook adjust_iv_update_pos Xionghu Luo
@ 2021-06-25  8:54 ` Richard Biener
  2021-06-25  9:41   ` Xionghu Luo
  0 siblings, 1 reply; 14+ messages in thread
From: Richard Biener @ 2021-06-25  8:54 UTC (permalink / raw)
  To: Xionghu Luo
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, linkw, David Edelsohn

On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
>
> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> on Power.  For example, it generates mismatched address offset after
> adjust iv update statement position:
>
> <bb 32> [local count: 70988443]:
> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> ivtmp.30_415 = ivtmp.30_414 + 1;
> _34 = ref_180 + 18446744073709551615;
> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> if (_84 == _86)
>   goto <bb 56>; [94.50%]
>   else
>   goto <bb 87>; [5.50%]
>
> Disable it will produce:
>
> <bb 32> [local count: 70988443]:
> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> ivtmp.30_415 = ivtmp.30_414 + 1;
> if (_84 == _86)
>   goto <bb 56>; [94.50%]
>   else
>   goto <bb 87>; [5.50%]
>
> Then later pass loop unroll could benefit from same address offset
> with different base address and reduces register dependency.
> This patch could improve performance by 10% for typical case on Power,
> no performance change observed for X86 or Aarch64 due to small loops
> not unrolled on these platforms.  Any comments?

The case you quote is special in that if we hoisted the IV update before
the other MEM _also_ used in the condition it would be fine again.

Now, adjust_iv_update_pos doesn't seem to check that the
condition actually uses the IV use stmt def, so it likely applies to
too many cases.

Unfortunately the introducing rev didn't come with a testcase,
but still I think fixing up adjust_iv_update_pos is better than
introducing a way to short-cut it per target decision.

One "fix" might be to add a check that either the condition
lhs or rhs is the def of the IV use and the other operand
is invariant.  Or if it's of similar structure hoist across the
other iv-use as well.  Not that I understand the argument
about the overlapping life-range.

You also don't provide a complete testcase ...

Richard.


> .L67:
>   lbzx %r7,%r8,%r6
>   lbzx %r12,%r25,%r4
>   cmpw %cr0,%r7,%r12
>   bne %cr0,.L11
>   lbzx %r7,%r8,%r4
>   mr %r6,%r4
>   addi %r4,%r4,1
>   lbzx %r12,%r25,%r4
>   mr %r11,%r6
>   cmpw %cr0,%r7,%r12
>   bne %cr0,.L11
>   mr %r6,%r4
> .L12:
>   cmpdi %cr0,%r10,1
>   addi %r4,%r6,1
>   mr %r11,%r6
>   addi %r10,%r10,-1
>   bne %cr0,.L67
>
> vs.
>
> .L67:
>   lbzx %r25,%r8,%r6
>   lbzx %r12,%r7,%r6
>   addi %r4,%r6,1
>   cmpw %cr0,%r25,%r12
>   bne %cr0,.L11
>   lbzx %r12,%r8,%r4
>   lbzx %r25,%r7,%r4
>   mr %r6,%r4
>   mr %r11,%r4
>   cmpw %cr0,%r12,%r25
>   bne %cr0,.L11
>   addi %r6,%r4,1
> .L12:
>   cmpdi %cr0,%r10,1
>   mr %r11,%r6
>   addi %r10,%r10,-1
>   bne %cr0,.L67
>
> gcc/ChangeLog:
>
>         * config/rs6000/rs6000.c (TARGET_ADJUST_IV_UPDATE_POS):
>         (rs6000_adjust_iv_update_pos):
>         * doc/tm.texi:
>         * doc/tm.texi.in:
>         * target.def:
>         * targhooks.c (default_adjust_iv_update_pos):
>         * targhooks.h (default_adjust_iv_update_pos):
>         * tree-ssa-loop-ivopts.c (rewrite_use_address):
> ---
>  gcc/config/rs6000/rs6000.c | 11 +++++++++++
>  gcc/doc/tm.texi            |  5 +++++
>  gcc/doc/tm.texi.in         |  2 ++
>  gcc/target.def             |  7 +++++++
>  gcc/targhooks.c            |  6 ++++++
>  gcc/targhooks.h            |  2 ++
>  gcc/tree-ssa-loop-ivopts.c |  3 ++-
>  7 files changed, 35 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index cd130dea611..e7725997793 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -1455,6 +1455,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
>  #undef TARGET_LOOP_UNROLL_ADJUST
>  #define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
>
> +#undef TARGET_ADJUST_IV_UPDATE_POS
> +#define TARGET_ADJUST_IV_UPDATE_POS rs6000_adjust_iv_update_pos
> +
>  #undef TARGET_INIT_BUILTINS
>  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>  #undef TARGET_BUILTIN_DECL
> @@ -5457,6 +5460,14 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
>    return nunroll;
>  }
>
> +/* Implement targetm.adjust_iv_update_pos.  */
> +
> +bool
> +rs6000_adjust_iv_update_pos (void)
> +{
> +  return false;
> +}
> +
>  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>     library with vectorized intrinsics.  */
>
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index b272fa4806d..07ce40eb053 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -11768,6 +11768,11 @@ By default, the RTL loop optimizer does not use a present doloop pattern for
>  loops containing function calls or branch on table instructions.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_ADJUST_IV_UPDATE_POS (void)
> +if adjust_iv_update_pos is enabled, reorder the iv update statement,
> + then mem ref uses the iv value after update.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_LEGITIMATE_COMBINED_INSN (rtx_insn *@var{insn})
>  Take an instruction in @var{insn} and return @code{false} if the instruction is not appropriate as a combination of two or more instructions.  The default is to accept all instructions.
>  @end deftypefn
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index bf724dc093c..87d02089588 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7979,6 +7979,8 @@ to by @var{ce_info}.
>
>  @hook TARGET_INVALID_WITHIN_DOLOOP
>
> +@hook TARGET_ADJUST_IV_UPDATE_POS
> +
>  @hook TARGET_LEGITIMATE_COMBINED_INSN
>
>  @hook TARGET_CAN_FOLLOW_JUMP
> diff --git a/gcc/target.def b/gcc/target.def
> index d7b94bd8e5d..aead7cb79ff 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -4398,6 +4398,13 @@ loops containing function calls or branch on table instructions.",
>   const char *, (const rtx_insn *insn),
>   default_invalid_within_doloop)
>
> +/* Function to adjust iv update statment position.  */
> +DEFHOOK
> +(adjust_iv_update_pos,
> + "if adjust_iv_update_pos is enabled, reorder the iv update statement,\n\
> + then mem ref uses the iv value after update.",
> + bool, (void), default_adjust_iv_update_pos)
> +
>  /* Returns true for a legitimate combined insn.  */
>  DEFHOOK
>  (legitimate_combined_insn,
> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
> index d69c9a2d819..2a93a3489e6 100644
> --- a/gcc/targhooks.c
> +++ b/gcc/targhooks.c
> @@ -679,6 +679,12 @@ default_invalid_within_doloop (const rtx_insn *insn)
>    return NULL;
>  }
>
> +bool
> +default_adjust_iv_update_pos (void)
> +{
> +  return true;
> +}
> +
>  /* Mapping of builtin functions to vectorized variants.  */
>
>  tree
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 39a6f82f143..298ecd4fc99 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -90,6 +90,8 @@ extern bool default_has_ifunc_p (void);
>  extern bool default_predict_doloop_p (class loop *);
>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>
> +extern bool default_adjust_iv_update_pos (void);
> +
>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
>  extern tree default_builtin_md_vectorized_function (tree, tree, tree);
>
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index 4012ae3f19d..5dbc306862c 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -7438,7 +7438,8 @@ rewrite_use_address (struct ivopts_data *data,
>    aff_tree aff;
>    bool ok;
>
> -  adjust_iv_update_pos (cand, use);
> +  if (targetm.adjust_iv_update_pos ())
> +    adjust_iv_update_pos (cand, use);
>    ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff);
>    gcc_assert (ok);
>    unshare_aff_combination (&aff);
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-25  8:54 ` Richard Biener
@ 2021-06-25  9:41   ` Xionghu Luo
  2021-06-25 10:02     ` Richard Biener
  0 siblings, 1 reply; 14+ messages in thread
From: Xionghu Luo @ 2021-06-25  9:41 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, linkw, David Edelsohn

[-- Attachment #1: Type: text/plain, Size: 3756 bytes --]



On 2021/6/25 16:54, Richard Biener wrote:
> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
>>
>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
>> on Power.  For example, it generates mismatched address offset after
>> adjust iv update statement position:
>>
>> <bb 32> [local count: 70988443]:
>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>> ivtmp.30_415 = ivtmp.30_414 + 1;
>> _34 = ref_180 + 18446744073709551615;
>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>> if (_84 == _86)
>>    goto <bb 56>; [94.50%]
>>    else
>>    goto <bb 87>; [5.50%]
>>
>> Disable it will produce:
>>
>> <bb 32> [local count: 70988443]:
>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
>> ivtmp.30_415 = ivtmp.30_414 + 1;
>> if (_84 == _86)
>>    goto <bb 56>; [94.50%]
>>    else
>>    goto <bb 87>; [5.50%]
>>
>> Then later pass loop unroll could benefit from same address offset
>> with different base address and reduces register dependency.
>> This patch could improve performance by 10% for typical case on Power,
>> no performance change observed for X86 or Aarch64 due to small loops
>> not unrolled on these platforms.  Any comments?
> 
> The case you quote is special in that if we hoisted the IV update before
> the other MEM _also_ used in the condition it would be fine again.

Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
performance is SAME to the one that IV update statement in the *MIDDLE* (trunk). 
From the ASM, we can see the index register %r4 is used in two iterations which
maybe a bottle neck for hiding instruction latency?

Then it seems reasonable the performance would be better if keep the IV update
statement at *LAST* (Fix 1).

(Fix 2):
  <bb 32> [local count: 70988443]:
  ivtmp.30_415 = ivtmp.30_414 + 1;
  _34 = ip_229 + 18446744073709551615;
  _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
  _33 = ref_180 + 18446744073709551615;
  _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
  if (_84 == _86)
    goto <bb 56>; [94.50%]
  else
    goto <bb 87>; [5.50%]


.L67:
        lbzx %r12,%r24,%r4
        lbzx %r25,%r7,%r4
        cmpw %cr0,%r12,%r25
        bne %cr0,.L11
        mr %r26,%r4
        addi %r4,%r4,1
        lbzx %r12,%r24,%r4
        lbzx %r25,%r7,%r4
        mr %r6,%r26
        cmpw %cr0,%r12,%r25
        bne %cr0,.L11
        mr %r26,%r4
.L12:
        cmpdi %cr0,%r10,1
        addi %r4,%r26,1
        mr %r6,%r26
        addi %r10,%r10,-1
        bne %cr0,.L67

> 
> Now, adjust_iv_update_pos doesn't seem to check that the
> condition actually uses the IV use stmt def, so it likely applies to
> too many cases.
> 
> Unfortunately the introducing rev didn't come with a testcase,
> but still I think fixing up adjust_iv_update_pos is better than
> introducing a way to short-cut it per target decision.
> 
> One "fix" might be to add a check that either the condition
> lhs or rhs is the def of the IV use and the other operand
> is invariant.  Or if it's of similar structure hoist across the
> other iv-use as well.  Not that I understand the argument
> about the overlapping life-range.
> 
> You also don't provide a complete testcase ...
> 

Attached the test code, will also add it it patch in future version.
The issue comes from a very small hot loop:

	do {
	  len++;
	} while(len < maxlen && ip[len] == ref[len]);


-- 
Thanks,
Xionghu

[-- Attachment #2: test_i2_4_6.c --]
[-- Type: text/plain, Size: 4754 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

# define HLOG 16
#define        MAX_LIT        (1 <<  5)
typedef const uint8_t *LZF_HSLOT;
typedef LZF_HSLOT LZF_STATE[1 << (HLOG)];

int compute_on_bytes(uint8_t *, int, uint8_t *, int );

int main(int argc, char **argv)
{
	//Declarations
	FILE *fptr;
	int len_limit;
	uint8_t *inputbuf,*outputbuf;

	uint8_t *ip,*op;
	int len, i;
	long int sum=0;

	//randomness
	if (argv[1] != NULL )
		len=atoi(argv[1]);

	//read
	fptr = fopen(argv[2],"rb");	

	if(fptr == NULL)
	{	  
	       printf("Error!");   
	       exit(1);             
	}

	fseek( fptr , 0L , SEEK_END);
	len_limit = ftell( fptr );
	rewind( fptr );

	/* allocate memory for entire input content */
	inputbuf = calloc( 1, len_limit+1 );
	if( !inputbuf ) fclose(fptr),fputs("memory alloc fails",stderr),exit(1);
	
	/* allocate memory for entire output */
	outputbuf = calloc( 1, len_limit+1 );
	if( !inputbuf ) fclose(fptr),fputs("memory alloc fails",stderr),exit(1);

	/* copy the file into the buffer */
	if( 1!=fread( inputbuf , len_limit, 1 , fptr) )
	  fclose(fptr),free(inputbuf),fputs("entire read fails",stderr),exit(1);

	//compare
	ip=inputbuf;
	op=outputbuf;

	for (i=0; i < len_limit; i=i+(len/4))
		sum+=compute_on_bytes(ip+i,len,op,len-4);
	fclose(fptr);
	free(inputbuf);
	return sum;
}


//__attribute__((noinline))  int compute_on_bytes(uint8_t *ip, uint8_t *ref, int len_limit, int temp2 )
 int compute_on_bytes(uint8_t *in_data, int in_len, uint8_t *out_data, int out_len)
{
	LZF_STATE htab;

	uint8_t *ip = in_data;
	uint8_t *op = out_data;
	uint8_t *in_end = ip + in_len;
	uint8_t *out_end = op + out_len;
	uint8_t *ref;

	unsigned long off;
	unsigned int hval;
	int lit;

	if (!in_len || !out_len)
	    return 0;

	lit = 0; op++;
	hval = (((ip[0]) << 8) | ip[1]);

	while( ip < in_end - 2 )
	{
		uint8_t *hslot;

		hval = (((hval) << 8) | ip[2]);
		hslot = htab + ((( hval >> (3*8 - 16)) - hval*5) & ((1 << (16)) - 1));

		ref = *hslot + in_data; 
		*hslot = ip - in_data;
		
		if (1 && (off = ip - ref - 1) < (1 << 13) && ref > in_data && ref[2] == ip[2] && ((ref[1] << 8) | ref[0]) == ((ip[1] << 8) | ip[0]) )
		{
			unsigned int len = 2;
			unsigned int maxlen = in_end - ip - len;
	       		maxlen = maxlen > ((1 << 8) + (1 << 3)) ? ((1 << 8) + (1 << 3)) : maxlen;

			if ((op + 3 + 1 >= out_end) != 0)
				if (op - !lit + 3 + 1 >= out_end)
					return 0;

			op [- lit - 1] = lit - 1;
			op -= !lit;

			for (;;)
			{
				if ( maxlen > 16 )
	               		{ 
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;

	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;

	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;

	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	               			len++; if (ref [len] != ip [len]) break;
	             		}

	       			do {
	               	  		len++;

	       	    		}while(len < maxlen && ip[len] == ref[len]);

				break;
			}

			len -= 2;
          		ip++;

	            	if (len < 7)
		         { 
				*op++ = (off >> 8) + (len << 5);
			}
		        else
			{ 
				*op++ = (off >> 8) + ( 7 << 5);
				*op++ = len - 7;
			}
		        *op++ = off;
			lit = 0; op++;
			ip += len + 1;

			if (ip >= in_end - 2) 
				break;

			--ip;
			--ip;

			hval = (((ip[0]) << 8) | ip[1]);
			hval = (((hval) << 8) | ip[2]);
			htab[((( hval >> (3*8 - 16)) - hval*5) & ((1 << (16)) - 1))] = ip - in_data;
			ip++;

			hval = (((hval) << 8) | ip[2]);
			htab[((( hval >> (3*8 - 16)) - hval*5) & ((1 << (16)) - 1))] = ip - in_data;
			ip++;

		}
	      else  {
		        if(op >= out_end)
				return 0;

			lit++; *op++ = *ip++;

			if (lit == (1 << 5)) 
			{ 
				op [- lit - 1] = lit - 1;
				lit = 0; op++;
			}
		}
	  }
	if (op + 3 > out_end) /* at most 3 bytes can be missing here */
		    return 0;

	while (ip < in_end)
	{ 
		lit++; *op++ = *ip++;
		if (lit == MAX_LIT)
		{ 
			op [- lit - 1] = lit - 1; /* stop run */
			lit = 0; op++; /* start run */
		}
	}

	op [- lit - 1] = lit - 1; /* end run */
	op -= !lit; /* undo run if length is zero */

	return op - out_data;

}

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-25  9:41   ` Xionghu Luo
@ 2021-06-25 10:02     ` Richard Biener
  2021-06-25 12:58       ` Xionghu Luo
                         ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Richard Biener @ 2021-06-25 10:02 UTC (permalink / raw)
  To: Xionghu Luo
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, linkw,
	David Edelsohn, H. J. Lu

On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>
>
>
> On 2021/6/25 16:54, Richard Biener wrote:
> > On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
> >>
> >> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> >> on Power.  For example, it generates mismatched address offset after
> >> adjust iv update statement position:
> >>
> >> <bb 32> [local count: 70988443]:
> >> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >> ivtmp.30_415 = ivtmp.30_414 + 1;
> >> _34 = ref_180 + 18446744073709551615;
> >> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> >> if (_84 == _86)
> >>    goto <bb 56>; [94.50%]
> >>    else
> >>    goto <bb 87>; [5.50%]
> >>
> >> Disable it will produce:
> >>
> >> <bb 32> [local count: 70988443]:
> >> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> >> ivtmp.30_415 = ivtmp.30_414 + 1;
> >> if (_84 == _86)
> >>    goto <bb 56>; [94.50%]
> >>    else
> >>    goto <bb 87>; [5.50%]
> >>
> >> Then later pass loop unroll could benefit from same address offset
> >> with different base address and reduces register dependency.
> >> This patch could improve performance by 10% for typical case on Power,
> >> no performance change observed for X86 or Aarch64 due to small loops
> >> not unrolled on these platforms.  Any comments?
> >
> > The case you quote is special in that if we hoisted the IV update before
> > the other MEM _also_ used in the condition it would be fine again.
>
> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
> From the ASM, we can see the index register %r4 is used in two iterations which
> maybe a bottle neck for hiding instruction latency?
>
> Then it seems reasonable the performance would be better if keep the IV update
> statement at *LAST* (Fix 1).
>
> (Fix 2):
>   <bb 32> [local count: 70988443]:
>   ivtmp.30_415 = ivtmp.30_414 + 1;
>   _34 = ip_229 + 18446744073709551615;
>   _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>   _33 = ref_180 + 18446744073709551615;
>   _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
>   if (_84 == _86)
>     goto <bb 56>; [94.50%]
>   else
>     goto <bb 87>; [5.50%]
>
>
> .L67:
>         lbzx %r12,%r24,%r4
>         lbzx %r25,%r7,%r4
>         cmpw %cr0,%r12,%r25
>         bne %cr0,.L11
>         mr %r26,%r4
>         addi %r4,%r4,1
>         lbzx %r12,%r24,%r4
>         lbzx %r25,%r7,%r4
>         mr %r6,%r26
>         cmpw %cr0,%r12,%r25
>         bne %cr0,.L11
>         mr %r26,%r4
> .L12:
>         cmpdi %cr0,%r10,1
>         addi %r4,%r26,1
>         mr %r6,%r26
>         addi %r10,%r10,-1
>         bne %cr0,.L67
>
> >
> > Now, adjust_iv_update_pos doesn't seem to check that the
> > condition actually uses the IV use stmt def, so it likely applies to
> > too many cases.
> >
> > Unfortunately the introducing rev didn't come with a testcase,
> > but still I think fixing up adjust_iv_update_pos is better than
> > introducing a way to short-cut it per target decision.
> >
> > One "fix" might be to add a check that either the condition
> > lhs or rhs is the def of the IV use and the other operand
> > is invariant.  Or if it's of similar structure hoist across the
> > other iv-use as well.  Not that I understand the argument
> > about the overlapping life-range.
> >
> > You also don't provide a complete testcase ...
> >
>
> Attached the test code, will also add it it patch in future version.
> The issue comes from a very small hot loop:
>
>         do {
>           len++;
>         } while(len < maxlen && ip[len] == ref[len]);

unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
{
  unsigned int len = 2;
  do {
      len++;
  }while(len < maxlen && ip[len] == ref[len]);
  return len;
}

I can see the effect on this loop on x86_64 as well, we end up with

.L6:
        movzbl  (%rdi,%rax), %ecx
        addq    $1, %rax
        cmpb    -1(%rsi,%rax), %cl
        jne     .L1
.L3:
        movl    %eax, %r8d
        cmpl    %edx, %eax
        jb      .L6

but without the trick it is

.L6:
        movzbl  (%rdi,%rax), %r8d
        movzbl  (%rsi,%rax), %ecx
        addq    $1, %rax
        cmpb    %cl, %r8b
        jne     .L1
.L3:
        movl    %eax, %r9d
        cmpl    %edx, %eax
        jb      .L6

so here you can see the missed fusion.  Of course
in this case the IV update could have been sunk into
the .L3 block and replicated on the exit edge as well.

I'm not sure if the motivation for the change introducing this
trick was the above kind of combination or not, but I guess
so.  The dependence distance of the IV increment to the
use is now shorter, so I'm not sure the combined variant is
better.

Richard.


>
> --
> Thanks,
> Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-25 10:02     ` Richard Biener
@ 2021-06-25 12:58       ` Xionghu Luo
  2021-06-25 13:12       ` Xionghu Luo
  2021-06-28  8:07       ` Xionghu Luo
  2 siblings, 0 replies; 14+ messages in thread
From: Xionghu Luo @ 2021-06-25 12:58 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, davidxl,
	David Edelsohn, H. J. Lu

+cc Xinliang David Li since he introduced the function
adjust_iv_update_pos. Looking forward to the input. :)


On 2021/6/25 18:02, Richard Biener wrote:
> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>
>>
>>
>> On 2021/6/25 16:54, Richard Biener wrote:
>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>
>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
>>>>
>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
>>>> on Power.  For example, it generates mismatched address offset after
>>>> adjust iv update statement position:
>>>>
>>>> <bb 32> [local count: 70988443]:
>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>> _34 = ref_180 + 18446744073709551615;
>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>>> if (_84 == _86)
>>>>     goto <bb 56>; [94.50%]
>>>>     else
>>>>     goto <bb 87>; [5.50%]
>>>>
>>>> Disable it will produce:
>>>>
>>>> <bb 32> [local count: 70988443]:
>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>> if (_84 == _86)
>>>>     goto <bb 56>; [94.50%]
>>>>     else
>>>>     goto <bb 87>; [5.50%]
>>>>
>>>> Then later pass loop unroll could benefit from same address offset
>>>> with different base address and reduces register dependency.
>>>> This patch could improve performance by 10% for typical case on Power,
>>>> no performance change observed for X86 or Aarch64 due to small loops
>>>> not unrolled on these platforms.  Any comments?
>>>
>>> The case you quote is special in that if we hoisted the IV update before
>>> the other MEM _also_ used in the condition it would be fine again.
>>
>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
>>  From the ASM, we can see the index register %r4 is used in two iterations which
>> maybe a bottle neck for hiding instruction latency?
>>
>> Then it seems reasonable the performance would be better if keep the IV update
>> statement at *LAST* (Fix 1).
>>
>> (Fix 2):
>>    <bb 32> [local count: 70988443]:
>>    ivtmp.30_415 = ivtmp.30_414 + 1;
>>    _34 = ip_229 + 18446744073709551615;
>>    _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>    _33 = ref_180 + 18446744073709551615;
>>    _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
>>    if (_84 == _86)
>>      goto <bb 56>; [94.50%]
>>    else
>>      goto <bb 87>; [5.50%]
>>
>>
>> .L67:
>>          lbzx %r12,%r24,%r4
>>          lbzx %r25,%r7,%r4
>>          cmpw %cr0,%r12,%r25
>>          bne %cr0,.L11
>>          mr %r26,%r4
>>          addi %r4,%r4,1
>>          lbzx %r12,%r24,%r4
>>          lbzx %r25,%r7,%r4
>>          mr %r6,%r26
>>          cmpw %cr0,%r12,%r25
>>          bne %cr0,.L11
>>          mr %r26,%r4
>> .L12:
>>          cmpdi %cr0,%r10,1
>>          addi %r4,%r26,1
>>          mr %r6,%r26
>>          addi %r10,%r10,-1
>>          bne %cr0,.L67
>>
>>>
>>> Now, adjust_iv_update_pos doesn't seem to check that the
>>> condition actually uses the IV use stmt def, so it likely applies to
>>> too many cases.
>>>
>>> Unfortunately the introducing rev didn't come with a testcase,
>>> but still I think fixing up adjust_iv_update_pos is better than
>>> introducing a way to short-cut it per target decision.
>>>
>>> One "fix" might be to add a check that either the condition
>>> lhs or rhs is the def of the IV use and the other operand
>>> is invariant.  Or if it's of similar structure hoist across the
>>> other iv-use as well.  Not that I understand the argument
>>> about the overlapping life-range.
>>>
>>> You also don't provide a complete testcase ...
>>>
>>
>> Attached the test code, will also add it it patch in future version.
>> The issue comes from a very small hot loop:
>>
>>          do {
>>            len++;
>>          } while(len < maxlen && ip[len] == ref[len]);
> 
> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> {
>    unsigned int len = 2;
>    do {
>        len++;
>    }while(len < maxlen && ip[len] == ref[len]);
>    return len;
> }
> 
> I can see the effect on this loop on x86_64 as well, we end up with
> 
> .L6:
>          movzbl  (%rdi,%rax), %ecx
>          addq    $1, %rax
>          cmpb    -1(%rsi,%rax), %cl
>          jne     .L1
> .L3:
>          movl    %eax, %r8d
>          cmpl    %edx, %eax
>          jb      .L6
> 
> but without the trick it is
> 
> .L6:
>          movzbl  (%rdi,%rax), %r8d
>          movzbl  (%rsi,%rax), %ecx
>          addq    $1, %rax
>          cmpb    %cl, %r8b
>          jne     .L1
> .L3:
>          movl    %eax, %r9d
>          cmpl    %edx, %eax
>          jb      .L6
> 
> so here you can see the missed fusion.  Of course
> in this case the IV update could have been sunk into
> the .L3 block and replicated on the exit edge as well.
> 
> I'm not sure if the motivation for the change introducing this
> trick was the above kind of combination or not, but I guess
> so.  The dependence distance of the IV increment to the
> use is now shorter, so I'm not sure the combined variant is
> better.
> 
> Richard.
> 
> 
>>
>> --
>> Thanks,
>> Xionghu

-- 
Thanks,
Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-25 10:02     ` Richard Biener
  2021-06-25 12:58       ` Xionghu Luo
@ 2021-06-25 13:12       ` Xionghu Luo
  2021-06-28  8:07       ` Xionghu Luo
  2 siblings, 0 replies; 14+ messages in thread
From: Xionghu Luo @ 2021-06-25 13:12 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, davidxl, davidxl,
	H. J. Lu



On 2021/6/25 18:02, Richard Biener wrote:
> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>
>>
>>
>> On 2021/6/25 16:54, Richard Biener wrote:
>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>
>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
>>>>
>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
>>>> on Power.  For example, it generates mismatched address offset after
>>>> adjust iv update statement position:
>>>>
>>>> <bb 32> [local count: 70988443]:
>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>> _34 = ref_180 + 18446744073709551615;
>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>>> if (_84 == _86)
>>>>     goto <bb 56>; [94.50%]
>>>>     else
>>>>     goto <bb 87>; [5.50%]
>>>>
>>>> Disable it will produce:
>>>>
>>>> <bb 32> [local count: 70988443]:
>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>> if (_84 == _86)
>>>>     goto <bb 56>; [94.50%]
>>>>     else
>>>>     goto <bb 87>; [5.50%]
>>>>
>>>> Then later pass loop unroll could benefit from same address offset
>>>> with different base address and reduces register dependency.
>>>> This patch could improve performance by 10% for typical case on Power,
>>>> no performance change observed for X86 or Aarch64 due to small loops
>>>> not unrolled on these platforms.  Any comments?
>>>
>>> The case you quote is special in that if we hoisted the IV update before
>>> the other MEM _also_ used in the condition it would be fine again.
>>
>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
>>  From the ASM, we can see the index register %r4 is used in two iterations which
>> maybe a bottle neck for hiding instruction latency?
>>
>> Then it seems reasonable the performance would be better if keep the IV update
>> statement at *LAST* (Fix 1).
>>
>> (Fix 2):
>>    <bb 32> [local count: 70988443]:
>>    ivtmp.30_415 = ivtmp.30_414 + 1;
>>    _34 = ip_229 + 18446744073709551615;
>>    _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>    _33 = ref_180 + 18446744073709551615;
>>    _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
>>    if (_84 == _86)
>>      goto <bb 56>; [94.50%]
>>    else
>>      goto <bb 87>; [5.50%]
>>
>>
>> .L67:
>>          lbzx %r12,%r24,%r4
>>          lbzx %r25,%r7,%r4
>>          cmpw %cr0,%r12,%r25
>>          bne %cr0,.L11
>>          mr %r26,%r4
>>          addi %r4,%r4,1
>>          lbzx %r12,%r24,%r4
>>          lbzx %r25,%r7,%r4
>>          mr %r6,%r26
>>          cmpw %cr0,%r12,%r25
>>          bne %cr0,.L11
>>          mr %r26,%r4
>> .L12:
>>          cmpdi %cr0,%r10,1
>>          addi %r4,%r26,1
>>          mr %r6,%r26
>>          addi %r10,%r10,-1
>>          bne %cr0,.L67
>>
>>>
>>> Now, adjust_iv_update_pos doesn't seem to check that the
>>> condition actually uses the IV use stmt def, so it likely applies to
>>> too many cases.
>>>
>>> Unfortunately the introducing rev didn't come with a testcase,
>>> but still I think fixing up adjust_iv_update_pos is better than
>>> introducing a way to short-cut it per target decision.
>>>
>>> One "fix" might be to add a check that either the condition
>>> lhs or rhs is the def of the IV use and the other operand
>>> is invariant.  Or if it's of similar structure hoist across the
>>> other iv-use as well.  Not that I understand the argument
>>> about the overlapping life-range.
>>>
>>> You also don't provide a complete testcase ...
>>>
>>
>> Attached the test code, will also add it it patch in future version.
>> The issue comes from a very small hot loop:
>>
>>          do {
>>            len++;
>>          } while(len < maxlen && ip[len] == ref[len]);
> 
> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> {
>    unsigned int len = 2;
>    do {
>        len++;
>    }while(len < maxlen && ip[len] == ref[len]);
>    return len;
> }
> 
> I can see the effect on this loop on x86_64 as well, we end up with
> 
> .L6:
>          movzbl  (%rdi,%rax), %ecx
>          addq    $1, %rax
>          cmpb    -1(%rsi,%rax), %cl
>          jne     .L1
> .L3:
>          movl    %eax, %r8d
>          cmpl    %edx, %eax
>          jb      .L6
> 
> but without the trick it is
> 
> .L6:
>          movzbl  (%rdi,%rax), %r8d
>          movzbl  (%rsi,%rax), %ecx
>          addq    $1, %rax
>          cmpb    %cl, %r8b
>          jne     .L1
> .L3:
>          movl    %eax, %r9d
>          cmpl    %edx, %eax
>          jb      .L6
> 
> so here you can see the missed fusion.  Of course
> in this case the IV update could have been sunk into
> the .L3 block and replicated on the exit edge as well.

It is not sunk by the recently added gimple sink2 pass,
reason is the latch block is an empty block.  Remove this
check would cause some SPEC performance regression.

tree-ssa-sink.c
static bool
statement_sink_location (gimple *stmt, basic_block frombb,
			 gimple_stmt_iterator *togsi, bool *zero_uses_p)
{
...
   /* If the latch block is empty, don't make it non-empty by sinking
      something into it.  */
   if (sinkbb == frombb->loop_father->latch
       && empty_block_p (sinkbb))
     return false;
...
}


   <bb 31> [local count: 75120046]:
   # ivtmp.30_414 = PHI <ivtmp.30_416(30), ivtmp.30_415(57)>
   _418 = (unsigned int) ivtmp.30_414;
   if (maxlen_184 > _418)
     goto <bb 32>; [94.50%]
   else
     goto <bb 87>; [5.50%]

   <bb 32> [local count: 70988443]:
   _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
   _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
   ivtmp.30_415 = ivtmp.30_414 + 1;
   if (_84 == _86)
     goto <bb 57>; [94.50%]
   else
     goto <bb 88>; [5.50%]

   <bb 88> [local count: 3904364]:
   goto <bb 33>; [100.00%]

   <bb 57> [local count: 67084079]:
   goto <bb 31>; [100.00%]


> 
> I'm not sure if the motivation for the change introducing this
> trick was the above kind of combination or not, but I guess
> so.  The dependence distance of the IV increment to the
> use is now shorter, so I'm not sure the combined variant is
> better.
> 
> Richard.
> 
> 
>>
>> --
>> Thanks,
>> Xionghu

-- 
Thanks,
Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-25 10:02     ` Richard Biener
  2021-06-25 12:58       ` Xionghu Luo
  2021-06-25 13:12       ` Xionghu Luo
@ 2021-06-28  8:07       ` Xionghu Luo
  2021-06-28  8:25         ` Richard Biener
  2 siblings, 1 reply; 14+ messages in thread
From: Xionghu Luo @ 2021-06-28  8:07 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, linkw,
	David Edelsohn, H. J. Lu



On 2021/6/25 18:02, Richard Biener wrote:
> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>
>>
>>
>> On 2021/6/25 16:54, Richard Biener wrote:
>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>
>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
>>>>
>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
>>>> on Power.  For example, it generates mismatched address offset after
>>>> adjust iv update statement position:
>>>>
>>>> <bb 32> [local count: 70988443]:
>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>> _34 = ref_180 + 18446744073709551615;
>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>>> if (_84 == _86)
>>>>     goto <bb 56>; [94.50%]
>>>>     else
>>>>     goto <bb 87>; [5.50%]
>>>>
>>>> Disable it will produce:
>>>>
>>>> <bb 32> [local count: 70988443]:
>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>> if (_84 == _86)
>>>>     goto <bb 56>; [94.50%]
>>>>     else
>>>>     goto <bb 87>; [5.50%]
>>>>
>>>> Then later pass loop unroll could benefit from same address offset
>>>> with different base address and reduces register dependency.
>>>> This patch could improve performance by 10% for typical case on Power,
>>>> no performance change observed for X86 or Aarch64 due to small loops
>>>> not unrolled on these platforms.  Any comments?
>>>
>>> The case you quote is special in that if we hoisted the IV update before
>>> the other MEM _also_ used in the condition it would be fine again.
>>
>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
>>  From the ASM, we can see the index register %r4 is used in two iterations which
>> maybe a bottle neck for hiding instruction latency?
>>
>> Then it seems reasonable the performance would be better if keep the IV update
>> statement at *LAST* (Fix 1).
>>
>> (Fix 2):
>>    <bb 32> [local count: 70988443]:
>>    ivtmp.30_415 = ivtmp.30_414 + 1;
>>    _34 = ip_229 + 18446744073709551615;
>>    _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>    _33 = ref_180 + 18446744073709551615;
>>    _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
>>    if (_84 == _86)
>>      goto <bb 56>; [94.50%]
>>    else
>>      goto <bb 87>; [5.50%]
>>
>>
>> .L67:
>>          lbzx %r12,%r24,%r4
>>          lbzx %r25,%r7,%r4
>>          cmpw %cr0,%r12,%r25
>>          bne %cr0,.L11
>>          mr %r26,%r4
>>          addi %r4,%r4,1
>>          lbzx %r12,%r24,%r4
>>          lbzx %r25,%r7,%r4
>>          mr %r6,%r26
>>          cmpw %cr0,%r12,%r25
>>          bne %cr0,.L11
>>          mr %r26,%r4
>> .L12:
>>          cmpdi %cr0,%r10,1
>>          addi %r4,%r26,1
>>          mr %r6,%r26
>>          addi %r10,%r10,-1
>>          bne %cr0,.L67
>>
>>>
>>> Now, adjust_iv_update_pos doesn't seem to check that the
>>> condition actually uses the IV use stmt def, so it likely applies to
>>> too many cases.
>>>
>>> Unfortunately the introducing rev didn't come with a testcase,
>>> but still I think fixing up adjust_iv_update_pos is better than
>>> introducing a way to short-cut it per target decision.
>>>
>>> One "fix" might be to add a check that either the condition
>>> lhs or rhs is the def of the IV use and the other operand
>>> is invariant.  Or if it's of similar structure hoist across the
>>> other iv-use as well.  Not that I understand the argument
>>> about the overlapping life-range.
>>>
>>> You also don't provide a complete testcase ...
>>>
>>
>> Attached the test code, will also add it it patch in future version.
>> The issue comes from a very small hot loop:
>>
>>          do {
>>            len++;
>>          } while(len < maxlen && ip[len] == ref[len]);
> 
> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> {
>    unsigned int len = 2;
>    do {
>        len++;
>    }while(len < maxlen && ip[len] == ref[len]);
>    return len;
> }
> 
> I can see the effect on this loop on x86_64 as well, we end up with
> 
> .L6:
>          movzbl  (%rdi,%rax), %ecx
>          addq    $1, %rax
>          cmpb    -1(%rsi,%rax), %cl
>          jne     .L1
> .L3:
>          movl    %eax, %r8d
>          cmpl    %edx, %eax
>          jb      .L6
> 
> but without the trick it is
> 
> .L6:
>          movzbl  (%rdi,%rax), %r8d
>          movzbl  (%rsi,%rax), %ecx
>          addq    $1, %rax
>          cmpb    %cl, %r8b
>          jne     .L1
> .L3:
>          movl    %eax, %r9d
>          cmpl    %edx, %eax
>          jb      .L6

Verified this small piece of code on X86, there is no performance
change with or without adjust_iv_update_pos (I've checked the ASM
exactly same with yours):

luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1

real    0m7.003s
user    0m6.996s
sys     0m0.000s
luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
cc/ -O2 test.c
luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1

real    0m7.070s
user    0m7.068s
sys     0m0.000s


But for AArch64, current GCC code also generates similar code with
or without adjust_iv_update_pos, the runtime is 10.705s for them.

L6:
        ldrb    w4, [x6, x3]
        add     x3, x3, 1
        ldrb    w5, [x1, x3]
        cmp     w5, w4
        bne     .L1
.L3:
        mov     w0, w3
        cmp     w2, w3
        bhi     .L6

No adjust_iv_update_pos:

.L6:
        ldrb    w5, [x6, x3]
        ldrb    w4, [x1, x3]
        add     x3, x3, 1
        cmp     w5, w4
        bne     .L1
.L3:
        mov     w0, w3
        cmp     w2, w3
        bhi     .L6


While built with old GCC(gcc version 7.4.1 20190424), it generates
worse code and runtime is 11.664s:

.L6:
        ldrb    w4, [x6, x3]
        add     x3, x3, 1
        add     x5, x1, x3
        ldrb    w5, [x5, -1]
        cmp     w5, w4
        bne     .L1
.L3:
        cmp     w3, w2
        mov     w0, w3
        bcc     .L6


In general, only Power shows negative performance with adjust_iv_update_pos,
that's why I tried to add target hook for it, is this reasonable?  Or we should
just remove adjust_iv_update_pos since it doesn't help performance for X86 or
other targets?



test.c
#include <stdlib.h>
__attribute__((noinline)) unsigned int
foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
{
        unsigned int len = 2;
        do {
                len++;
        }while(len < maxlen && ip[len] == ref[len]);
        return len;
}

int main (int argc, char* argv[])
{
  unsigned char string_a [] = "abcdefghijklmnopqrstuvwxyzmnpppppppppppaaaaaaabbbbbbeeee";
  unsigned char string_b [] = "abcdefghijklmnopqrstuvwxyzmnpppppppppppaaaaaaabbbbbbeene";
  unsigned long ret = 0;

  for (long i = 0; i < atoi(argv[1]) * 100000000; i++)
          ret += foo (string_a, string_b, sizeof(string_a));
  return ret;
}


> 
> so here you can see the missed fusion.  Of course
> in this case the IV update could have been sunk into
> the .L3 block and replicated on the exit edge as well.
> 
> I'm not sure if the motivation for the change introducing this
> trick was the above kind of combination or not, but I guess
> so.  The dependence distance of the IV increment to the
> use is now shorter, so I'm not sure the combined variant is
> better.
> 
> Richard.
> 
> 
>>
>> --
>> Thanks,
>> Xionghu

-- 
Thanks,
Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-28  8:07       ` Xionghu Luo
@ 2021-06-28  8:25         ` Richard Biener
  2021-06-29  9:19           ` Xionghu Luo
  2021-07-07  5:51           ` Bin.Cheng
  0 siblings, 2 replies; 14+ messages in thread
From: Richard Biener @ 2021-06-28  8:25 UTC (permalink / raw)
  To: Xionghu Luo, Amker.Cheng
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, linkw,
	David Edelsohn, H. J. Lu

On Mon, Jun 28, 2021 at 10:07 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>
>
>
> On 2021/6/25 18:02, Richard Biener wrote:
> > On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >>
> >>
> >>
> >> On 2021/6/25 16:54, Richard Biener wrote:
> >>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
> >>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>
> >>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
> >>>>
> >>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> >>>> on Power.  For example, it generates mismatched address offset after
> >>>> adjust iv update statement position:
> >>>>
> >>>> <bb 32> [local count: 70988443]:
> >>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>> _34 = ref_180 + 18446744073709551615;
> >>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> >>>> if (_84 == _86)
> >>>>     goto <bb 56>; [94.50%]
> >>>>     else
> >>>>     goto <bb 87>; [5.50%]
> >>>>
> >>>> Disable it will produce:
> >>>>
> >>>> <bb 32> [local count: 70988443]:
> >>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> >>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>> if (_84 == _86)
> >>>>     goto <bb 56>; [94.50%]
> >>>>     else
> >>>>     goto <bb 87>; [5.50%]
> >>>>
> >>>> Then later pass loop unroll could benefit from same address offset
> >>>> with different base address and reduces register dependency.
> >>>> This patch could improve performance by 10% for typical case on Power,
> >>>> no performance change observed for X86 or Aarch64 due to small loops
> >>>> not unrolled on these platforms.  Any comments?
> >>>
> >>> The case you quote is special in that if we hoisted the IV update before
> >>> the other MEM _also_ used in the condition it would be fine again.
> >>
> >> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
> >> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
> >> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
> >> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
> >> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
> >>  From the ASM, we can see the index register %r4 is used in two iterations which
> >> maybe a bottle neck for hiding instruction latency?
> >>
> >> Then it seems reasonable the performance would be better if keep the IV update
> >> statement at *LAST* (Fix 1).
> >>
> >> (Fix 2):
> >>    <bb 32> [local count: 70988443]:
> >>    ivtmp.30_415 = ivtmp.30_414 + 1;
> >>    _34 = ip_229 + 18446744073709551615;
> >>    _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> >>    _33 = ref_180 + 18446744073709551615;
> >>    _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
> >>    if (_84 == _86)
> >>      goto <bb 56>; [94.50%]
> >>    else
> >>      goto <bb 87>; [5.50%]
> >>
> >>
> >> .L67:
> >>          lbzx %r12,%r24,%r4
> >>          lbzx %r25,%r7,%r4
> >>          cmpw %cr0,%r12,%r25
> >>          bne %cr0,.L11
> >>          mr %r26,%r4
> >>          addi %r4,%r4,1
> >>          lbzx %r12,%r24,%r4
> >>          lbzx %r25,%r7,%r4
> >>          mr %r6,%r26
> >>          cmpw %cr0,%r12,%r25
> >>          bne %cr0,.L11
> >>          mr %r26,%r4
> >> .L12:
> >>          cmpdi %cr0,%r10,1
> >>          addi %r4,%r26,1
> >>          mr %r6,%r26
> >>          addi %r10,%r10,-1
> >>          bne %cr0,.L67
> >>
> >>>
> >>> Now, adjust_iv_update_pos doesn't seem to check that the
> >>> condition actually uses the IV use stmt def, so it likely applies to
> >>> too many cases.
> >>>
> >>> Unfortunately the introducing rev didn't come with a testcase,
> >>> but still I think fixing up adjust_iv_update_pos is better than
> >>> introducing a way to short-cut it per target decision.
> >>>
> >>> One "fix" might be to add a check that either the condition
> >>> lhs or rhs is the def of the IV use and the other operand
> >>> is invariant.  Or if it's of similar structure hoist across the
> >>> other iv-use as well.  Not that I understand the argument
> >>> about the overlapping life-range.
> >>>
> >>> You also don't provide a complete testcase ...
> >>>
> >>
> >> Attached the test code, will also add it it patch in future version.
> >> The issue comes from a very small hot loop:
> >>
> >>          do {
> >>            len++;
> >>          } while(len < maxlen && ip[len] == ref[len]);
> >
> > unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> > {
> >    unsigned int len = 2;
> >    do {
> >        len++;
> >    }while(len < maxlen && ip[len] == ref[len]);
> >    return len;
> > }
> >
> > I can see the effect on this loop on x86_64 as well, we end up with
> >
> > .L6:
> >          movzbl  (%rdi,%rax), %ecx
> >          addq    $1, %rax
> >          cmpb    -1(%rsi,%rax), %cl
> >          jne     .L1
> > .L3:
> >          movl    %eax, %r8d
> >          cmpl    %edx, %eax
> >          jb      .L6
> >
> > but without the trick it is
> >
> > .L6:
> >          movzbl  (%rdi,%rax), %r8d
> >          movzbl  (%rsi,%rax), %ecx
> >          addq    $1, %rax
> >          cmpb    %cl, %r8b
> >          jne     .L1
> > .L3:
> >          movl    %eax, %r9d
> >          cmpl    %edx, %eax
> >          jb      .L6
>
> Verified this small piece of code on X86, there is no performance
> change with or without adjust_iv_update_pos (I've checked the ASM
> exactly same with yours):
>
> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
>
> real    0m7.003s
> user    0m6.996s
> sys     0m0.000s
> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
> cc/ -O2 test.c
> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
>
> real    0m7.070s
> user    0m7.068s
> sys     0m0.000s
>
>
> But for AArch64, current GCC code also generates similar code with
> or without adjust_iv_update_pos, the runtime is 10.705s for them.
>
> L6:
>         ldrb    w4, [x6, x3]
>         add     x3, x3, 1
>         ldrb    w5, [x1, x3]
>         cmp     w5, w4
>         bne     .L1
> .L3:
>         mov     w0, w3
>         cmp     w2, w3
>         bhi     .L6
>
> No adjust_iv_update_pos:
>
> .L6:
>         ldrb    w5, [x6, x3]
>         ldrb    w4, [x1, x3]
>         add     x3, x3, 1
>         cmp     w5, w4
>         bne     .L1
> .L3:
>         mov     w0, w3
>         cmp     w2, w3
>         bhi     .L6
>
>
> While built with old GCC(gcc version 7.4.1 20190424), it generates
> worse code and runtime is 11.664s:
>
> .L6:
>         ldrb    w4, [x6, x3]
>         add     x3, x3, 1
>         add     x5, x1, x3
>         ldrb    w5, [x5, -1]
>         cmp     w5, w4
>         bne     .L1
> .L3:
>         cmp     w3, w2
>         mov     w0, w3
>         bcc     .L6
>
>
> In general, only Power shows negative performance with adjust_iv_update_pos,
> that's why I tried to add target hook for it, is this reasonable?  Or we should
> just remove adjust_iv_update_pos since it doesn't help performance for X86 or
> other targets?

It was Bin who added the functionality - maybe he can remember the cases
he saw improvements for.

I think we should instead try to change the code that it doesn't apply to
CONDs where both operands are defined by stmts using the IV?
And this time add a testcase ;)

Richard.

>
>
> test.c
> #include <stdlib.h>
> __attribute__((noinline)) unsigned int
> foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> {
>         unsigned int len = 2;
>         do {
>                 len++;
>         }while(len < maxlen && ip[len] == ref[len]);
>         return len;
> }
>
> int main (int argc, char* argv[])
> {
>   unsigned char string_a [] = "abcdefghijklmnopqrstuvwxyzmnpppppppppppaaaaaaabbbbbbeeee";
>   unsigned char string_b [] = "abcdefghijklmnopqrstuvwxyzmnpppppppppppaaaaaaabbbbbbeene";
>   unsigned long ret = 0;
>
>   for (long i = 0; i < atoi(argv[1]) * 100000000; i++)
>           ret += foo (string_a, string_b, sizeof(string_a));
>   return ret;
> }
>
>
> >
> > so here you can see the missed fusion.  Of course
> > in this case the IV update could have been sunk into
> > the .L3 block and replicated on the exit edge as well.
> >
> > I'm not sure if the motivation for the change introducing this
> > trick was the above kind of combination or not, but I guess
> > so.  The dependence distance of the IV increment to the
> > use is now shorter, so I'm not sure the combined variant is
> > better.
> >
> > Richard.
> >
> >
> >>
> >> --
> >> Thanks,
> >> Xionghu
>
> --
> Thanks,
> Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-28  8:25         ` Richard Biener
@ 2021-06-29  9:19           ` Xionghu Luo
  2021-07-07 13:20             ` Richard Biener
  2021-07-07  5:51           ` Bin.Cheng
  1 sibling, 1 reply; 14+ messages in thread
From: Xionghu Luo @ 2021-06-29  9:19 UTC (permalink / raw)
  To: Richard Biener, Amker.Cheng
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, linkw,
	David Edelsohn, H. J. Lu



On 2021/6/28 16:25, Richard Biener wrote:
> On Mon, Jun 28, 2021 at 10:07 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>
>>
>>
>> On 2021/6/25 18:02, Richard Biener wrote:
>>> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2021/6/25 16:54, Richard Biener wrote:
>>>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>>
>>>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
>>>>>>
>>>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
>>>>>> on Power.  For example, it generates mismatched address offset after
>>>>>> adjust iv update statement position:
>>>>>>
>>>>>> <bb 32> [local count: 70988443]:
>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>>>> _34 = ref_180 + 18446744073709551615;
>>>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>>>>> if (_84 == _86)
>>>>>>      goto <bb 56>; [94.50%]
>>>>>>      else
>>>>>>      goto <bb 87>; [5.50%]
>>>>>>
>>>>>> Disable it will produce:
>>>>>>
>>>>>> <bb 32> [local count: 70988443]:
>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>>>> if (_84 == _86)
>>>>>>      goto <bb 56>; [94.50%]
>>>>>>      else
>>>>>>      goto <bb 87>; [5.50%]
>>>>>>
>>>>>> Then later pass loop unroll could benefit from same address offset
>>>>>> with different base address and reduces register dependency.
>>>>>> This patch could improve performance by 10% for typical case on Power,
>>>>>> no performance change observed for X86 or Aarch64 due to small loops
>>>>>> not unrolled on these platforms.  Any comments?
>>>>>
>>>>> The case you quote is special in that if we hoisted the IV update before
>>>>> the other MEM _also_ used in the condition it would be fine again.
>>>>
>>>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
>>>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
>>>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
>>>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
>>>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
>>>>   From the ASM, we can see the index register %r4 is used in two iterations which
>>>> maybe a bottle neck for hiding instruction latency?
>>>>
>>>> Then it seems reasonable the performance would be better if keep the IV update
>>>> statement at *LAST* (Fix 1).
>>>>
>>>> (Fix 2):
>>>>     <bb 32> [local count: 70988443]:
>>>>     ivtmp.30_415 = ivtmp.30_414 + 1;
>>>>     _34 = ip_229 + 18446744073709551615;
>>>>     _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>>>     _33 = ref_180 + 18446744073709551615;
>>>>     _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
>>>>     if (_84 == _86)
>>>>       goto <bb 56>; [94.50%]
>>>>     else
>>>>       goto <bb 87>; [5.50%]
>>>>
>>>>
>>>> .L67:
>>>>           lbzx %r12,%r24,%r4
>>>>           lbzx %r25,%r7,%r4
>>>>           cmpw %cr0,%r12,%r25
>>>>           bne %cr0,.L11
>>>>           mr %r26,%r4
>>>>           addi %r4,%r4,1
>>>>           lbzx %r12,%r24,%r4
>>>>           lbzx %r25,%r7,%r4
>>>>           mr %r6,%r26
>>>>           cmpw %cr0,%r12,%r25
>>>>           bne %cr0,.L11
>>>>           mr %r26,%r4
>>>> .L12:
>>>>           cmpdi %cr0,%r10,1
>>>>           addi %r4,%r26,1
>>>>           mr %r6,%r26
>>>>           addi %r10,%r10,-1
>>>>           bne %cr0,.L67
>>>>
>>>>>
>>>>> Now, adjust_iv_update_pos doesn't seem to check that the
>>>>> condition actually uses the IV use stmt def, so it likely applies to
>>>>> too many cases.
>>>>>
>>>>> Unfortunately the introducing rev didn't come with a testcase,
>>>>> but still I think fixing up adjust_iv_update_pos is better than
>>>>> introducing a way to short-cut it per target decision.
>>>>>
>>>>> One "fix" might be to add a check that either the condition
>>>>> lhs or rhs is the def of the IV use and the other operand
>>>>> is invariant.  Or if it's of similar structure hoist across the
>>>>> other iv-use as well.  Not that I understand the argument
>>>>> about the overlapping life-range.
>>>>>
>>>>> You also don't provide a complete testcase ...
>>>>>
>>>>
>>>> Attached the test code, will also add it it patch in future version.
>>>> The issue comes from a very small hot loop:
>>>>
>>>>           do {
>>>>             len++;
>>>>           } while(len < maxlen && ip[len] == ref[len]);
>>>
>>> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
>>> {
>>>     unsigned int len = 2;
>>>     do {
>>>         len++;
>>>     }while(len < maxlen && ip[len] == ref[len]);
>>>     return len;
>>> }
>>>
>>> I can see the effect on this loop on x86_64 as well, we end up with
>>>
>>> .L6:
>>>           movzbl  (%rdi,%rax), %ecx
>>>           addq    $1, %rax
>>>           cmpb    -1(%rsi,%rax), %cl
>>>           jne     .L1
>>> .L3:
>>>           movl    %eax, %r8d
>>>           cmpl    %edx, %eax
>>>           jb      .L6
>>>
>>> but without the trick it is
>>>
>>> .L6:
>>>           movzbl  (%rdi,%rax), %r8d
>>>           movzbl  (%rsi,%rax), %ecx
>>>           addq    $1, %rax
>>>           cmpb    %cl, %r8b
>>>           jne     .L1
>>> .L3:
>>>           movl    %eax, %r9d
>>>           cmpl    %edx, %eax
>>>           jb      .L6
>>
>> Verified this small piece of code on X86, there is no performance
>> change with or without adjust_iv_update_pos (I've checked the ASM
>> exactly same with yours):
>>
>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
>>
>> real    0m7.003s
>> user    0m6.996s
>> sys     0m0.000s
>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
>> cc/ -O2 test.c
>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
>>
>> real    0m7.070s
>> user    0m7.068s
>> sys     0m0.000s
>>
>>
>> But for AArch64, current GCC code also generates similar code with
>> or without adjust_iv_update_pos, the runtime is 10.705s for them.
>>
>> L6:
>>          ldrb    w4, [x6, x3]
>>          add     x3, x3, 1
>>          ldrb    w5, [x1, x3]
>>          cmp     w5, w4
>>          bne     .L1
>> .L3:
>>          mov     w0, w3
>>          cmp     w2, w3
>>          bhi     .L6
>>
>> No adjust_iv_update_pos:
>>
>> .L6:
>>          ldrb    w5, [x6, x3]
>>          ldrb    w4, [x1, x3]
>>          add     x3, x3, 1
>>          cmp     w5, w4
>>          bne     .L1
>> .L3:
>>          mov     w0, w3
>>          cmp     w2, w3
>>          bhi     .L6
>>
>>
>> While built with old GCC(gcc version 7.4.1 20190424), it generates
>> worse code and runtime is 11.664s:
>>
>> .L6:
>>          ldrb    w4, [x6, x3]
>>          add     x3, x3, 1
>>          add     x5, x1, x3
>>          ldrb    w5, [x5, -1]
>>          cmp     w5, w4
>>          bne     .L1
>> .L3:
>>          cmp     w3, w2
>>          mov     w0, w3
>>          bcc     .L6
>>
>>
>> In general, only Power shows negative performance with adjust_iv_update_pos,
>> that's why I tried to add target hook for it, is this reasonable?  Or we should
>> just remove adjust_iv_update_pos since it doesn't help performance for X86 or
>> other targets?
> 
> It was Bin who added the functionality - maybe he can remember the cases
> he saw improvements for.
> 
> I think we should instead try to change the code that it doesn't apply to
> CONDs where both operands are defined by stmts using the IV?
> And this time add a testcase ;)

Thanks. I am not sure whether should check both cond_lhs and cond_rhs or just
cond_lhs is enough if COND expr's LHS is always processed first?

For example, adjust_iv_update_pos will be called twice for this case,
after the first call, the gimple will be updated to:

<bb 4> [local count: 1014686026]:
_1 = (sizetype) len_8;
_2 = ip_10(D) + _1;
_3 = MEM[(unsigned char *)ip_10(D) + ivtmp.15_15 * 1];
_5 = ref_12(D) + _1;
_6 = *_5;
ivtmp.15_16 = ivtmp.15_15 + 1;
if (_3 == _6)
  goto <bb 6>; [94.50%]
else
  goto <bb 10>; [5.50%]


At this moment, use->stmt is "_6 = *_5;", it is not using IV directly?



[PATCH] ivopts: Don't adjust IV update statement if both operands use the IV in COND [PR101250]

gcc/ChangeLog:

	PR middle-end/101250
	* tree-ssa-loop-ivopts.c (adjust_iv_update_pos): Check the COND
	operands whether both use IV.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr101250.c | 15 ++++++++++++
 gcc/tree-ssa-loop-ivopts.c               | 31 +++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr101250.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr101250.c b/gcc/testsuite/gcc.dg/tree-ssa/pr101250.c
new file mode 100644
index 00000000000..70d3701de49
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr101250.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
+{
+  unsigned int len = 2;
+  do
+    {
+      len++;
+    }
+  while (len < maxlen && ip[len] == ref[len]);
+  return len;
+}
+
+/* { dg-final { scan-tree-dump-not "Reordering" "ivopts" } } */
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 12a8a49a307..082097bff38 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -7352,7 +7352,7 @@ static void
 adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
 {
   tree var_after;
-  gimple *iv_update, *stmt;
+  gimple *iv_update, *stmt, *cond_stmt;
   basic_block bb;
   gimple_stmt_iterator gsi, gsi_iv;
 
@@ -7370,6 +7370,7 @@ adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
   if (gimple_code (stmt) != GIMPLE_COND)
     return;
 
+  cond_stmt = stmt;
   gsi_prev_nondebug (&gsi);
   stmt = gsi_stmt (gsi);
   if (stmt != iv_update)
@@ -7386,6 +7387,34 @@ adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
   if (stmt != use->stmt)
     return;
 
+  gcond *gcond_stmt = as_a <gcond *> (cond_stmt);
+  tree *cond_lhs = gimple_cond_lhs_ptr (gcond_stmt);
+  tree *cond_rhs = gimple_cond_rhs_ptr (gcond_stmt);
+  if (TREE_CODE (*cond_lhs) == SSA_NAME && TREE_CODE (*cond_rhs) == SSA_NAME)
+    {
+      /* If both sides are memory operations use iv var_before, adjust the
+       iv update stmt before second meory operations will cause offset index
+       mismatch, which may hurt performance if unroll, so return here to avoid
+       reorder.
+
+       _1 = (sizetype) len_8;
+       _2 = ip_10(D) + _1;
+       _3 = MEM[(unsigned char *)ip_10(D) + ivtmp.15_15 * 1];
+       _5 = ref_12(D) + _1;
+       _6 = *_5;
+       ivtmp.15_16 = ivtmp.15_15 + 1;
+       if (_3 == _6).   */
+
+      gimple *lhs_mem = SSA_NAME_DEF_STMT (*cond_lhs);
+      gimple *rhs_mem = SSA_NAME_DEF_STMT (*cond_rhs);
+      gimple *use_stmt;
+      imm_use_iterator use_iter;
+      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, cand->var_before)
+	if ((use_stmt == lhs_mem || use_stmt == rhs_mem)
+	    && use_stmt != use->stmt)
+	  return;
+    }
+
   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
     return;
 
-- 
2.27.0.90.geebb51ba8c



-- 
Thanks,
Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-28  8:25         ` Richard Biener
  2021-06-29  9:19           ` Xionghu Luo
@ 2021-07-07  5:51           ` Bin.Cheng
  1 sibling, 0 replies; 14+ messages in thread
From: Bin.Cheng @ 2021-07-07  5:51 UTC (permalink / raw)
  To: Richard Biener
  Cc: Xionghu Luo, GCC Patches, Segher Boessenkool, Bill Schmidt,
	linkw, David Edelsohn, H. J. Lu

On Mon, Jun 28, 2021 at 4:26 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, Jun 28, 2021 at 10:07 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >
> >
> >
> > On 2021/6/25 18:02, Richard Biener wrote:
> > > On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> > >>
> > >>
> > >>
> > >> On 2021/6/25 16:54, Richard Biener wrote:
> > >>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
> > >>> <gcc-patches@gcc.gnu.org> wrote:
> > >>>>
> > >>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
> > >>>>
> > >>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> > >>>> on Power.  For example, it generates mismatched address offset after
> > >>>> adjust iv update statement position:
> > >>>>
> > >>>> <bb 32> [local count: 70988443]:
> > >>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> > >>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> > >>>> _34 = ref_180 + 18446744073709551615;
> > >>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> > >>>> if (_84 == _86)
> > >>>>     goto <bb 56>; [94.50%]
> > >>>>     else
> > >>>>     goto <bb 87>; [5.50%]
> > >>>>
> > >>>> Disable it will produce:
> > >>>>
> > >>>> <bb 32> [local count: 70988443]:
> > >>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> > >>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> > >>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> > >>>> if (_84 == _86)
> > >>>>     goto <bb 56>; [94.50%]
> > >>>>     else
> > >>>>     goto <bb 87>; [5.50%]
> > >>>>
> > >>>> Then later pass loop unroll could benefit from same address offset
> > >>>> with different base address and reduces register dependency.
> > >>>> This patch could improve performance by 10% for typical case on Power,
> > >>>> no performance change observed for X86 or Aarch64 due to small loops
> > >>>> not unrolled on these platforms.  Any comments?
> > >>>
> > >>> The case you quote is special in that if we hoisted the IV update before
> > >>> the other MEM _also_ used in the condition it would be fine again.
> > >>
> > >> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
> > >> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
> > >> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
> > >> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
> > >> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
> > >>  From the ASM, we can see the index register %r4 is used in two iterations which
> > >> maybe a bottle neck for hiding instruction latency?
> > >>
> > >> Then it seems reasonable the performance would be better if keep the IV update
> > >> statement at *LAST* (Fix 1).
> > >>
> > >> (Fix 2):
> > >>    <bb 32> [local count: 70988443]:
> > >>    ivtmp.30_415 = ivtmp.30_414 + 1;
> > >>    _34 = ip_229 + 18446744073709551615;
> > >>    _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> > >>    _33 = ref_180 + 18446744073709551615;
> > >>    _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
> > >>    if (_84 == _86)
> > >>      goto <bb 56>; [94.50%]
> > >>    else
> > >>      goto <bb 87>; [5.50%]
> > >>
> > >>
> > >> .L67:
> > >>          lbzx %r12,%r24,%r4
> > >>          lbzx %r25,%r7,%r4
> > >>          cmpw %cr0,%r12,%r25
> > >>          bne %cr0,.L11
> > >>          mr %r26,%r4
> > >>          addi %r4,%r4,1
> > >>          lbzx %r12,%r24,%r4
> > >>          lbzx %r25,%r7,%r4
> > >>          mr %r6,%r26
> > >>          cmpw %cr0,%r12,%r25
> > >>          bne %cr0,.L11
> > >>          mr %r26,%r4
> > >> .L12:
> > >>          cmpdi %cr0,%r10,1
> > >>          addi %r4,%r26,1
> > >>          mr %r6,%r26
> > >>          addi %r10,%r10,-1
> > >>          bne %cr0,.L67
> > >>
> > >>>
> > >>> Now, adjust_iv_update_pos doesn't seem to check that the
> > >>> condition actually uses the IV use stmt def, so it likely applies to
> > >>> too many cases.
> > >>>
> > >>> Unfortunately the introducing rev didn't come with a testcase,
> > >>> but still I think fixing up adjust_iv_update_pos is better than
> > >>> introducing a way to short-cut it per target decision.
> > >>>
> > >>> One "fix" might be to add a check that either the condition
> > >>> lhs or rhs is the def of the IV use and the other operand
> > >>> is invariant.  Or if it's of similar structure hoist across the
> > >>> other iv-use as well.  Not that I understand the argument
> > >>> about the overlapping life-range.
> > >>>
> > >>> You also don't provide a complete testcase ...
> > >>>
> > >>
> > >> Attached the test code, will also add it it patch in future version.
> > >> The issue comes from a very small hot loop:
> > >>
> > >>          do {
> > >>            len++;
> > >>          } while(len < maxlen && ip[len] == ref[len]);
> > >
> > > unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> > > {
> > >    unsigned int len = 2;
> > >    do {
> > >        len++;
> > >    }while(len < maxlen && ip[len] == ref[len]);
> > >    return len;
> > > }
> > >
> > > I can see the effect on this loop on x86_64 as well, we end up with
> > >
> > > .L6:
> > >          movzbl  (%rdi,%rax), %ecx
> > >          addq    $1, %rax
> > >          cmpb    -1(%rsi,%rax), %cl
> > >          jne     .L1
> > > .L3:
> > >          movl    %eax, %r8d
> > >          cmpl    %edx, %eax
> > >          jb      .L6
> > >
> > > but without the trick it is
> > >
> > > .L6:
> > >          movzbl  (%rdi,%rax), %r8d
> > >          movzbl  (%rsi,%rax), %ecx
> > >          addq    $1, %rax
> > >          cmpb    %cl, %r8b
> > >          jne     .L1
> > > .L3:
> > >          movl    %eax, %r9d
> > >          cmpl    %edx, %eax
> > >          jb      .L6
> >
> > Verified this small piece of code on X86, there is no performance
> > change with or without adjust_iv_update_pos (I've checked the ASM
> > exactly same with yours):
> >
> > luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
> > luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> >
> > real    0m7.003s
> > user    0m6.996s
> > sys     0m0.000s
> > luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
> > cc/ -O2 test.c
> > luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> >
> > real    0m7.070s
> > user    0m7.068s
> > sys     0m0.000s
> >
> >
> > But for AArch64, current GCC code also generates similar code with
> > or without adjust_iv_update_pos, the runtime is 10.705s for them.
> >
> > L6:
> >         ldrb    w4, [x6, x3]
> >         add     x3, x3, 1
> >         ldrb    w5, [x1, x3]
> >         cmp     w5, w4
> >         bne     .L1
> > .L3:
> >         mov     w0, w3
> >         cmp     w2, w3
> >         bhi     .L6
> >
> > No adjust_iv_update_pos:
> >
> > .L6:
> >         ldrb    w5, [x6, x3]
> >         ldrb    w4, [x1, x3]
> >         add     x3, x3, 1
> >         cmp     w5, w4
> >         bne     .L1
> > .L3:
> >         mov     w0, w3
> >         cmp     w2, w3
> >         bhi     .L6
> >
> >
> > While built with old GCC(gcc version 7.4.1 20190424), it generates
> > worse code and runtime is 11.664s:
> >
> > .L6:
> >         ldrb    w4, [x6, x3]
> >         add     x3, x3, 1
> >         add     x5, x1, x3
> >         ldrb    w5, [x5, -1]
> >         cmp     w5, w4
> >         bne     .L1
> > .L3:
> >         cmp     w3, w2
> >         mov     w0, w3
> >         bcc     .L6
> >
> >
> > In general, only Power shows negative performance with adjust_iv_update_pos,
> > that's why I tried to add target hook for it, is this reasonable?  Or we should
> > just remove adjust_iv_update_pos since it doesn't help performance for X86 or
> > other targets?
>
> It was Bin who added the functionality - maybe he can remember the cases
> he saw improvements for.
Hmm, not sure if you meant that David added this code?
In general, I am for removing special handling in IVOPTs, however,
this might break micro optimization cases as the x86 one, could you
please further investigate it before making any decision? (For
example, if the issue could be fixed by other means?)

Thanks,
bin
>
> I think we should instead try to change the code that it doesn't apply to
> CONDs where both operands are defined by stmts using the IV?
> And this time add a testcase ;)
>
> Richard.
>
> >
> >
> > test.c
> > #include <stdlib.h>
> > __attribute__((noinline)) unsigned int
> > foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> > {
> >         unsigned int len = 2;
> >         do {
> >                 len++;
> >         }while(len < maxlen && ip[len] == ref[len]);
> >         return len;
> > }
> >
> > int main (int argc, char* argv[])
> > {
> >   unsigned char string_a [] = "abcdefghijklmnopqrstuvwxyzmnpppppppppppaaaaaaabbbbbbeeee";
> >   unsigned char string_b [] = "abcdefghijklmnopqrstuvwxyzmnpppppppppppaaaaaaabbbbbbeene";
> >   unsigned long ret = 0;
> >
> >   for (long i = 0; i < atoi(argv[1]) * 100000000; i++)
> >           ret += foo (string_a, string_b, sizeof(string_a));
> >   return ret;
> > }
> >
> >
> > >
> > > so here you can see the missed fusion.  Of course
> > > in this case the IV update could have been sunk into
> > > the .L3 block and replicated on the exit edge as well.
> > >
> > > I'm not sure if the motivation for the change introducing this
> > > trick was the above kind of combination or not, but I guess
> > > so.  The dependence distance of the IV increment to the
> > > use is now shorter, so I'm not sure the combined variant is
> > > better.
> > >
> > > Richard.
> > >
> > >
> > >>
> > >> --
> > >> Thanks,
> > >> Xionghu
> >
> > --
> > Thanks,
> > Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-06-29  9:19           ` Xionghu Luo
@ 2021-07-07 13:20             ` Richard Biener
  2021-07-12  8:13               ` Xionghu Luo
  0 siblings, 1 reply; 14+ messages in thread
From: Richard Biener @ 2021-07-07 13:20 UTC (permalink / raw)
  To: Xionghu Luo
  Cc: Amker.Cheng, GCC Patches, Segher Boessenkool, Bill Schmidt,
	linkw, David Edelsohn, H. J. Lu

On Tue, Jun 29, 2021 at 11:19 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>
>
>
> On 2021/6/28 16:25, Richard Biener wrote:
> > On Mon, Jun 28, 2021 at 10:07 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >>
> >>
> >>
> >> On 2021/6/25 18:02, Richard Biener wrote:
> >>> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 2021/6/25 16:54, Richard Biener wrote:
> >>>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
> >>>>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>>>
> >>>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
> >>>>>>
> >>>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> >>>>>> on Power.  For example, it generates mismatched address offset after
> >>>>>> adjust iv update statement position:
> >>>>>>
> >>>>>> <bb 32> [local count: 70988443]:
> >>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>>>> _34 = ref_180 + 18446744073709551615;
> >>>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> >>>>>> if (_84 == _86)
> >>>>>>      goto <bb 56>; [94.50%]
> >>>>>>      else
> >>>>>>      goto <bb 87>; [5.50%]
> >>>>>>
> >>>>>> Disable it will produce:
> >>>>>>
> >>>>>> <bb 32> [local count: 70988443]:
> >>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >>>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> >>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>>>> if (_84 == _86)
> >>>>>>      goto <bb 56>; [94.50%]
> >>>>>>      else
> >>>>>>      goto <bb 87>; [5.50%]
> >>>>>>
> >>>>>> Then later pass loop unroll could benefit from same address offset
> >>>>>> with different base address and reduces register dependency.
> >>>>>> This patch could improve performance by 10% for typical case on Power,
> >>>>>> no performance change observed for X86 or Aarch64 due to small loops
> >>>>>> not unrolled on these platforms.  Any comments?
> >>>>>
> >>>>> The case you quote is special in that if we hoisted the IV update before
> >>>>> the other MEM _also_ used in the condition it would be fine again.
> >>>>
> >>>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
> >>>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
> >>>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
> >>>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
> >>>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
> >>>>   From the ASM, we can see the index register %r4 is used in two iterations which
> >>>> maybe a bottle neck for hiding instruction latency?
> >>>>
> >>>> Then it seems reasonable the performance would be better if keep the IV update
> >>>> statement at *LAST* (Fix 1).
> >>>>
> >>>> (Fix 2):
> >>>>     <bb 32> [local count: 70988443]:
> >>>>     ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>>     _34 = ip_229 + 18446744073709551615;
> >>>>     _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> >>>>     _33 = ref_180 + 18446744073709551615;
> >>>>     _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
> >>>>     if (_84 == _86)
> >>>>       goto <bb 56>; [94.50%]
> >>>>     else
> >>>>       goto <bb 87>; [5.50%]
> >>>>
> >>>>
> >>>> .L67:
> >>>>           lbzx %r12,%r24,%r4
> >>>>           lbzx %r25,%r7,%r4
> >>>>           cmpw %cr0,%r12,%r25
> >>>>           bne %cr0,.L11
> >>>>           mr %r26,%r4
> >>>>           addi %r4,%r4,1
> >>>>           lbzx %r12,%r24,%r4
> >>>>           lbzx %r25,%r7,%r4
> >>>>           mr %r6,%r26
> >>>>           cmpw %cr0,%r12,%r25
> >>>>           bne %cr0,.L11
> >>>>           mr %r26,%r4
> >>>> .L12:
> >>>>           cmpdi %cr0,%r10,1
> >>>>           addi %r4,%r26,1
> >>>>           mr %r6,%r26
> >>>>           addi %r10,%r10,-1
> >>>>           bne %cr0,.L67
> >>>>
> >>>>>
> >>>>> Now, adjust_iv_update_pos doesn't seem to check that the
> >>>>> condition actually uses the IV use stmt def, so it likely applies to
> >>>>> too many cases.
> >>>>>
> >>>>> Unfortunately the introducing rev didn't come with a testcase,
> >>>>> but still I think fixing up adjust_iv_update_pos is better than
> >>>>> introducing a way to short-cut it per target decision.
> >>>>>
> >>>>> One "fix" might be to add a check that either the condition
> >>>>> lhs or rhs is the def of the IV use and the other operand
> >>>>> is invariant.  Or if it's of similar structure hoist across the
> >>>>> other iv-use as well.  Not that I understand the argument
> >>>>> about the overlapping life-range.
> >>>>>
> >>>>> You also don't provide a complete testcase ...
> >>>>>
> >>>>
> >>>> Attached the test code, will also add it it patch in future version.
> >>>> The issue comes from a very small hot loop:
> >>>>
> >>>>           do {
> >>>>             len++;
> >>>>           } while(len < maxlen && ip[len] == ref[len]);
> >>>
> >>> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> >>> {
> >>>     unsigned int len = 2;
> >>>     do {
> >>>         len++;
> >>>     }while(len < maxlen && ip[len] == ref[len]);
> >>>     return len;
> >>> }
> >>>
> >>> I can see the effect on this loop on x86_64 as well, we end up with
> >>>
> >>> .L6:
> >>>           movzbl  (%rdi,%rax), %ecx
> >>>           addq    $1, %rax
> >>>           cmpb    -1(%rsi,%rax), %cl
> >>>           jne     .L1
> >>> .L3:
> >>>           movl    %eax, %r8d
> >>>           cmpl    %edx, %eax
> >>>           jb      .L6
> >>>
> >>> but without the trick it is
> >>>
> >>> .L6:
> >>>           movzbl  (%rdi,%rax), %r8d
> >>>           movzbl  (%rsi,%rax), %ecx
> >>>           addq    $1, %rax
> >>>           cmpb    %cl, %r8b
> >>>           jne     .L1
> >>> .L3:
> >>>           movl    %eax, %r9d
> >>>           cmpl    %edx, %eax
> >>>           jb      .L6
> >>
> >> Verified this small piece of code on X86, there is no performance
> >> change with or without adjust_iv_update_pos (I've checked the ASM
> >> exactly same with yours):
> >>
> >> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
> >> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> >>
> >> real    0m7.003s
> >> user    0m6.996s
> >> sys     0m0.000s
> >> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
> >> cc/ -O2 test.c
> >> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> >>
> >> real    0m7.070s
> >> user    0m7.068s
> >> sys     0m0.000s
> >>
> >>
> >> But for AArch64, current GCC code also generates similar code with
> >> or without adjust_iv_update_pos, the runtime is 10.705s for them.
> >>
> >> L6:
> >>          ldrb    w4, [x6, x3]
> >>          add     x3, x3, 1
> >>          ldrb    w5, [x1, x3]
> >>          cmp     w5, w4
> >>          bne     .L1
> >> .L3:
> >>          mov     w0, w3
> >>          cmp     w2, w3
> >>          bhi     .L6
> >>
> >> No adjust_iv_update_pos:
> >>
> >> .L6:
> >>          ldrb    w5, [x6, x3]
> >>          ldrb    w4, [x1, x3]
> >>          add     x3, x3, 1
> >>          cmp     w5, w4
> >>          bne     .L1
> >> .L3:
> >>          mov     w0, w3
> >>          cmp     w2, w3
> >>          bhi     .L6
> >>
> >>
> >> While built with old GCC(gcc version 7.4.1 20190424), it generates
> >> worse code and runtime is 11.664s:
> >>
> >> .L6:
> >>          ldrb    w4, [x6, x3]
> >>          add     x3, x3, 1
> >>          add     x5, x1, x3
> >>          ldrb    w5, [x5, -1]
> >>          cmp     w5, w4
> >>          bne     .L1
> >> .L3:
> >>          cmp     w3, w2
> >>          mov     w0, w3
> >>          bcc     .L6
> >>
> >>
> >> In general, only Power shows negative performance with adjust_iv_update_pos,
> >> that's why I tried to add target hook for it, is this reasonable?  Or we should
> >> just remove adjust_iv_update_pos since it doesn't help performance for X86 or
> >> other targets?
> >
> > It was Bin who added the functionality - maybe he can remember the cases
> > he saw improvements for.
> >
> > I think we should instead try to change the code that it doesn't apply to
> > CONDs where both operands are defined by stmts using the IV?
> > And this time add a testcase ;)
>
> Thanks. I am not sure whether should check both cond_lhs and cond_rhs or just
> cond_lhs is enough if COND expr's LHS is always processed first?
>
> For example, adjust_iv_update_pos will be called twice for this case,
> after the first call, the gimple will be updated to:
>
> <bb 4> [local count: 1014686026]:
> _1 = (sizetype) len_8;
> _2 = ip_10(D) + _1;
> _3 = MEM[(unsigned char *)ip_10(D) + ivtmp.15_15 * 1];
> _5 = ref_12(D) + _1;
> _6 = *_5;
> ivtmp.15_16 = ivtmp.15_15 + 1;
> if (_3 == _6)
>   goto <bb 6>; [94.50%]
> else
>   goto <bb 10>; [5.50%]
>
>
> At this moment, use->stmt is "_6 = *_5;", it is not using IV directly?

So looking around I think you want to pass down the IV group from
rewrite_groups to rewrite_use_address and then see whether the
definition of the RHS is one of the IV USE stmts that are going to be
expressed using the same candidate.  In fact, I guess (to avoid
linear searching the group), we likely might want to restrict considering
adjust_iv_update_pos for IV candidates with a single [address] use?
That would be more restrictive of course (OTOH the list shouldn't be
too big - hopefully).  In case the other side of the conditional is
a memory using a different IV the replacement should still be OK.

Richard.

>
>
> [PATCH] ivopts: Don't adjust IV update statement if both operands use the IV in COND [PR101250]
>
> gcc/ChangeLog:
>
>         PR middle-end/101250
>         * tree-ssa-loop-ivopts.c (adjust_iv_update_pos): Check the COND
>         operands whether both use IV.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/pr101250.c | 15 ++++++++++++
>  gcc/tree-ssa-loop-ivopts.c               | 31 +++++++++++++++++++++++-
>  2 files changed, 45 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr101250.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr101250.c b/gcc/testsuite/gcc.dg/tree-ssa/pr101250.c
> new file mode 100644
> index 00000000000..70d3701de49
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr101250.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
> +
> +unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> +{
> +  unsigned int len = 2;
> +  do
> +    {
> +      len++;
> +    }
> +  while (len < maxlen && ip[len] == ref[len]);
> +  return len;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "Reordering" "ivopts" } } */
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index 12a8a49a307..082097bff38 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -7352,7 +7352,7 @@ static void
>  adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
>  {
>    tree var_after;
> -  gimple *iv_update, *stmt;
> +  gimple *iv_update, *stmt, *cond_stmt;
>    basic_block bb;
>    gimple_stmt_iterator gsi, gsi_iv;
>
> @@ -7370,6 +7370,7 @@ adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
>    if (gimple_code (stmt) != GIMPLE_COND)
>      return;
>
> +  cond_stmt = stmt;
>    gsi_prev_nondebug (&gsi);
>    stmt = gsi_stmt (gsi);
>    if (stmt != iv_update)
> @@ -7386,6 +7387,34 @@ adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
>    if (stmt != use->stmt)
>      return;
>
> +  gcond *gcond_stmt = as_a <gcond *> (cond_stmt);
> +  tree *cond_lhs = gimple_cond_lhs_ptr (gcond_stmt);
> +  tree *cond_rhs = gimple_cond_rhs_ptr (gcond_stmt);
> +  if (TREE_CODE (*cond_lhs) == SSA_NAME && TREE_CODE (*cond_rhs) == SSA_NAME)
> +    {
> +      /* If both sides are memory operations use iv var_before, adjust the
> +       iv update stmt before second meory operations will cause offset index
> +       mismatch, which may hurt performance if unroll, so return here to avoid
> +       reorder.
> +
> +       _1 = (sizetype) len_8;
> +       _2 = ip_10(D) + _1;
> +       _3 = MEM[(unsigned char *)ip_10(D) + ivtmp.15_15 * 1];
> +       _5 = ref_12(D) + _1;
> +       _6 = *_5;
> +       ivtmp.15_16 = ivtmp.15_15 + 1;
> +       if (_3 == _6).   */
> +
> +      gimple *lhs_mem = SSA_NAME_DEF_STMT (*cond_lhs);
> +      gimple *rhs_mem = SSA_NAME_DEF_STMT (*cond_rhs);
> +      gimple *use_stmt;
> +      imm_use_iterator use_iter;
> +      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, cand->var_before)
> +       if ((use_stmt == lhs_mem || use_stmt == rhs_mem)
> +           && use_stmt != use->stmt)
> +         return;
> +    }
> +
>    if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
>      return;
>
> --
> 2.27.0.90.geebb51ba8c
>
>
>
> --
> Thanks,
> Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-07-07 13:20             ` Richard Biener
@ 2021-07-12  8:13               ` Xionghu Luo
  2021-07-12  9:05                 ` Hongtao Liu
  0 siblings, 1 reply; 14+ messages in thread
From: Xionghu Luo @ 2021-07-12  8:13 UTC (permalink / raw)
  To: Richard Biener
  Cc: Amker.Cheng, GCC Patches, Segher Boessenkool, Bill Schmidt,
	hongtao.liu, Uros Bizjak, H. J. Lu



On 2021/7/7 21:20, Richard Biener wrote:
> On Tue, Jun 29, 2021 at 11:19 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>
>>
>>
>> On 2021/6/28 16:25, Richard Biener wrote:
>>> On Mon, Jun 28, 2021 at 10:07 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2021/6/25 18:02, Richard Biener wrote:
>>>>> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2021/6/25 16:54, Richard Biener wrote:
>>>>>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
>>>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>>>>
>>>>>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
>>>>>>>>
>>>>>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
>>>>>>>> on Power.  For example, it generates mismatched address offset after
>>>>>>>> adjust iv update statement position:
>>>>>>>>
>>>>>>>> <bb 32> [local count: 70988443]:
>>>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>>>>>> _34 = ref_180 + 18446744073709551615;
>>>>>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>>>>>>> if (_84 == _86)
>>>>>>>>       goto <bb 56>; [94.50%]
>>>>>>>>       else
>>>>>>>>       goto <bb 87>; [5.50%]
>>>>>>>>
>>>>>>>> Disable it will produce:
>>>>>>>>
>>>>>>>> <bb 32> [local count: 70988443]:
>>>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
>>>>>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
>>>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
>>>>>>>> if (_84 == _86)
>>>>>>>>       goto <bb 56>; [94.50%]
>>>>>>>>       else
>>>>>>>>       goto <bb 87>; [5.50%]
>>>>>>>>
>>>>>>>> Then later pass loop unroll could benefit from same address offset
>>>>>>>> with different base address and reduces register dependency.
>>>>>>>> This patch could improve performance by 10% for typical case on Power,
>>>>>>>> no performance change observed for X86 or Aarch64 due to small loops
>>>>>>>> not unrolled on these platforms.  Any comments?
>>>>>>>
>>>>>>> The case you quote is special in that if we hoisted the IV update before
>>>>>>> the other MEM _also_ used in the condition it would be fine again.
>>>>>>
>>>>>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
>>>>>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
>>>>>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
>>>>>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
>>>>>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
>>>>>>    From the ASM, we can see the index register %r4 is used in two iterations which
>>>>>> maybe a bottle neck for hiding instruction latency?
>>>>>>
>>>>>> Then it seems reasonable the performance would be better if keep the IV update
>>>>>> statement at *LAST* (Fix 1).
>>>>>>
>>>>>> (Fix 2):
>>>>>>      <bb 32> [local count: 70988443]:
>>>>>>      ivtmp.30_415 = ivtmp.30_414 + 1;
>>>>>>      _34 = ip_229 + 18446744073709551615;
>>>>>>      _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
>>>>>>      _33 = ref_180 + 18446744073709551615;
>>>>>>      _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
>>>>>>      if (_84 == _86)
>>>>>>        goto <bb 56>; [94.50%]
>>>>>>      else
>>>>>>        goto <bb 87>; [5.50%]
>>>>>>
>>>>>>
>>>>>> .L67:
>>>>>>            lbzx %r12,%r24,%r4
>>>>>>            lbzx %r25,%r7,%r4
>>>>>>            cmpw %cr0,%r12,%r25
>>>>>>            bne %cr0,.L11
>>>>>>            mr %r26,%r4
>>>>>>            addi %r4,%r4,1
>>>>>>            lbzx %r12,%r24,%r4
>>>>>>            lbzx %r25,%r7,%r4
>>>>>>            mr %r6,%r26
>>>>>>            cmpw %cr0,%r12,%r25
>>>>>>            bne %cr0,.L11
>>>>>>            mr %r26,%r4
>>>>>> .L12:
>>>>>>            cmpdi %cr0,%r10,1
>>>>>>            addi %r4,%r26,1
>>>>>>            mr %r6,%r26
>>>>>>            addi %r10,%r10,-1
>>>>>>            bne %cr0,.L67
>>>>>>
>>>>>>>
>>>>>>> Now, adjust_iv_update_pos doesn't seem to check that the
>>>>>>> condition actually uses the IV use stmt def, so it likely applies to
>>>>>>> too many cases.
>>>>>>>
>>>>>>> Unfortunately the introducing rev didn't come with a testcase,
>>>>>>> but still I think fixing up adjust_iv_update_pos is better than
>>>>>>> introducing a way to short-cut it per target decision.
>>>>>>>
>>>>>>> One "fix" might be to add a check that either the condition
>>>>>>> lhs or rhs is the def of the IV use and the other operand
>>>>>>> is invariant.  Or if it's of similar structure hoist across the
>>>>>>> other iv-use as well.  Not that I understand the argument
>>>>>>> about the overlapping life-range.
>>>>>>>
>>>>>>> You also don't provide a complete testcase ...
>>>>>>>
>>>>>>
>>>>>> Attached the test code, will also add it it patch in future version.
>>>>>> The issue comes from a very small hot loop:
>>>>>>
>>>>>>            do {
>>>>>>              len++;
>>>>>>            } while(len < maxlen && ip[len] == ref[len]);
>>>>>
>>>>> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
>>>>> {
>>>>>      unsigned int len = 2;
>>>>>      do {
>>>>>          len++;
>>>>>      }while(len < maxlen && ip[len] == ref[len]);
>>>>>      return len;
>>>>> }
>>>>>
>>>>> I can see the effect on this loop on x86_64 as well, we end up with
>>>>>
>>>>> .L6:
>>>>>            movzbl  (%rdi,%rax), %ecx
>>>>>            addq    $1, %rax
>>>>>            cmpb    -1(%rsi,%rax), %cl
>>>>>            jne     .L1
>>>>> .L3:
>>>>>            movl    %eax, %r8d
>>>>>            cmpl    %edx, %eax
>>>>>            jb      .L6
>>>>>
>>>>> but without the trick it is
>>>>>
>>>>> .L6:
>>>>>            movzbl  (%rdi,%rax), %r8d
>>>>>            movzbl  (%rsi,%rax), %ecx
>>>>>            addq    $1, %rax
>>>>>            cmpb    %cl, %r8b
>>>>>            jne     .L1
>>>>> .L3:
>>>>>            movl    %eax, %r9d
>>>>>            cmpl    %edx, %eax
>>>>>            jb      .L6
>>>>
>>>> Verified this small piece of code on X86, there is no performance
>>>> change with or without adjust_iv_update_pos (I've checked the ASM
>>>> exactly same with yours):
>>>>
>>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
>>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
>>>>
>>>> real    0m7.003s
>>>> user    0m6.996s
>>>> sys     0m0.000s
>>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
>>>> cc/ -O2 test.c
>>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
>>>>
>>>> real    0m7.070s
>>>> user    0m7.068s
>>>> sys     0m0.000s
>>>>
>>>>
>>>> But for AArch64, current GCC code also generates similar code with
>>>> or without adjust_iv_update_pos, the runtime is 10.705s for them.
>>>>
>>>> L6:
>>>>           ldrb    w4, [x6, x3]
>>>>           add     x3, x3, 1
>>>>           ldrb    w5, [x1, x3]
>>>>           cmp     w5, w4
>>>>           bne     .L1
>>>> .L3:
>>>>           mov     w0, w3
>>>>           cmp     w2, w3
>>>>           bhi     .L6
>>>>
>>>> No adjust_iv_update_pos:
>>>>
>>>> .L6:
>>>>           ldrb    w5, [x6, x3]
>>>>           ldrb    w4, [x1, x3]
>>>>           add     x3, x3, 1
>>>>           cmp     w5, w4
>>>>           bne     .L1
>>>> .L3:
>>>>           mov     w0, w3
>>>>           cmp     w2, w3
>>>>           bhi     .L6
>>>>
>>>>
>>>> While built with old GCC(gcc version 7.4.1 20190424), it generates
>>>> worse code and runtime is 11.664s:
>>>>
>>>> .L6:
>>>>           ldrb    w4, [x6, x3]
>>>>           add     x3, x3, 1
>>>>           add     x5, x1, x3
>>>>           ldrb    w5, [x5, -1]
>>>>           cmp     w5, w4
>>>>           bne     .L1
>>>> .L3:
>>>>           cmp     w3, w2
>>>>           mov     w0, w3
>>>>           bcc     .L6
>>>>
>>>>
>>>> In general, only Power shows negative performance with adjust_iv_update_pos,
>>>> that's why I tried to add target hook for it, is this reasonable?  Or we should
>>>> just remove adjust_iv_update_pos since it doesn't help performance for X86 or
>>>> other targets?
>>>
>>> It was Bin who added the functionality - maybe he can remember the cases
>>> he saw improvements for.
>>>
>>> I think we should instead try to change the code that it doesn't apply to
>>> CONDs where both operands are defined by stmts using the IV?
>>> And this time add a testcase ;)
>>
>> Thanks. I am not sure whether should check both cond_lhs and cond_rhs or just
>> cond_lhs is enough if COND expr's LHS is always processed first?
>>
>> For example, adjust_iv_update_pos will be called twice for this case,
>> after the first call, the gimple will be updated to:
>>
>> <bb 4> [local count: 1014686026]:
>> _1 = (sizetype) len_8;
>> _2 = ip_10(D) + _1;
>> _3 = MEM[(unsigned char *)ip_10(D) + ivtmp.15_15 * 1];
>> _5 = ref_12(D) + _1;
>> _6 = *_5;
>> ivtmp.15_16 = ivtmp.15_15 + 1;
>> if (_3 == _6)
>>    goto <bb 6>; [94.50%]
>> else
>>    goto <bb 10>; [5.50%]
>>
>>
>> At this moment, use->stmt is "_6 = *_5;", it is not using IV directly?
> 
> So looking around I think you want to pass down the IV group from
> rewrite_groups to rewrite_use_address and then see whether the
> definition of the RHS is one of the IV USE stmts that are going to be
> expressed using the same candidate.  In fact, I guess (to avoid
> linear searching the group), we likely might want to restrict considering
> adjust_iv_update_pos for IV candidates with a single [address] use?
> That would be more restrictive of course (OTOH the list shouldn't be
> too big - hopefully).  In case the other side of the conditional is
> a memory using a different IV the replacement should still be OK.
> 
> Richard.

Thanks, however those methods would still break the X86 optimization
with worse ASM(one more movzbl) though no performance change observed.
CCed X86 maintainers to see which one they prefer, and whether target
hook is a better solution?


GCC trunk:

.L6:
          movzbl  (%rdi,%rax), %ecx
          addq    $1, %rax
          cmpb    -1(%rsi,%rax), %cl
          jne     .L1
.L3:
          movl    %eax, %r8d
          cmpl    %edx, %eax
          jb      .L6

Disable adjust_iv_update_pos:

.L6:
          movzbl  (%rdi,%rax), %r8d
          movzbl  (%rsi,%rax), %ecx
          addq    $1, %rax
          cmpb    %cl, %r8b
          jne     .L1
.L3:
          movl    %eax, %r9d
          cmpl    %edx, %eax
          jb      .L6



-- 
Thanks,
Xionghu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-07-12  8:13               ` Xionghu Luo
@ 2021-07-12  9:05                 ` Hongtao Liu
  2021-07-12  9:49                   ` Richard Biener
  0 siblings, 1 reply; 14+ messages in thread
From: Hongtao Liu @ 2021-07-12  9:05 UTC (permalink / raw)
  To: Xionghu Luo, Uros Bizjak
  Cc: Richard Biener, Segher Boessenkool, Bill Schmidt, GCC Patches,
	Liu, Hongtao

On Mon, Jul 12, 2021 at 4:14 PM Xionghu Luo via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> On 2021/7/7 21:20, Richard Biener wrote:
> > On Tue, Jun 29, 2021 at 11:19 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >>
> >>
> >>
> >> On 2021/6/28 16:25, Richard Biener wrote:
> >>> On Mon, Jun 28, 2021 at 10:07 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 2021/6/25 18:02, Richard Biener wrote:
> >>>>> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 2021/6/25 16:54, Richard Biener wrote:
> >>>>>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
> >>>>>>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>>>>>
> >>>>>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
> >>>>>>>>
> >>>>>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> >>>>>>>> on Power.  For example, it generates mismatched address offset after
> >>>>>>>> adjust iv update statement position:
> >>>>>>>>
> >>>>>>>> <bb 32> [local count: 70988443]:
> >>>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >>>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>>>>>> _34 = ref_180 + 18446744073709551615;
> >>>>>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> >>>>>>>> if (_84 == _86)
> >>>>>>>>       goto <bb 56>; [94.50%]
> >>>>>>>>       else
> >>>>>>>>       goto <bb 87>; [5.50%]
> >>>>>>>>
> >>>>>>>> Disable it will produce:
> >>>>>>>>
> >>>>>>>> <bb 32> [local count: 70988443]:
> >>>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> >>>>>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> >>>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>>>>>> if (_84 == _86)
> >>>>>>>>       goto <bb 56>; [94.50%]
> >>>>>>>>       else
> >>>>>>>>       goto <bb 87>; [5.50%]
> >>>>>>>>
> >>>>>>>> Then later pass loop unroll could benefit from same address offset
> >>>>>>>> with different base address and reduces register dependency.
> >>>>>>>> This patch could improve performance by 10% for typical case on Power,
> >>>>>>>> no performance change observed for X86 or Aarch64 due to small loops
> >>>>>>>> not unrolled on these platforms.  Any comments?
> >>>>>>>
> >>>>>>> The case you quote is special in that if we hoisted the IV update before
> >>>>>>> the other MEM _also_ used in the condition it would be fine again.
> >>>>>>
> >>>>>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
> >>>>>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
> >>>>>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
> >>>>>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
> >>>>>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
> >>>>>>    From the ASM, we can see the index register %r4 is used in two iterations which
> >>>>>> maybe a bottle neck for hiding instruction latency?
> >>>>>>
> >>>>>> Then it seems reasonable the performance would be better if keep the IV update
> >>>>>> statement at *LAST* (Fix 1).
> >>>>>>
> >>>>>> (Fix 2):
> >>>>>>      <bb 32> [local count: 70988443]:
> >>>>>>      ivtmp.30_415 = ivtmp.30_414 + 1;
> >>>>>>      _34 = ip_229 + 18446744073709551615;
> >>>>>>      _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> >>>>>>      _33 = ref_180 + 18446744073709551615;
> >>>>>>      _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
> >>>>>>      if (_84 == _86)
> >>>>>>        goto <bb 56>; [94.50%]
> >>>>>>      else
> >>>>>>        goto <bb 87>; [5.50%]
> >>>>>>
> >>>>>>
> >>>>>> .L67:
> >>>>>>            lbzx %r12,%r24,%r4
> >>>>>>            lbzx %r25,%r7,%r4
> >>>>>>            cmpw %cr0,%r12,%r25
> >>>>>>            bne %cr0,.L11
> >>>>>>            mr %r26,%r4
> >>>>>>            addi %r4,%r4,1
> >>>>>>            lbzx %r12,%r24,%r4
> >>>>>>            lbzx %r25,%r7,%r4
> >>>>>>            mr %r6,%r26
> >>>>>>            cmpw %cr0,%r12,%r25
> >>>>>>            bne %cr0,.L11
> >>>>>>            mr %r26,%r4
> >>>>>> .L12:
> >>>>>>            cmpdi %cr0,%r10,1
> >>>>>>            addi %r4,%r26,1
> >>>>>>            mr %r6,%r26
> >>>>>>            addi %r10,%r10,-1
> >>>>>>            bne %cr0,.L67
> >>>>>>
> >>>>>>>
> >>>>>>> Now, adjust_iv_update_pos doesn't seem to check that the
> >>>>>>> condition actually uses the IV use stmt def, so it likely applies to
> >>>>>>> too many cases.
> >>>>>>>
> >>>>>>> Unfortunately the introducing rev didn't come with a testcase,
> >>>>>>> but still I think fixing up adjust_iv_update_pos is better than
> >>>>>>> introducing a way to short-cut it per target decision.
> >>>>>>>
> >>>>>>> One "fix" might be to add a check that either the condition
> >>>>>>> lhs or rhs is the def of the IV use and the other operand
> >>>>>>> is invariant.  Or if it's of similar structure hoist across the
> >>>>>>> other iv-use as well.  Not that I understand the argument
> >>>>>>> about the overlapping life-range.
> >>>>>>>
> >>>>>>> You also don't provide a complete testcase ...
> >>>>>>>
> >>>>>>
> >>>>>> Attached the test code, will also add it it patch in future version.
> >>>>>> The issue comes from a very small hot loop:
> >>>>>>
> >>>>>>            do {
> >>>>>>              len++;
> >>>>>>            } while(len < maxlen && ip[len] == ref[len]);
> >>>>>
> >>>>> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> >>>>> {
> >>>>>      unsigned int len = 2;
> >>>>>      do {
> >>>>>          len++;
> >>>>>      }while(len < maxlen && ip[len] == ref[len]);
> >>>>>      return len;
> >>>>> }
> >>>>>
> >>>>> I can see the effect on this loop on x86_64 as well, we end up with
> >>>>>
> >>>>> .L6:
> >>>>>            movzbl  (%rdi,%rax), %ecx
> >>>>>            addq    $1, %rax
> >>>>>            cmpb    -1(%rsi,%rax), %cl
> >>>>>            jne     .L1
> >>>>> .L3:
> >>>>>            movl    %eax, %r8d
> >>>>>            cmpl    %edx, %eax
> >>>>>            jb      .L6
> >>>>>
> >>>>> but without the trick it is
> >>>>>
> >>>>> .L6:
> >>>>>            movzbl  (%rdi,%rax), %r8d
> >>>>>            movzbl  (%rsi,%rax), %ecx
> >>>>>            addq    $1, %rax
> >>>>>            cmpb    %cl, %r8b
> >>>>>            jne     .L1
> >>>>> .L3:
> >>>>>            movl    %eax, %r9d
> >>>>>            cmpl    %edx, %eax
> >>>>>            jb      .L6
> >>>>
> >>>> Verified this small piece of code on X86, there is no performance
> >>>> change with or without adjust_iv_update_pos (I've checked the ASM
> >>>> exactly same with yours):
> >>>>
> >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
> >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> >>>>
> >>>> real    0m7.003s
> >>>> user    0m6.996s
> >>>> sys     0m0.000s
> >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
> >>>> cc/ -O2 test.c
> >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> >>>>
> >>>> real    0m7.070s
> >>>> user    0m7.068s
> >>>> sys     0m0.000s
> >>>>
> >>>>
> >>>> But for AArch64, current GCC code also generates similar code with
> >>>> or without adjust_iv_update_pos, the runtime is 10.705s for them.
> >>>>
> >>>> L6:
> >>>>           ldrb    w4, [x6, x3]
> >>>>           add     x3, x3, 1
> >>>>           ldrb    w5, [x1, x3]
> >>>>           cmp     w5, w4
> >>>>           bne     .L1
> >>>> .L3:
> >>>>           mov     w0, w3
> >>>>           cmp     w2, w3
> >>>>           bhi     .L6
> >>>>
> >>>> No adjust_iv_update_pos:
> >>>>
> >>>> .L6:
> >>>>           ldrb    w5, [x6, x3]
> >>>>           ldrb    w4, [x1, x3]
> >>>>           add     x3, x3, 1
> >>>>           cmp     w5, w4
> >>>>           bne     .L1
> >>>> .L3:
> >>>>           mov     w0, w3
> >>>>           cmp     w2, w3
> >>>>           bhi     .L6
> >>>>
> >>>>
> >>>> While built with old GCC(gcc version 7.4.1 20190424), it generates
> >>>> worse code and runtime is 11.664s:
> >>>>
> >>>> .L6:
> >>>>           ldrb    w4, [x6, x3]
> >>>>           add     x3, x3, 1
> >>>>           add     x5, x1, x3
> >>>>           ldrb    w5, [x5, -1]
> >>>>           cmp     w5, w4
> >>>>           bne     .L1
> >>>> .L3:
> >>>>           cmp     w3, w2
> >>>>           mov     w0, w3
> >>>>           bcc     .L6
> >>>>
> >>>>
> >>>> In general, only Power shows negative performance with adjust_iv_update_pos,
> >>>> that's why I tried to add target hook for it, is this reasonable?  Or we should
> >>>> just remove adjust_iv_update_pos since it doesn't help performance for X86 or
> >>>> other targets?
> >>>
> >>> It was Bin who added the functionality - maybe he can remember the cases
> >>> he saw improvements for.
> >>>
> >>> I think we should instead try to change the code that it doesn't apply to
> >>> CONDs where both operands are defined by stmts using the IV?
> >>> And this time add a testcase ;)
> >>
> >> Thanks. I am not sure whether should check both cond_lhs and cond_rhs or just
> >> cond_lhs is enough if COND expr's LHS is always processed first?
> >>
> >> For example, adjust_iv_update_pos will be called twice for this case,
> >> after the first call, the gimple will be updated to:
> >>
> >> <bb 4> [local count: 1014686026]:
> >> _1 = (sizetype) len_8;
> >> _2 = ip_10(D) + _1;
> >> _3 = MEM[(unsigned char *)ip_10(D) + ivtmp.15_15 * 1];
> >> _5 = ref_12(D) + _1;
> >> _6 = *_5;
> >> ivtmp.15_16 = ivtmp.15_15 + 1;
> >> if (_3 == _6)
> >>    goto <bb 6>; [94.50%]
> >> else
> >>    goto <bb 10>; [5.50%]
> >>
> >>
> >> At this moment, use->stmt is "_6 = *_5;", it is not using IV directly?
> >
> > So looking around I think you want to pass down the IV group from
> > rewrite_groups to rewrite_use_address and then see whether the
> > definition of the RHS is one of the IV USE stmts that are going to be
> > expressed using the same candidate.  In fact, I guess (to avoid
> > linear searching the group), we likely might want to restrict considering
> > adjust_iv_update_pos for IV candidates with a single [address] use?
> > That would be more restrictive of course (OTOH the list shouldn't be
> > too big - hopefully).  In case the other side of the conditional is
> > a memory using a different IV the replacement should still be OK.
> >
> > Richard.
>
> Thanks, however those methods would still break the X86 optimization
> with worse ASM(one more movzbl) though no performance change observed.
> CCed X86 maintainers to see which one they prefer, and whether target
> hook is a better solution?
>
>
> GCC trunk:
>
> .L6:
>           movzbl  (%rdi,%rax), %ecx
>           addq    $1, %rax
>           cmpb    -1(%rsi,%rax), %cl
>           jne     .L1
> .L3:
>           movl    %eax, %r8d
>           cmpl    %edx, %eax
>           jb      .L6
>
> Disable adjust_iv_update_pos:
>
> .L6:
>           movzbl  (%rdi,%rax), %r8d
>           movzbl  (%rsi,%rax), %ecx
>           addq    $1, %rax
>           cmpb    %cl, %r8b
>           jne     .L1
> .L3:
>           movl    %eax, %r9d
>           cmpl    %edx, %eax
>           jb      .L6
>
latency should be the same
movzbl mem reg 5
cmp reg8 reg8 1
total 6
vs
cmp mem8 reg8 6

Guess they're the same in uops level.

CC uros, any opinion?

>
>
> --
> Thanks,
> Xionghu



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] New hook adjust_iv_update_pos
  2021-07-12  9:05                 ` Hongtao Liu
@ 2021-07-12  9:49                   ` Richard Biener
  0 siblings, 0 replies; 14+ messages in thread
From: Richard Biener @ 2021-07-12  9:49 UTC (permalink / raw)
  To: Hongtao Liu
  Cc: Xionghu Luo, Uros Bizjak, Segher Boessenkool, Bill Schmidt,
	GCC Patches, Liu, Hongtao

On Mon, Jul 12, 2021 at 11:00 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 4:14 PM Xionghu Luo via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> >
> >
> > On 2021/7/7 21:20, Richard Biener wrote:
> > > On Tue, Jun 29, 2021 at 11:19 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> > >>
> > >>
> > >>
> > >> On 2021/6/28 16:25, Richard Biener wrote:
> > >>> On Mon, Jun 28, 2021 at 10:07 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> > >>>>
> > >>>>
> > >>>>
> > >>>> On 2021/6/25 18:02, Richard Biener wrote:
> > >>>>> On Fri, Jun 25, 2021 at 11:41 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> > >>>>>>
> > >>>>>>
> > >>>>>>
> > >>>>>> On 2021/6/25 16:54, Richard Biener wrote:
> > >>>>>>> On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
> > >>>>>>> <gcc-patches@gcc.gnu.org> wrote:
> > >>>>>>>>
> > >>>>>>>> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
> > >>>>>>>>
> > >>>>>>>> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> > >>>>>>>> on Power.  For example, it generates mismatched address offset after
> > >>>>>>>> adjust iv update statement position:
> > >>>>>>>>
> > >>>>>>>> <bb 32> [local count: 70988443]:
> > >>>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> > >>>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> > >>>>>>>> _34 = ref_180 + 18446744073709551615;
> > >>>>>>>> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> > >>>>>>>> if (_84 == _86)
> > >>>>>>>>       goto <bb 56>; [94.50%]
> > >>>>>>>>       else
> > >>>>>>>>       goto <bb 87>; [5.50%]
> > >>>>>>>>
> > >>>>>>>> Disable it will produce:
> > >>>>>>>>
> > >>>>>>>> <bb 32> [local count: 70988443]:
> > >>>>>>>> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> > >>>>>>>> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> > >>>>>>>> ivtmp.30_415 = ivtmp.30_414 + 1;
> > >>>>>>>> if (_84 == _86)
> > >>>>>>>>       goto <bb 56>; [94.50%]
> > >>>>>>>>       else
> > >>>>>>>>       goto <bb 87>; [5.50%]
> > >>>>>>>>
> > >>>>>>>> Then later pass loop unroll could benefit from same address offset
> > >>>>>>>> with different base address and reduces register dependency.
> > >>>>>>>> This patch could improve performance by 10% for typical case on Power,
> > >>>>>>>> no performance change observed for X86 or Aarch64 due to small loops
> > >>>>>>>> not unrolled on these platforms.  Any comments?
> > >>>>>>>
> > >>>>>>> The case you quote is special in that if we hoisted the IV update before
> > >>>>>>> the other MEM _also_ used in the condition it would be fine again.
> > >>>>>>
> > >>>>>> Thanks.  I tried to hoist the IV update statement before the first MEM (Fix 2), it
> > >>>>>> shows even worse performance due to not unroll(two more "base-1" is generated in gimple,
> > >>>>>> then loop->ninsns is 11 so small loops is not unrolled), change the threshold from
> > >>>>>> 10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
> > >>>>>> performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
> > >>>>>>    From the ASM, we can see the index register %r4 is used in two iterations which
> > >>>>>> maybe a bottle neck for hiding instruction latency?
> > >>>>>>
> > >>>>>> Then it seems reasonable the performance would be better if keep the IV update
> > >>>>>> statement at *LAST* (Fix 1).
> > >>>>>>
> > >>>>>> (Fix 2):
> > >>>>>>      <bb 32> [local count: 70988443]:
> > >>>>>>      ivtmp.30_415 = ivtmp.30_414 + 1;
> > >>>>>>      _34 = ip_229 + 18446744073709551615;
> > >>>>>>      _84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> > >>>>>>      _33 = ref_180 + 18446744073709551615;
> > >>>>>>      _86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
> > >>>>>>      if (_84 == _86)
> > >>>>>>        goto <bb 56>; [94.50%]
> > >>>>>>      else
> > >>>>>>        goto <bb 87>; [5.50%]
> > >>>>>>
> > >>>>>>
> > >>>>>> .L67:
> > >>>>>>            lbzx %r12,%r24,%r4
> > >>>>>>            lbzx %r25,%r7,%r4
> > >>>>>>            cmpw %cr0,%r12,%r25
> > >>>>>>            bne %cr0,.L11
> > >>>>>>            mr %r26,%r4
> > >>>>>>            addi %r4,%r4,1
> > >>>>>>            lbzx %r12,%r24,%r4
> > >>>>>>            lbzx %r25,%r7,%r4
> > >>>>>>            mr %r6,%r26
> > >>>>>>            cmpw %cr0,%r12,%r25
> > >>>>>>            bne %cr0,.L11
> > >>>>>>            mr %r26,%r4
> > >>>>>> .L12:
> > >>>>>>            cmpdi %cr0,%r10,1
> > >>>>>>            addi %r4,%r26,1
> > >>>>>>            mr %r6,%r26
> > >>>>>>            addi %r10,%r10,-1
> > >>>>>>            bne %cr0,.L67
> > >>>>>>
> > >>>>>>>
> > >>>>>>> Now, adjust_iv_update_pos doesn't seem to check that the
> > >>>>>>> condition actually uses the IV use stmt def, so it likely applies to
> > >>>>>>> too many cases.
> > >>>>>>>
> > >>>>>>> Unfortunately the introducing rev didn't come with a testcase,
> > >>>>>>> but still I think fixing up adjust_iv_update_pos is better than
> > >>>>>>> introducing a way to short-cut it per target decision.
> > >>>>>>>
> > >>>>>>> One "fix" might be to add a check that either the condition
> > >>>>>>> lhs or rhs is the def of the IV use and the other operand
> > >>>>>>> is invariant.  Or if it's of similar structure hoist across the
> > >>>>>>> other iv-use as well.  Not that I understand the argument
> > >>>>>>> about the overlapping life-range.
> > >>>>>>>
> > >>>>>>> You also don't provide a complete testcase ...
> > >>>>>>>
> > >>>>>>
> > >>>>>> Attached the test code, will also add it it patch in future version.
> > >>>>>> The issue comes from a very small hot loop:
> > >>>>>>
> > >>>>>>            do {
> > >>>>>>              len++;
> > >>>>>>            } while(len < maxlen && ip[len] == ref[len]);
> > >>>>>
> > >>>>> unsigned int foo (unsigned char *ip, unsigned char *ref, unsigned int maxlen)
> > >>>>> {
> > >>>>>      unsigned int len = 2;
> > >>>>>      do {
> > >>>>>          len++;
> > >>>>>      }while(len < maxlen && ip[len] == ref[len]);
> > >>>>>      return len;
> > >>>>> }
> > >>>>>
> > >>>>> I can see the effect on this loop on x86_64 as well, we end up with
> > >>>>>
> > >>>>> .L6:
> > >>>>>            movzbl  (%rdi,%rax), %ecx
> > >>>>>            addq    $1, %rax
> > >>>>>            cmpb    -1(%rsi,%rax), %cl
> > >>>>>            jne     .L1
> > >>>>> .L3:
> > >>>>>            movl    %eax, %r8d
> > >>>>>            cmpl    %edx, %eax
> > >>>>>            jb      .L6
> > >>>>>
> > >>>>> but without the trick it is
> > >>>>>
> > >>>>> .L6:
> > >>>>>            movzbl  (%rdi,%rax), %r8d
> > >>>>>            movzbl  (%rsi,%rax), %ecx
> > >>>>>            addq    $1, %rax
> > >>>>>            cmpb    %cl, %r8b
> > >>>>>            jne     .L1
> > >>>>> .L3:
> > >>>>>            movl    %eax, %r9d
> > >>>>>            cmpl    %edx, %eax
> > >>>>>            jb      .L6
> > >>>>
> > >>>> Verified this small piece of code on X86, there is no performance
> > >>>> change with or without adjust_iv_update_pos (I've checked the ASM
> > >>>> exactly same with yours):
> > >>>>
> > >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ gcc -O2 test.c
> > >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> > >>>>
> > >>>> real    0m7.003s
> > >>>> user    0m6.996s
> > >>>> sys     0m0.000s
> > >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ /home/luoxhu/workspace/build/gcc/xgcc -B/home/luoxhu/workspace/build/g
> > >>>> cc/ -O2 test.c
> > >>>> luoxhu@gcc14:~/workspace/lzf_compress_testcase$ time ./a.out  1
> > >>>>
> > >>>> real    0m7.070s
> > >>>> user    0m7.068s
> > >>>> sys     0m0.000s
> > >>>>
> > >>>>
> > >>>> But for AArch64, current GCC code also generates similar code with
> > >>>> or without adjust_iv_update_pos, the runtime is 10.705s for them.
> > >>>>
> > >>>> L6:
> > >>>>           ldrb    w4, [x6, x3]
> > >>>>           add     x3, x3, 1
> > >>>>           ldrb    w5, [x1, x3]
> > >>>>           cmp     w5, w4
> > >>>>           bne     .L1
> > >>>> .L3:
> > >>>>           mov     w0, w3
> > >>>>           cmp     w2, w3
> > >>>>           bhi     .L6
> > >>>>
> > >>>> No adjust_iv_update_pos:
> > >>>>
> > >>>> .L6:
> > >>>>           ldrb    w5, [x6, x3]
> > >>>>           ldrb    w4, [x1, x3]
> > >>>>           add     x3, x3, 1
> > >>>>           cmp     w5, w4
> > >>>>           bne     .L1
> > >>>> .L3:
> > >>>>           mov     w0, w3
> > >>>>           cmp     w2, w3
> > >>>>           bhi     .L6
> > >>>>
> > >>>>
> > >>>> While built with old GCC(gcc version 7.4.1 20190424), it generates
> > >>>> worse code and runtime is 11.664s:
> > >>>>
> > >>>> .L6:
> > >>>>           ldrb    w4, [x6, x3]
> > >>>>           add     x3, x3, 1
> > >>>>           add     x5, x1, x3
> > >>>>           ldrb    w5, [x5, -1]
> > >>>>           cmp     w5, w4
> > >>>>           bne     .L1
> > >>>> .L3:
> > >>>>           cmp     w3, w2
> > >>>>           mov     w0, w3
> > >>>>           bcc     .L6
> > >>>>
> > >>>>
> > >>>> In general, only Power shows negative performance with adjust_iv_update_pos,
> > >>>> that's why I tried to add target hook for it, is this reasonable?  Or we should
> > >>>> just remove adjust_iv_update_pos since it doesn't help performance for X86 or
> > >>>> other targets?
> > >>>
> > >>> It was Bin who added the functionality - maybe he can remember the cases
> > >>> he saw improvements for.
> > >>>
> > >>> I think we should instead try to change the code that it doesn't apply to
> > >>> CONDs where both operands are defined by stmts using the IV?
> > >>> And this time add a testcase ;)
> > >>
> > >> Thanks. I am not sure whether should check both cond_lhs and cond_rhs or just
> > >> cond_lhs is enough if COND expr's LHS is always processed first?
> > >>
> > >> For example, adjust_iv_update_pos will be called twice for this case,
> > >> after the first call, the gimple will be updated to:
> > >>
> > >> <bb 4> [local count: 1014686026]:
> > >> _1 = (sizetype) len_8;
> > >> _2 = ip_10(D) + _1;
> > >> _3 = MEM[(unsigned char *)ip_10(D) + ivtmp.15_15 * 1];
> > >> _5 = ref_12(D) + _1;
> > >> _6 = *_5;
> > >> ivtmp.15_16 = ivtmp.15_15 + 1;
> > >> if (_3 == _6)
> > >>    goto <bb 6>; [94.50%]
> > >> else
> > >>    goto <bb 10>; [5.50%]
> > >>
> > >>
> > >> At this moment, use->stmt is "_6 = *_5;", it is not using IV directly?
> > >
> > > So looking around I think you want to pass down the IV group from
> > > rewrite_groups to rewrite_use_address and then see whether the
> > > definition of the RHS is one of the IV USE stmts that are going to be
> > > expressed using the same candidate.  In fact, I guess (to avoid
> > > linear searching the group), we likely might want to restrict considering
> > > adjust_iv_update_pos for IV candidates with a single [address] use?
> > > That would be more restrictive of course (OTOH the list shouldn't be
> > > too big - hopefully).  In case the other side of the conditional is
> > > a memory using a different IV the replacement should still be OK.
> > >
> > > Richard.
> >
> > Thanks, however those methods would still break the X86 optimization
> > with worse ASM(one more movzbl) though no performance change observed.
> > CCed X86 maintainers to see which one they prefer, and whether target
> > hook is a better solution?
> >
> >
> > GCC trunk:
> >
> > .L6:
> >           movzbl  (%rdi,%rax), %ecx
> >           addq    $1, %rax
> >           cmpb    -1(%rsi,%rax), %cl
> >           jne     .L1
> > .L3:
> >           movl    %eax, %r8d
> >           cmpl    %edx, %eax
> >           jb      .L6
> >
> > Disable adjust_iv_update_pos:
> >
> > .L6:
> >           movzbl  (%rdi,%rax), %r8d
> >           movzbl  (%rsi,%rax), %ecx
> >           addq    $1, %rax
> >           cmpb    %cl, %r8b
> >           jne     .L1
> > .L3:
> >           movl    %eax, %r9d
> >           cmpl    %edx, %eax
> >           jb      .L6
> >
> latency should be the same
> movzbl mem reg 5
> cmp reg8 reg8 1
> total 6
> vs
> cmp mem8 reg8 6
>
> Guess they're the same in uops level.

OTOH the dependence on the add on trunk
might tip it to the worse side, when adjust_iv_update_pos
is disabled the add can execute independently
(OTOH AGU + add might compete for the same resources)

Richard.

> CC uros, any opinion?
>
> >
> >
> > --
> > Thanks,
> > Xionghu
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-07-12  9:49 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-25  8:31 [PATCH] New hook adjust_iv_update_pos Xionghu Luo
2021-06-25  8:54 ` Richard Biener
2021-06-25  9:41   ` Xionghu Luo
2021-06-25 10:02     ` Richard Biener
2021-06-25 12:58       ` Xionghu Luo
2021-06-25 13:12       ` Xionghu Luo
2021-06-28  8:07       ` Xionghu Luo
2021-06-28  8:25         ` Richard Biener
2021-06-29  9:19           ` Xionghu Luo
2021-07-07 13:20             ` Richard Biener
2021-07-12  8:13               ` Xionghu Luo
2021-07-12  9:05                 ` Hongtao Liu
2021-07-12  9:49                   ` Richard Biener
2021-07-07  5:51           ` Bin.Cheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).