* [PATCH v3 1/5] AArch64: Improve A64FX memset
@ 2021-07-22 15:59 Wilco Dijkstra
2021-07-28 8:10 ` naohirot
0 siblings, 1 reply; 9+ messages in thread
From: Wilco Dijkstra @ 2021-07-22 15:59 UTC (permalink / raw)
To: naohirot; +Cc: 'GNU C Library'
Improve performance of small copies by reducing instruction counts and improving
alignment. Bench-memset shows 35-45% performance gain for small sizes.
---
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -30,7 +30,6 @@
#define L2_SIZE (8*1024*1024) // L2 8MB - 1MB
#define CACHE_LINE_SIZE 256
#define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
-#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
#define rest x8
#define vector_length x9
#define vl_remainder x10 // vector_length remainder
@@ -51,78 +50,54 @@
.endm
.macro st1b_unroll first=0, last=7
- st1b z0.b, p0, [dst, #\first, mul vl]
+ st1b z0.b, p0, [dst, \first, mul vl]
.if \last-\first
st1b_unroll "(\first+1)", \last
.endif
.endm
- .macro shortcut_for_small_size exit
- // if rest <= vector_length * 2
- whilelo p0.b, xzr, count
- whilelo p1.b, vector_length, count
- b.last 1f
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- ret
-1: // if rest > vector_length * 8
- cmp count, vector_length, lsl 3 // vector_length * 8
- b.hi \exit
- // if rest <= vector_length * 4
- lsl tmp1, vector_length, 1 // vector_length * 2
- whilelo p2.b, tmp1, count
- incb tmp1
- whilelo p3.b, tmp1, count
- b.last 1f
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- st1b z0.b, p2, [dstin, #2, mul vl]
- st1b z0.b, p3, [dstin, #3, mul vl]
- ret
-1: // if rest <= vector_length * 8
- lsl tmp1, vector_length, 2 // vector_length * 4
- whilelo p4.b, tmp1, count
- incb tmp1
- whilelo p5.b, tmp1, count
- b.last 1f
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- st1b z0.b, p2, [dstin, #2, mul vl]
- st1b z0.b, p3, [dstin, #3, mul vl]
- st1b z0.b, p4, [dstin, #4, mul vl]
- st1b z0.b, p5, [dstin, #5, mul vl]
- ret
-1: lsl tmp1, vector_length, 2 // vector_length * 4
- incb tmp1 // vector_length * 5
- incb tmp1 // vector_length * 6
- whilelo p6.b, tmp1, count
- incb tmp1
- whilelo p7.b, tmp1, count
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- st1b z0.b, p2, [dstin, #2, mul vl]
- st1b z0.b, p3, [dstin, #3, mul vl]
- st1b z0.b, p4, [dstin, #4, mul vl]
- st1b z0.b, p5, [dstin, #5, mul vl]
- st1b z0.b, p6, [dstin, #6, mul vl]
- st1b z0.b, p7, [dstin, #7, mul vl]
- ret
- .endm
-ENTRY (MEMSET)
+#undef BTI_C
+#define BTI_C
+ENTRY (MEMSET)
PTR_ARG (0)
SIZE_ARG (2)
- cbnz count, 1f
- ret
-1: dup z0.b, valw
cntb vector_length
- // shortcut for less than vector_length * 8
- // gives a free ptrue to p0.b for n >= vector_length
- shortcut_for_small_size L(vl_agnostic)
- // end of shortcut
+ dup z0.b, valw
+ whilelo p0.b, vector_length, count
+ b.last 1f
+ whilelo p1.b, xzr, count
+ st1b z0.b, p1, [dstin, 0, mul vl]
+ st1b z0.b, p0, [dstin, 1, mul vl]
+ ret
+
+ // count >= vector_length * 2
+1: cmp count, vector_length, lsl 2
+ add dstend, dstin, count
+ b.hi 1f
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z0.b, p0, [dstin, 1, mul vl]
+ st1b z0.b, p0, [dstend, -2, mul vl]
+ st1b z0.b, p0, [dstend, -1, mul vl]
+ ret
+
+ // count > vector_length * 4
+1: lsl tmp1, vector_length, 3
+ cmp count, tmp1
+ b.hi L(vl_agnostic)
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z0.b, p0, [dstin, 1, mul vl]
+ st1b z0.b, p0, [dstin, 2, mul vl]
+ st1b z0.b, p0, [dstin, 3, mul vl]
+ st1b z0.b, p0, [dstend, -4, mul vl]
+ st1b z0.b, p0, [dstend, -3, mul vl]
+ st1b z0.b, p0, [dstend, -2, mul vl]
+ st1b z0.b, p0, [dstend, -1, mul vl]
+ ret
+ .p2align 4
L(vl_agnostic): // VL Agnostic
mov rest, count
mov dst, dstin
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-07-22 15:59 [PATCH v3 1/5] AArch64: Improve A64FX memset Wilco Dijkstra
@ 2021-07-28 8:10 ` naohirot
2021-08-02 13:53 ` naohirot
0 siblings, 1 reply; 9+ messages in thread
From: naohirot @ 2021-07-28 8:10 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: 'GNU C Library'
Hi Wilco,
Thanks for the patch.
I confirmed that the performance is improved than the master as show
in the graphs [1].
There are two comments, please find them.
Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
Tested-by: Naohiro Tamura <naohirot@fujitsu.com>
[1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing
> [PATCH v3 1/5] AArch64: Improve A64FX memset
>
Would you update the commit title so as not to be the same among 5
patches?
Because we need to ask distro to backport these patches.
If all commit titles are the same, it will increase the room to happen
confusion and mistake.
How about "AArch64: Improve A64FX memset for less than 512B" ?
> Improve performance of small copies by reducing instruction counts and improving
> alignment. Bench-memset shows 35-45% performance gain for small sizes.
>
> ---
>
> diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
> --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> @@ -30,7 +30,6 @@
> #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB
> #define CACHE_LINE_SIZE 256
> #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
> -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
This caused compile error.
> #define rest x8
> #define vector_length x9
> #define vl_remainder x10 // vector_length remainder
> @@ -51,78 +50,54 @@
> .endm
>
> .macro st1b_unroll first=0, last=7
> - st1b z0.b, p0, [dst, #\first, mul vl]
> + st1b z0.b, p0, [dst, \first, mul vl]
> .if \last-\first
> st1b_unroll "(\first+1)", \last
> .endif
> .endm
>
> - .macro shortcut_for_small_size exit
> - // if rest <= vector_length * 2
> - whilelo p0.b, xzr, count
> - whilelo p1.b, vector_length, count
> - b.last 1f
> - st1b z0.b, p0, [dstin, #0, mul vl]
> - st1b z0.b, p1, [dstin, #1, mul vl]
> - ret
> -1: // if rest > vector_length * 8
> - cmp count, vector_length, lsl 3 // vector_length * 8
> - b.hi \exit
> - // if rest <= vector_length * 4
> - lsl tmp1, vector_length, 1 // vector_length * 2
> - whilelo p2.b, tmp1, count
> - incb tmp1
> - whilelo p3.b, tmp1, count
> - b.last 1f
> - st1b z0.b, p0, [dstin, #0, mul vl]
> - st1b z0.b, p1, [dstin, #1, mul vl]
> - st1b z0.b, p2, [dstin, #2, mul vl]
> - st1b z0.b, p3, [dstin, #3, mul vl]
> - ret
> -1: // if rest <= vector_length * 8
> - lsl tmp1, vector_length, 2 // vector_length * 4
> - whilelo p4.b, tmp1, count
> - incb tmp1
> - whilelo p5.b, tmp1, count
> - b.last 1f
> - st1b z0.b, p0, [dstin, #0, mul vl]
> - st1b z0.b, p1, [dstin, #1, mul vl]
> - st1b z0.b, p2, [dstin, #2, mul vl]
> - st1b z0.b, p3, [dstin, #3, mul vl]
> - st1b z0.b, p4, [dstin, #4, mul vl]
> - st1b z0.b, p5, [dstin, #5, mul vl]
> - ret
> -1: lsl tmp1, vector_length, 2 // vector_length * 4
> - incb tmp1 // vector_length * 5
> - incb tmp1 // vector_length * 6
> - whilelo p6.b, tmp1, count
> - incb tmp1
> - whilelo p7.b, tmp1, count
> - st1b z0.b, p0, [dstin, #0, mul vl]
> - st1b z0.b, p1, [dstin, #1, mul vl]
> - st1b z0.b, p2, [dstin, #2, mul vl]
> - st1b z0.b, p3, [dstin, #3, mul vl]
> - st1b z0.b, p4, [dstin, #4, mul vl]
> - st1b z0.b, p5, [dstin, #5, mul vl]
> - st1b z0.b, p6, [dstin, #6, mul vl]
> - st1b z0.b, p7, [dstin, #7, mul vl]
> - ret
> - .endm
>
> -ENTRY (MEMSET)
> +#undef BTI_C
> +#define BTI_C
>
> +ENTRY (MEMSET)
> PTR_ARG (0)
> SIZE_ARG (2)
>
> - cbnz count, 1f
> - ret
> -1: dup z0.b, valw
> cntb vector_length
> - // shortcut for less than vector_length * 8
> - // gives a free ptrue to p0.b for n >= vector_length
> - shortcut_for_small_size L(vl_agnostic)
> - // end of shortcut
> + dup z0.b, valw
> + whilelo p0.b, vector_length, count
> + b.last 1f
> + whilelo p1.b, xzr, count
> + st1b z0.b, p1, [dstin, 0, mul vl]
> + st1b z0.b, p0, [dstin, 1, mul vl]
> + ret
> +
> + // count >= vector_length * 2
> +1: cmp count, vector_length, lsl 2
> + add dstend, dstin, count
> + b.hi 1f
> + st1b z0.b, p0, [dstin, 0, mul vl]
> + st1b z0.b, p0, [dstin, 1, mul vl]
> + st1b z0.b, p0, [dstend, -2, mul vl]
> + st1b z0.b, p0, [dstend, -1, mul vl]
> + ret
> +
> + // count > vector_length * 4
> +1: lsl tmp1, vector_length, 3
> + cmp count, tmp1
> + b.hi L(vl_agnostic)
> + st1b z0.b, p0, [dstin, 0, mul vl]
> + st1b z0.b, p0, [dstin, 1, mul vl]
> + st1b z0.b, p0, [dstin, 2, mul vl]
> + st1b z0.b, p0, [dstin, 3, mul vl]
> + st1b z0.b, p0, [dstend, -4, mul vl]
> + st1b z0.b, p0, [dstend, -3, mul vl]
> + st1b z0.b, p0, [dstend, -2, mul vl]
> + st1b z0.b, p0, [dstend, -1, mul vl]
> + ret
>
> + .p2align 4
> L(vl_agnostic): // VL Agnostic
> mov rest, count
> mov dst, dstin
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-07-28 8:10 ` naohirot
@ 2021-08-02 13:53 ` naohirot
2021-08-02 14:38 ` Wilco Dijkstra
0 siblings, 1 reply; 9+ messages in thread
From: naohirot @ 2021-08-02 13:53 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: 'GNU C Library'
Hi Wilco,
I have one question below.
> -----Original Message-----
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, July 28, 2021 5:11 PM
> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Cc: 'GNU C Library' <libc-alpha@sourceware.org>
> Subject: Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
>
> Hi Wilco,
>
> Thanks for the patch.
>
> I confirmed that the performance is improved than the master as show
> in the graphs [1].
> There are two comments, please find them.
>
> Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
> Tested-by: Naohiro Tamura <naohirot@fujitsu.com>
>
> [1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing
>
> > [PATCH v3 1/5] AArch64: Improve A64FX memset
> >
>
> Would you update the commit title so as not to be the same among 5
> patches?
> Because we need to ask distro to backport these patches.
> If all commit titles are the same, it will increase the room to happen
> confusion and mistake.
>
> How about "AArch64: Improve A64FX memset for less than 512B" ?
>
> > Improve performance of small copies by reducing instruction counts and improving
> > alignment. Bench-memset shows 35-45% performance gain for small sizes.
> >
> > ---
> >
> > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
> > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > @@ -30,7 +30,6 @@
> > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB
> > #define CACHE_LINE_SIZE 256
> > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
> > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
>
> This caused compile error.
>
> > #define rest x8
> > #define vector_length x9
> > #define vl_remainder x10 // vector_length remainder
> > @@ -51,78 +50,54 @@
> > .endm
> >
> > .macro st1b_unroll first=0, last=7
> > - st1b z0.b, p0, [dst, #\first, mul vl]
> > + st1b z0.b, p0, [dst, \first, mul vl]
> > .if \last-\first
> > st1b_unroll "(\first+1)", \last
> > .endif
> > .endm
> >
> > - .macro shortcut_for_small_size exit
> > - // if rest <= vector_length * 2
> > - whilelo p0.b, xzr, count
> > - whilelo p1.b, vector_length, count
> > - b.last 1f
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - ret
> > -1: // if rest > vector_length * 8
> > - cmp count, vector_length, lsl 3 // vector_length * 8
> > - b.hi \exit
> > - // if rest <= vector_length * 4
> > - lsl tmp1, vector_length, 1 // vector_length * 2
> > - whilelo p2.b, tmp1, count
> > - incb tmp1
> > - whilelo p3.b, tmp1, count
> > - b.last 1f
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - st1b z0.b, p2, [dstin, #2, mul vl]
> > - st1b z0.b, p3, [dstin, #3, mul vl]
> > - ret
> > -1: // if rest <= vector_length * 8
> > - lsl tmp1, vector_length, 2 // vector_length * 4
> > - whilelo p4.b, tmp1, count
> > - incb tmp1
> > - whilelo p5.b, tmp1, count
> > - b.last 1f
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - st1b z0.b, p2, [dstin, #2, mul vl]
> > - st1b z0.b, p3, [dstin, #3, mul vl]
> > - st1b z0.b, p4, [dstin, #4, mul vl]
> > - st1b z0.b, p5, [dstin, #5, mul vl]
> > - ret
> > -1: lsl tmp1, vector_length, 2 // vector_length * 4
> > - incb tmp1 // vector_length * 5
> > - incb tmp1 // vector_length * 6
> > - whilelo p6.b, tmp1, count
> > - incb tmp1
> > - whilelo p7.b, tmp1, count
> > - st1b z0.b, p0, [dstin, #0, mul vl]
> > - st1b z0.b, p1, [dstin, #1, mul vl]
> > - st1b z0.b, p2, [dstin, #2, mul vl]
> > - st1b z0.b, p3, [dstin, #3, mul vl]
> > - st1b z0.b, p4, [dstin, #4, mul vl]
> > - st1b z0.b, p5, [dstin, #5, mul vl]
> > - st1b z0.b, p6, [dstin, #6, mul vl]
> > - st1b z0.b, p7, [dstin, #7, mul vl]
> > - ret
> > - .endm
> >
> > -ENTRY (MEMSET)
> > +#undef BTI_C
> > +#define BTI_C
We discussed how should be defined BTI_C macro before, at that time conclusion
was "NOP" rather than empty unless HAVE_AARCH64_BTI.
Now the above code defines BTI_C as empty unconditionally.
A64FX doesn't support BTI, so this code is OK.
But I'm just interested in the reason why it is changed.
Thanks.
Naohiro
> >
> > +ENTRY (MEMSET)
> > PTR_ARG (0)
> > SIZE_ARG (2)
> >
> > - cbnz count, 1f
> > - ret
> > -1: dup z0.b, valw
> > cntb vector_length
> > - // shortcut for less than vector_length * 8
> > - // gives a free ptrue to p0.b for n >= vector_length
> > - shortcut_for_small_size L(vl_agnostic)
> > - // end of shortcut
> > + dup z0.b, valw
> > + whilelo p0.b, vector_length, count
> > + b.last 1f
> > + whilelo p1.b, xzr, count
> > + st1b z0.b, p1, [dstin, 0, mul vl]
> > + st1b z0.b, p0, [dstin, 1, mul vl]
> > + ret
> > +
> > + // count >= vector_length * 2
> > +1: cmp count, vector_length, lsl 2
> > + add dstend, dstin, count
> > + b.hi 1f
> > + st1b z0.b, p0, [dstin, 0, mul vl]
> > + st1b z0.b, p0, [dstin, 1, mul vl]
> > + st1b z0.b, p0, [dstend, -2, mul vl]
> > + st1b z0.b, p0, [dstend, -1, mul vl]
> > + ret
> > +
> > + // count > vector_length * 4
> > +1: lsl tmp1, vector_length, 3
> > + cmp count, tmp1
> > + b.hi L(vl_agnostic)
> > + st1b z0.b, p0, [dstin, 0, mul vl]
> > + st1b z0.b, p0, [dstin, 1, mul vl]
> > + st1b z0.b, p0, [dstin, 2, mul vl]
> > + st1b z0.b, p0, [dstin, 3, mul vl]
> > + st1b z0.b, p0, [dstend, -4, mul vl]
> > + st1b z0.b, p0, [dstend, -3, mul vl]
> > + st1b z0.b, p0, [dstend, -2, mul vl]
> > + st1b z0.b, p0, [dstend, -1, mul vl]
> > + ret
> >
> > + .p2align 4
> > L(vl_agnostic): // VL Agnostic
> > mov rest, count
> > mov dst, dstin
> >
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-08-02 13:53 ` naohirot
@ 2021-08-02 14:38 ` Wilco Dijkstra
2021-08-02 14:50 ` Szabolcs Nagy
2021-08-03 2:56 ` naohirot
0 siblings, 2 replies; 9+ messages in thread
From: Wilco Dijkstra @ 2021-08-02 14:38 UTC (permalink / raw)
To: naohirot; +Cc: 'GNU C Library'
Hi Naohiro,
> Would you update the commit title so as not to be the same among 5
> patches?
> Because we need to ask distro to backport these patches.
> If all commit titles are the same, it will increase the room to happen
> confusion and mistake.
>
> How about "AArch64: Improve A64FX memset for less than 512B" ?
Generally the commit title in a patch series would include the series number,
however it's also easy to add something to the title as suggested. As for
backporting, one uses the hash of the patch in the cherry-pick rather than
the title, so once you have the right hashes, there should be no possibility
of confusion.
> > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
>
> This caused compile error.
Sorry, that should be part of the 2nd patch.
> > -ENTRY (MEMSET)
> > +#undef BTI_C
> > +#define BTI_C
>
> We discussed how should be defined BTI_C macro before, at that time conclusion
> was "NOP" rather than empty unless HAVE_AARCH64_BTI.
> Now the above code defines BTI_C as empty unconditionally.
> A64FX doesn't support BTI, so this code is OK.
> But I'm just interested in the reason why it is changed.
We changed to NOP in the generic code, so that works for all string functions.
In this specific case removing the initial NOP as well allows all performance critical
code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks.
Cheers,
Wilco
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-08-02 14:38 ` Wilco Dijkstra
@ 2021-08-02 14:50 ` Szabolcs Nagy
2021-08-03 2:57 ` naohirot
2021-08-03 2:56 ` naohirot
1 sibling, 1 reply; 9+ messages in thread
From: Szabolcs Nagy @ 2021-08-02 14:50 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: naohirot, 'GNU C Library'
The 08/02/2021 14:38, Wilco Dijkstra via Libc-alpha wrote:
> > We discussed how should be defined BTI_C macro before, at that time conclusion
> > was "NOP" rather than empty unless HAVE_AARCH64_BTI.
> > Now the above code defines BTI_C as empty unconditionally.
> > A64FX doesn't support BTI, so this code is OK.
> > But I'm just interested in the reason why it is changed.
>
> We changed to NOP in the generic code, so that works for all string functions.
> In this specific case removing the initial NOP as well allows all performance critical
> code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks.
yes, this makes sense:
originally BTI_C was always hint 34, but since that can be
slow it was changed for !HAVE_AARCH64_BTI. We don't want the
layout of asm code to change based on toolchain configuration
so BTI_C is defined as a place holder nop then.
but in a64fx specific code bti is never needed so we also
don't need the place holder nop, BTI_C can be unconditionally
empty.
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-08-02 14:38 ` Wilco Dijkstra
2021-08-02 14:50 ` Szabolcs Nagy
@ 2021-08-03 2:56 ` naohirot
1 sibling, 0 replies; 9+ messages in thread
From: naohirot @ 2021-08-03 2:56 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: 'GNU C Library'
Hi Wilco,
> > Would you update the commit title so as not to be the same among 5
> > patches?
> > Because we need to ask distro to backport these patches.
> > If all commit titles are the same, it will increase the room to happen
> > confusion and mistake.
> >
> > How about "AArch64: Improve A64FX memset for less than 512B" ?
>
> Generally the commit title in a patch series would include the series number,
> however it's also easy to add something to the title as suggested. As for
> backporting, one uses the hash of the patch in the cherry-pick rather than
> the title, so once you have the right hashes, there should be no possibility
> of confusion.
Thank you for considering it.
If communication made among engineers, I think the series number is
enough.
But we don't know if people in the middle of engineer to engineer
communication knows git :-)
Thanks.
Naohiro
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-08-02 14:50 ` Szabolcs Nagy
@ 2021-08-03 2:57 ` naohirot
2021-08-03 8:01 ` Szabolcs Nagy
0 siblings, 1 reply; 9+ messages in thread
From: naohirot @ 2021-08-03 2:57 UTC (permalink / raw)
To: Szabolcs Nagy, Wilco Dijkstra; +Cc: 'GNU C Library'
Hi Szabolcs, Wilco,
> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Sent: Monday, August 2, 2021 11:50 PM
> The 08/02/2021 14:38, Wilco Dijkstra via Libc-alpha wrote:
> > > We discussed how should be defined BTI_C macro before, at that time conclusion
> > > was "NOP" rather than empty unless HAVE_AARCH64_BTI.
> > > Now the above code defines BTI_C as empty unconditionally.
> > > A64FX doesn't support BTI, so this code is OK.
> > > But I'm just interested in the reason why it is changed.
> >
> > We changed to NOP in the generic code, so that works for all string functions.
> > In this specific case removing the initial NOP as well allows all performance critical
> > code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks.
>
> yes, this makes sense:
>
> originally BTI_C was always hint 34, but since that can be
> slow it was changed for !HAVE_AARCH64_BTI. We don't want the
> layout of asm code to change based on toolchain configuration
> so BTI_C is defined as a place holder nop then.
Now I understood the difference between nop and empty is the layout.
When we discussed BTI_C before, I didn't ask the difference so as not
to prolong the discussion because there is no performance difference.
> but in a64fx specific code bti is never needed so we also
> don't need the place holder nop, BTI_C can be unconditionally
> empty.
Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same
way too.
Thanks.
Naohiro
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-08-03 2:57 ` naohirot
@ 2021-08-03 8:01 ` Szabolcs Nagy
2021-09-24 7:56 ` naohirot
0 siblings, 1 reply; 9+ messages in thread
From: Szabolcs Nagy @ 2021-08-03 8:01 UTC (permalink / raw)
To: naohirot; +Cc: Wilco Dijkstra, 'GNU C Library'
The 08/03/2021 02:57, naohirot@fujitsu.com wrote:
> > but in a64fx specific code bti is never needed so we also
> > don't need the place holder nop, BTI_C can be unconditionally
> > empty.
>
> Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same
> way too.
sounds good to me.
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [PATCH v3 1/5] AArch64: Improve A64FX memset
2021-08-03 8:01 ` Szabolcs Nagy
@ 2021-09-24 7:56 ` naohirot
0 siblings, 0 replies; 9+ messages in thread
From: naohirot @ 2021-09-24 7:56 UTC (permalink / raw)
To: Szabolcs Nagy; +Cc: Wilco Dijkstra, 'GNU C Library'
Hi Szabolcs, Wilco,
> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Sent: Tuesday, August 3, 2021 5:02 PM
>
> The 08/03/2021 02:57, naohirot@fujitsu.com wrote:
> > > but in a64fx specific code bti is never needed so we also
> > > don't need the place holder nop, BTI_C can be unconditionally
> > > empty.
> >
> > Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same
> > way too.
>
> sounds good to me.
I submitted a patch [1], please review it if it's OK or not.
[1] https://sourceware.org/pipermail/libc-alpha/2021-September/131356.html
Thanks.
Naohiro
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2021-09-24 7:56 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-22 15:59 [PATCH v3 1/5] AArch64: Improve A64FX memset Wilco Dijkstra
2021-07-28 8:10 ` naohirot
2021-08-02 13:53 ` naohirot
2021-08-02 14:38 ` Wilco Dijkstra
2021-08-02 14:50 ` Szabolcs Nagy
2021-08-03 2:57 ` naohirot
2021-08-03 8:01 ` Szabolcs Nagy
2021-09-24 7:56 ` naohirot
2021-08-03 2:56 ` naohirot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).