* [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy
@ 2016-04-20 20:55 Gabriel F. T. Gomes
2016-04-20 21:25 ` Adhemerval Zanella
0 siblings, 1 reply; 5+ messages in thread
From: Gabriel F. T. Gomes @ 2016-04-20 20:55 UTC (permalink / raw)
To: libc-alpha; +Cc: munroesj, tuliom
Call __memset_power8 to pad, with zeros, the remaining bytes in the
dest string on __strncpy_power8 and __stpncpy_power8. This improves
performance when n is larger than the input string, giving ~30% gain for
larger strings without impacting much shorter strings.
2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
remaining bytes in the dest string, with zeros.
---
sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
1 file changed, 56 insertions(+), 67 deletions(-)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
index 17c3afb..0bb3bd4 100644
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -24,6 +24,8 @@
# define FUNC_NAME strncpy
#endif
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
/* Implements the function
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
@@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
addi r10,r4,16
rlwinm r9,r4,0,19,19
- /* Since it is a leaf function, save some non-volatile registers on the
- protected/red zone. */
+ /* Save some non-volatile registers on the stack. */
std r26,-48(r1)
std r27,-40(r1)
@@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
std r30,-16(r1)
std r31,-8(r1)
+ /* Update CFI. */
+ cfi_offset(r26, -48)
+ cfi_offset(r27, -40)
+ cfi_offset(r28, -32)
+ cfi_offset(r29, -24)
+ cfi_offset(r30, -16)
+ cfi_offset(r31, -8)
+
beq cr7,L(unaligned_lt_16)
rldicl r9,r4,0,61
subfic r8,r9,8
@@ -180,74 +189,58 @@ L(short_path_loop_end):
ld r31,-8(r1)
blr
- /* This code pads the remainder dest with NULL bytes. The algorithm
- calculate the remanining size and issues a doubleword unrolled
- loops followed by a byte a byte set. */
+ /* This code pads the remainder of dest with NULL bytes. The algorithm
+ calculates the remaining size and calls memset. */
.align 4
L(zero_pad_start):
mr r5,r10
mr r9,r6
L(zero_pad_start_1):
- srdi. r8,r5,r3
- mr r10,r9
-#ifdef USE_AS_STPNCPY
- mr r3,r9
+ /* At this point:
+ - r5 holds the number of bytes that still have to be written to
+ dest.
+ - r9 points to the position, in dest, where the first null byte
+ will be written.
+ The above statements are true both when control reaches this label
+ from a branch or when falling through the previous lines. */
+#ifndef USE_AS_STPNCPY
+ mr r30,r3 /* Save the return value of strncpy. */
+#endif
+ /* Prepare the call to memset. */
+ mr r3,r9 /* Pointer to the area to be zero-filled. */
+ li r4,0 /* Byte to be written (zero). */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+ cfi_offset(lr, 16)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ bl __memset_power8
+ nop
+
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ ld r0,16(r1)
+ mtlr r0
+
+#ifndef USE_AS_STPNCPY
+ mr r3,r30 /* Restore the return value of strncpy, i.e.:
+ dest. For stpncpy, the return value is the
+ same as return value of memset. */
#endif
- beq- cr0,L(zero_pad_loop_b_start)
- cmpldi cr7,r8,1
- li cr7,0
- std r7,0(r9)
- beq cr7,L(zero_pad_loop_b_prepare)
- addic. r8,r8,-2
- addi r10,r9,r16
- std r7,8(r9)
- beq cr0,L(zero_pad_loop_dw_2)
- std r7,16(r9)
- li r9,0
- b L(zero_pad_loop_dw_1)
-
- .align 4
-L(zero_pad_loop_dw):
- addi r10,r10,16
- std r9,-8(r10)
- beq cr0,L(zero_pad_loop_dw_2)
- std r9,0(r10)
-L(zero_pad_loop_dw_1):
- cmpldi cr7,r8,1
- std r9,0(r10)
- addic. r8,r8,-2
- bne cr7,L(zero_pad_loop_dw)
- addi r10,r10,8
-L(zero_pad_loop_dw_2):
- rldicl r5,r5,0,61
-L(zero_pad_loop_b_start):
- cmpdi cr7,r5,0
- addi r5,r5,-1
- addi r9,r10,-1
- add r10,r10,5
- subf r10,r9,r10
- li r8,0
- beq- cr7,L(short_path_loop_end)
-
- /* Write remaining 1-8 bytes. */
- .align 4
- addi r9,r9,1
- mtocrf 0x1,r10
- bf 29,4f
- stw r8,0(r9)
- addi r9,r9,4
-
- .align 4
-4: bf 30,2f
- sth r8,0(r9)
- addi r9,r9,2
-
- .align 4
-2: bf 31,1f
- stb r8,0(r9)
- /* Restore non-volatile registers. */
-1: ld r26,-48(r1)
+ /* Restore non-volatile registers and return. */
+ ld r26,-48(r1)
ld r27,-40(r1)
ld r28,-32(r1)
ld r29,-24(r1)
@@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
mr r4,r28
mr r9,r29
b L(short_path_2)
-L(zero_pad_loop_b_prepare):
- addi r10,r9,8
- rldicl r5,r5,0,61
- b L(zero_pad_loop_b_start)
L(zero_pad_start_prepare_1):
mr r5,r6
mr r9,r8
--
2.4.11
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy
2016-04-20 20:55 [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy Gabriel F. T. Gomes
@ 2016-04-20 21:25 ` Adhemerval Zanella
2016-04-26 13:41 ` Gabriel F. T. Gomes
0 siblings, 1 reply; 5+ messages in thread
From: Adhemerval Zanella @ 2016-04-20 21:25 UTC (permalink / raw)
To: libc-alpha
I am ok with this patch, the only nit is it now calls memset for all
padding size, where for small padding (less than 16 or maybe 32) I think it would
be still faster than calling memset (since dcbtst will be used only for
large inputs). However I still not sure if embedding the memset write_LT_32
would yield much gain as well.
On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
> Call __memset_power8 to pad, with zeros, the remaining bytes in the
> dest string on __strncpy_power8 and __stpncpy_power8. This improves
> performance when n is larger than the input string, giving ~30% gain for
> larger strings without impacting much shorter strings.
>
> 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
> remaining bytes in the dest string, with zeros.
> ---
> sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
> 1 file changed, 56 insertions(+), 67 deletions(-)
>
> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> index 17c3afb..0bb3bd4 100644
> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> @@ -24,6 +24,8 @@
> # define FUNC_NAME strncpy
> #endif
>
> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
> +
> /* Implements the function
>
> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
> addi r10,r4,16
> rlwinm r9,r4,0,19,19
>
> - /* Since it is a leaf function, save some non-volatile registers on the
> - protected/red zone. */
> + /* Save some non-volatile registers on the stack. */
> std r26,-48(r1)
> std r27,-40(r1)
>
> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
> std r30,-16(r1)
> std r31,-8(r1)
>
> + /* Update CFI. */
> + cfi_offset(r26, -48)
> + cfi_offset(r27, -40)
> + cfi_offset(r28, -32)
> + cfi_offset(r29, -24)
> + cfi_offset(r30, -16)
> + cfi_offset(r31, -8)
> +
> beq cr7,L(unaligned_lt_16)
> rldicl r9,r4,0,61
> subfic r8,r9,8
> @@ -180,74 +189,58 @@ L(short_path_loop_end):
> ld r31,-8(r1)
> blr
>
> - /* This code pads the remainder dest with NULL bytes. The algorithm
> - calculate the remanining size and issues a doubleword unrolled
> - loops followed by a byte a byte set. */
> + /* This code pads the remainder of dest with NULL bytes. The algorithm
> + calculates the remaining size and calls memset. */
> .align 4
> L(zero_pad_start):
> mr r5,r10
> mr r9,r6
> L(zero_pad_start_1):
> - srdi. r8,r5,r3
> - mr r10,r9
> -#ifdef USE_AS_STPNCPY
> - mr r3,r9
> + /* At this point:
> + - r5 holds the number of bytes that still have to be written to
> + dest.
> + - r9 points to the position, in dest, where the first null byte
> + will be written.
> + The above statements are true both when control reaches this label
> + from a branch or when falling through the previous lines. */
> +#ifndef USE_AS_STPNCPY
> + mr r30,r3 /* Save the return value of strncpy. */
> +#endif
> + /* Prepare the call to memset. */
> + mr r3,r9 /* Pointer to the area to be zero-filled. */
> + li r4,0 /* Byte to be written (zero). */
> +
> + /* We delayed the creation of the stack frame, as well as the saving of
> + the link register, because only at this point, we are sure that
> + doing so is actually needed. */
> +
> + /* Save the link register. */
> + mflr r0
> + std r0,16(r1)
> + cfi_offset(lr, 16)
> +
> + /* Create the stack frame. */
> + stdu r1,-FRAMESIZE(r1)
> + cfi_adjust_cfa_offset(FRAMESIZE)
> +
> + bl __memset_power8
> + nop
> +
> + /* Restore the stack frame. */
> + addi r1,r1,FRAMESIZE
> + cfi_adjust_cfa_offset(-FRAMESIZE)
> + /* Restore the link register. */
> + ld r0,16(r1)
> + mtlr r0
> +
> +#ifndef USE_AS_STPNCPY
> + mr r3,r30 /* Restore the return value of strncpy, i.e.:
> + dest. For stpncpy, the return value is the
> + same as return value of memset. */
> #endif
> - beq- cr0,L(zero_pad_loop_b_start)
> - cmpldi cr7,r8,1
> - li cr7,0
> - std r7,0(r9)
> - beq cr7,L(zero_pad_loop_b_prepare)
> - addic. r8,r8,-2
> - addi r10,r9,r16
> - std r7,8(r9)
> - beq cr0,L(zero_pad_loop_dw_2)
> - std r7,16(r9)
> - li r9,0
> - b L(zero_pad_loop_dw_1)
> -
> - .align 4
> -L(zero_pad_loop_dw):
> - addi r10,r10,16
> - std r9,-8(r10)
> - beq cr0,L(zero_pad_loop_dw_2)
> - std r9,0(r10)
> -L(zero_pad_loop_dw_1):
> - cmpldi cr7,r8,1
> - std r9,0(r10)
> - addic. r8,r8,-2
> - bne cr7,L(zero_pad_loop_dw)
> - addi r10,r10,8
> -L(zero_pad_loop_dw_2):
> - rldicl r5,r5,0,61
> -L(zero_pad_loop_b_start):
> - cmpdi cr7,r5,0
> - addi r5,r5,-1
> - addi r9,r10,-1
> - add r10,r10,5
> - subf r10,r9,r10
> - li r8,0
> - beq- cr7,L(short_path_loop_end)
> -
> - /* Write remaining 1-8 bytes. */
> - .align 4
> - addi r9,r9,1
> - mtocrf 0x1,r10
> - bf 29,4f
> - stw r8,0(r9)
> - addi r9,r9,4
> -
> - .align 4
> -4: bf 30,2f
> - sth r8,0(r9)
> - addi r9,r9,2
> -
> - .align 4
> -2: bf 31,1f
> - stb r8,0(r9)
>
> - /* Restore non-volatile registers. */
> -1: ld r26,-48(r1)
> + /* Restore non-volatile registers and return. */
> + ld r26,-48(r1)
> ld r27,-40(r1)
> ld r28,-32(r1)
> ld r29,-24(r1)
> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
> mr r4,r28
> mr r9,r29
> b L(short_path_2)
> -L(zero_pad_loop_b_prepare):
> - addi r10,r9,8
> - rldicl r5,r5,0,61
> - b L(zero_pad_loop_b_start)
> L(zero_pad_start_prepare_1):
> mr r5,r6
> mr r9,r8
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy
2016-04-20 21:25 ` Adhemerval Zanella
@ 2016-04-26 13:41 ` Gabriel F. T. Gomes
2016-04-26 14:17 ` Adhemerval Zanella
0 siblings, 1 reply; 5+ messages in thread
From: Gabriel F. T. Gomes @ 2016-04-26 13:41 UTC (permalink / raw)
To: Adhemerval Zanella; +Cc: libc-alpha
On Wed, 20 Apr 2016 18:25:37 -0300
Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
> I am ok with this patch, the only nit is it now calls memset for all
> padding size, where for small padding (less than 16 or maybe 32) I think it would
> be still faster than calling memset (since dcbtst will be used only for
> large inputs). However I still not sure if embedding the memset write_LT_32
> would yield much gain as well.
Hi, Adhemerval. Thanks for your review.
I also expected that for small amounts of padding, calling memset would
harm performance. However, I ran some tests, where memset was only
called for padding amounts greater than 32 bytes, and I didn't notice
any significant changes in performance. That's the reason why I sent
this version of the patch, which always calls memset.
I also noticed that bench-strncpy exposes a load-hit-store problem,
which might be suppressing the difference for small paddings. I have
plans to fix this in the future, and, if necessary, limit memset to
greater paddings.
However, right now, I don't see a benefit in adding the check.
> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
> > Call __memset_power8 to pad, with zeros, the remaining bytes in the
> > dest string on __strncpy_power8 and __stpncpy_power8. This improves
> > performance when n is larger than the input string, giving ~30% gain for
> > larger strings without impacting much shorter strings.
> >
> > 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com>
> >
> > * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
> > remaining bytes in the dest string, with zeros.
> > ---
> > sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
> > 1 file changed, 56 insertions(+), 67 deletions(-)
> >
> > diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> > index 17c3afb..0bb3bd4 100644
> > --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
> > +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> > @@ -24,6 +24,8 @@
> > # define FUNC_NAME strncpy
> > #endif
> >
> > +#define FRAMESIZE (FRAME_MIN_SIZE+48)
> > +
> > /* Implements the function
> >
> > char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> > @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
> > addi r10,r4,16
> > rlwinm r9,r4,0,19,19
> >
> > - /* Since it is a leaf function, save some non-volatile registers on the
> > - protected/red zone. */
> > + /* Save some non-volatile registers on the stack. */
> > std r26,-48(r1)
> > std r27,-40(r1)
> >
> > @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
> > std r30,-16(r1)
> > std r31,-8(r1)
> >
> > + /* Update CFI. */
> > + cfi_offset(r26, -48)
> > + cfi_offset(r27, -40)
> > + cfi_offset(r28, -32)
> > + cfi_offset(r29, -24)
> > + cfi_offset(r30, -16)
> > + cfi_offset(r31, -8)
> > +
> > beq cr7,L(unaligned_lt_16)
> > rldicl r9,r4,0,61
> > subfic r8,r9,8
> > @@ -180,74 +189,58 @@ L(short_path_loop_end):
> > ld r31,-8(r1)
> > blr
> >
> > - /* This code pads the remainder dest with NULL bytes. The algorithm
> > - calculate the remanining size and issues a doubleword unrolled
> > - loops followed by a byte a byte set. */
> > + /* This code pads the remainder of dest with NULL bytes. The algorithm
> > + calculates the remaining size and calls memset. */
> > .align 4
> > L(zero_pad_start):
> > mr r5,r10
> > mr r9,r6
> > L(zero_pad_start_1):
> > - srdi. r8,r5,r3
> > - mr r10,r9
> > -#ifdef USE_AS_STPNCPY
> > - mr r3,r9
> > + /* At this point:
> > + - r5 holds the number of bytes that still have to be written to
> > + dest.
> > + - r9 points to the position, in dest, where the first null byte
> > + will be written.
> > + The above statements are true both when control reaches this label
> > + from a branch or when falling through the previous lines. */
> > +#ifndef USE_AS_STPNCPY
> > + mr r30,r3 /* Save the return value of strncpy. */
> > +#endif
> > + /* Prepare the call to memset. */
> > + mr r3,r9 /* Pointer to the area to be zero-filled. */
> > + li r4,0 /* Byte to be written (zero). */
> > +
> > + /* We delayed the creation of the stack frame, as well as the saving of
> > + the link register, because only at this point, we are sure that
> > + doing so is actually needed. */
> > +
> > + /* Save the link register. */
> > + mflr r0
> > + std r0,16(r1)
> > + cfi_offset(lr, 16)
> > +
> > + /* Create the stack frame. */
> > + stdu r1,-FRAMESIZE(r1)
> > + cfi_adjust_cfa_offset(FRAMESIZE)
> > +
> > + bl __memset_power8
> > + nop
> > +
> > + /* Restore the stack frame. */
> > + addi r1,r1,FRAMESIZE
> > + cfi_adjust_cfa_offset(-FRAMESIZE)
> > + /* Restore the link register. */
> > + ld r0,16(r1)
> > + mtlr r0
> > +
> > +#ifndef USE_AS_STPNCPY
> > + mr r3,r30 /* Restore the return value of strncpy, i.e.:
> > + dest. For stpncpy, the return value is the
> > + same as return value of memset. */
> > #endif
> > - beq- cr0,L(zero_pad_loop_b_start)
> > - cmpldi cr7,r8,1
> > - li cr7,0
> > - std r7,0(r9)
> > - beq cr7,L(zero_pad_loop_b_prepare)
> > - addic. r8,r8,-2
> > - addi r10,r9,r16
> > - std r7,8(r9)
> > - beq cr0,L(zero_pad_loop_dw_2)
> > - std r7,16(r9)
> > - li r9,0
> > - b L(zero_pad_loop_dw_1)
> > -
> > - .align 4
> > -L(zero_pad_loop_dw):
> > - addi r10,r10,16
> > - std r9,-8(r10)
> > - beq cr0,L(zero_pad_loop_dw_2)
> > - std r9,0(r10)
> > -L(zero_pad_loop_dw_1):
> > - cmpldi cr7,r8,1
> > - std r9,0(r10)
> > - addic. r8,r8,-2
> > - bne cr7,L(zero_pad_loop_dw)
> > - addi r10,r10,8
> > -L(zero_pad_loop_dw_2):
> > - rldicl r5,r5,0,61
> > -L(zero_pad_loop_b_start):
> > - cmpdi cr7,r5,0
> > - addi r5,r5,-1
> > - addi r9,r10,-1
> > - add r10,r10,5
> > - subf r10,r9,r10
> > - li r8,0
> > - beq- cr7,L(short_path_loop_end)
> > -
> > - /* Write remaining 1-8 bytes. */
> > - .align 4
> > - addi r9,r9,1
> > - mtocrf 0x1,r10
> > - bf 29,4f
> > - stw r8,0(r9)
> > - addi r9,r9,4
> > -
> > - .align 4
> > -4: bf 30,2f
> > - sth r8,0(r9)
> > - addi r9,r9,2
> > -
> > - .align 4
> > -2: bf 31,1f
> > - stb r8,0(r9)
> >
> > - /* Restore non-volatile registers. */
> > -1: ld r26,-48(r1)
> > + /* Restore non-volatile registers and return. */
> > + ld r26,-48(r1)
> > ld r27,-40(r1)
> > ld r28,-32(r1)
> > ld r29,-24(r1)
> > @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
> > mr r4,r28
> > mr r9,r29
> > b L(short_path_2)
> > -L(zero_pad_loop_b_prepare):
> > - addi r10,r9,8
> > - rldicl r5,r5,0,61
> > - b L(zero_pad_loop_b_start)
> > L(zero_pad_start_prepare_1):
> > mr r5,r6
> > mr r9,r8
> >
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy
2016-04-26 13:41 ` Gabriel F. T. Gomes
@ 2016-04-26 14:17 ` Adhemerval Zanella
2016-04-29 13:48 ` Gabriel F. T. Gomes
0 siblings, 1 reply; 5+ messages in thread
From: Adhemerval Zanella @ 2016-04-26 14:17 UTC (permalink / raw)
To: Gabriel F. T. Gomes; +Cc: libc-alpha
On 26/04/2016 10:41, Gabriel F. T. Gomes wrote:
> On Wed, 20 Apr 2016 18:25:37 -0300
> Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
>
>> I am ok with this patch, the only nit is it now calls memset for all
>> padding size, where for small padding (less than 16 or maybe 32) I think it would
>> be still faster than calling memset (since dcbtst will be used only for
>> large inputs). However I still not sure if embedding the memset write_LT_32
>> would yield much gain as well.
>
> Hi, Adhemerval. Thanks for your review.
>
> I also expected that for small amounts of padding, calling memset would
> harm performance. However, I ran some tests, where memset was only
> called for padding amounts greater than 32 bytes, and I didn't notice
> any significant changes in performance. That's the reason why I sent
> this version of the patch, which always calls memset.
>
> I also noticed that bench-strncpy exposes a load-hit-store problem,
> which might be suppressing the difference for small paddings. I have
> plans to fix this in the future, and, if necessary, limit memset to
> greater paddings.
>
> However, right now, I don't see a benefit in adding the check.
>
>
Fair enough then.
>> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
>>> Call __memset_power8 to pad, with zeros, the remaining bytes in the
>>> dest string on __strncpy_power8 and __stpncpy_power8. This improves
>>> performance when n is larger than the input string, giving ~30% gain for
>>> larger strings without impacting much shorter strings.
>>>
>>> 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com>
>>>
>>> * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
>>> remaining bytes in the dest string, with zeros.
>>> ---
>>> sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
>>> 1 file changed, 56 insertions(+), 67 deletions(-)
>>>
>>> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
>>> index 17c3afb..0bb3bd4 100644
>>> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
>>> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
>>> @@ -24,6 +24,8 @@
>>> # define FUNC_NAME strncpy
>>> #endif
>>>
>>> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
>>> +
>>> /* Implements the function
>>>
>>> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>>> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
>>> addi r10,r4,16
>>> rlwinm r9,r4,0,19,19
>>>
>>> - /* Since it is a leaf function, save some non-volatile registers on the
>>> - protected/red zone. */
>>> + /* Save some non-volatile registers on the stack. */
>>> std r26,-48(r1)
>>> std r27,-40(r1)
>>>
>>> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
>>> std r30,-16(r1)
>>> std r31,-8(r1)
>>>
>>> + /* Update CFI. */
>>> + cfi_offset(r26, -48)
>>> + cfi_offset(r27, -40)
>>> + cfi_offset(r28, -32)
>>> + cfi_offset(r29, -24)
>>> + cfi_offset(r30, -16)
>>> + cfi_offset(r31, -8)
>>> +
>>> beq cr7,L(unaligned_lt_16)
>>> rldicl r9,r4,0,61
>>> subfic r8,r9,8
>>> @@ -180,74 +189,58 @@ L(short_path_loop_end):
>>> ld r31,-8(r1)
>>> blr
>>>
>>> - /* This code pads the remainder dest with NULL bytes. The algorithm
>>> - calculate the remanining size and issues a doubleword unrolled
>>> - loops followed by a byte a byte set. */
>>> + /* This code pads the remainder of dest with NULL bytes. The algorithm
>>> + calculates the remaining size and calls memset. */
>>> .align 4
>>> L(zero_pad_start):
>>> mr r5,r10
>>> mr r9,r6
>>> L(zero_pad_start_1):
>>> - srdi. r8,r5,r3
>>> - mr r10,r9
>>> -#ifdef USE_AS_STPNCPY
>>> - mr r3,r9
>>> + /* At this point:
>>> + - r5 holds the number of bytes that still have to be written to
>>> + dest.
>>> + - r9 points to the position, in dest, where the first null byte
>>> + will be written.
>>> + The above statements are true both when control reaches this label
>>> + from a branch or when falling through the previous lines. */
>>> +#ifndef USE_AS_STPNCPY
>>> + mr r30,r3 /* Save the return value of strncpy. */
>>> +#endif
>>> + /* Prepare the call to memset. */
>>> + mr r3,r9 /* Pointer to the area to be zero-filled. */
>>> + li r4,0 /* Byte to be written (zero). */
>>> +
>>> + /* We delayed the creation of the stack frame, as well as the saving of
>>> + the link register, because only at this point, we are sure that
>>> + doing so is actually needed. */
>>> +
>>> + /* Save the link register. */
>>> + mflr r0
>>> + std r0,16(r1)
>>> + cfi_offset(lr, 16)
>>> +
>>> + /* Create the stack frame. */
>>> + stdu r1,-FRAMESIZE(r1)
>>> + cfi_adjust_cfa_offset(FRAMESIZE)
>>> +
>>> + bl __memset_power8
>>> + nop
>>> +
>>> + /* Restore the stack frame. */
>>> + addi r1,r1,FRAMESIZE
>>> + cfi_adjust_cfa_offset(-FRAMESIZE)
>>> + /* Restore the link register. */
>>> + ld r0,16(r1)
>>> + mtlr r0
>>> +
>>> +#ifndef USE_AS_STPNCPY
>>> + mr r3,r30 /* Restore the return value of strncpy, i.e.:
>>> + dest. For stpncpy, the return value is the
>>> + same as return value of memset. */
>>> #endif
>>> - beq- cr0,L(zero_pad_loop_b_start)
>>> - cmpldi cr7,r8,1
>>> - li cr7,0
>>> - std r7,0(r9)
>>> - beq cr7,L(zero_pad_loop_b_prepare)
>>> - addic. r8,r8,-2
>>> - addi r10,r9,r16
>>> - std r7,8(r9)
>>> - beq cr0,L(zero_pad_loop_dw_2)
>>> - std r7,16(r9)
>>> - li r9,0
>>> - b L(zero_pad_loop_dw_1)
>>> -
>>> - .align 4
>>> -L(zero_pad_loop_dw):
>>> - addi r10,r10,16
>>> - std r9,-8(r10)
>>> - beq cr0,L(zero_pad_loop_dw_2)
>>> - std r9,0(r10)
>>> -L(zero_pad_loop_dw_1):
>>> - cmpldi cr7,r8,1
>>> - std r9,0(r10)
>>> - addic. r8,r8,-2
>>> - bne cr7,L(zero_pad_loop_dw)
>>> - addi r10,r10,8
>>> -L(zero_pad_loop_dw_2):
>>> - rldicl r5,r5,0,61
>>> -L(zero_pad_loop_b_start):
>>> - cmpdi cr7,r5,0
>>> - addi r5,r5,-1
>>> - addi r9,r10,-1
>>> - add r10,r10,5
>>> - subf r10,r9,r10
>>> - li r8,0
>>> - beq- cr7,L(short_path_loop_end)
>>> -
>>> - /* Write remaining 1-8 bytes. */
>>> - .align 4
>>> - addi r9,r9,1
>>> - mtocrf 0x1,r10
>>> - bf 29,4f
>>> - stw r8,0(r9)
>>> - addi r9,r9,4
>>> -
>>> - .align 4
>>> -4: bf 30,2f
>>> - sth r8,0(r9)
>>> - addi r9,r9,2
>>> -
>>> - .align 4
>>> -2: bf 31,1f
>>> - stb r8,0(r9)
>>>
>>> - /* Restore non-volatile registers. */
>>> -1: ld r26,-48(r1)
>>> + /* Restore non-volatile registers and return. */
>>> + ld r26,-48(r1)
>>> ld r27,-40(r1)
>>> ld r28,-32(r1)
>>> ld r29,-24(r1)
>>> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
>>> mr r4,r28
>>> mr r9,r29
>>> b L(short_path_2)
>>> -L(zero_pad_loop_b_prepare):
>>> - addi r10,r9,8
>>> - rldicl r5,r5,0,61
>>> - b L(zero_pad_loop_b_start)
>>> L(zero_pad_start_prepare_1):
>>> mr r5,r6
>>> mr r9,r8
>>>
>>
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy
2016-04-26 14:17 ` Adhemerval Zanella
@ 2016-04-29 13:48 ` Gabriel F. T. Gomes
0 siblings, 0 replies; 5+ messages in thread
From: Gabriel F. T. Gomes @ 2016-04-29 13:48 UTC (permalink / raw)
To: Adhemerval Zanella; +Cc: libc-alpha
Thank you for reviewing, Adhemerval.
Pushed as 72c11b353ede72931cc474c9071d143d9a05c0d7.
On Tue, 26 Apr 2016 11:16:48 -0300
Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
> On 26/04/2016 10:41, Gabriel F. T. Gomes wrote:
> > On Wed, 20 Apr 2016 18:25:37 -0300
> > Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
> >
> >> I am ok with this patch, the only nit is it now calls memset for all
> >> padding size, where for small padding (less than 16 or maybe 32) I think it would
> >> be still faster than calling memset (since dcbtst will be used only for
> >> large inputs). However I still not sure if embedding the memset write_LT_32
> >> would yield much gain as well.
> >
> > Hi, Adhemerval. Thanks for your review.
> >
> > I also expected that for small amounts of padding, calling memset would
> > harm performance. However, I ran some tests, where memset was only
> > called for padding amounts greater than 32 bytes, and I didn't notice
> > any significant changes in performance. That's the reason why I sent
> > this version of the patch, which always calls memset.
> >
> > I also noticed that bench-strncpy exposes a load-hit-store problem,
> > which might be suppressing the difference for small paddings. I have
> > plans to fix this in the future, and, if necessary, limit memset to
> > greater paddings.
> >
> > However, right now, I don't see a benefit in adding the check.
> >
> >
>
> Fair enough then.
>
> >> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
> >>> Call __memset_power8 to pad, with zeros, the remaining bytes in the
> >>> dest string on __strncpy_power8 and __stpncpy_power8. This improves
> >>> performance when n is larger than the input string, giving ~30% gain for
> >>> larger strings without impacting much shorter strings.
> >>>
> >>> 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com>
> >>>
> >>> * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
> >>> remaining bytes in the dest string, with zeros.
> >>> ---
> >>> sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
> >>> 1 file changed, 56 insertions(+), 67 deletions(-)
> >>>
> >>> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> >>> index 17c3afb..0bb3bd4 100644
> >>> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
> >>> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> >>> @@ -24,6 +24,8 @@
> >>> # define FUNC_NAME strncpy
> >>> #endif
> >>>
> >>> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
> >>> +
> >>> /* Implements the function
> >>>
> >>> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> >>> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
> >>> addi r10,r4,16
> >>> rlwinm r9,r4,0,19,19
> >>>
> >>> - /* Since it is a leaf function, save some non-volatile registers on the
> >>> - protected/red zone. */
> >>> + /* Save some non-volatile registers on the stack. */
> >>> std r26,-48(r1)
> >>> std r27,-40(r1)
> >>>
> >>> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
> >>> std r30,-16(r1)
> >>> std r31,-8(r1)
> >>>
> >>> + /* Update CFI. */
> >>> + cfi_offset(r26, -48)
> >>> + cfi_offset(r27, -40)
> >>> + cfi_offset(r28, -32)
> >>> + cfi_offset(r29, -24)
> >>> + cfi_offset(r30, -16)
> >>> + cfi_offset(r31, -8)
> >>> +
> >>> beq cr7,L(unaligned_lt_16)
> >>> rldicl r9,r4,0,61
> >>> subfic r8,r9,8
> >>> @@ -180,74 +189,58 @@ L(short_path_loop_end):
> >>> ld r31,-8(r1)
> >>> blr
> >>>
> >>> - /* This code pads the remainder dest with NULL bytes. The algorithm
> >>> - calculate the remanining size and issues a doubleword unrolled
> >>> - loops followed by a byte a byte set. */
> >>> + /* This code pads the remainder of dest with NULL bytes. The algorithm
> >>> + calculates the remaining size and calls memset. */
> >>> .align 4
> >>> L(zero_pad_start):
> >>> mr r5,r10
> >>> mr r9,r6
> >>> L(zero_pad_start_1):
> >>> - srdi. r8,r5,r3
> >>> - mr r10,r9
> >>> -#ifdef USE_AS_STPNCPY
> >>> - mr r3,r9
> >>> + /* At this point:
> >>> + - r5 holds the number of bytes that still have to be written to
> >>> + dest.
> >>> + - r9 points to the position, in dest, where the first null byte
> >>> + will be written.
> >>> + The above statements are true both when control reaches this label
> >>> + from a branch or when falling through the previous lines. */
> >>> +#ifndef USE_AS_STPNCPY
> >>> + mr r30,r3 /* Save the return value of strncpy. */
> >>> +#endif
> >>> + /* Prepare the call to memset. */
> >>> + mr r3,r9 /* Pointer to the area to be zero-filled. */
> >>> + li r4,0 /* Byte to be written (zero). */
> >>> +
> >>> + /* We delayed the creation of the stack frame, as well as the saving of
> >>> + the link register, because only at this point, we are sure that
> >>> + doing so is actually needed. */
> >>> +
> >>> + /* Save the link register. */
> >>> + mflr r0
> >>> + std r0,16(r1)
> >>> + cfi_offset(lr, 16)
> >>> +
> >>> + /* Create the stack frame. */
> >>> + stdu r1,-FRAMESIZE(r1)
> >>> + cfi_adjust_cfa_offset(FRAMESIZE)
> >>> +
> >>> + bl __memset_power8
> >>> + nop
> >>> +
> >>> + /* Restore the stack frame. */
> >>> + addi r1,r1,FRAMESIZE
> >>> + cfi_adjust_cfa_offset(-FRAMESIZE)
> >>> + /* Restore the link register. */
> >>> + ld r0,16(r1)
> >>> + mtlr r0
> >>> +
> >>> +#ifndef USE_AS_STPNCPY
> >>> + mr r3,r30 /* Restore the return value of strncpy, i.e.:
> >>> + dest. For stpncpy, the return value is the
> >>> + same as return value of memset. */
> >>> #endif
> >>> - beq- cr0,L(zero_pad_loop_b_start)
> >>> - cmpldi cr7,r8,1
> >>> - li cr7,0
> >>> - std r7,0(r9)
> >>> - beq cr7,L(zero_pad_loop_b_prepare)
> >>> - addic. r8,r8,-2
> >>> - addi r10,r9,r16
> >>> - std r7,8(r9)
> >>> - beq cr0,L(zero_pad_loop_dw_2)
> >>> - std r7,16(r9)
> >>> - li r9,0
> >>> - b L(zero_pad_loop_dw_1)
> >>> -
> >>> - .align 4
> >>> -L(zero_pad_loop_dw):
> >>> - addi r10,r10,16
> >>> - std r9,-8(r10)
> >>> - beq cr0,L(zero_pad_loop_dw_2)
> >>> - std r9,0(r10)
> >>> -L(zero_pad_loop_dw_1):
> >>> - cmpldi cr7,r8,1
> >>> - std r9,0(r10)
> >>> - addic. r8,r8,-2
> >>> - bne cr7,L(zero_pad_loop_dw)
> >>> - addi r10,r10,8
> >>> -L(zero_pad_loop_dw_2):
> >>> - rldicl r5,r5,0,61
> >>> -L(zero_pad_loop_b_start):
> >>> - cmpdi cr7,r5,0
> >>> - addi r5,r5,-1
> >>> - addi r9,r10,-1
> >>> - add r10,r10,5
> >>> - subf r10,r9,r10
> >>> - li r8,0
> >>> - beq- cr7,L(short_path_loop_end)
> >>> -
> >>> - /* Write remaining 1-8 bytes. */
> >>> - .align 4
> >>> - addi r9,r9,1
> >>> - mtocrf 0x1,r10
> >>> - bf 29,4f
> >>> - stw r8,0(r9)
> >>> - addi r9,r9,4
> >>> -
> >>> - .align 4
> >>> -4: bf 30,2f
> >>> - sth r8,0(r9)
> >>> - addi r9,r9,2
> >>> -
> >>> - .align 4
> >>> -2: bf 31,1f
> >>> - stb r8,0(r9)
> >>>
> >>> - /* Restore non-volatile registers. */
> >>> -1: ld r26,-48(r1)
> >>> + /* Restore non-volatile registers and return. */
> >>> + ld r26,-48(r1)
> >>> ld r27,-40(r1)
> >>> ld r28,-32(r1)
> >>> ld r29,-24(r1)
> >>> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
> >>> mr r4,r28
> >>> mr r9,r29
> >>> b L(short_path_2)
> >>> -L(zero_pad_loop_b_prepare):
> >>> - addi r10,r9,8
> >>> - rldicl r5,r5,0,61
> >>> - b L(zero_pad_loop_b_start)
> >>> L(zero_pad_start_prepare_1):
> >>> mr r5,r6
> >>> mr r9,r8
> >>>
> >>
> >
>
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2016-04-29 13:48 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-20 20:55 [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy Gabriel F. T. Gomes
2016-04-20 21:25 ` Adhemerval Zanella
2016-04-26 13:41 ` Gabriel F. T. Gomes
2016-04-26 14:17 ` Adhemerval Zanella
2016-04-29 13:48 ` Gabriel F. T. Gomes
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).