From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 120946 invoked by alias); 26 Apr 2016 13:41:42 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Received: (qmail 120935 invoked by uid 89); 26 Apr 2016 13:41:41 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00,KAM_LAZY_DOMAIN_SECURITY,RP_MATCHES_RCVD autolearn=ham version=3.3.2 spammy=zeros, leaf, paddings, rldicl X-HELO: e24smtp02.br.ibm.com X-IBM-Helo: d24dlp02.br.ibm.com X-IBM-MailFrom: gftg@linux.vnet.ibm.com X-IBM-RcptTo: libc-alpha@sourceware.org Date: Tue, 26 Apr 2016 13:41:00 -0000 From: "Gabriel F. T. Gomes" To: Adhemerval Zanella Cc: libc-alpha@sourceware.org Subject: Re: [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy Message-ID: <20160426104120.535e2f92@keller> In-Reply-To: <5717F3D1.5090003@linaro.org> References: <1461185716-7332-1-git-send-email-gftg@linux.vnet.ibm.com> <5717F3D1.5090003@linaro.org> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit X-TM-AS-MML: disable X-Content-Scanned: Fidelis XPS MAILER x-cbid: 16042613-0021-0000-0000-00001A6C20A4 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused X-SW-Source: 2016-04/txt/msg00644.txt.bz2 On Wed, 20 Apr 2016 18:25:37 -0300 Adhemerval Zanella wrote: > I am ok with this patch, the only nit is it now calls memset for all > padding size, where for small padding (less than 16 or maybe 32) I think it would > be still faster than calling memset (since dcbtst will be used only for > large inputs). However I still not sure if embedding the memset write_LT_32 > would yield much gain as well. Hi, Adhemerval. Thanks for your review. I also expected that for small amounts of padding, calling memset would harm performance. However, I ran some tests, where memset was only called for padding amounts greater than 32 bytes, and I didn't notice any significant changes in performance. That's the reason why I sent this version of the patch, which always calls memset. I also noticed that bench-strncpy exposes a load-hit-store problem, which might be suppressing the difference for small paddings. I have plans to fix this in the future, and, if necessary, limit memset to greater paddings. However, right now, I don't see a benefit in adding the check. > On 20-04-2016 17:55, Gabriel F. T. Gomes wrote: > > Call __memset_power8 to pad, with zeros, the remaining bytes in the > > dest string on __strncpy_power8 and __stpncpy_power8. This improves > > performance when n is larger than the input string, giving ~30% gain for > > larger strings without impacting much shorter strings. > > > > 2016-04-18 Gabriel F. T. Gomes > > > > * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the > > remaining bytes in the dest string, with zeros. > > --- > > sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++---------------- > > 1 file changed, 56 insertions(+), 67 deletions(-) > > > > diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S > > index 17c3afb..0bb3bd4 100644 > > --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S > > +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S > > @@ -24,6 +24,8 @@ > > # define FUNC_NAME strncpy > > #endif > > > > +#define FRAMESIZE (FRAME_MIN_SIZE+48) > > + > > /* Implements the function > > > > char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > > @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) > > addi r10,r4,16 > > rlwinm r9,r4,0,19,19 > > > > - /* Since it is a leaf function, save some non-volatile registers on the > > - protected/red zone. */ > > + /* Save some non-volatile registers on the stack. */ > > std r26,-48(r1) > > std r27,-40(r1) > > > > @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) > > std r30,-16(r1) > > std r31,-8(r1) > > > > + /* Update CFI. */ > > + cfi_offset(r26, -48) > > + cfi_offset(r27, -40) > > + cfi_offset(r28, -32) > > + cfi_offset(r29, -24) > > + cfi_offset(r30, -16) > > + cfi_offset(r31, -8) > > + > > beq cr7,L(unaligned_lt_16) > > rldicl r9,r4,0,61 > > subfic r8,r9,8 > > @@ -180,74 +189,58 @@ L(short_path_loop_end): > > ld r31,-8(r1) > > blr > > > > - /* This code pads the remainder dest with NULL bytes. The algorithm > > - calculate the remanining size and issues a doubleword unrolled > > - loops followed by a byte a byte set. */ > > + /* This code pads the remainder of dest with NULL bytes. The algorithm > > + calculates the remaining size and calls memset. */ > > .align 4 > > L(zero_pad_start): > > mr r5,r10 > > mr r9,r6 > > L(zero_pad_start_1): > > - srdi. r8,r5,r3 > > - mr r10,r9 > > -#ifdef USE_AS_STPNCPY > > - mr r3,r9 > > + /* At this point: > > + - r5 holds the number of bytes that still have to be written to > > + dest. > > + - r9 points to the position, in dest, where the first null byte > > + will be written. > > + The above statements are true both when control reaches this label > > + from a branch or when falling through the previous lines. */ > > +#ifndef USE_AS_STPNCPY > > + mr r30,r3 /* Save the return value of strncpy. */ > > +#endif > > + /* Prepare the call to memset. */ > > + mr r3,r9 /* Pointer to the area to be zero-filled. */ > > + li r4,0 /* Byte to be written (zero). */ > > + > > + /* We delayed the creation of the stack frame, as well as the saving of > > + the link register, because only at this point, we are sure that > > + doing so is actually needed. */ > > + > > + /* Save the link register. */ > > + mflr r0 > > + std r0,16(r1) > > + cfi_offset(lr, 16) > > + > > + /* Create the stack frame. */ > > + stdu r1,-FRAMESIZE(r1) > > + cfi_adjust_cfa_offset(FRAMESIZE) > > + > > + bl __memset_power8 > > + nop > > + > > + /* Restore the stack frame. */ > > + addi r1,r1,FRAMESIZE > > + cfi_adjust_cfa_offset(-FRAMESIZE) > > + /* Restore the link register. */ > > + ld r0,16(r1) > > + mtlr r0 > > + > > +#ifndef USE_AS_STPNCPY > > + mr r3,r30 /* Restore the return value of strncpy, i.e.: > > + dest. For stpncpy, the return value is the > > + same as return value of memset. */ > > #endif > > - beq- cr0,L(zero_pad_loop_b_start) > > - cmpldi cr7,r8,1 > > - li cr7,0 > > - std r7,0(r9) > > - beq cr7,L(zero_pad_loop_b_prepare) > > - addic. r8,r8,-2 > > - addi r10,r9,r16 > > - std r7,8(r9) > > - beq cr0,L(zero_pad_loop_dw_2) > > - std r7,16(r9) > > - li r9,0 > > - b L(zero_pad_loop_dw_1) > > - > > - .align 4 > > -L(zero_pad_loop_dw): > > - addi r10,r10,16 > > - std r9,-8(r10) > > - beq cr0,L(zero_pad_loop_dw_2) > > - std r9,0(r10) > > -L(zero_pad_loop_dw_1): > > - cmpldi cr7,r8,1 > > - std r9,0(r10) > > - addic. r8,r8,-2 > > - bne cr7,L(zero_pad_loop_dw) > > - addi r10,r10,8 > > -L(zero_pad_loop_dw_2): > > - rldicl r5,r5,0,61 > > -L(zero_pad_loop_b_start): > > - cmpdi cr7,r5,0 > > - addi r5,r5,-1 > > - addi r9,r10,-1 > > - add r10,r10,5 > > - subf r10,r9,r10 > > - li r8,0 > > - beq- cr7,L(short_path_loop_end) > > - > > - /* Write remaining 1-8 bytes. */ > > - .align 4 > > - addi r9,r9,1 > > - mtocrf 0x1,r10 > > - bf 29,4f > > - stw r8,0(r9) > > - addi r9,r9,4 > > - > > - .align 4 > > -4: bf 30,2f > > - sth r8,0(r9) > > - addi r9,r9,2 > > - > > - .align 4 > > -2: bf 31,1f > > - stb r8,0(r9) > > > > - /* Restore non-volatile registers. */ > > -1: ld r26,-48(r1) > > + /* Restore non-volatile registers and return. */ > > + ld r26,-48(r1) > > ld r27,-40(r1) > > ld r28,-32(r1) > > ld r29,-24(r1) > > @@ -443,10 +436,6 @@ L(short_path_prepare_2_3): > > mr r4,r28 > > mr r9,r29 > > b L(short_path_2) > > -L(zero_pad_loop_b_prepare): > > - addi r10,r9,8 > > - rldicl r5,r5,0,61 > > - b L(zero_pad_loop_b_start) > > L(zero_pad_start_prepare_1): > > mr r5,r6 > > mr r9,r8 > > >