From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf1-x436.google.com (mail-pf1-x436.google.com [IPv6:2607:f8b0:4864:20::436]) by sourceware.org (Postfix) with ESMTPS id 19128395542B for ; Tue, 22 Jun 2021 21:25:32 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 19128395542B Received: by mail-pf1-x436.google.com with SMTP id u18so564114pfk.11 for ; Tue, 22 Jun 2021 14:25:32 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=8x5RNBmrKhvXx4VYNkW8YwLMTSqxxNJzTk+wVYUmycY=; b=uJXEU1v524y87uea99Ags46G6OY7VCndaAyLdHv531AzxcfyiO9CQ1FXSdmWWf+6Vn lKEs0ntFJN+196q5ExIwlFIxQdlMzlX8bgvWjYSL1nPAZO2n+5YiNsqezqkSd2bu1UMx +KdK2tiqvuZ1Iv3FHm7GGTmHhhOthu1qPo+4DmQN+Xw9cxwyWkbMyuZxvBK34Ee9YKlY gcQEqzHFOnLwXdVCRAtsNJXgOvXbPajWbNtTVTjSVfRiP3dyg/UW7sTLoZ23UiXTdqTR Pnmj3UCjbP8KjB3/L9grr02VTq7U2wVxftOjBncTg/x4G0KLhUC7VAXx99xEQ2/1bWg5 m6bQ== X-Gm-Message-State: AOAM53069hmOtMKcG6LNfZLVJFXgfBXP7Fquco2n0nHf9eNTe5156aJb ucYdKUwJYROuXH9rnPSriVgLYBIwZ3lA5wTYICA= X-Google-Smtp-Source: ABdhPJykGw+0ZZSo2wf43ufPGP4y4Z/yOkvOoAu+PZH3dnbYFxVWFEX7pClyuRfZmrCN8QNVoK33kKRFruvxmEekWAc= X-Received: by 2002:a62:8c8d:0:b029:301:7970:ab46 with SMTP id m135-20020a628c8d0000b02903017970ab46mr5647377pfd.48.1624397131193; Tue, 22 Jun 2021 14:25:31 -0700 (PDT) MIME-Version: 1.0 References: <20210609205257.123944-1-goldstein.w.n@gmail.com> <20210622181111.185897-2-goldstein.w.n@gmail.com> In-Reply-To: <20210622181111.185897-2-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Tue, 22 Jun 2021 14:24:55 -0700 Message-ID: Subject: Re: [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ #27974] To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3032.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 22 Jun 2021 21:25:34 -0000 On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein wrote: > > This commit fixes the bug mentioned in the previous commit. > > The previous implementations of wmemchr in these files relied > on n * sizeof(wchar_t) which was not guranteed by the standard. > > The new overflow tests added in the previous commit now > pass (As well as all the other tests). > > Signed-off-by: Noah Goldstein > --- > sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++------- > sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------ > 2 files changed, 98 insertions(+), 37 deletions(-) > > diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S > index beff2708de..3ddc4655cf 100644 > --- a/sysdeps/x86_64/memchr.S > +++ b/sysdeps/x86_64/memchr.S > @@ -21,9 +21,11 @@ > #ifdef USE_AS_WMEMCHR > # define MEMCHR wmemchr > # define PCMPEQ pcmpeqd > +# define CHAR_PER_VEC 4 > #else > # define MEMCHR memchr > # define PCMPEQ pcmpeqb > +# define CHAR_PER_VEC 16 > #endif > > /* fast SSE2 version with using pmaxub and 64 byte loop */ > @@ -33,15 +35,14 @@ ENTRY(MEMCHR) > movd %esi, %xmm1 > mov %edi, %ecx > > +#ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +#endif > #ifdef USE_AS_WMEMCHR > test %RDX_LP, %RDX_LP > jz L(return_null) > - shl $2, %RDX_LP > #else > -# ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %edx, %edx > -# endif > punpcklbw %xmm1, %xmm1 > test %RDX_LP, %RDX_LP > jz L(return_null) > @@ -60,13 +61,16 @@ ENTRY(MEMCHR) > test %eax, %eax > > jnz L(matches_1) > - sub $16, %rdx > + sub $CHAR_PER_VEC, %rdx > jbe L(return_null) > add $16, %rdi > and $15, %ecx > and $-16, %rdi > +#ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +#endif > add %rcx, %rdx > - sub $64, %rdx > + sub $(CHAR_PER_VEC * 4), %rdx > jbe L(exit_loop) > jmp L(loop_prolog) > > @@ -77,16 +81,21 @@ L(crosscache): > movdqa (%rdi), %xmm0 > > PCMPEQ %xmm1, %xmm0 > -/* Check if there is a match. */ > + /* Check if there is a match. */ > pmovmskb %xmm0, %eax > -/* Remove the leading bytes. */ > + /* Remove the leading bytes. */ > sar %cl, %eax > test %eax, %eax > je L(unaligned_no_match) > -/* Check which byte is a match. */ > + /* Check which byte is a match. */ > bsf %eax, %eax > - > +#ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +#else > sub %rax, %rdx > +#endif > jbe L(return_null) > add %rdi, %rax > add %rcx, %rax > @@ -94,15 +103,18 @@ L(crosscache): > > .p2align 4 > L(unaligned_no_match): > - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > possible addition overflow. */ > neg %rcx > add $16, %rcx > +#ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +#endif > sub %rcx, %rdx > jbe L(return_null) > add $16, %rdi > - sub $64, %rdx > + sub $(CHAR_PER_VEC * 4), %rdx > jbe L(exit_loop) > > .p2align 4 > @@ -135,7 +147,7 @@ L(loop_prolog): > test $0x3f, %rdi > jz L(align64_loop) > > - sub $64, %rdx > + sub $(CHAR_PER_VEC * 4), %rdx > jbe L(exit_loop) > > movdqa (%rdi), %xmm0 > @@ -167,11 +179,14 @@ L(loop_prolog): > mov %rdi, %rcx > and $-64, %rdi > and $63, %ecx > +#ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +#endif > add %rcx, %rdx > > .p2align 4 > L(align64_loop): > - sub $64, %rdx > + sub $(CHAR_PER_VEC * 4), %rdx > jbe L(exit_loop) > movdqa (%rdi), %xmm0 > movdqa 16(%rdi), %xmm2 > @@ -218,7 +233,7 @@ L(align64_loop): > > .p2align 4 > L(exit_loop): > - add $32, %edx > + add $(CHAR_PER_VEC * 2), %edx > jle L(exit_loop_32) > > movdqa (%rdi), %xmm0 > @@ -238,7 +253,7 @@ L(exit_loop): > pmovmskb %xmm3, %eax > test %eax, %eax > jnz L(matches32_1) > - sub $16, %edx > + sub $CHAR_PER_VEC, %edx > jle L(return_null) > > PCMPEQ 48(%rdi), %xmm1 > @@ -250,13 +265,13 @@ L(exit_loop): > > .p2align 4 > L(exit_loop_32): > - add $32, %edx > + add $(CHAR_PER_VEC * 2), %edx > movdqa (%rdi), %xmm0 > PCMPEQ %xmm1, %xmm0 > pmovmskb %xmm0, %eax > test %eax, %eax > jnz L(matches_1) > - sub $16, %edx > + sub $CHAR_PER_VEC, %edx > jbe L(return_null) > > PCMPEQ 16(%rdi), %xmm1 > @@ -293,7 +308,13 @@ L(matches32): > .p2align 4 > L(matches_1): > bsf %eax, %eax > +#ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +#else > sub %rax, %rdx > +#endif > jbe L(return_null) > add %rdi, %rax > ret > @@ -301,7 +322,13 @@ L(matches_1): > .p2align 4 > L(matches16_1): > bsf %eax, %eax > +#ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +#else > sub %rax, %rdx > +#endif > jbe L(return_null) > lea 16(%rdi, %rax), %rax > ret > @@ -309,7 +336,13 @@ L(matches16_1): > .p2align 4 > L(matches32_1): > bsf %eax, %eax > +#ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +#else > sub %rax, %rdx > +#endif > jbe L(return_null) > lea 32(%rdi, %rax), %rax > ret > @@ -317,7 +350,13 @@ L(matches32_1): > .p2align 4 > L(matches48_1): > bsf %eax, %eax > +#ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +#else > sub %rax, %rdx > +#endif > jbe L(return_null) > lea 48(%rdi, %rax), %rax > ret > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > index 0d8758e3e7..afdb956502 100644 > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > @@ -54,21 +54,19 @@ > > # define VEC_SIZE 32 > # define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > .section SECTION(.text),"ax",@progbits > ENTRY (MEMCHR) > # ifndef USE_AS_RAWMEMCHR > /* Check for zero length. */ > - test %RDX_LP, %RDX_LP > - jz L(null) > -# endif > -# ifdef USE_AS_WMEMCHR > - shl $2, %RDX_LP > -# else > # ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %edx, %edx > + /* Clear upper bits. */ > + and %RDX_LP, %RDX_LP > +# else > + test %RDX_LP, %RDX_LP > # endif > + jz L(null) > # endif > /* Broadcast CHAR to YMMMATCH. */ > vmovd %esi, %xmm0 > @@ -84,7 +82,7 @@ ENTRY (MEMCHR) > vpmovmskb %ymm1, %eax > # ifndef USE_AS_RAWMEMCHR > /* If length < CHAR_PER_VEC handle special. */ > - cmpq $VEC_SIZE, %rdx > + cmpq $CHAR_PER_VEC, %rdx > jbe L(first_vec_x0) > # endif > testl %eax, %eax > @@ -98,6 +96,10 @@ ENTRY (MEMCHR) > L(first_vec_x0): > /* Check if first match was before length. */ > tzcntl %eax, %eax > +# ifdef USE_AS_WMEMCHR > + /* NB: Multiply length by 4 to get byte count. */ > + sall $2, %edx > +# endif > xorl %ecx, %ecx > cmpl %eax, %edx > leaq (%rdi, %rax), %rax > @@ -110,12 +112,12 @@ L(null): > # endif > .p2align 4 > L(cross_page_boundary): > - /* Save pointer before aligning as its original value is necessary > - for computer return address if byte is found or adjusting length > - if it is not and this is memchr. */ > + /* Save pointer before aligning as its original value is > + necessary for computer return address if byte is found or > + adjusting length if it is not and this is memchr. */ > movq %rdi, %rcx > - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and > - rdi for rawmemchr. */ > + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr > + and rdi for rawmemchr. */ > orq $(VEC_SIZE - 1), %ALGN_PTR_REG > VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 > vpmovmskb %ymm1, %eax > @@ -124,6 +126,10 @@ L(cross_page_boundary): > match). */ > leaq 1(%ALGN_PTR_REG), %rsi > subq %RRAW_PTR_REG, %rsi > +# ifdef USE_AS_WMEMCHR > + /* NB: Divide bytes by 4 to get wchar_t count. */ > + shrl $2, %esi > +# endif > # endif > /* Remove the leading bytes. */ > sarxl %ERAW_PTR_REG, %eax, %eax > @@ -181,6 +187,10 @@ L(cross_page_continue): > orq $(VEC_SIZE - 1), %rdi > /* esi is for adjusting length to see if near the end. */ > leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi > +# ifdef USE_AS_WMEMCHR > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %esi > +# endif > # else > orq $(VEC_SIZE - 1), %rdi > L(cross_page_continue): > @@ -213,7 +223,7 @@ L(cross_page_continue): > > # ifndef USE_AS_RAWMEMCHR > /* Check if at last VEC_SIZE * 4 length. */ > - subq $(VEC_SIZE * 4), %rdx > + subq $(CHAR_PER_VEC * 4), %rdx > jbe L(last_4x_vec_or_less_cmpeq) > /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust > length. */ > @@ -221,6 +231,10 @@ L(cross_page_continue): > movl %edi, %ecx > orq $(VEC_SIZE * 4 - 1), %rdi > andl $(VEC_SIZE * 4 - 1), %ecx > +# ifdef USE_AS_WMEMCHR > + /* NB: Divide bytes by 4 to get the wchar_t count. */ > + sarl $2, %ecx > +# endif > addq %rcx, %rdx > # else > /* Align data to VEC_SIZE * 4 - 1 for loop. */ > @@ -250,15 +264,19 @@ L(loop_4x_vec): > > subq $-(VEC_SIZE * 4), %rdi > > - subq $(VEC_SIZE * 4), %rdx > + subq $(CHAR_PER_VEC * 4), %rdx > ja L(loop_4x_vec) > > - /* Fall through into less than 4 remaining vectors of length case. > - */ > + /* Fall through into less than 4 remaining vectors of length > + case. */ > VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 > vpmovmskb %ymm1, %eax > .p2align 4 > L(last_4x_vec_or_less): > +# ifdef USE_AS_WMEMCHR > + /* NB: Multiply length by 4 to get byte count. */ > + sall $2, %edx > +# endif > /* Check if first VEC contained match. */ > testl %eax, %eax > jnz L(first_vec_x1_check) > @@ -355,6 +373,10 @@ L(last_vec_x2_return): > L(last_4x_vec_or_less_cmpeq): > VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 > vpmovmskb %ymm1, %eax > +# ifdef USE_AS_WMEMCHR > + /* NB: Multiply length by 4 to get byte count. */ > + sall $2, %edx > +# endif > subq $-(VEC_SIZE * 4), %rdi > /* Check first VEC regardless. */ > testl %eax, %eax > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu Thanks. -- H.J.