From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <hjl.tools@gmail.com>
Received: from mail-pf1-x436.google.com (mail-pf1-x436.google.com
 [IPv6:2607:f8b0:4864:20::436])
 by sourceware.org (Postfix) with ESMTPS id 19128395542B
 for <libc-alpha@sourceware.org>; Tue, 22 Jun 2021 21:25:32 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 19128395542B
Received: by mail-pf1-x436.google.com with SMTP id u18so564114pfk.11
 for <libc-alpha@sourceware.org>; Tue, 22 Jun 2021 14:25:32 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:mime-version:references:in-reply-to:from:date
 :message-id:subject:to:cc;
 bh=8x5RNBmrKhvXx4VYNkW8YwLMTSqxxNJzTk+wVYUmycY=;
 b=uJXEU1v524y87uea99Ags46G6OY7VCndaAyLdHv531AzxcfyiO9CQ1FXSdmWWf+6Vn
 lKEs0ntFJN+196q5ExIwlFIxQdlMzlX8bgvWjYSL1nPAZO2n+5YiNsqezqkSd2bu1UMx
 +KdK2tiqvuZ1Iv3FHm7GGTmHhhOthu1qPo+4DmQN+Xw9cxwyWkbMyuZxvBK34Ee9YKlY
 gcQEqzHFOnLwXdVCRAtsNJXgOvXbPajWbNtTVTjSVfRiP3dyg/UW7sTLoZ23UiXTdqTR
 Pnmj3UCjbP8KjB3/L9grr02VTq7U2wVxftOjBncTg/x4G0KLhUC7VAXx99xEQ2/1bWg5
 m6bQ==
X-Gm-Message-State: AOAM53069hmOtMKcG6LNfZLVJFXgfBXP7Fquco2n0nHf9eNTe5156aJb
 ucYdKUwJYROuXH9rnPSriVgLYBIwZ3lA5wTYICA=
X-Google-Smtp-Source: ABdhPJykGw+0ZZSo2wf43ufPGP4y4Z/yOkvOoAu+PZH3dnbYFxVWFEX7pClyuRfZmrCN8QNVoK33kKRFruvxmEekWAc=
X-Received: by 2002:a62:8c8d:0:b029:301:7970:ab46 with SMTP id
 m135-20020a628c8d0000b02903017970ab46mr5647377pfd.48.1624397131193; Tue, 22
 Jun 2021 14:25:31 -0700 (PDT)
MIME-Version: 1.0
References: <20210609205257.123944-1-goldstein.w.n@gmail.com>
 <20210622181111.185897-2-goldstein.w.n@gmail.com>
In-Reply-To: <20210622181111.185897-2-goldstein.w.n@gmail.com>
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 22 Jun 2021 14:24:55 -0700
Message-ID: <CAMe9rOr=dd2zrhD4E=s5k8jp9x8HL70BdyOve4wXBz68oGdMnA@mail.gmail.com>
Subject: Re: [PATCH v2 2/3] x86: Fix overflow bug with wmemchr-sse2 and
 wmemchr-avx2 [BZ #27974]
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
 "Carlos O'Donell" <carlos@systemhalted.org>
Content-Type: text/plain; charset="UTF-8"
X-Spam-Status: No, score=-3032.1 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.2
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Tue, 22 Jun 2021 21:25:34 -0000

On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on n * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
>  2 files changed, 98 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
> index beff2708de..3ddc4655cf 100644
> --- a/sysdeps/x86_64/memchr.S
> +++ b/sysdeps/x86_64/memchr.S
> @@ -21,9 +21,11 @@
>  #ifdef USE_AS_WMEMCHR
>  # define MEMCHR                wmemchr
>  # define PCMPEQ                pcmpeqd
> +# define CHAR_PER_VEC  4
>  #else
>  # define MEMCHR                memchr
>  # define PCMPEQ                pcmpeqb
> +# define CHAR_PER_VEC  16
>  #endif
>
>  /* fast SSE2 version with using pmaxub and 64 byte loop */
> @@ -33,15 +35,14 @@ ENTRY(MEMCHR)
>         movd    %esi, %xmm1
>         mov     %edi, %ecx
>
> +#ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +#endif
>  #ifdef USE_AS_WMEMCHR
>         test    %RDX_LP, %RDX_LP
>         jz      L(return_null)
> -       shl     $2, %RDX_LP
>  #else
> -# ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %edx, %edx
> -# endif
>         punpcklbw %xmm1, %xmm1
>         test    %RDX_LP, %RDX_LP
>         jz      L(return_null)
> @@ -60,13 +61,16 @@ ENTRY(MEMCHR)
>         test    %eax, %eax
>
>         jnz     L(matches_1)
> -       sub     $16, %rdx
> +       sub     $CHAR_PER_VEC, %rdx
>         jbe     L(return_null)
>         add     $16, %rdi
>         and     $15, %ecx
>         and     $-16, %rdi
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         add     %rcx, %rdx
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>         jmp     L(loop_prolog)
>
> @@ -77,16 +81,21 @@ L(crosscache):
>         movdqa  (%rdi), %xmm0
>
>         PCMPEQ  %xmm1, %xmm0
> -/* Check if there is a match.  */
> +       /* Check if there is a match.  */
>         pmovmskb %xmm0, %eax
> -/* Remove the leading bytes.  */
> +       /* Remove the leading bytes.  */
>         sar     %cl, %eax
>         test    %eax, %eax
>         je      L(unaligned_no_match)
> -/* Check which byte is a match.  */
> +       /* Check which byte is a match.  */
>         bsf     %eax, %eax
> -
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         add     %rdi, %rax
>         add     %rcx, %rax
> @@ -94,15 +103,18 @@ L(crosscache):
>
>         .p2align 4
>  L(unaligned_no_match):
> -        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
> +       /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
>            "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
>            possible addition overflow.  */
>         neg     %rcx
>         add     $16, %rcx
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         sub     %rcx, %rdx
>         jbe     L(return_null)
>         add     $16, %rdi
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>
>         .p2align 4
> @@ -135,7 +147,7 @@ L(loop_prolog):
>         test    $0x3f, %rdi
>         jz      L(align64_loop)
>
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>
>         movdqa  (%rdi), %xmm0
> @@ -167,11 +179,14 @@ L(loop_prolog):
>         mov     %rdi, %rcx
>         and     $-64, %rdi
>         and     $63, %ecx
> +#ifdef USE_AS_WMEMCHR
> +       shr     $2, %ecx
> +#endif
>         add     %rcx, %rdx
>
>         .p2align 4
>  L(align64_loop):
> -       sub     $64, %rdx
> +       sub     $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(exit_loop)
>         movdqa  (%rdi), %xmm0
>         movdqa  16(%rdi), %xmm2
> @@ -218,7 +233,7 @@ L(align64_loop):
>
>         .p2align 4
>  L(exit_loop):
> -       add     $32, %edx
> +       add     $(CHAR_PER_VEC * 2), %edx
>         jle     L(exit_loop_32)
>
>         movdqa  (%rdi), %xmm0
> @@ -238,7 +253,7 @@ L(exit_loop):
>         pmovmskb %xmm3, %eax
>         test    %eax, %eax
>         jnz     L(matches32_1)
> -       sub     $16, %edx
> +       sub     $CHAR_PER_VEC, %edx
>         jle     L(return_null)
>
>         PCMPEQ  48(%rdi), %xmm1
> @@ -250,13 +265,13 @@ L(exit_loop):
>
>         .p2align 4
>  L(exit_loop_32):
> -       add     $32, %edx
> +       add     $(CHAR_PER_VEC * 2), %edx
>         movdqa  (%rdi), %xmm0
>         PCMPEQ  %xmm1, %xmm0
>         pmovmskb %xmm0, %eax
>         test    %eax, %eax
>         jnz     L(matches_1)
> -       sub     $16, %edx
> +       sub     $CHAR_PER_VEC, %edx
>         jbe     L(return_null)
>
>         PCMPEQ  16(%rdi), %xmm1
> @@ -293,7 +308,13 @@ L(matches32):
>         .p2align 4
>  L(matches_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         add     %rdi, %rax
>         ret
> @@ -301,7 +322,13 @@ L(matches_1):
>         .p2align 4
>  L(matches16_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     16(%rdi, %rax), %rax
>         ret
> @@ -309,7 +336,13 @@ L(matches16_1):
>         .p2align 4
>  L(matches32_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     32(%rdi, %rax), %rax
>         ret
> @@ -317,7 +350,13 @@ L(matches32_1):
>         .p2align 4
>  L(matches48_1):
>         bsf     %eax, %eax
> +#ifdef USE_AS_WMEMCHR
> +       mov     %eax, %esi
> +       shr     $2, %esi
> +       sub     %rsi, %rdx
> +#else
>         sub     %rax, %rdx
> +#endif
>         jbe     L(return_null)
>         lea     48(%rdi, %rax), %rax
>         ret
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 0d8758e3e7..afdb956502 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -54,21 +54,19 @@
>
>  # define VEC_SIZE 32
>  # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (MEMCHR)
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
> -       test    %RDX_LP, %RDX_LP
> -       jz      L(null)
> -# endif
> -# ifdef USE_AS_WMEMCHR
> -       shl     $2, %RDX_LP
> -# else
>  #  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %edx, %edx
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +#  else
> +       test    %RDX_LP, %RDX_LP
>  #  endif
> +       jz      L(null)
>  # endif
>         /* Broadcast CHAR to YMMMATCH.  */
>         vmovd   %esi, %xmm0
> @@ -84,7 +82,7 @@ ENTRY (MEMCHR)
>         vpmovmskb %ymm1, %eax
>  # ifndef USE_AS_RAWMEMCHR
>         /* If length < CHAR_PER_VEC handle special.  */
> -       cmpq    $VEC_SIZE, %rdx
> +       cmpq    $CHAR_PER_VEC, %rdx
>         jbe     L(first_vec_x0)
>  # endif
>         testl   %eax, %eax
> @@ -98,6 +96,10 @@ ENTRY (MEMCHR)
>  L(first_vec_x0):
>         /* Check if first match was before length.  */
>         tzcntl  %eax, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         xorl    %ecx, %ecx
>         cmpl    %eax, %edx
>         leaq    (%rdi, %rax), %rax
> @@ -110,12 +112,12 @@ L(null):
>  # endif
>         .p2align 4
>  L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is necessary
> -          for computer return address if byte is found or adjusting length
> -          if it is not and this is memchr.  */
> +       /* Save pointer before aligning as its original value is
> +          necessary for computer return address if byte is found or
> +          adjusting length if it is not and this is memchr.  */
>         movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> -          rdi for rawmemchr.  */
> +       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> +          and rdi for rawmemchr.  */
>         orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
>         VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> @@ -124,6 +126,10 @@ L(cross_page_boundary):
>            match).  */
>         leaq    1(%ALGN_PTR_REG), %rsi
>         subq    %RRAW_PTR_REG, %rsi
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %esi
> +#  endif
>  # endif
>         /* Remove the leading bytes.  */
>         sarxl   %ERAW_PTR_REG, %eax, %eax
> @@ -181,6 +187,10 @@ L(cross_page_continue):
>         orq     $(VEC_SIZE - 1), %rdi
>         /* esi is for adjusting length to see if near the end.  */
>         leal    (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %esi
> +#  endif
>  # else
>         orq     $(VEC_SIZE - 1), %rdi
>  L(cross_page_continue):
> @@ -213,7 +223,7 @@ L(cross_page_continue):
>
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check if at last VEC_SIZE * 4 length.  */
> -       subq    $(VEC_SIZE * 4), %rdx
> +       subq    $(CHAR_PER_VEC * 4), %rdx
>         jbe     L(last_4x_vec_or_less_cmpeq)
>         /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
>            length.  */
> @@ -221,6 +231,10 @@ L(cross_page_continue):
>         movl    %edi, %ecx
>         orq     $(VEC_SIZE * 4 - 1), %rdi
>         andl    $(VEC_SIZE * 4 - 1), %ecx
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>         addq    %rcx, %rdx
>  # else
>         /* Align data to VEC_SIZE * 4 - 1 for loop.  */
> @@ -250,15 +264,19 @@ L(loop_4x_vec):
>
>         subq    $-(VEC_SIZE * 4), %rdi
>
> -       subq    $(VEC_SIZE * 4), %rdx
> +       subq    $(CHAR_PER_VEC * 4), %rdx
>         ja      L(loop_4x_vec)
>
> -       /* Fall through into less than 4 remaining vectors of length case.
> -        */
> +       /* Fall through into less than 4 remaining vectors of length
> +          case.  */
>         VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
>         .p2align 4
>  L(last_4x_vec_or_less):
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         /* Check if first VEC contained match.  */
>         testl   %eax, %eax
>         jnz     L(first_vec_x1_check)
> @@ -355,6 +373,10 @@ L(last_vec_x2_return):
>  L(last_4x_vec_or_less_cmpeq):
>         VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %edx
> +#  endif
>         subq    $-(VEC_SIZE * 4), %rdi
>         /* Check first VEC regardless.  */
>         testl   %eax, %eax
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.