From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 25605 invoked by alias); 17 Mar 2005 21:48:22 -0000 Mailing-List: contact libc-hacker-help@sources.redhat.com; run by ezmlm Precedence: bulk List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-hacker-owner@sources.redhat.com Received: (qmail 25584 invoked from network); 17 Mar 2005 21:48:22 -0000 Received: from unknown (HELO sunsite.mff.cuni.cz) (195.113.15.26) by sourceware.org with SMTP; 17 Mar 2005 21:48:22 -0000 Received: from sunsite.mff.cuni.cz (sunsite.mff.cuni.cz [127.0.0.1]) by sunsite.mff.cuni.cz (8.13.1/8.13.1) with ESMTP id j2HLmBik027475; Thu, 17 Mar 2005 22:48:11 +0100 Received: (from jj@localhost) by sunsite.mff.cuni.cz (8.13.1/8.13.1/Submit) id j2HLmBC2027470; Thu, 17 Mar 2005 22:48:11 +0100 Date: Thu, 17 Mar 2005 21:48:00 -0000 From: Jakub Jelinek To: Ulrich Drepper , Roland McGrath Cc: Glibc hackers Subject: [PATCH] Fix regex mb char handling Message-ID: <20050317214810.GG4961@sunsite.mff.cuni.cz> Reply-To: Jakub Jelinek Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.4.1i X-SW-Source: 2005-03/txt/msg00046.txt.bz2 Hi! The following patch fixes segfault on: #include #include #include int main (void) { setlocale (LC_ALL, "zh_CN"); regex_t re; if (regcomp (&re, "abc", REG_ICASE)) { puts ("regcomp failed"); return 1; } if (regexec (&re, "\xa8\xa7", 0, NULL, 0) != REG_NOMATCH) { puts ("regexec unexpectedly succeeded"); return 1; } regfree (&re); return 0; } testcase (not included yet in the patch, because I want to find out if some already built testing locale has similar property, particularly that for a mb char its towupper (resp. towlower) char is not represent in the locale's character set). 2005-03-17 Jakub Jelinek * posix/regcomp.c (re_compile_fastmap_iter): Fix check for failed __wcrtomb. Check return values of other __wcrtomb calls. * posix/regex_internal.c (build_wcs_buffer, re_string_skip_chars): Change mbclen type to size_t. (build_wcs_upper_buffer): Change mbclen and mbcdlen type to size_t. Handle mb chars whose upper case doesn't have multibyte representation in locale's charset. --- libc/posix/regcomp.c.jj 2005-02-22 10:02:42.000000000 +0100 +++ libc/posix/regcomp.c 2005-03-17 21:06:43.622233856 +0100 @@ -359,7 +359,8 @@ re_compile_fastmap_iter (bufp, init_stat memset (&state, 0, sizeof (state)); if (mbrtowc (&wc, (const char *) buf, p - buf, &state) == p - buf - && __wcrtomb ((char *) buf, towlower (wc), &state) > 0) + && (__wcrtomb ((char *) buf, towlower (wc), &state) + != (size_t) -1)) re_set_fastmap (fastmap, 0, buf[0]); } #endif @@ -409,12 +410,13 @@ re_compile_fastmap_iter (bufp, init_stat char buf[256]; mbstate_t state; memset (&state, '\0', sizeof (state)); - __wcrtomb (buf, cset->mbchars[i], &state); - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) { - __wcrtomb (buf, towlower (cset->mbchars[i]), &state); - re_set_fastmap (fastmap, 0, *(unsigned char *) buf); + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, 0, *(unsigned char *) buf); } } } --- libc/posix/regex_internal.c.jj 2005-03-08 13:05:18.000000000 +0100 +++ libc/posix/regex_internal.c 2005-03-17 21:17:48.069329822 +0100 @@ -220,7 +220,8 @@ build_wcs_buffer (pstr) unsigned char buf[64]; #endif mbstate_t prev_st; - int byte_idx, end_idx, mbclen, remain_len; + int byte_idx, end_idx, remain_len; + size_t mbclen; /* Build the buffers from pstr->valid_len to either pstr->len or pstr->bufs_len. */ @@ -281,7 +282,8 @@ build_wcs_upper_buffer (pstr) re_string_t *pstr; { mbstate_t prev_st; - int src_idx, byte_idx, end_idx, mbclen, remain_len; + int src_idx, byte_idx, end_idx, remain_len; + size_t mbclen; #ifdef _LIBC char buf[MB_CUR_MAX]; assert (MB_CUR_MAX >= pstr->mb_cur_max); @@ -318,12 +320,12 @@ build_wcs_upper_buffer (pstr) mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx), remain_len, &pstr->cur_state); - if (BE (mbclen > 0, 1)) + if (BE (mbclen + 2 > 2, 1)) { wchar_t wcu = wc; if (iswlower (wc)) { - int mbcdlen; + size_t mbcdlen; wcu = towupper (wc); mbcdlen = wcrtomb (buf, wcu, &prev_st); @@ -386,20 +388,20 @@ build_wcs_upper_buffer (pstr) else p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); - if (BE (mbclen > 0, 1)) + if (BE (mbclen + 2 > 2, 1)) { wchar_t wcu = wc; if (iswlower (wc)) { - int mbcdlen; + size_t mbcdlen; wcu = towupper (wc); mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); if (BE (mbclen == mbcdlen, 1)) memcpy (pstr->mbs + byte_idx, buf, mbclen); - else + else if (mbcdlen != (size_t) -1) { - int i; + size_t i; if (byte_idx + mbcdlen > pstr->bufs_len) { @@ -416,7 +418,7 @@ build_wcs_upper_buffer (pstr) } if (!pstr->offsets_needed) { - for (i = 0; i < byte_idx; ++i) + for (i = 0; i < (size_t) byte_idx; ++i) pstr->offsets[i] = i; pstr->offsets_needed = 1; } @@ -439,13 +441,15 @@ build_wcs_upper_buffer (pstr) src_idx += mbclen; continue; } + else + memcpy (pstr->mbs + byte_idx, p, mbclen); } else memcpy (pstr->mbs + byte_idx, p, mbclen); if (BE (pstr->offsets_needed != 0, 0)) { - int i; + size_t i; for (i = 0; i < mbclen; ++i) pstr->offsets[byte_idx + i] = src_idx + i; } @@ -496,7 +500,8 @@ re_string_skip_chars (pstr, new_raw_idx, wint_t *last_wc; { mbstate_t prev_st; - int rawbuf_idx, mbclen; + int rawbuf_idx; + size_t mbclen; wchar_t wc = 0; /* Skip the characters which are not necessary to check. */ Jakub