From ecec45845d05235eb3c02f50f4bbef3311b4e9f6 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Fri, 16 Aug 2013 20:50:12 +0200 Subject: [PATCH] fix Cygwin UTF-16 surrogate-pair handling with -i grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin) when converting an input string containing certain 4-byte UTF-8 sequences to lower case. The conversions to wchar_t and back to a UTF-8 multibyte string did not take surrogate pairs into account. * src/searchutils.c (mbtolower) [__CYGWIN__]: Detect and handle surrogate pairs when converting. * tests/surrogate-pair: New test. * tests/Makefile.am (TESTS): Add it. Reported by: Jim Burwell --- NEWS | 6 ++++++ src/searchutils.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++- tests/Makefile.am | 1 + tests/surrogate-pair | 43 +++++++++++++++++++++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) create mode 100755 tests/surrogate-pair diff --git a/NEWS b/NEWS index 8796a1b..55445c0 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,12 @@ GNU grep NEWS -*- outline -*- ** Bug fixes + grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin) + when converting an input string containing certain 4-byte UTF-8 + sequences to lower case. The conversions to wchar_t and back to + a UTF-8 multibyte string did not take surrogate pairs into account. + [bug present since grep-2.6] + grep -E would segfault when given a regexp like '([^.]*[M]){1,2}' for any multibyte character M. [bug introduced in grep-2.6, which would segfault, but 2.7 and 2.8 had no problem, and 2.9 through 2.14 would diff --git a/src/searchutils.c b/src/searchutils.c index 160f80d..f27a01b 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -111,6 +111,37 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) { wchar_t wc; size_t mbclen = mbrtowc (&wc, beg, end - beg, &is); +#ifdef __CYGWIN__ + /* Handle a UTF-8 sequence for a character beyond the base plane. + Cygwin's wchar_t is UTF-16, as in the underlying OS. This + results in surrogate pairs which need some extra attention. */ + wint_t wci = 0; + if (mbclen == 3 && (wc & 0xdc00) == 0xd800) + { + /* We got the start of a 4 byte UTF-8 sequence. This is returned + as a UTF-16 surrogate pair. The first call to mbrtowc returned 3 + and wc has been set to a high surrogate value, now we're going + to fetch the matching low surrogate. This second call to mbrtowc + is supposed to return 1 to complete the 4 byte UTF-8 sequence. */ + wchar_t wc_2; + size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen, + &is); + if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00) + { + /* Match. Convert this to a 4 byte wint_t which constitutes + a 32-bit UTF-32 value. */ + wci = ( (((wint_t) (wc - 0xd800)) << 10) + | ((wint_t) (wc_2 - 0xdc00))) + + 0x10000; + ++mbclen; + } + else + { + /* Invalid UTF-8 sequence. */ + mbclen = (size_t) -1; + } + } +#endif if (outlen + mb_cur_max >= outalloc) { size_t dm = m - len_map; @@ -132,8 +163,35 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) } else { + size_t ombclen; beg += mbclen; - size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os); +#ifdef __CYGWIN__ + /* Handle Unicode characters beyond the base plane. */ + if (mbclen == 4) + { + /* towlower, taking wint_t (4 bytes), handles UCS-4 values. */ + wci = towlower (wci); + if (wci >= 0x10000) + { + wci -= 0x10000; + wc = (wci >> 10) | 0xd800; + /* No need to check the return value. When reading the + high surrogate, the return value will be 0 and only the + mbstate indicates that we're in the middle of reading a + surrogate pair. The next wcrtomb call reading the low + surrogate will then return 4 and reset the mbstate. */ + wcrtomb (p, wc, &os); + wc = (wci & 0x3ff) | 0xdc00; + } + else + { + wc = (wchar_t) wci; + } + ombclen = wcrtomb (p, wc, &os); + } + else +#endif + ombclen = wcrtomb (p, towlower ((wint_t) wc), &os); *m = mbclen - ombclen; memset (m + 1, 0, ombclen - 1); m += ombclen; diff --git a/tests/Makefile.am b/tests/Makefile.am index c96ce06..581f688 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -87,6 +87,7 @@ TESTS = \ spencer1 \ spencer1-locale \ status \ + surrogate-pair \ symlink \ turkish-I \ turkish-I-without-dot \ diff --git a/tests/surrogate-pair b/tests/surrogate-pair new file mode 100755 index 0000000..80def1d --- /dev/null +++ b/tests/surrogate-pair @@ -0,0 +1,43 @@ +#!/bin/sh +# Trigger a segfault-inducing bug with -i in grep-2.14 on Cygwin. + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +require_en_utf8_locale_ +require_compiled_in_MB_support + +fail=0 + +printf '\xf0\x90\x90\x85\n' > in || framework_failure_ + +LC_ALL=en_US.UTF-8 +export LC_ALL + +# On Cygwin, before grep-2.15, this would segfault. +grep -i anything-else in > out 2>&1 +# Require not just non-zero exit status, but exactly 1. +test $? = 1 || fail=1 +# Expect no output. +test -s out && fail=1 + +for opt in '' -i -E -F -iE -iF; do + grep --file=in $opt in > out 2>&1 || fail=1 + compare out in || fail=1 +done + +Exit $fail -- 1.8.4.rc0.11.g35f5eaa