>From 678f829c869059cd9cb0fe38b87880ef0a78d210 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Wed, 23 Sep 2020 18:57:57 -0700 Subject: [PATCH 3/5] grep: fix more Turkish-eyes bugs Fix more bugs recently uncovered by Norihiro Tanaka (Bug#43577). * NEWS: Mention new bug report. * src/grep.c (ok_fold): New static var. (setup_ok_fold): New function. (fgrep_icase_charlen): Reject single-byte characters if they match some multibyte characters when ignoring case. This part of the patch is partly derived from , which means it is: Co-authored-by: Norihiro Tanaka (main): Call setup_ok_fold if ok_fold might be needed. * src/searchutils.c (kwsinit): With the grep.c changes, this code can now revert to classic 7th Edition Unix style; aborting would be wrong. * tests/turkish-eyes: Add tests for these bugs. --- NEWS | 2 +- src/grep.c | 116 +++++++++++++++++++++++++++++++-------------- src/searchutils.c | 23 ++------- tests/turkish-eyes | 18 +++++-- 4 files changed, 102 insertions(+), 57 deletions(-) diff --git a/NEWS b/NEWS index 36e423d..ab00ff2 100644 --- a/NEWS +++ b/NEWS @@ -36,7 +36,7 @@ GNU grep NEWS -*- outline -*- characters. For example, 'LC_ALL=tr_TR.utf8 grep -i i' no longer dumps core merely because 'i' matches 'İ' (U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE) in Turkish when ignoring case. - [Bug#43577 introduced in grep 3.4] + [Bug#43577 introduced partly in grep 2.28 and partly in grep 3.4] A performance regression with -E and many patterns has been mostly fixed. "Mostly" as there is a performance tradeoff between Bug#22357 and Bug#40634. diff --git a/src/grep.c b/src/grep.c index 11856d8..1a52c89 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2300,37 +2300,75 @@ contains_encoding_error (char const *pat, size_t patlen) return false; } +/* When ignoring case and (-E or -F or -G), then for each single-byte + character I, ok_fold[I] is 1 if every case folded counterpart of I + is also single-byte, and is -1 otherwise. */ +static signed char ok_fold[NCHAR]; +static void +setup_ok_fold (void) +{ + for (int i = 0; i < NCHAR; i++) + { + wint_t wi = localeinfo.sbctowc[i]; + if (wi == WEOF) + continue; + + int ok = 1; + wchar_t folded[CASE_FOLDED_BUFSIZE]; + for (int n = case_folded_counterparts (wi, folded); 0 <= --n; ) + { + char buf[MB_LEN_MAX]; + mbstate_t s = { 0 }; + if (wcrtomb (buf, folded[n], &s) != 1) + { + ok = -1; + break; + } + } + ok_fold[i] = ok; + } +} + /* Return the number of bytes in the initial character of PAT, of size PATLEN, if Fcompile can handle that character. Return -1 if Fcompile cannot handle it. MBS is the multibyte conversion state. - - Fcompile can handle a character C if C is single-byte, or if C has no - case folded counterparts and toupper translates none of its bytes. */ + PATLEN must be nonzero. */ static int fgrep_icase_charlen (char const *pat, size_t patlen, mbstate_t *mbs) { - int n = localeinfo.sbclen[to_uchar (*pat)]; - if (n < 0) + unsigned char pat0 = pat[0]; + + /* If PAT starts with a single-byte character, Fcompile works if + every case folded counterpart is also single-byte. */ + if (localeinfo.sbctowc[pat0] != WEOF) + return ok_fold[pat0]; + + wchar_t wc; + size_t wn = mbrtowc (&wc, pat, patlen, mbs); + + /* If PAT starts with an encoding error, Fcompile does not work. */ + if (MB_LEN_MAX < wn) + return -1; + + /* PAT starts with a multibyte character. Fcompile works if the + character has no case folded counterparts and toupper translates + none of its encoding's bytes. */ + wchar_t folded[CASE_FOLDED_BUFSIZE]; + if (case_folded_counterparts (wc, folded)) + return -1; + for (int i = wn; 0 < --i; ) { - wchar_t wc; - wchar_t folded[CASE_FOLDED_BUFSIZE]; - size_t wn = mbrtowc (&wc, pat, patlen, mbs); - if (MB_LEN_MAX < wn || case_folded_counterparts (wc, folded)) + unsigned char c = pat[i]; + if (toupper (c) != c) return -1; - for (int i = wn; 0 < --i; ) - { - unsigned char c = pat[i]; - if (toupper (c) != c) - return -1; - } - n = wn; } - return n; + return wn; } /* Return true if the -F patterns PAT, of size PATLEN, contain only - single-byte characters or characters not subject to case folding, + single-byte characters that case-fold only to single-byte + characters, or multibyte characters not subject to case folding, and so can be processed by Fcompile. */ static bool @@ -2950,26 +2988,34 @@ main (int argc, char **argv) if (matcher < 0) matcher = G_MATCHER_INDEX; - /* In a single-byte locale, switch from -F to -G if it is a single - pattern that matches words, where -G is typically faster. In a - multi-byte locale, switch if the patterns have an encoding error - (where -F does not work) or if -i and the patterns will not work - for -iF. */ if (matcher == F_MATCHER_INDEX - && (! localeinfo.multibyte - ? n_patterns == 1 && match_words - : (contains_encoding_error (keys, keycc) - || (match_icase && !fgrep_icase_available (keys, keycc))))) + || matcher == E_MATCHER_INDEX || matcher == G_MATCHER_INDEX) { - fgrep_to_grep_pattern (&pattern_array, &keycc); - keys = pattern_array; - matcher = G_MATCHER_INDEX; + if (match_icase) + setup_ok_fold (); + + /* In a single-byte locale, switch from -F to -G if it is a single + pattern that matches words, where -G is typically faster. In a + multibyte locale, switch if the patterns have an encoding error + (where -F does not work) or if -i and the patterns will not work + for -iF. */ + if (matcher == F_MATCHER_INDEX) + { + if (! localeinfo.multibyte + ? n_patterns == 1 && match_words + : (contains_encoding_error (keys, keycc) + || (match_icase && !fgrep_icase_available (keys, keycc)))) + { + fgrep_to_grep_pattern (&pattern_array, &keycc); + keys = pattern_array; + matcher = G_MATCHER_INDEX; + } + } + /* With two or more patterns, if -F works then switch from either -E + or -G, as -F is probably faster then. */ + else if (1 < n_patterns) + matcher = try_fgrep_pattern (matcher, keys, &keycc); } - /* With two or more patterns, if -F works then switch from either -E - or -G, as -F is probably faster then. */ - else if ((matcher == G_MATCHER_INDEX || matcher == E_MATCHER_INDEX) - && 1 < n_patterns) - matcher = try_fgrep_pattern (matcher, keys, &keycc); execute = matchers[matcher].execute; compiled_pattern = diff --git a/src/searchutils.c b/src/searchutils.c index c4bb802..aa11063 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -48,24 +48,11 @@ kwsinit (bool mb_trans) if (match_icase && (MB_CUR_MAX == 1 || mb_trans)) { trans = xmalloc (NCHAR); - if (MB_CUR_MAX == 1) - for (int i = 0; i < NCHAR; i++) - trans[i] = toupper (i); - else - for (int i = 0; i < NCHAR; i++) - { - wint_t wc = localeinfo.sbctowc[i]; - wint_t uwc = towupper (wc); - if (uwc != wc) - { - mbstate_t mbs = { 0 }; - size_t len = wcrtomb (&trans[i], uwc, &mbs); - if (len != 1) - abort (); - } - else - trans[i] = i; - } + /* If I is a single-byte character that becomes a different + single-byte character when uppercased, set trans[I] + to that character. Otherwise, set trans[I] to I. */ + for (int i = 0; i < NCHAR; i++) + trans[i] = toupper (i); } return kwsalloc (trans); diff --git a/tests/turkish-eyes b/tests/turkish-eyes index ba1ea33..879b59d 100755 --- a/tests/turkish-eyes +++ b/tests/turkish-eyes @@ -36,11 +36,23 @@ i=$(printf '\304\261') # lowercase dotless i data="I:$I $i:i" search_str="$i:i I:$I" -printf "$data\n" > in || framework_failure_ +printf "$data\\n" > in || framework_failure_ for opt in -E -F -G; do - LC_ALL=$L grep $opt -i "$search_str" in > out || fail=1 - compare out in || fail=1 + for pat in i I "$i" "$I" " " : "$search_str"; do + LC_ALL=$L grep $opt -i "$pat" in > out || fail=1 + compare in out || fail=1 + + case $pat in + i|"$I") printf "$I\\ni\\n";; + I|"$i") printf "I\\n$i\\n";; + :) printf ":\\n:\\n";; + ' ') printf " \\n";; + *) cat in;; + esac >exp || framework_failure_ + LC_ALL=$L grep -o $opt -i "$pat" in > out || fail=1 + compare exp out || fail=1 + done done Exit $fail -- 2.17.1