>From 678f829c869059cd9cb0fe38b87880ef0a78d210 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Wed, 23 Sep 2020 18:57:57 -0700
Subject: [PATCH 3/5] grep: fix more Turkish-eyes bugs

Fix more bugs recently uncovered by Norihiro Tanaka (Bug#43577).
* NEWS: Mention new bug report.
* src/grep.c (ok_fold): New static var.
(setup_ok_fold): New function.
(fgrep_icase_charlen): Reject single-byte characters
if they match some multibyte characters when ignoring case.
This part of the patch is partly derived from
<https://bugs.gnu.org/43577#14>, which means it is:
Co-authored-by: Norihiro Tanaka <noritnk@kcn.ne.jp>
(main): Call setup_ok_fold if ok_fold might be needed.
* src/searchutils.c (kwsinit): With the grep.c changes,
this code can now revert to classic 7th Edition Unix style;
aborting would be wrong.
* tests/turkish-eyes: Add tests for these bugs.
---
 NEWS               |   2 +-
 src/grep.c         | 116 +++++++++++++++++++++++++++++++--------------
 src/searchutils.c  |  23 ++-------
 tests/turkish-eyes |  18 +++++--
 4 files changed, 102 insertions(+), 57 deletions(-)

diff --git a/NEWS b/NEWS
index 36e423d..ab00ff2 100644
--- a/NEWS
+++ b/NEWS
@@ -36,7 +36,7 @@ GNU grep NEWS                                    -*- outline -*-
   characters.  For example, 'LC_ALL=tr_TR.utf8 grep -i i' no longer
   dumps core merely because 'i' matches 'İ' (U+0130 LATIN CAPITAL
   LETTER I WITH DOT ABOVE) in Turkish when ignoring case.
-  [Bug#43577 introduced in grep 3.4]
+  [Bug#43577 introduced partly in grep 2.28 and partly in grep 3.4]
 
   A performance regression with -E and many patterns has been mostly fixed.
   "Mostly" as there is a performance tradeoff between Bug#22357 and Bug#40634.
diff --git a/src/grep.c b/src/grep.c
index 11856d8..1a52c89 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2300,37 +2300,75 @@ contains_encoding_error (char const *pat, size_t patlen)
   return false;
 }
 
+/* When ignoring case and (-E or -F or -G), then for each single-byte
+   character I, ok_fold[I] is 1 if every case folded counterpart of I
+   is also single-byte, and is -1 otherwise.  */
+static signed char ok_fold[NCHAR];
+static void
+setup_ok_fold (void)
+{
+  for (int i = 0; i < NCHAR; i++)
+    {
+      wint_t wi = localeinfo.sbctowc[i];
+      if (wi == WEOF)
+        continue;
+
+      int ok = 1;
+      wchar_t folded[CASE_FOLDED_BUFSIZE];
+      for (int n = case_folded_counterparts (wi, folded); 0 <= --n; )
+        {
+          char buf[MB_LEN_MAX];
+          mbstate_t s = { 0 };
+          if (wcrtomb (buf, folded[n], &s) != 1)
+            {
+              ok = -1;
+              break;
+            }
+        }
+      ok_fold[i] = ok;
+    }
+}
+
 /* Return the number of bytes in the initial character of PAT, of size
    PATLEN, if Fcompile can handle that character.  Return -1 if
    Fcompile cannot handle it.  MBS is the multibyte conversion state.
-
-   Fcompile can handle a character C if C is single-byte, or if C has no
-   case folded counterparts and toupper translates none of its bytes.  */
+   PATLEN must be nonzero.  */
 
 static int
 fgrep_icase_charlen (char const *pat, size_t patlen, mbstate_t *mbs)
 {
-  int n = localeinfo.sbclen[to_uchar (*pat)];
-  if (n < 0)
+  unsigned char pat0 = pat[0];
+
+  /* If PAT starts with a single-byte character, Fcompile works if
+     every case folded counterpart is also single-byte.  */
+  if (localeinfo.sbctowc[pat0] != WEOF)
+    return ok_fold[pat0];
+
+  wchar_t wc;
+  size_t wn = mbrtowc (&wc, pat, patlen, mbs);
+
+  /* If PAT starts with an encoding error, Fcompile does not work.  */
+  if (MB_LEN_MAX < wn)
+    return -1;
+
+  /* PAT starts with a multibyte character.  Fcompile works if the
+     character has no case folded counterparts and toupper translates
+     none of its encoding's bytes.  */
+  wchar_t folded[CASE_FOLDED_BUFSIZE];
+  if (case_folded_counterparts (wc, folded))
+    return -1;
+  for (int i = wn; 0 < --i; )
     {
-      wchar_t wc;
-      wchar_t folded[CASE_FOLDED_BUFSIZE];
-      size_t wn = mbrtowc (&wc, pat, patlen, mbs);
-      if (MB_LEN_MAX < wn || case_folded_counterparts (wc, folded))
+      unsigned char c = pat[i];
+      if (toupper (c) != c)
         return -1;
-      for (int i = wn; 0 < --i; )
-        {
-          unsigned char c = pat[i];
-          if (toupper (c) != c)
-            return -1;
-        }
-      n = wn;
     }
-  return n;
+  return wn;
 }
 
 /* Return true if the -F patterns PAT, of size PATLEN, contain only
-   single-byte characters or characters not subject to case folding,
+   single-byte characters that case-fold only to single-byte
+   characters, or multibyte characters not subject to case folding,
    and so can be processed by Fcompile.  */
 
 static bool
@@ -2950,26 +2988,34 @@ main (int argc, char **argv)
   if (matcher < 0)
     matcher = G_MATCHER_INDEX;
 
-  /* In a single-byte locale, switch from -F to -G if it is a single
-     pattern that matches words, where -G is typically faster.  In a
-     multi-byte locale, switch if the patterns have an encoding error
-     (where -F does not work) or if -i and the patterns will not work
-     for -iF.  */
   if (matcher == F_MATCHER_INDEX
-      && (! localeinfo.multibyte
-          ? n_patterns == 1 && match_words
-          : (contains_encoding_error (keys, keycc)
-             || (match_icase && !fgrep_icase_available (keys, keycc)))))
+      || matcher == E_MATCHER_INDEX || matcher == G_MATCHER_INDEX)
     {
-      fgrep_to_grep_pattern (&pattern_array, &keycc);
-      keys = pattern_array;
-      matcher = G_MATCHER_INDEX;
+      if (match_icase)
+        setup_ok_fold ();
+
+      /* In a single-byte locale, switch from -F to -G if it is a single
+         pattern that matches words, where -G is typically faster.  In a
+         multibyte locale, switch if the patterns have an encoding error
+         (where -F does not work) or if -i and the patterns will not work
+         for -iF.  */
+      if (matcher == F_MATCHER_INDEX)
+        {
+          if (! localeinfo.multibyte
+              ? n_patterns == 1 && match_words
+              : (contains_encoding_error (keys, keycc)
+                 || (match_icase && !fgrep_icase_available (keys, keycc))))
+            {
+              fgrep_to_grep_pattern (&pattern_array, &keycc);
+              keys = pattern_array;
+              matcher = G_MATCHER_INDEX;
+            }
+        }
+      /* With two or more patterns, if -F works then switch from either -E
+         or -G, as -F is probably faster then.  */
+      else if (1 < n_patterns)
+        matcher = try_fgrep_pattern (matcher, keys, &keycc);
     }
-  /* With two or more patterns, if -F works then switch from either -E
-     or -G, as -F is probably faster then.  */
-  else if ((matcher == G_MATCHER_INDEX || matcher == E_MATCHER_INDEX)
-           && 1 < n_patterns)
-    matcher = try_fgrep_pattern (matcher, keys, &keycc);
 
   execute = matchers[matcher].execute;
   compiled_pattern =
diff --git a/src/searchutils.c b/src/searchutils.c
index c4bb802..aa11063 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -48,24 +48,11 @@ kwsinit (bool mb_trans)
   if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
     {
       trans = xmalloc (NCHAR);
-      if (MB_CUR_MAX == 1)
-        for (int i = 0; i < NCHAR; i++)
-          trans[i] = toupper (i);
-      else
-        for (int i = 0; i < NCHAR; i++)
-          {
-            wint_t wc = localeinfo.sbctowc[i];
-            wint_t uwc = towupper (wc);
-            if (uwc != wc)
-              {
-                mbstate_t mbs = { 0 };
-                size_t len = wcrtomb (&trans[i], uwc, &mbs);
-                if (len != 1)
-                  abort ();
-              }
-            else
-              trans[i] = i;
-          }
+      /* If I is a single-byte character that becomes a different
+         single-byte character when uppercased, set trans[I]
+         to that character.  Otherwise, set trans[I] to I.  */
+      for (int i = 0; i < NCHAR; i++)
+        trans[i] = toupper (i);
     }
 
   return kwsalloc (trans);
diff --git a/tests/turkish-eyes b/tests/turkish-eyes
index ba1ea33..879b59d 100755
--- a/tests/turkish-eyes
+++ b/tests/turkish-eyes
@@ -36,11 +36,23 @@ i=$(printf '\304\261') # lowercase dotless i
 
       data="I:$I $i:i"
 search_str="$i:i I:$I"
-printf "$data\n" > in || framework_failure_
+printf "$data\\n" > in || framework_failure_
 
 for opt in -E -F -G; do
-  LC_ALL=$L grep $opt -i "$search_str" in > out || fail=1
-  compare out in || fail=1
+  for pat in i I "$i" "$I" " " : "$search_str"; do
+    LC_ALL=$L grep $opt -i "$pat" in > out || fail=1
+    compare in out || fail=1
+
+    case $pat in
+      i|"$I") printf "$I\\ni\\n";;
+      I|"$i") printf "I\\n$i\\n";;
+      :) printf ":\\n:\\n";;
+      ' ') printf " \\n";;
+      *) cat in;;
+    esac >exp || framework_failure_
+    LC_ALL=$L grep -o $opt -i "$pat" in > out || fail=1
+    compare exp out || fail=1
+  done
 done
 
 Exit $fail
-- 
2.17.1