From ca64fe63d9404b65754b6f6b07adc1910ddae5a4 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Sun, 24 Nov 2013 18:49:31 -0800
Subject: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in
 multibyte locales

These days, nearly everyone uses a multibyte locale, and grep is often
used with the --ignore-case (-i) option, but that option imposes a very
high cost in order to handle some unusual cases in just a few multibyte
locales.  This change gets most of the performance of using LC_ALL=C
without eliminating the ability to search for multibyte strings.

With the following example, I see an 11x speed-up with a 2.3GHz i7 and an SSD:
Generate a 10M-line file, with each line consisting of 40 'j's:

    yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 > k

Time searching it for the simple/noexistent string "foobar",
first with this patch (best-of-5 trials):

    LC_ALL=en_US.UTF-8 env time src/grep -i foobar k
        1.10 real         1.03 user         0.07 sys

Back out that commit (temporarily), recompile, and rerun the experiment:

    git log -1 -p|patch -R -p1; make
    LC_ALL=en_US.UTF-8 env time src/grep -i foobar k
        25.11 real        17.45 user         0.17 sys

The trick is to realize that for some search strings, it is easy
to convert to an equivalent one that is handled much more efficiently.
E.g., convert this command:

  grep -i foobar k

to this:

  grep '[fF][oO][oO][bB][aA][rR]' k

That allows the matcher to search in buffer mode, rather than having to
extract/case-convert/search each line separately.  Currently, we perform
this conversion only when search strings are all ASCII and contain neither
'\' nor '['.  See the comments for more detail.

* src/main.c (trivial_case_convert): New function.
(main): When possible, transform the regexp so we can drop the -i.
---
 src/main.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/src/main.c b/src/main.c
index 2168a0a..8b7b492 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1644,13 +1644,14 @@ if any error occurs and -q is not given, the exit status is 2.\n"));
   exit (status);
 }

+static char const *matcher;
+
 /* If M is NULL, initialize the matcher to the default.  Otherwise set the
    matcher to M if available.  Exit in case of conflicts or if M is not
    available.  */
 static void
 setmatcher (char const *m)
 {
-  static char const *matcher;
   unsigned int i;

   if (!m)
@@ -1865,6 +1866,50 @@ parse_grep_colors (void)
       return;
 }

+/* If the regexp KEYS, is amenable to trivial case conversion,
+   create a new regexp, (*NEW_KEYS, *NEW_KEYCC) that is equivalent,
+   but matches case-insensitively.  */
+static bool
+trivial_case_convert (size_t keycc, char const *keys,
+                      size_t *new_keycc, char **new_keys)
+{
+  /* FIXME: consider removing the following restriction:
+     Reject if KEYS contains '\\' or '['.  */
+  if (memchr (keys, '\\', keycc) || memchr (keys, '[', keycc))
+    return false;
+
+  /* Worst case is that every byte of keys will be alpha,
+     so every byte B will map to the sequence of 4 bytes [Bb].  */
+  *new_keys = xnmalloc (4, keycc + 1);
+  char *p = *new_keys;
+  while (*keys)
+    {
+      /* FIXME: consider removing this ascii-only restriction.  */
+      if (!isascii (*keys))
+        {
+          free (*new_keys);
+          return false;
+        }
+      if (!isalpha (*keys))
+        {
+          *p++ = *keys;
+        }
+      else
+        {
+          *p++ = '[';
+          *p++ = *keys;
+          *p++ = islower (*keys) ? toupper (*keys) : tolower (*keys);
+          *p++ = ']';
+        }
+
+      ++keys;
+    }
+
+  *new_keycc = p - *new_keys;
+
+  return true;
+}
+
 int
 main (int argc, char **argv)
 {
@@ -2263,6 +2308,35 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);

+  /* As currently implemented, case-insensitive matching is expensive in
+     multi-byte locales because of a few outlier locales in which some
+     characters change size when converted to upper or lower case.  To
+     accommodate those, we revert to searching the input one line at a
+     time, rather than using the much more efficient buffer search.
+     However, if we have a plain ascii search string, /foo/, we can
+     convert it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
+     avoid the expensive read-and-process-a-line-at-a-time requirement.
+     Optimize-away the "-i" option, when possible, converting each
+     candidate alpha, C, in the regexp to [Cc].  */
+  if (match_icase)
+    {
+      size_t new_keycc;
+      char *new_keys;
+      /* It is not possible with -F, not useful with -P (pcre) and there is no
+         point when there is no regexp.  It also depends on which constructs
+         appear in the regexp.  See trivial_case_convert for those details.  */
+      if (keycc
+          && ! (matcher
+                && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre")))
+          && trivial_case_convert (keycc, keys, &new_keycc, &new_keys))
+        {
+          match_icase = 0;
+          free (keys);
+          keys = new_keys;
+          keycc = new_keycc;
+        }
+    }
+
   compile (keys, keycc);
   free (keys);

-- 
1.8.5.rc2.6.gc6f1b92