From 740048e66e7c55a8e42f4f7e4c24256a61506f70 Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Fri, 23 Dec 2016 12:25:24 -0800
Subject: [PATCH 4/8] grep: specialize word-finding functions

This improves performance a bit.
* src/dfasearch.c, src/kwsearch.c (wordchar):
Remove; now in searchutils.c.
* src/grep.c (main): Call wordinit if -w.
* src/search.h: Adjust.
* src/searchutils.c: Include verify.h.
(word_start): New static var.
(wordchar): Move here from dfasearch.c and kwsearch.c.
(wordinit, wordchars_count, wordchar_next, wordchar_prev):
New functions.
(mb_prev_wc, mb_next_wc): Remove.
All callers changed to use the new functions instead.
---
 src/dfasearch.c   | 11 ++-----
 src/grep.c        |  1 +
 src/kwsearch.c    | 11 ++-----
 src/search.h      |  5 +--
 src/searchutils.c | 91 +++++++++++++++++++++++++++++++++++++++++++------------
 5 files changed, 80 insertions(+), 39 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 24a36cd..87e1f7e 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -26,13 +26,6 @@
 
 struct localeinfo localeinfo;
 
-/* Whether -w considers WC to be a word constituent.  */
-static bool
-wordchar (wint_t wc)
-{
-  return wc == L'_' || iswalnum (wc);
-}
-
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
    any string matching the regexp. */
@@ -394,8 +387,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
                 while (match <= best_match)
                   {
                     regoff_t shorter_len = 0;
-                    if (!wordchar (mb_prev_wc (beg, match, end - 1))
-                        && !wordchar (mb_next_wc (match + len, end - 1)))
+                    if (! wordchar_next (match + len, end - 1)
+                        && ! wordchar_prev (beg, match, end - 1))
                       goto assess_pattern_match;
                     if (len > 0)
                       {
diff --git a/src/grep.c b/src/grep.c
index 3729ae0..f9d1d86 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2651,6 +2651,7 @@ main (int argc, char **argv)
         break;
 
       case 'w':
+        wordinit ();
         match_words = true;
         break;
 
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5596ebd..b30dfd0 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -21,13 +21,6 @@
 #include <config.h>
 #include "search.h"
 
-/* Whether -w considers WC to be a word constituent.  */
-static bool
-wordchar (wint_t wc)
-{
-  return wc == L'_' || iswalnum (wc);
-}
-
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
    any string matching the regexp. */
@@ -140,10 +133,10 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
       char const *bol = memrchr (mb_start, eol, beg - mb_start);
       if (bol)
         mb_start = bol + 1;
-      if (! wordchar (mb_prev_wc (mb_start, beg, buf + size)))
+      if (! wordchar_prev (mb_start, beg, buf + size))
         for (;;)
           {
-            if (! wordchar (mb_next_wc (beg + len, buf + size)))
+            if (! wordchar_next (beg + len, buf + size))
               {
                 if (start_ptr)
                   goto success_in_beg_and_len;
diff --git a/src/search.h b/src/search.h
index 1ff5be2..6fe1797 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,10 +46,11 @@ _GL_INLINE_HEADER_BEGIN
 typedef signed char mb_len_map_t;
 
 /* searchutils.c */
+extern void wordinit (void);
 extern kwset_t kwsinit (bool);
+extern size_t wordchar_next (char const *, char const *);
+extern bool wordchar_prev (char const *, char const *, char const *);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
-extern wint_t mb_prev_wc (char const *, char const *, char const *);
-extern wint_t mb_next_wc (char const *, char const *);
 
 /* dfasearch.c */
 extern struct localeinfo localeinfo;
diff --git a/src/searchutils.c b/src/searchutils.c
index deaab60..e0a1db3 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,6 +22,30 @@
 #define SYSTEM_INLINE _GL_EXTERN_INLINE
 #include "search.h"
 
+#include <verify.h>
+
+/* For each byte B, word_start[B] is 1 if B is a single-byte character
+   that is a word constituent, 0 if B cannot start a word constituent,
+   and -1 if B might be or might not be the start of a word
+   constituent.  */
+static wint_t word_start[NCHAR];
+verify (WEOF != 0 && WEOF != 1);
+
+/* Whether -w considers WC to be a word constituent.  */
+static bool
+wordchar (wint_t wc)
+{
+  return wc == L'_' || iswalnum (wc);
+}
+
+void
+wordinit (void)
+{
+  for (int i = 0; i < NCHAR; i++)
+    word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
+                     : wordchar (localeinfo.sbctowc[i]));
+}
+
 kwset_t
 kwsinit (bool mb_trans)
 {
@@ -93,27 +117,56 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
   return p == cur ? 0 : cur - p0;
 }
 
-/* In the buffer BUF, return the wide character that is encoded just
-   before CUR.  The buffer ends at END.  Return WEOF if there is no
-   wide character just before CUR.  */
-wint_t
-mb_prev_wc (char const *buf, char const *cur, char const *end)
+/* Examine the start of BUF (of size SIZE) for word constituents.
+   If COUNTALL, examine as many as possible; otherwise, examine at most one.
+   Return the total number of bytes in the examined characters.  */
+static size_t
+wordchars_count (char const *buf, char const *end, bool countall)
 {
-  if (cur == buf)
-    return WEOF;
-  char const *p = buf;
-  cur--;
-  cur -= mb_goback (&p, cur, end);
-  return mb_next_wc (cur, end);
+  size_t n = 0;
+  mbstate_t mbs = { 0 };
+  while (n < end - buf)
+    {
+      wint_t ws = word_start[to_uchar (buf[n])];
+      if (ws == 0)
+        break;
+      else if (ws == 1)
+        n++;
+      else
+        {
+          wchar_t wc = 0;
+          size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs);
+          if (!wordchar (wc))
+            break;
+          n += wcbytes + !wcbytes;
+        }
+      if (!countall)
+        break;
+    }
+  return n;
 }
 
-/* Return the wide character that is encoded at CUR.  The buffer ends
-   at END.  Return WEOF if there is no wide character encoded at CUR.  */
-wint_t
-mb_next_wc (char const *cur, char const *end)
+/* If BUF starts with a word constituent, return the number of bytes
+   used to represent it; otherwise, return zero.  The buffer ends at END.  */
+size_t
+wordchar_next (char const *buf, char const *end)
 {
-  wchar_t wc;
-  mbstate_t mbs = { 0 };
-  return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2
-          ? wc : WEOF);
+  return wordchars_count (buf, end, false);
+}
+
+/* In the buffer BUF, return true if the character whose encoding
+   contains the byte before CUR is a word constituent.  The buffer
+   ends at END.  */
+bool
+wordchar_prev (char const *buf, char const *cur, char const *end)
+{
+  if (buf == cur)
+    return false;
+  cur--;
+  wint_t ws = word_start[to_uchar (*cur)];
+  if (! localeinfo.multibyte)
+    return ws == 1;
+  char const *p = buf;
+  cur -= mb_goback (&p, cur, end);
+  return wordchar_next (cur, end) != 0;
 }
-- 
2.7.4