From 740048e66e7c55a8e42f4f7e4c24256a61506f70 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 23 Dec 2016 12:25:24 -0800 Subject: [PATCH 4/8] grep: specialize word-finding functions This improves performance a bit. * src/dfasearch.c, src/kwsearch.c (wordchar): Remove; now in searchutils.c. * src/grep.c (main): Call wordinit if -w. * src/search.h: Adjust. * src/searchutils.c: Include verify.h. (word_start): New static var. (wordchar): Move here from dfasearch.c and kwsearch.c. (wordinit, wordchars_count, wordchar_next, wordchar_prev): New functions. (mb_prev_wc, mb_next_wc): Remove. All callers changed to use the new functions instead. --- src/dfasearch.c | 11 ++----- src/grep.c | 1 + src/kwsearch.c | 11 ++----- src/search.h | 5 +-- src/searchutils.c | 91 +++++++++++++++++++++++++++++++++++++++++++------------ 5 files changed, 80 insertions(+), 39 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 24a36cd..87e1f7e 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -26,13 +26,6 @@ struct localeinfo localeinfo; -/* Whether -w considers WC to be a word constituent. */ -static bool -wordchar (wint_t wc) -{ - return wc == L'_' || iswalnum (wc); -} - /* KWset compiled pattern. For Ecompile and Gcompile, we compile a list of strings, at least one of which is known to occur in any string matching the regexp. */ @@ -394,8 +387,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size, while (match <= best_match) { regoff_t shorter_len = 0; - if (!wordchar (mb_prev_wc (beg, match, end - 1)) - && !wordchar (mb_next_wc (match + len, end - 1))) + if (! wordchar_next (match + len, end - 1) + && ! wordchar_prev (beg, match, end - 1)) goto assess_pattern_match; if (len > 0) { diff --git a/src/grep.c b/src/grep.c index 3729ae0..f9d1d86 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2651,6 +2651,7 @@ main (int argc, char **argv) break; case 'w': + wordinit (); match_words = true; break; diff --git a/src/kwsearch.c b/src/kwsearch.c index 5596ebd..b30dfd0 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -21,13 +21,6 @@ #include #include "search.h" -/* Whether -w considers WC to be a word constituent. */ -static bool -wordchar (wint_t wc) -{ - return wc == L'_' || iswalnum (wc); -} - /* KWset compiled pattern. For Ecompile and Gcompile, we compile a list of strings, at least one of which is known to occur in any string matching the regexp. */ @@ -140,10 +133,10 @@ Fexecute (char const *buf, size_t size, size_t *match_size, char const *bol = memrchr (mb_start, eol, beg - mb_start); if (bol) mb_start = bol + 1; - if (! wordchar (mb_prev_wc (mb_start, beg, buf + size))) + if (! wordchar_prev (mb_start, beg, buf + size)) for (;;) { - if (! wordchar (mb_next_wc (beg + len, buf + size))) + if (! wordchar_next (beg + len, buf + size)) { if (start_ptr) goto success_in_beg_and_len; diff --git a/src/search.h b/src/search.h index 1ff5be2..6fe1797 100644 --- a/src/search.h +++ b/src/search.h @@ -46,10 +46,11 @@ _GL_INLINE_HEADER_BEGIN typedef signed char mb_len_map_t; /* searchutils.c */ +extern void wordinit (void); extern kwset_t kwsinit (bool); +extern size_t wordchar_next (char const *, char const *); +extern bool wordchar_prev (char const *, char const *, char const *); extern ptrdiff_t mb_goback (char const **, char const *, char const *); -extern wint_t mb_prev_wc (char const *, char const *, char const *); -extern wint_t mb_next_wc (char const *, char const *); /* dfasearch.c */ extern struct localeinfo localeinfo; diff --git a/src/searchutils.c b/src/searchutils.c index deaab60..e0a1db3 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -22,6 +22,30 @@ #define SYSTEM_INLINE _GL_EXTERN_INLINE #include "search.h" +#include + +/* For each byte B, word_start[B] is 1 if B is a single-byte character + that is a word constituent, 0 if B cannot start a word constituent, + and -1 if B might be or might not be the start of a word + constituent. */ +static wint_t word_start[NCHAR]; +verify (WEOF != 0 && WEOF != 1); + +/* Whether -w considers WC to be a word constituent. */ +static bool +wordchar (wint_t wc) +{ + return wc == L'_' || iswalnum (wc); +} + +void +wordinit (void) +{ + for (int i = 0; i < NCHAR; i++) + word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF + : wordchar (localeinfo.sbctowc[i])); +} + kwset_t kwsinit (bool mb_trans) { @@ -93,27 +117,56 @@ mb_goback (char const **mb_start, char const *cur, char const *end) return p == cur ? 0 : cur - p0; } -/* In the buffer BUF, return the wide character that is encoded just - before CUR. The buffer ends at END. Return WEOF if there is no - wide character just before CUR. */ -wint_t -mb_prev_wc (char const *buf, char const *cur, char const *end) +/* Examine the start of BUF (of size SIZE) for word constituents. + If COUNTALL, examine as many as possible; otherwise, examine at most one. + Return the total number of bytes in the examined characters. */ +static size_t +wordchars_count (char const *buf, char const *end, bool countall) { - if (cur == buf) - return WEOF; - char const *p = buf; - cur--; - cur -= mb_goback (&p, cur, end); - return mb_next_wc (cur, end); + size_t n = 0; + mbstate_t mbs = { 0 }; + while (n < end - buf) + { + wint_t ws = word_start[to_uchar (buf[n])]; + if (ws == 0) + break; + else if (ws == 1) + n++; + else + { + wchar_t wc = 0; + size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs); + if (!wordchar (wc)) + break; + n += wcbytes + !wcbytes; + } + if (!countall) + break; + } + return n; } -/* Return the wide character that is encoded at CUR. The buffer ends - at END. Return WEOF if there is no wide character encoded at CUR. */ -wint_t -mb_next_wc (char const *cur, char const *end) +/* If BUF starts with a word constituent, return the number of bytes + used to represent it; otherwise, return zero. The buffer ends at END. */ +size_t +wordchar_next (char const *buf, char const *end) { - wchar_t wc; - mbstate_t mbs = { 0 }; - return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 - ? wc : WEOF); + return wordchars_count (buf, end, false); +} + +/* In the buffer BUF, return true if the character whose encoding + contains the byte before CUR is a word constituent. The buffer + ends at END. */ +bool +wordchar_prev (char const *buf, char const *cur, char const *end) +{ + if (buf == cur) + return false; + cur--; + wint_t ws = word_start[to_uchar (*cur)]; + if (! localeinfo.multibyte) + return ws == 1; + char const *p = buf; + cur -= mb_goback (&p, cur, end); + return wordchar_next (cur, end) != 0; } -- 2.7.4