From fe6fe68f0098704846da9e64f56073a5d5171ce5 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 12 Jun 2016 15:09:40 +0900 Subject: [PATCH] grep: try fgrep matcher for case insensitive matching by grep -F in multibyte locale In multibyte locale, if a pattern is composed of only single byte characters and their all counterparts are also single byte characters and the pattern does not have invalid sequences, grep -F uses fgrep matcher same as single byte locale. * NEWS: Mention it. * src/dfa.c (mbrtowc_cache): Change into global variable. (build_mbrtowc_cache): New function. (dfasyntax): Use it. * src/dfa.h (mbrtowc_cache, build_mbrtowc_cache): New prototype. * src/dfasearch.c (kwsinit): Update for kwsinit. * src/grep.c (lonesome_lower): New variable. (fgrep_icase_available): New function. (fgrep_to_grep_pattern): Simplify it. (main): Use them. * src/kwsearch.c (kwsinit): Update for kwsinit. * src/search.h (kwsinit): Update prototype. * src/searchutils.c (kwsinit): Try fgrep matcher for case insensitive matching by grep -F in multibyte locale. --- NEWS | 6 +++ src/dfa.c | 31 +++++++++++--- src/dfa.h | 9 ++++ src/dfasearch.c | 2 +- src/grep.c | 118 +++++++++++++++++++++++++++++++++++++++++++++-------- src/kwsearch.c | 2 +- src/search.h | 2 +- src/searchutils.c | 26 ++++++++++-- 8 files changed, 165 insertions(+), 31 deletions(-) diff --git a/NEWS b/NEWS index c3e6000..cd06153 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,12 @@ GNU grep NEWS -*- outline -*- as it now uses the Aho-Corasick algorithm instead of the Commentz-Walter algorithm in that case. + grep -F now tries fgrep matcher for case insensitive matching in + multibyte locale. It improves performance for a case which a pattern + is composed of only single byte characters and their all counterparts + are also single byte characters and the pattern does not have invalid + sequences. + * Noteworthy changes in release 2.25 (2016-04-21) [stable] diff --git a/src/dfa.c b/src/dfa.c index 19363ce..b79d464 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -430,7 +430,30 @@ static void regexp (void); /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is not a valid single-byte character. */ -static wint_t mbrtowc_cache[NOTCHAR]; +wint_t mbrtowc_cache[NOTCHAR]; + +/* Initialize a cache of wide characters for each of its 1-byte inputs. */ +void +build_mbrtowc_cache (void) +{ + static bool cached = false; + int i; + + if (cached) + return; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t s = { 0 }; + wchar_t wc; + size_t ret = mbrtowc (&wc, &c, 1, &s); + mbrtowc_cache[uc] = (ret == (size_t)-1 || ret == (size_t) -2) ? WEOF : wc; + } + + cached = true; +} /* Store into *PWC the result of converting the leading bytes of the multibyte buffer S of length N bytes, using the mbrtowc_cache in *D @@ -698,16 +721,12 @@ dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol) syntax_bits = bits; case_fold = fold; eolbyte = eol; + build_mbrtowc_cache (); for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { - char c = i; unsigned char uc = i; - mbstate_t s = { 0 }; - wchar_t wc; - mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */ sbit[uc] = char_context (uc); switch (sbit[uc]) { diff --git a/src/dfa.h b/src/dfa.h index 60da0e4..bd165e8 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "xalloc.h" /* for _GL_ATTRIBUTE_MALLOC */ @@ -100,4 +101,12 @@ extern void dfawarn (const char *); The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); +/* Others. */ + extern bool using_utf8 (void); + +/* build_mbrtowc_cache () initializes a cache of wide characters for each of + its 1-byte inputs. */ +extern void build_mbrtowc_cache (void); + +extern wint_t mbrtowc_cache[]; diff --git a/src/dfasearch.c b/src/dfasearch.c index 8052ef0..fa412bc 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -89,7 +89,7 @@ kwsmusts (void) struct dfamust *dm = dfamust (dfa); if (!dm) return; - kwsinit (&kwset); + kwsinit (&kwset, false); if (dm->exact) { /* Prepare a substring whose presence implies a match. diff --git a/src/grep.c b/src/grep.c index 302e4d7..581e90a 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2187,14 +2187,93 @@ contains_encoding_error (char const *pat, size_t patlen) return false; } +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE + }; + +static bool +fgrep_icase_available (char const *pat, size_t patlen) +{ + for (size_t i = 0; i < patlen; ++i) + { + unsigned char c = pat[i]; + if (mbclen_cache[c] > 1) + return false; + } + + build_mbrtowc_cache (); + + for (size_t i = 0; i < patlen; ++i) + { + unsigned char c = pat[i]; + + wint_t wc = mbrtowc_cache[c]; + if (wc == WEOF) + return false; + + wint_t uwc = towupper (wc); + if (uwc != wc) + { + char s[MB_LEN_MAX]; + mbstate_t mb_state = { 0 }; + size_t len = wcrtomb (s, uwc, &mb_state); + if (len > 1 && len != (size_t) -1) + return false; + } + + wint_t lwc = towlower (uwc); + if (lwc != uwc && lwc != wc && towupper (lwc) == uwc) + { + char s[MB_LEN_MAX]; + mbstate_t mb_state = { 0 }; + size_t len = wcrtomb (s, lwc, &mb_state); + if (len > 1 && len != (size_t) -1) + return false; + } + + for (size_t j = 0; lonesome_lower[j]; j++) + { + wint_t li = lonesome_lower[j]; + if (li != lwc && li != uwc && li != wc && towupper (li) == uwc) + { + char s[MB_LEN_MAX]; + mbstate_t mb_state = { 0 }; + size_t len = wcrtomb (s, li, &mb_state); + if (len > 1 && len != (size_t) -1) + return false; + } + } + } + + return true; +} + /* Change a pattern for fgrep into grep. */ static void -fgrep_to_grep_pattern (size_t len, char const *keys, - size_t *new_len, char **new_keys) +fgrep_to_grep_pattern (char **keys_p, size_t *len_p) { - char *p = *new_keys = xnmalloc (len + 1, 2); + char *keys, *new_keys, *p; mbstate_t mb_state = { 0 }; - size_t n; + size_t len, n; + + len = *len_p; + keys = *keys_p; + + new_keys = xnmalloc (len + 1, 2); + p = new_keys; for (; len; keys += n, len -= n) { @@ -2222,7 +2301,13 @@ fgrep_to_grep_pattern (size_t len, char const *keys, } } - *new_len = p - *new_keys; + free (*keys_p); + *keys_p = new_keys; + *len_p = p - new_keys; + + matcher = "grep"; + compile = Gcompile; + execute = EGexecute; } int @@ -2657,20 +2742,17 @@ main (int argc, char **argv) In a multibyte locale, switch from fgrep to grep if either (1) case is ignored (where grep is typically faster), or (2) the pattern has an encoding error (where fgrep might not work). */ - if (compile == Fcompile - && (MB_CUR_MAX <= 1 - ? match_words - : match_icase || contains_encoding_error (keys, keycc))) + if (compile == Fcompile) { - size_t new_keycc; - char *new_keys; - fgrep_to_grep_pattern (keycc, keys, &new_keycc, &new_keys); - free (keys); - keys = new_keys; - keycc = new_keycc; - matcher = "grep"; - compile = Gcompile; - execute = EGexecute; + if (MB_CUR_MAX > 1) + { + if (contains_encoding_error (keys, keycc)) + fgrep_to_grep_pattern (&keys, &keycc); + else if (match_icase && !fgrep_icase_available (keys, keycc)) + fgrep_to_grep_pattern (&keys, &keycc); + } + else if (match_words) + fgrep_to_grep_pattern (&keys, &keycc); } compile (keys, keycc); diff --git a/src/kwsearch.c b/src/kwsearch.c index d2afa40..ec86a09 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size) { size_t total = size; - kwsinit (&kwset); + kwsinit (&kwset, true); char const *p = pattern; do diff --git a/src/search.h b/src/search.h index 7dc1940..801d768 100644 --- a/src/search.h +++ b/src/search.h @@ -46,7 +46,7 @@ _GL_INLINE_HEADER_BEGIN typedef signed char mb_len_map_t; /* searchutils.c */ -extern void kwsinit (kwset_t *); +extern void kwsinit (kwset_t *, bool); extern void build_mbclen_cache (void); extern size_t mbclen_cache[]; diff --git a/src/searchutils.c b/src/searchutils.c index d25e5f8..fbfc9a1 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -27,15 +27,33 @@ size_t mbclen_cache[NCHAR]; void -kwsinit (kwset_t *kwset) +kwsinit (kwset_t *kwset, bool mb_trans) { static char trans[NCHAR]; int i; - if (match_icase && MB_CUR_MAX == 1) + if (match_icase && (MB_CUR_MAX == 1 || mb_trans)) { - for (i = 0; i < NCHAR; ++i) - trans[i] = toupper (i); + if (MB_CUR_MAX == 1) + for (i = 0; i < NCHAR; ++i) + trans[i] = toupper (i); + else + for (i = 0; i < NCHAR; ++i) + { + wint_t wc = mbrtowc_cache[i]; + wint_t uwc = towupper (wc); + if (uwc != wc) + { + char s[MB_LEN_MAX]; + mbstate_t mbs = { 0 }; + size_t len = wcrtomb (s, uwc, &mbs); + if (len > 1) + abort (); + trans[i] = s[0]; + } + else + trans[i] = i; + } *kwset = kwsalloc (trans, false); } -- 1.7.1