From 0d77720671a9ea8bb54f9402a7f558b798b9381a Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Wed, 31 Aug 2016 23:44:08 -0700 Subject: [PATCH 1/2] grep: speed up -iF in multibyte locales In a multibyte locale, if a pattern is composed of only single byte characters and their all counterparts are also single byte characters and the pattern does not have invalid sequences, grep -iF uses the fgrep matcher, the same as in a single byte locale (Bug#23752). * NEWS: Mention it. * src/grep.c (lonesome_lower): New constant. (fgrep_icase_available): New function. (fgrep_to_grep_pattern): Simplify it. (main): Use them. * src/searchutils.c (kwsinit): New arg MB_TRANS; all uses changed. Try fgrep matcher for case insensitive matching by grep -F in multibyte locale. --- NEWS | 6 +++ src/dfasearch.c | 2 +- src/grep.c | 116 +++++++++++++++++++++++++++++++++++++++++++++--------- src/kwsearch.c | 2 +- src/search.h | 2 +- src/searchutils.c | 26 ++++++++++-- 6 files changed, 129 insertions(+), 25 deletions(-) diff --git a/NEWS b/NEWS index 21db87a..65e663d 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,12 @@ GNU grep NEWS -*- outline -*- grep now prints a "FILENAME:LINENO: " prefix when diagnosing an invalid regular expression that was read from an '-f'-specified file. + grep -F now tries fgrep matcher for case insensitive matching in + multibyte locale. It improves performance for a case which a pattern + is composed of only single byte characters and their all counterparts + are also single byte characters and the pattern does not have invalid + sequences. + * Noteworthy changes in release 2.25 (2016-04-21) [stable] diff --git a/src/dfasearch.c b/src/dfasearch.c index c2e0177..548ef08 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -89,7 +89,7 @@ kwsmusts (void) struct dfamust *dm = dfamust (dfa); if (!dm) return; - kwsinit (&kwset); + kwsinit (&kwset, false); if (dm->exact) { /* Prepare a substring whose presence implies a match. diff --git a/src/grep.c b/src/grep.c index fc22c7b..ae3b6e7 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2265,14 +2265,91 @@ contains_encoding_error (char const *pat, size_t patlen) return false; } +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE + }; + +static bool +fgrep_icase_available (char const *pat, size_t patlen) +{ + for (size_t i = 0; i < patlen; ++i) + { + unsigned char c = pat[i]; + if (localeinfo.sbclen[c] > 1) + return false; + } + + for (size_t i = 0; i < patlen; ++i) + { + unsigned char c = pat[i]; + + wint_t wc = localeinfo.sbctowc[c]; + if (wc == WEOF) + return false; + + wint_t uwc = towupper (wc); + if (uwc != wc) + { + char s[MB_LEN_MAX]; + mbstate_t mb_state = { 0 }; + size_t len = wcrtomb (s, uwc, &mb_state); + if (len > 1 && len != (size_t) -1) + return false; + } + + wint_t lwc = towlower (uwc); + if (lwc != uwc && lwc != wc && towupper (lwc) == uwc) + { + char s[MB_LEN_MAX]; + mbstate_t mb_state = { 0 }; + size_t len = wcrtomb (s, lwc, &mb_state); + if (len > 1 && len != (size_t) -1) + return false; + } + + for (size_t j = 0; lonesome_lower[j]; j++) + { + wint_t li = lonesome_lower[j]; + if (li != lwc && li != uwc && li != wc && towupper (li) == uwc) + { + char s[MB_LEN_MAX]; + mbstate_t mb_state = { 0 }; + size_t len = wcrtomb (s, li, &mb_state); + if (len > 1 && len != (size_t) -1) + return false; + } + } + } + + return true; +} + /* Change a pattern for fgrep into grep. */ static void -fgrep_to_grep_pattern (size_t len, char const *keys, - size_t *new_len, char **new_keys) +fgrep_to_grep_pattern (char **keys_p, size_t *len_p) { - char *p = *new_keys = xnmalloc (len + 1, 2); + char *keys, *new_keys, *p; mbstate_t mb_state = { 0 }; - size_t n; + size_t len, n; + + len = *len_p; + keys = *keys_p; + + new_keys = xnmalloc (len + 1, 2); + p = new_keys; for (; len; keys += n, len -= n) { @@ -2300,7 +2377,13 @@ fgrep_to_grep_pattern (size_t len, char const *keys, } } - *new_len = p - *new_keys; + free (*keys_p); + *keys_p = new_keys; + *len_p = p - new_keys; + + matcher = "grep"; + compile = Gcompile; + execute = EGexecute; } int @@ -2733,20 +2816,17 @@ main (int argc, char **argv) In a multibyte locale, switch from fgrep to grep if either (1) case is ignored (where grep is typically faster), or (2) the pattern has an encoding error (where fgrep might not work). */ - if (compile == Fcompile - && (MB_CUR_MAX <= 1 - ? match_words - : match_icase || contains_encoding_error (keys, keycc))) + if (compile == Fcompile) { - size_t new_keycc; - char *new_keys; - fgrep_to_grep_pattern (keycc, keys, &new_keycc, &new_keys); - free (keys); - keys = new_keys; - keycc = new_keycc; - matcher = "grep"; - compile = Gcompile; - execute = EGexecute; + if (MB_CUR_MAX > 1) + { + if (contains_encoding_error (keys, keycc)) + fgrep_to_grep_pattern (&keys, &keycc); + else if (match_icase && !fgrep_icase_available (keys, keycc)) + fgrep_to_grep_pattern (&keys, &keycc); + } + else if (match_words) + fgrep_to_grep_pattern (&keys, &keycc); } compile (keys, keycc); diff --git a/src/kwsearch.c b/src/kwsearch.c index 508ebc5..7fe08aa 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size) { size_t total = size; - kwsinit (&kwset); + kwsinit (&kwset, true); char const *p = pattern; do diff --git a/src/search.h b/src/search.h index 431a67d..534a49e 100644 --- a/src/search.h +++ b/src/search.h @@ -47,7 +47,7 @@ _GL_INLINE_HEADER_BEGIN typedef signed char mb_len_map_t; /* searchutils.c */ -extern void kwsinit (kwset_t *); +extern void kwsinit (kwset_t *, bool); extern ptrdiff_t mb_goback (char const **, char const *, char const *); extern wint_t mb_prev_wc (char const *, char const *, char const *); extern wint_t mb_next_wc (char const *, char const *); diff --git a/src/searchutils.c b/src/searchutils.c index 8081d41..87f51a4 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -25,15 +25,33 @@ #define NCHAR (UCHAR_MAX + 1) void -kwsinit (kwset_t *kwset) +kwsinit (kwset_t *kwset, bool mb_trans) { static char trans[NCHAR]; int i; - if (match_icase && MB_CUR_MAX == 1) + if (match_icase && (MB_CUR_MAX == 1 || mb_trans)) { - for (i = 0; i < NCHAR; ++i) - trans[i] = toupper (i); + if (MB_CUR_MAX == 1) + for (i = 0; i < NCHAR; ++i) + trans[i] = toupper (i); + else + for (i = 0; i < NCHAR; ++i) + { + wint_t wc = localeinfo.sbctowc[i]; + wint_t uwc = towupper (wc); + if (uwc != wc) + { + char s[MB_LEN_MAX]; + mbstate_t mbs = { 0 }; + size_t len = wcrtomb (s, uwc, &mbs); + if (len > 1) + abort (); + trans[i] = s[0]; + } + else + trans[i] = i; + } *kwset = kwsalloc (trans, false); } -- 2.7.4