From 1a3676d1a47da68ebc894a0569f19bb947211dfa Mon Sep 17 00:00:00 2001 From: Paul Eggert
Date: Mon, 16 Jan 2017 12:13:50 -0800 Subject: [PATCH 2/4] Improve -i performance in typical UTF-8 searches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ‘grep -i i’ is slow in a UTF-8 locale, because ‘i’ in the pattern matches the two-byte character 'ı' (U+0131, LATIN SMALL LETTER DOTLESS I) in data, and kwset handles only single-byte character translations, so grep falls back on a slower DFA-based search for all searches. Improve -i performance in the typical case by using kwset when data are free of troublesome characters like 'ı', falling back on the DFA only when data contain troublesome characters. * src/dfasearch.c (GEAcompile): * src/grep.c (compile_fp_t): * src/kwsearch.c (Fcompile): * src/pcresearch.c (Pcompile): Pattern arg is now char *, not char const *, since Fcompile now reallocates it sometimes. * src/grep.c (all_single_byte_after_folding): Remove. All callers removed. (fgrep_icase_charlen): New function. (fgrep_icase_available, try_fgrep_pattern): Use it, for more-generous semantics. (fgrep_to_grep_pattern): Now extern. (main): Do not free keys, since Fexecute may use them. * src/kwsearch.c (struct kwsearch): New struct. (Fcompile): Return it. If -i, be more generous about patterns. (Fexecute): Use it. Fall back on DFA when the data contain troublesome characters; this should be rare in practice. * src/kwset.c, src/kwset.h (kwswords): New function. --- src/dfasearch.c | 2 +- src/grep.c | 93 +++++++++++++++++++++++++++++++------------------------- src/kwsearch.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- src/kwset.c | 6 ++++ src/kwset.h | 1 + src/pcresearch.c | 2 +- src/search.h | 7 +++-- 7 files changed, 153 insertions(+), 50 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 9b32d48..542e7f1 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -104,7 +104,7 @@ kwsmusts (struct dfa_comp *dc) } void * -GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) +GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits) { char *motif; struct dfa_comp *dc = xcalloc (1, sizeof (*dc)); diff --git a/src/grep.c b/src/grep.c index 0a674ec..81654c3 100644 --- a/src/grep.c +++ b/src/grep.c @@ -575,7 +575,7 @@ static bool seek_failed; static bool seek_data_failed; /* Functions we'll use to search. */ -typedef void *(*compile_fp_t) (char const *, size_t, reg_syntax_t); +typedef void *(*compile_fp_t) (char *, size_t, reg_syntax_t); typedef size_t (*execute_fp_t) (void *, char const *, size_t, size_t *, char const *); static execute_fp_t execute; @@ -2254,54 +2254,58 @@ contains_encoding_error (char const *pat, size_t patlen) return false; } -/* Return true if the set of single-byte characters USED contains only - characters whose case counterparts are also single-byte. */ +/* Return the number of bytes in the initial character of PAT, of size + PATLEN, if Fcompile can handle that character. Return -1 if + Fcompile cannot handle it. MBS is the multibyte conversion state. -static bool -all_single_byte_after_folding (bool used[UCHAR_MAX + 1]) -{ - for (int c = 0; c <= UCHAR_MAX; c++) - if (used[c]) - { - wint_t wc = localeinfo.sbctowc[c]; - wchar_t folded[CASE_FOLDED_BUFSIZE]; - size_t nfolded = case_folded_counterparts (wc, folded); - - for (size_t i = 0; i < nfolded; i++) - { - char s[MB_LEN_MAX]; - mbstate_t mb_state = { 0 }; - if (1 < wcrtomb (s, folded[i], &mb_state)) - return false; - } - } + Fcompile can handle a character C if C is single-byte, or if C has no + case folded counterparts and toupper translates none of its bytes. */ - return true; +static int +fgrep_icase_charlen (char const *pat, size_t patlen, mbstate_t *mbs) +{ + int n = localeinfo.sbclen[to_uchar (*pat)]; + if (n < 0) + { + wchar_t wc; + wchar_t folded[CASE_FOLDED_BUFSIZE]; + size_t wn = mbrtowc (&wc, pat, patlen, mbs); + if (MB_LEN_MAX < wn || case_folded_counterparts (wc, folded)) + return -1; + for (int i = wn; 0 < --i; ) + { + unsigned char c = pat[i]; + if (toupper (c) != c) + return -1; + } + n = wn; + } + return n; } -/* Return true if the -F patterns PAT, of size PATLEN, match only - single-byte characters including case folding, and so can be - processed by the -F matcher even if -i is given. */ +/* Return true if the -F patterns PAT, of size PATLEN, contain only + single-byte characters or characters not subject to case folding, + and so can be processed by Fcompile. */ static bool fgrep_icase_available (char const *pat, size_t patlen) { - bool used[UCHAR_MAX + 1] = { 0, }; + mbstate_t mbs = {0,}; - for (size_t i = 0; i < patlen; i++) + for (size_t i = 0; i < patlen; ) { - unsigned char c = pat[i]; - if (localeinfo.sbctowc[c] == WEOF) + int n = fgrep_icase_charlen (pat + i, patlen - i, &mbs); + if (n < 0) return false; - used[c] = true; + i += n; } - return all_single_byte_after_folding (used); + return true; } /* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style. */ -static void +void fgrep_to_grep_pattern (char **keys_p, size_t *len_p) { size_t len = *len_p; @@ -2358,8 +2362,6 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p) char *new_keys = xmalloc (len + 1); char *p = new_keys; mbstate_t mb_state = { 0 }; - size_t mblen_lim = match_icase ? 1 : -3; - bool used[UCHAR_MAX + 1] = { 0, }; while (len != 0) { @@ -2396,19 +2398,27 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p) } { - size_t n = mb_clen (keys, len, &mb_state); - if (mblen_lim < n) - goto fail; - used[to_uchar (keys[0])] = true; + size_t n; + if (match_icase) + { + int ni = fgrep_icase_charlen (keys, len, &mb_state); + if (ni < 0) + goto fail; + n = ni; + } + else + { + n = mb_clen (keys, len, &mb_state); + if (MB_LEN_MAX < n) + goto fail; + } + p = mempcpy (p, keys, n); keys += n; len -= n; } } - if (mblen_lim == 1 && !all_single_byte_after_folding (used)) - goto fail; - if (*len_p != p - new_keys) { *len_p = p - new_keys; @@ -2878,7 +2888,6 @@ main (int argc, char **argv) execute = matchers[matcher].execute; compiled_pattern = matchers[matcher].compile (keys, keycc, matchers[matcher].syntax); - free (keys); /* We need one byte prior and one after. */ char eolbytes[3] = { 0, eolbyte, 0 }; size_t match_size; diff --git a/src/kwsearch.c b/src/kwsearch.c index 5e9df16..30a027c 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -21,8 +21,33 @@ #include