From 1a3676d1a47da68ebc894a0569f19bb947211dfa Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 16 Jan 2017 12:13:50 -0800 Subject: [PATCH 2/4] Improve -i performance in typical UTF-8 searches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ‘grep -i i’ is slow in a UTF-8 locale, because ‘i’ in the pattern matches the two-byte character 'ı' (U+0131, LATIN SMALL LETTER DOTLESS I) in data, and kwset handles only single-byte character translations, so grep falls back on a slower DFA-based search for all searches. Improve -i performance in the typical case by using kwset when data are free of troublesome characters like 'ı', falling back on the DFA only when data contain troublesome characters. * src/dfasearch.c (GEAcompile): * src/grep.c (compile_fp_t): * src/kwsearch.c (Fcompile): * src/pcresearch.c (Pcompile): Pattern arg is now char *, not char const *, since Fcompile now reallocates it sometimes. * src/grep.c (all_single_byte_after_folding): Remove. All callers removed. (fgrep_icase_charlen): New function. (fgrep_icase_available, try_fgrep_pattern): Use it, for more-generous semantics. (fgrep_to_grep_pattern): Now extern. (main): Do not free keys, since Fexecute may use them. * src/kwsearch.c (struct kwsearch): New struct. (Fcompile): Return it. If -i, be more generous about patterns. (Fexecute): Use it. Fall back on DFA when the data contain troublesome characters; this should be rare in practice. * src/kwset.c, src/kwset.h (kwswords): New function. --- src/dfasearch.c | 2 +- src/grep.c | 93 +++++++++++++++++++++++++++++++------------------------- src/kwsearch.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- src/kwset.c | 6 ++++ src/kwset.h | 1 + src/pcresearch.c | 2 +- src/search.h | 7 +++-- 7 files changed, 153 insertions(+), 50 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 9b32d48..542e7f1 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -104,7 +104,7 @@ kwsmusts (struct dfa_comp *dc) } void * -GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) +GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits) { char *motif; struct dfa_comp *dc = xcalloc (1, sizeof (*dc)); diff --git a/src/grep.c b/src/grep.c index 0a674ec..81654c3 100644 --- a/src/grep.c +++ b/src/grep.c @@ -575,7 +575,7 @@ static bool seek_failed; static bool seek_data_failed; /* Functions we'll use to search. */ -typedef void *(*compile_fp_t) (char const *, size_t, reg_syntax_t); +typedef void *(*compile_fp_t) (char *, size_t, reg_syntax_t); typedef size_t (*execute_fp_t) (void *, char const *, size_t, size_t *, char const *); static execute_fp_t execute; @@ -2254,54 +2254,58 @@ contains_encoding_error (char const *pat, size_t patlen) return false; } -/* Return true if the set of single-byte characters USED contains only - characters whose case counterparts are also single-byte. */ +/* Return the number of bytes in the initial character of PAT, of size + PATLEN, if Fcompile can handle that character. Return -1 if + Fcompile cannot handle it. MBS is the multibyte conversion state. -static bool -all_single_byte_after_folding (bool used[UCHAR_MAX + 1]) -{ - for (int c = 0; c <= UCHAR_MAX; c++) - if (used[c]) - { - wint_t wc = localeinfo.sbctowc[c]; - wchar_t folded[CASE_FOLDED_BUFSIZE]; - size_t nfolded = case_folded_counterparts (wc, folded); - - for (size_t i = 0; i < nfolded; i++) - { - char s[MB_LEN_MAX]; - mbstate_t mb_state = { 0 }; - if (1 < wcrtomb (s, folded[i], &mb_state)) - return false; - } - } + Fcompile can handle a character C if C is single-byte, or if C has no + case folded counterparts and toupper translates none of its bytes. */ - return true; +static int +fgrep_icase_charlen (char const *pat, size_t patlen, mbstate_t *mbs) +{ + int n = localeinfo.sbclen[to_uchar (*pat)]; + if (n < 0) + { + wchar_t wc; + wchar_t folded[CASE_FOLDED_BUFSIZE]; + size_t wn = mbrtowc (&wc, pat, patlen, mbs); + if (MB_LEN_MAX < wn || case_folded_counterparts (wc, folded)) + return -1; + for (int i = wn; 0 < --i; ) + { + unsigned char c = pat[i]; + if (toupper (c) != c) + return -1; + } + n = wn; + } + return n; } -/* Return true if the -F patterns PAT, of size PATLEN, match only - single-byte characters including case folding, and so can be - processed by the -F matcher even if -i is given. */ +/* Return true if the -F patterns PAT, of size PATLEN, contain only + single-byte characters or characters not subject to case folding, + and so can be processed by Fcompile. */ static bool fgrep_icase_available (char const *pat, size_t patlen) { - bool used[UCHAR_MAX + 1] = { 0, }; + mbstate_t mbs = {0,}; - for (size_t i = 0; i < patlen; i++) + for (size_t i = 0; i < patlen; ) { - unsigned char c = pat[i]; - if (localeinfo.sbctowc[c] == WEOF) + int n = fgrep_icase_charlen (pat + i, patlen - i, &mbs); + if (n < 0) return false; - used[c] = true; + i += n; } - return all_single_byte_after_folding (used); + return true; } /* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style. */ -static void +void fgrep_to_grep_pattern (char **keys_p, size_t *len_p) { size_t len = *len_p; @@ -2358,8 +2362,6 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p) char *new_keys = xmalloc (len + 1); char *p = new_keys; mbstate_t mb_state = { 0 }; - size_t mblen_lim = match_icase ? 1 : -3; - bool used[UCHAR_MAX + 1] = { 0, }; while (len != 0) { @@ -2396,19 +2398,27 @@ try_fgrep_pattern (int matcher, char *keys, size_t *len_p) } { - size_t n = mb_clen (keys, len, &mb_state); - if (mblen_lim < n) - goto fail; - used[to_uchar (keys[0])] = true; + size_t n; + if (match_icase) + { + int ni = fgrep_icase_charlen (keys, len, &mb_state); + if (ni < 0) + goto fail; + n = ni; + } + else + { + n = mb_clen (keys, len, &mb_state); + if (MB_LEN_MAX < n) + goto fail; + } + p = mempcpy (p, keys, n); keys += n; len -= n; } } - if (mblen_lim == 1 && !all_single_byte_after_folding (used)) - goto fail; - if (*len_p != p - new_keys) { *len_p = p - new_keys; @@ -2878,7 +2888,6 @@ main (int argc, char **argv) execute = matchers[matcher].execute; compiled_pattern = matchers[matcher].compile (keys, keycc, matchers[matcher].syntax); - free (keys); /* We need one byte prior and one after. */ char eolbytes[3] = { 0, eolbyte, 0 }; size_t match_size; diff --git a/src/kwsearch.c b/src/kwsearch.c index 5e9df16..30a027c 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -21,8 +21,33 @@ #include #include "search.h" +/* A compiled -F pattern list. */ + +struct kwsearch +{ + /* The kwset for this pattern list. */ + kwset_t kwset; + + /* The number of user-specified patterns. This is less than + 'kwswords (kwset)' when some extra one-character words have been + appended, one for each troublesome character that will require a + DFA search. */ + ptrdiff_t words; + + /* The user's pattern and its size in bytes. */ + char *pattern; + size_t size; + + /* The user's pattern compiled as a regular expression, + or null if it has not been compiled. */ + void *re; +}; + +/* Compile the -F style PATTERN, containing SIZE bytes. Return a + description of the compiled pattern. */ + void * -Fcompile (char const *pattern, size_t size, reg_syntax_t ignored) +Fcompile (char *pattern, size_t size, reg_syntax_t ignored) { kwset_t kwset; ptrdiff_t total = size; @@ -74,11 +99,55 @@ Fcompile (char const *pattern, size_t size, reg_syntax_t ignored) while (p); free (buf); + ptrdiff_t words = kwswords (kwset); + + if (match_icase) + { + /* For each pattern character C that has a case folded + counterpart F that is multibyte and so cannot easily be + implemented via translating a single byte, append a pattern + containing just F. That way, if the data contains F, the + matcher can fall back on DFA. For example, if C is 'i' and + the locale is en_US.utf8, append a pattern containing just + the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that + Fexecute will use a DFA if the data contain U+0131. */ + mbstate_t mbs = { 0 }; + char checked[NCHAR] = {0,}; + for (p = pattern; p < pattern + size; p++) + { + unsigned char c = *p; + if (checked[c]) + continue; + checked[c] = true; + + wint_t wc = localeinfo.sbctowc[c]; + wchar_t folded[CASE_FOLDED_BUFSIZE]; + + for (int i = case_folded_counterparts (wc, folded); 0 <= --i; ) + { + char s[MB_LEN_MAX]; + int nbytes = wcrtomb (s, folded[i], &mbs); + if (1 < nbytes) + kwsincr (kwset, s, nbytes); + } + } + } + kwsprep (kwset); - return kwset; + struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch); + kwsearch->kwset = kwset; + kwsearch->words = words; + kwsearch->pattern = pattern; + kwsearch->size = size; + kwsearch->re = NULL; + return kwsearch; } +/* Use the compiled pattern VCP to search the buffer BUF of size SIZE. + If found, return the offset of the first match and store its + size into *MATCH_SIZE. If not found, return SIZE_MAX. + If START_PTR is nonnull, start searching there. */ size_t Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, char const *start_ptr) @@ -90,7 +159,8 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, size_t ret_val; bool mb_check; bool longest; - kwset_t kwset = vcp; + struct kwsearch *kwsearch = vcp; + kwset_t kwset = kwsearch->kwset; if (match_lines) mb_check = longest = false; @@ -108,6 +178,22 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, if (offset < 0) break; len = kwsmatch.size[0] - 2 * match_lines; + + if (kwsearch->words <= kwsmatch.index) + { + /* The data contain a multibyte character that matches + some pattern character that is a case folded counterpart. + Since the kwset code cannot handle this case, fall back + on the DFA code, which can. */ + if (! kwsearch->re) + { + fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size); + kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size, + RE_SYNTAX_GREP); + } + return EGexecute (kwsearch->re, buf, size, match_size, start_ptr); + } + if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0) { /* We have matched a single byte that is not at the beginning of a diff --git a/src/kwset.c b/src/kwset.c index 0b6fab4..77ef7c7 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -328,6 +328,12 @@ kwsincr (kwset_t kwset, char const *text, ptrdiff_t len) kwset->maxd = trie->depth; } +ptrdiff_t +kwswords (kwset_t kwset) +{ + return kwset->words; +} + /* Enqueue the trie nodes referenced from the given tree in the given queue. */ static void diff --git a/src/kwset.h b/src/kwset.h index ef78db3..5c78e54 100644 --- a/src/kwset.h +++ b/src/kwset.h @@ -36,6 +36,7 @@ typedef struct kwset *kwset_t; extern kwset_t kwsalloc (char const *, bool); extern void kwsincr (kwset_t, char const *, ptrdiff_t); +extern ptrdiff_t kwswords (kwset_t) _GL_ATTRIBUTE_PURE; extern void kwsprep (kwset_t); extern ptrdiff_t kwsexec (kwset_t, char const *, ptrdiff_t, struct kwsmatch *, bool) diff --git a/src/pcresearch.c b/src/pcresearch.c index 28144b4..703498c 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -91,7 +91,7 @@ jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes, #endif void * -Pcompile (char const *pattern, size_t size, reg_syntax_t ignored) +Pcompile (char *pattern, size_t size, reg_syntax_t ignored) { #if !HAVE_LIBPCRE die (EXIT_TROUBLE, 0, diff --git a/src/search.h b/src/search.h index c8bd5f2..8533d59 100644 --- a/src/search.h +++ b/src/search.h @@ -55,19 +55,20 @@ extern size_t wordchar_prev (char const *, char const *, char const *) extern ptrdiff_t mb_goback (char const **, char const *, char const *); /* dfasearch.c */ -extern void *GEAcompile (char const *, size_t, reg_syntax_t); +extern void *GEAcompile (char *, size_t, reg_syntax_t); extern size_t EGexecute (void *, char const *, size_t, size_t *, char const *); /* kwsearch.c */ -extern void *Fcompile (char const *, size_t, reg_syntax_t); +extern void *Fcompile (char *, size_t, reg_syntax_t); extern size_t Fexecute (void *, char const *, size_t, size_t *, char const *); /* pcresearch.c */ -extern void *Pcompile (char const *, size_t, reg_syntax_t); +extern void *Pcompile (char *, size_t, reg_syntax_t); extern size_t Pexecute (void *, char const *, size_t, size_t *, char const *); /* grep.c */ extern struct localeinfo localeinfo; +extern void fgrep_to_grep_pattern (char **, size_t *); /* Return the number of bytes in the character at the start of S, which is of size N. N must be positive. MBS is the conversion state. -- 2.9.3