From f804d7afbf75f08fd209ad08358bf76c18cc1d56 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 25 Jan 2014 16:19:37 -0800 Subject: [PATCH 1/5] dfa: remove GREP-ifdef'd code in favor of code used by gawk For many years, gawk and grep have used different #ifdef'd bits of code relating to how the DFA matcher matches multibyte characters. Remove the GREP-specific code in favor of the code gawk uses. This permits us to avoid still more cases in which grep must resort to the expensive process of copying/case-converting each input line before matching against a case-converted regexp. * src/dfa.c (parse_bracket_exp, atom): As above. --- src/dfa.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 5e3140d..b79c604 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1094,7 +1094,6 @@ parse_bracket_exp (void) work_mbc->range_ends[work_mbc->nranges++] = case_fold ? towlower (wc2) : (wchar_t) wc2; -#ifndef GREP if (case_fold && (iswalpha (wc) || iswalpha (wc2))) { REALLOC_IF_NECESSARY (work_mbc->range_sts, @@ -1104,7 +1103,6 @@ parse_bracket_exp (void) range_ends_al, work_mbc->nranges + 1); work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); } -#endif } else { @@ -1140,11 +1138,7 @@ parse_bracket_exp (void) work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = wc; } -#ifdef GREP - continue; -#else wc = towupper (wc); -#endif } if (!setbit_wc (wc, ccl)) { @@ -1738,13 +1732,11 @@ atom (void) else if (MBS_SUPPORT && tok == WCHAR) { addtok_wc (case_fold ? towlower (wctok) : wctok); -#ifndef GREP if (case_fold && iswalpha (wctok)) { addtok_wc (towupper (wctok)); addtok (OR); } -#endif tok = lex (); } -- 1.8.5.3.321.g14598b9 From ada1ba7a047db001432cc3d72707c099828ed338 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 13 Jan 2014 08:42:25 -0800 Subject: [PATCH 2/5] dfasearch: skip kwset optimization when multi-byte+case-insensitive Now that DFA searching works with multi-byte locales, the only remaining reason to case-convert the searched input is the kwset optimization. But multi-byte case-conversion is so expensive that it's not worthwhile even to attempt that optimization. * src/dfasearch.c (kwsmusts): Skip this function in ignore-case mode when the locale is multi-byte. (EGexecute): Now that this code need not handle multi-byte case-ignoring matches, remove the expensive copy/case-conversion code. --- src/dfasearch.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 46581ff..0b0f506 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -95,6 +95,12 @@ kwsmusts (void) struct dfamust const *dm; char const *err; + /* With case-insensitive matching in a multi-byte locale, do not + use kwsearch, because in that case, it would be too expensive, + requiring that we case-convert all searched input. */ + if (MB_CUR_MAX > 1 && match_icase) + return; + dm = dfamusts (dfa); if (dm) { @@ -220,19 +226,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size, size_t i, ret_val; mb_len_map_t *map = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - /* mbtolower adds a NUL byte at the end. That will provide - space for the sentinel byte dfaexec may add. */ - char *case_buf = mbtolower (buf, &size, &map); - if (start_ptr) - start_ptr = case_buf + (start_ptr - buf); - buf = case_buf; - } - } - mb_start = buf; buflim = buf + size; -- 1.8.5.3.321.g14598b9 From 3893ea4a026399e942c4af5b0a2c71a071296686 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Tue, 14 Jan 2014 07:36:09 -0800 Subject: [PATCH 3/5] also remove call to mb_case_map_apply * src/dfasearch.c (EGexecute): With no case-converted buffer, there is no longer any need to call this length/offset mapping function. --- src/dfasearch.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 0b0f506..f640b9b 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -223,8 +223,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, regoff_t start; size_t len, best_len; struct kwsmatch kwsm; - size_t i, ret_val; - mb_len_map_t *map = NULL; + size_t i; mb_start = buf; buflim = buf + size; @@ -411,8 +410,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size, len = end - beg; success_in_len:; size_t off = beg - buf; - mb_case_map_apply (map, &off, &len); *match_size = len; - ret_val = off; - return ret_val; + return off; } -- 1.8.5.3.321.g14598b9 From 26531c30f83e6277d7a8ec9de249f41d03f64e5b Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Tue, 14 Jan 2014 07:53:50 -0800 Subject: [PATCH 4/5] test more * tests/turkish-eyes: Test all of -E, -F and -G. --- tests/turkish-eyes | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/turkish-eyes b/tests/turkish-eyes index 68301e7..453e30f 100755 --- a/tests/turkish-eyes +++ b/tests/turkish-eyes @@ -38,7 +38,9 @@ i=$(printf '\304\261') # lowercase dotless i search_str="$i:i I:$I" printf "$data\n" > in || framework_failure_ -LC_ALL=$L grep -i "^$search_str\$" in > out || fail=1 -compare out in || fail=1 +for opt in -E -F -G; do + LC_ALL=$L grep $opt -i "^$search_str\$" in > out || fail=1 + compare out in || fail=1 +done Exit $fail -- 1.8.5.3.321.g14598b9 From e269b64fd335eb73e28a5a2e61bf67e9459cce5a Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Sat, 25 Jan 2014 16:47:17 -0800 Subject: [PATCH 5/5] maint: move two local variable declarations * src/dfasearch.c (kwsmusts): Move one declaration down to the point of definition. Move another into the sole scope where it is used. --- src/dfasearch.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index f640b9b..634a082 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -92,18 +92,16 @@ kwsincr_case (const char *must) static void kwsmusts (void) { - struct dfamust const *dm; - char const *err; - /* With case-insensitive matching in a multi-byte locale, do not use kwsearch, because in that case, it would be too expensive, requiring that we case-convert all searched input. */ if (MB_CUR_MAX > 1 && match_icase) return; - dm = dfamusts (dfa); + struct dfamust const *dm = dfamusts (dfa); if (dm) { + char const *err; kwsinit (&kwset); /* First, we compile in the substrings known to be exact matches. The kwset matcher will return the index -- 1.8.5.3.321.g14598b9