From 0b5084286c23e75139cc09e02c1ad8495059eb38 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Wed, 30 Apr 2014 11:22:27 +0900 Subject: [PATCH] grep: fix the different behaviour for a invalid sequence between KWset and DFA * src/dfa.c (ctok): Define new global variable. (dfambcache): Don't cache invalid sequences, because it can't be expressed with any wide character. (mbs_to_wchar): Return WEOF for invalid sequences. (parse_bracket_exp): Fix it. (lex): Set `ctok'. (atom, match_anychar, match_mb_charset): Fix it. * src/searchutils.c (is_mb_middle): Fix it. * tests/prefix-of-multibyte: Fix it. --- src/dfa.c | 101 ++++++++++++++++++++++++++++------------------ src/kwsearch.c | 7 +--- src/searchutils.c | 2 +- tests/prefix-of-multibyte | 12 ++++-- 4 files changed, 72 insertions(+), 50 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 362de2c..c83a940 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -456,9 +456,13 @@ dfambcache (struct dfa *d) wint_t wi; switch (mbrtowc (&wc, &c, 1, &s)) { - default: wi = wc; break; - case (size_t) -2: wi = WEOF; break; - case (size_t) -1: wi = uc; break; + default: + wi = wc; + break; + case (size_t) -1: + case (size_t) -2: + wi = WEOF; + break; } d->mbrtowc_cache[uc] = wi; } @@ -492,7 +496,6 @@ mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d) if (0 < nbytes && nbytes < (size_t) -2) return nbytes; memset (&d->mbs, 0, sizeof d->mbs); - wc = uc; } *pwc = wc; @@ -847,6 +850,8 @@ static int cur_mb_len = 1; /* Length of the multibyte representation of /* These variables are used only if (MB_CUR_MAX > 1). */ static wchar_t wctok; /* Wide character representation of the current multibyte character. */ +static unsigned int ctok; /* Single character representation of the current + multibyte character. */ /* Note that characters become unsigned here. */ @@ -1128,19 +1133,22 @@ parse_bracket_exp (void) to the pair of ranges, [m-z] [M-Z]. Although this code is wrong in multiple ways, it's never used in practice. FIXME: Remove this (and related) unused code. */ - work_mbc->ranges - = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2, - &ranges_al, sizeof *work_mbc->ranges); - work_mbc->ranges[work_mbc->nranges].beg - = case_fold ? towlower (wc) : wc; - work_mbc->ranges[work_mbc->nranges++].end - = case_fold ? towlower (wc2) : wc2; - - if (case_fold && (iswalpha (wc) || iswalpha (wc2))) + if (wc != WEOF && wc2 != WEOF) { - work_mbc->ranges[work_mbc->nranges].beg = towupper (wc); + work_mbc->ranges + = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2, + &ranges_al, sizeof *work_mbc->ranges); + work_mbc->ranges[work_mbc->nranges].beg + = case_fold ? towlower (wc) : wc; work_mbc->ranges[work_mbc->nranges++].end - = towupper (wc2); + = case_fold ? towlower (wc2) : wc2; + + if (case_fold && (iswalpha (wc) || iswalpha (wc2))) + { + work_mbc->ranges[work_mbc->nranges].beg = towupper (wc); + work_mbc->ranges[work_mbc->nranges++].end + = towupper (wc2); + } } } else if (using_simple_locale ()) @@ -1184,23 +1192,28 @@ parse_bracket_exp (void) continue; } - if (case_fold) - { - wchar_t folded[CASE_FOLDED_BUFSIZE]; - int i, n = case_folded_counterparts (wc, folded); - work_mbc->chars = maybe_realloc (work_mbc->chars, - work_mbc->nchars + n, &chars_al, - sizeof *work_mbc->chars); - for (i = 0; i < n; i++) - if (!setbit_wc (folded[i], ccl)) - work_mbc->chars[work_mbc->nchars++] = folded[i]; - } - if (!setbit_wc (wc, ccl)) + if (wc != WEOF) { - work_mbc->chars = maybe_realloc (work_mbc->chars, work_mbc->nchars, - &chars_al, sizeof *work_mbc->chars); - work_mbc->chars[work_mbc->nchars++] = wc; + if (case_fold) + { + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wc, folded); + work_mbc->chars = maybe_realloc (work_mbc->chars, + work_mbc->nchars + n, &chars_al, + sizeof *work_mbc->chars); + for (i = 0; i < n; i++) + if (!setbit_wc (folded[i], ccl)) + work_mbc->chars[work_mbc->nchars++] = folded[i]; + } + else if (!setbit_wc (wc, ccl)) + { + work_mbc->chars = maybe_realloc (work_mbc->chars, work_mbc->nchars, + &chars_al, sizeof *work_mbc->chars); + work_mbc->chars[work_mbc->nchars++] = wc; + } } + else + setbit (c, ccl); } while ((wc = wc1, (c = c1) != ']')); @@ -1245,7 +1258,8 @@ lex (void) "if (backslash) ...". */ for (i = 0; i < 2; ++i) { - FETCH_WC (c, wctok, NULL); + FETCH_WC (ctok, wctok, NULL); + c = ctok; if (c == (unsigned int) EOF) goto normal_char; @@ -1776,18 +1790,23 @@ atom (void) { if (tok == WCHAR) { - addtok_wc (wctok); - - if (case_fold) + if (wctok != WEOF) { - wchar_t folded[CASE_FOLDED_BUFSIZE]; - int i, n = case_folded_counterparts (wctok, folded); - for (i = 0; i < n; i++) + addtok_wc (wctok); + + if (case_fold) { - addtok_wc (folded[i]); - addtok (OR); + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wctok, folded); + for (i = 0; i < n; i++) + { + addtok_wc (folded[i]); + addtok (OR); + } } } + else + addtok_mb (ctok, 3); tok = lex (); } @@ -2949,6 +2968,8 @@ match_anychar (struct dfa *d, state_num s, position pos, if (syntax_bits & RE_DOT_NOT_NULL) return 0; } + else if (wc == WEOF) + return 0; context = wchar_context (wc); if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context)) @@ -2985,6 +3006,8 @@ match_mb_charset (struct dfa *d, state_num s, position pos, if (syntax_bits & RE_DOT_NOT_NULL) return 0; } + else if (wc == WEOF) + return 0; context = wchar_context (wc); if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context)) diff --git a/src/kwsearch.c b/src/kwsearch.c index 7c64c86..46569e9 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -131,12 +131,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size, { /* The match was a part of multibyte character, advance at least one byte to ensure no infinite loop happens. */ - mbstate_t s; - memset (&s, 0, sizeof s); - size_t mb_len = mbrlen (mb_start, (buf + size) - (beg + offset), &s); - if (mb_len == (size_t) -2 || mb_len == (size_t) -1) - goto failure; - beg = mb_start + mb_len - 1; + beg = mb_start; continue; } beg += offset; diff --git a/src/searchutils.c b/src/searchutils.c index 6440f07..ea26a70 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -270,5 +270,5 @@ is_mb_middle (const char **good, const char *buf, const char *end, return true; /* P == BUF here. */ - return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state); + return false; } diff --git a/tests/prefix-of-multibyte b/tests/prefix-of-multibyte index b15fa9b..70a924e 100755 --- a/tests/prefix-of-multibyte +++ b/tests/prefix-of-multibyte @@ -1,5 +1,5 @@ #!/bin/sh -# This would mistakenly print a line prior to grep-2.6.2. +# This would mistakenly print a line prior to grep-2.18. . "${srcdir=.}/init.sh"; path_prepend_ ../src require_en_utf8_locale_ @@ -7,14 +7,18 @@ require_compiled_in_MB_support encode() { echo "$1" | tr ABC '\357\274\241'; } +encode ABC >exp1 +encode aABC >exp2 + fail=0 for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do for opt in '' '-F'; do out=out-$opt-$LOC - encode ABC | LC_ALL=$LOC grep $opt "$(encode A)" > $out 2>&1 - test $? = 1 || fail=1 - compare /dev/null $out || fail=1 + LC_ALL=$LOC grep $opt "$(encode A)" exp1 >$out || fail=1 + compare exp1 $out || fail=1 + LC_ALL=$LOC grep $opt "$(encode aA)" exp2 >$out || fail=1 + compare exp2 $out || fail=1 done done -- 1.9.2