From 3ba4fad7c591fd482ac6bfee1f7a2533aef1f611 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 11 Oct 2014 11:38:09 +0900 Subject: [PATCH] dfa: treat a multibyte character even with constraints correctly * src/dfa.c (struct dfa): Add a new members `min_trcount', `initstate_letter' and `initstate_others'. (dfaanalyze): Build states with not only a newline context but others. (build_state): Don't release initial states. (dfaexec_main): If multiple states exists in initial, transit a state to another after skip a middle position in a multibyte character tests/euc-mb: Add a new test. * NEWS (Bug fixes): Mention it. --- NEWS | 4 ++++ src/dfa.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------------ tests/euc-mb | 1 + 3 files changed, 60 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index 07a5d54..f454068 100644 --- a/NEWS +++ b/NEWS @@ -38,6 +38,10 @@ GNU grep NEWS -*- outline -*- implying that the match, "10" was on line 1. [bug introduced in grep-2.19] + grep would match to match at a middle of a multibyte character when + using '^' in a pattern in non-UTF8 multibyte locales, leading it to + print lines that did not match. + grep -E rejected unmatched ')', instead of treating it like '\)'. [bug present since "the beginning"] diff --git a/src/dfa.c b/src/dfa.c index 58a4b83..9899749 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -405,6 +405,10 @@ struct dfa slots so far, not counting trans[-1]. */ int trcount; /* Number of transition tables that have actually been built. */ + int min_trcount; /* Minimum of a number of transition tables. + Always keep the number, even if release + transition tables. It also a number of + initial states. */ state_num **trans; /* Transition tables for states that can never accept. If the transitions for a state have not yet been computed, or the @@ -423,6 +427,8 @@ struct dfa newline is stored separately and handled as a special case. Newline is also used as a sentinel at the end of the buffer. */ + state_num initstate_letter; /* Initial state for letter context. */ + state_num initstate_others; /* Initial state for other contexts. */ struct dfamust *musts; /* List of strings, at least one of which is known to appear in any r.e. matching the dfa. */ @@ -2517,9 +2523,16 @@ dfaanalyze (struct dfa *d, int searchflag) /* Build the initial state. */ separate_contexts = state_separate_contexts (&merged); - state_index (d, &merged, - (separate_contexts & CTX_NEWLINE - ? CTX_NEWLINE : separate_contexts ^ CTX_ANY)); + if (separate_contexts & CTX_NEWLINE) + state_index (d, &merged, CTX_NEWLINE); + d->initstate_others = d->min_trcount + = state_index (d, &merged, separate_contexts ^ CTX_ANY); + if (separate_contexts & CTX_LETTER) + d->initstate_letter = d->min_trcount + = state_index (d, &merged, CTX_LETTER); + else + d->initstate_letter = d->initstate_others; + d->min_trcount++; free (posalloc); free (stkalloc); @@ -2859,13 +2872,13 @@ build_state (state_num s, struct dfa *d) not clear the initial state, as it's always used. */ if (d->trcount >= 1024) { - for (i = 1; i < d->tralloc; ++i) + for (i = d->min_trcount; i < d->tralloc; ++i) { free (d->trans[i]); free (d->fails[i]); d->trans[i] = d->fails[i] = NULL; } - d->trcount = 1; + d->trcount = d->min_trcount; } ++d->trcount; @@ -3316,20 +3329,49 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, { s1 = s; - if (s == 0) + if (s < d->min_trcount) { - if (d->states[s].mbps.nelem == 0) + if (d->min_trcount == 1) { - do + if (d->states[s].mbps.nelem == 0) { - while (t[*p] == 0) - p++; - p = mbp = skip_remains_mb (d, p, mbp, end); + do + { + while (t[*p] == 0) + p++; + p = mbp = skip_remains_mb (d, p, mbp, end); + } + while (t[*p] == 0); } - while (t[*p] == 0); + else + p = mbp = skip_remains_mb (d, p, mbp, end); } else - p = mbp = skip_remains_mb (d, p, mbp, end); + { + mbp = skip_remains_mb (d, p, mbp, end); + + /* If d->min_trcount is greater than 1, maybe + transit to another initial state after skip. */ + if (p < mbp) + { + if (*p == eol) + s = 0; + else if (d->initstate_letter == d->initstate_others) + s = d->initstate_others; + else + { + wint_t wc; + mbs_to_wchar (&wc, (char const *) p, + (unsigned char *) end - p, d); + if (wchar_context (wc)) + s = d->initstate_letter; + else + s = d->initstate_others; + } + p = mbp; + s1 = s; + } + } } if (d->states[s].mbps.nelem == 0) diff --git a/tests/euc-mb b/tests/euc-mb index 6a9a845..b625046 100755 --- a/tests/euc-mb +++ b/tests/euc-mb @@ -39,6 +39,7 @@ make_input BABAAB |euc_grep AB > out || fail=1 make_input BABAAB > exp || framework_failure_ compare exp out || fail=1 make_input BABABA |euc_grep AB; test $? = 1 || fail=1 +make_input BABABA |euc_grep '^x\|AB'; test $? = 1 || fail=1 # -P supports only unibyte and UTF-8 locales. LC_ALL=$locale grep -P x /dev/null -- 2.1.1