From a5540fa9f5e5b9339afe59b3d8e1b3b4791397e4 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 27 Mar 2014 21:34:42 +0900 Subject: [PATCH] grep: perform the kwset-helping DFA match in narrower range When kwsexec gives us the offset of a potential match, we compute line begin/end and then run the DFA matcher to see if there really is a match on that line. When the beginning of the line, BEG, is not on a multibyte character boundary, start DFA search from it on such a boundary. * src/dfasearch.c (EGexecute): As above. Add a comment. --- src/dfasearch.c | 12 ++++++++++-- tests/euc-mb | 5 ++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 0b56960..4301fe7 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -247,9 +247,17 @@ EGexecute (char const *buf, size_t size, size_t *match_size, || !is_mb_middle (&mb_start, match, buflim, kwsm.size[0])) goto success; + /* The matched line starts in the middle of a multibyte + character. Then we start DFA search from the + beginning of the next character. */ + if (dfaexec (dfa, mb_start, (char *) end, 0, NULL, &backref) == NULL) + continue; + } + else + { + if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) + continue; } - if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) - continue; } else { diff --git a/tests/euc-mb b/tests/euc-mb index c0af220..90111d1 100755 --- a/tests/euc-mb +++ b/tests/euc-mb @@ -30,7 +30,10 @@ fail=0 # Does EUC-JP work at all? make_input BABA |euc_grep AB && fail=1 -# Whole line rejected after matching in the middle of a multibyte char? +# After matching in the middle of a multibyte char by KWSet, +# whole line doesn't be rejected immediately and boundaries of multibyte +# chars following it are correctly recognized by DFA matcher? make_input BABAAB |euc_grep AB || fail=1 +make_input BABABA |euc_grep AB && fail=1 Exit $fail -- 1.9.1