From a5540fa9f5e5b9339afe59b3d8e1b3b4791397e4 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <address@hidden>
Date: Thu, 27 Mar 2014 21:34:42 +0900
Subject: [PATCH] grep: perform the kwset-helping DFA match in narrower range

When kwsexec gives us the offset of a potential match, we compute
line begin/end and then run the DFA matcher to see if there really
is a match on that line.  When the beginning of the line, BEG, is
not on a multibyte character boundary, start DFA search from it on
such a boundary.
* src/dfasearch.c (EGexecute): As above.  Add a comment.
---
 src/dfasearch.c | 12 ++++++++++--
 tests/euc-mb    |  5 ++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 0b56960..4301fe7 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -247,9 +247,17 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
                       || !is_mb_middle (&mb_start, match, buflim,
                                         kwsm.size[0]))
                     goto success;
+                  /* The matched line starts in the middle of a multibyte
+                     character.  Then we start DFA search from the
+                     beginning of the next character.  */
+                  if (dfaexec (dfa, mb_start, (char *) end, 0, NULL, &backref) == NULL)
+                    continue;
+                }
+              else
+                {
+                  if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
+                    continue;
                 }
-              if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
-                continue;
             }
           else
             {
diff --git a/tests/euc-mb b/tests/euc-mb
index c0af220..90111d1 100755
--- a/tests/euc-mb
+++ b/tests/euc-mb
@@ -30,7 +30,10 @@ fail=0
 # Does EUC-JP work at all?
 make_input BABA |euc_grep AB && fail=1
 
-# Whole line rejected after matching in the middle of a multibyte char?
+# After matching in the middle of a multibyte char by KWSet,
+# whole line doesn't be rejected immediately and boundaries of multibyte
+# chars following it are correctly recognized by DFA matcher?
 make_input BABAAB |euc_grep AB || fail=1
+make_input BABABA |euc_grep AB && fail=1
 
 Exit $fail
-- 
1.9.1