grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.12-16-g7aa698d


From: Jim Meyering
Subject: grep branch, master, updated. v2.12-16-g7aa698d
Date: Sat, 02 Jun 2012 09:06:27 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  7aa698d36b5b2eeb8e90e7a327eb7ebe46d59e87 (commit)
      from  2665746b756bd372ba856e165388dc98032362fd (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=7aa698d36b5b2eeb8e90e7a327eb7ebe46d59e87


commit 7aa698d36b5b2eeb8e90e7a327eb7ebe46d59e87
Author: Jim Meyering <address@hidden>
Date:   Fri Jun 1 21:18:00 2012 +0200

    grep: fix how -i works with a match containing the Turkish I-with-dot
    
    Fix a long-standing problem in the way grep's -i interacts with
    data whose byte count changes when we convert it to lower case.
    For example, the UTF-8 Turkish I-with-dot (Ä°) occupies two bytes,
    but its lower case analog, i, occupies just one byte.  The code
    converts both search string and the haystack data to lower case,
    and then searches for the modified string in the modified buffer.
    The trouble arose when using a lowercase buffer <offset,length>
    pair to manipulate the original (longer) buffer.
    
    The solution is to change mbtolower to return additional information:
    a malloc'd mapping vector.  With that, the caller maps the lowercase-
    relative <offset,length> to numbers that refer to the original buffer.
    This mapping is used only when lengths actually differ, so the cost
    in general should be small.
    
    * src/searchutils.c (mbtolower): Add the new map parameter.
    * src/search.h (mb_case_map_apply): New function.
    * src/kwsearch.c (Fexecute): Update mbtolower caller, and upon
    success, apply the new map.
    * src/dfasearch.c (EGexecute): Likewise.
    * tests/Makefile.am (XFAIL_TESTS): Remove turkish-I from this list;
    that test is no longer expected to fail.
    * NEWS (Bug fixes): Mention it.
    Reported by Ilya Basin in
    http://thread.gmane.org/gmane.comp.gnu.grep.bugs/3413 and later
    by Strahinja Kustudic in http://savannah.gnu.org/bugs/?36567

diff --git a/NEWS b/NEWS
index 6926276..d0ea60a 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,11 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 ** Bug fixes
 
+  grep -i, in a multi-byte locale, when matching a line containing a character
+  like the UTF-8 Turkish I-with-dot (U+0130) (whose lower-case representation
+  occupies fewer bytes), would print an incomplete output line.
+  [bug introduced in grep-2.6]
+
   --include and --exclude can again be combined, and again apply to
   the command line, e.g., "grep --include='*.[ch]' --exclude='system.h'
   PATTERN *" again reads all *.c and *.h files except for system.h.
diff --git a/THANKS b/THANKS
index 80df5ec..1720232 100644
--- a/THANKS
+++ b/THANKS
@@ -84,6 +84,7 @@ Shannon Hill               <address@hidden>
 Sotiris Vassilopoulos      <address@hidden>
 Standish Parsley           <address@hidden>
 Stewart Levin              <address@hidden>
+Strahinja Kustudic         <address@hidden>
 Sven Joachim               <address@hidden>
 Sydoruk Stepan             <address@hidden>
 Tapani Tarvainen           <address@hidden>
diff --git a/src/dfasearch.c b/src/dfasearch.c
index bd09aa6..a48333d 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -78,8 +78,9 @@ static char const *
 kwsincr_case (const char *must)
 {
   size_t n = strlen (must);
+  unsigned char *map = NULL;
   const char *buf = (match_icase && MB_CUR_MAX > 1
-                     ? mbtolower (must, &n)
+                     ? mbtolower (must, &n, &map)
                      : must);
   return kwsincr (kwset, buf, n);
 }
@@ -217,13 +218,15 @@ EGexecute (char const *buf, size_t size, size_t 
*match_size,
   ptrdiff_t len, best_len;
   struct kwsmatch kwsm;
   size_t i, ret_val;
+  unsigned char *map = NULL;
+
   if (MB_CUR_MAX > 1)
     {
       if (match_icase)
         {
           /* mbtolower adds a NUL byte at the end.  That will provide
              space for the sentinel byte dfaexec may add.  */
-          char *case_buf = mbtolower (buf, &size);
+          char *case_buf = mbtolower (buf, &size, &map);
           if (start_ptr)
             start_ptr = case_buf + (start_ptr - buf);
           buf = case_buf;
@@ -408,9 +411,11 @@ EGexecute (char const *buf, size_t size, size_t 
*match_size,
 
  success:
   len = end - beg;
- success_in_len:
+ success_in_len:;
+  size_t off = beg - buf;
+  mb_case_map_apply (map, &off, &len);
   *match_size = len;
-  ret_val = beg - buf;
+  ret_val = off;
  out:
   return ret_val;
 }
diff --git a/src/kwsearch.c b/src/kwsearch.c
index f1a802e..d0bb201 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -34,8 +34,9 @@ Fcompile (char const *pattern, size_t size)
 {
   char const *err;
   size_t psize = size;
+  unsigned char *map = NULL;
   char const *pat = (match_icase && MB_CUR_MAX > 1
-                     ? mbtolower (pattern, &psize)
+                     ? mbtolower (pattern, &psize, &map)
                      : pattern);
 
   kwsinit (&kwset);
@@ -83,11 +84,13 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
   char eol = eolbyte;
   struct kwsmatch kwsmatch;
   size_t ret_val;
+  unsigned char *map = NULL;
+
   if (MB_CUR_MAX > 1)
     {
       if (match_icase)
         {
-          char *case_buf = mbtolower (buf, &size);
+          char *case_buf = mbtolower (buf, &size, &map);
           if (start_ptr)
             start_ptr = case_buf + (start_ptr - buf);
           buf = case_buf;
@@ -162,9 +165,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
   while (buf < beg && beg[-1] != eol)
     --beg;
   len = end - beg;
- success_in_beg_and_len:
+ success_in_beg_and_len:;
+  size_t off = beg - buf;
+  mb_case_map_apply (map, &off, &len);
+
   *match_size = len;
-  ret_val = beg - buf;
+  ret_val = off;
  out:
   return ret_val;
 }
diff --git a/src/search.h b/src/search.h
index 3074407..529e7e2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -38,7 +38,7 @@
 /* searchutils.c */
 extern void kwsinit (kwset_t *);
 
-extern char *mbtolower (const char *, size_t *);
+extern char *mbtolower (const char *, size_t *, unsigned char **);
 extern bool is_mb_middle (const char **, const char *, const char *, size_t);
 
 /* dfasearch.c */
@@ -53,4 +53,23 @@ extern size_t Fexecute (char const *, size_t, size_t *, char 
const *);
 extern void Pcompile (char const *, size_t);
 extern size_t Pexecute (char const *, size_t, size_t *, char const *);
 
+/* Apply a non-NULL MAP from mbtolower to the lowercase-buffer-relative
+   *OFF and *LEN, converting them to be relative to the original buffer.  */
+static inline void
+mb_case_map_apply (unsigned char const *map, size_t *off, size_t *len)
+{
+  if (map)
+    {
+      size_t off_incr = 0;
+      size_t len_incr = 0;
+      size_t k;
+      for (k = 0; k < *off; k++)
+        off_incr += map[k];
+      for (k = *off; k < *off + *len; k++)
+        len_incr += map[k];
+      *off += off_incr;
+      *len += len_incr;
+    }
+}
+
 #endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
index b787fe6..4942c51 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -53,25 +53,38 @@ kwsinit (kwset_t *kwset)
    Note that while this function returns a pointer to malloc'd storage,
    the caller must not free it, since this function retains a pointer
    to the buffer and reuses it on any subsequent call.  As a consequence,
-   this function is not thread-safe.  */
+   this function is not thread-safe.
+
+   When the lowercase result string has the same length as the input string,
+   set *LEN_MAP_P to NULL.  Otherwise, set it to a malloc'd buffer (like the
+   returned buffer, this must not be freed by caller) of the same length as
+   the result string.  (*LEN_MAP_P)[J] is one less than the length-in-bytes
+   of the character in BEG that formed byte J of the result.  This map is
+   used by the caller to convert offset,length pairs that reference the
+   lowercase result to numbers that refer to the corresponding parts of
+   the original buffer.  */
 char *
-mbtolower (const char *beg, size_t *n)
+mbtolower (const char *beg, size_t *n, unsigned char **len_map_p)
 {
   static char *out;
+  static unsigned char *len_map;
   static size_t outalloc;
   size_t outlen, mb_cur_max;
   mbstate_t is, os;
   const char *end;
   char *p;
+  unsigned char *m;
 
   if (*n > outalloc || outalloc == 0)
     {
       outalloc = MAX(1, *n);
       out = xrealloc (out, outalloc);
+      len_map = xrealloc (len_map, outalloc);
     }
 
   /* appease clang-2.6 */
   assert (out);
+  assert (len_map);
   if (*n == 0)
     return out;
 
@@ -81,6 +94,7 @@ mbtolower (const char *beg, size_t *n)
 
   mb_cur_max = MB_CUR_MAX;
   p = out;
+  m = len_map;
   outlen = 0;
   while (beg < end)
     {
@@ -88,14 +102,18 @@ mbtolower (const char *beg, size_t *n)
       size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
       if (outlen + mb_cur_max >= outalloc)
         {
+          size_t dm = m - len_map;
           out = x2nrealloc (out, &outalloc, 1);
+          len_map = xrealloc (len_map, outalloc);
           p = out + outlen;
+          m = len_map + dm;
         }
 
       if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
         {
           /* An invalid sequence, or a truncated multi-octet character.
              We treat it as a single-octet character.  */
+          *m++ = 0;
           *p++ = *beg++;
           outlen++;
           memset (&is, 0, sizeof (is));
@@ -103,6 +121,7 @@ mbtolower (const char *beg, size_t *n)
         }
       else
         {
+          *m++ = mbclen - 1;
           beg += mbclen;
           mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
           p += mbclen;
@@ -110,6 +129,8 @@ mbtolower (const char *beg, size_t *n)
         }
     }
 
+  /* If the new length differs from the original, give caller the map.  */
+  *len_map_p = p - out == *n ? NULL : len_map;
   *n = p - out;
   *p = 0;
   return out;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 7be788c..167e318 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -32,7 +32,6 @@ XFAIL_TESTS = \
 if USE_INCLUDED_REGEX
 XFAIL_TESTS += equiv-classes
 endif
-XFAIL_TESTS += turkish-I
 
 TESTS =                                                \
   backref                                      \

-----------------------------------------------------------------------

Summary of changes:
 NEWS              |    5 +++++
 THANKS            |    1 +
 src/dfasearch.c   |   13 +++++++++----
 src/kwsearch.c    |   14 ++++++++++----
 src/search.h      |   21 ++++++++++++++++++++-
 src/searchutils.c |   25 +++++++++++++++++++++++--
 tests/Makefile.am |    1 -
 7 files changed, 68 insertions(+), 12 deletions(-)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]