>From 767b4eaec498b212303973929effb877093c0fcf Mon Sep 17 00:00:00 2001
From: =?utf-8?q?P=C3=A1draig=20Brady?= <address@hidden>
Date: Mon, 28 Jun 2010 01:01:54 +0100
Subject: [PATCH] join: support multi-byte character encodings

* bootstrap.conf: Include various multi-byte support modules,
and the support module for the external libunistring.
* configure.ac: Use libunistring if available.
* src/Makefile.am: link join and printf with libunistring.
Note printf need to link as u8_uctomb_aux() is moved from
a header file to the shared library, and this introduces
a 16% startup overhead for the printf binary.
* src/join.c (u8_str_from_locale): A wrapper for
u8_strconv_from_locale which only allocates memory if required.
(u8_base_chars): A function to could the number of characters
excluding combining characters.
(mbmmestr): A function to seach for a string in a multi-byte
area of memory, that may contain NULs.
(xfields): Search for a specified tab character as efficiently
and robustly as possible depending on the current encoding
and the particular character chosen.  Expand the blanks matching,
even though it already worked for any encoding, so as to support
matching extended blank characters.
(keycmp): Support sophisticated case insensitive comparison,
honoring rules for particular locales.  If libunistring is
unavailable, we resort to more general multi-byte case folding.
(output_separator): A New function to deal with the now possible
multi-byte tab separator.
(prjoin): Call output_separator() to support a possible multi-byte
tab delimiter.
(main): Cache whether we have a non C LC_CTYPE or UTF-8 encoding.
Parse and validate multi-byte tab characters, while restricting
them to single characters, accounting for combining chars.
tests/misc/join-i18n: Test various multi-byte operations.
---
 bootstrap.conf       |    5 +-
 configure.ac         |    2 +
 src/Makefile.am      |    4 +
 src/join.c           |  312 +++++++++++++++++++++++++++++++++++++++++++-------
 tests/Makefile.am    |    1 +
 tests/misc/join-i18n |  104 +++++++++++++++++
 6 files changed, 387 insertions(+), 41 deletions(-)
 create mode 100755 tests/misc/join-i18n

diff --git a/bootstrap.conf b/bootstrap.conf
index a0004ac..3f91916 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -137,6 +137,7 @@ gnulib_modules="
   lchmod
   lchown
   lib-ignore
+  libunistring
   linebuffer
   link
   link-follow
@@ -146,10 +147,12 @@ gnulib_modules="
   maintainer-makefile
   malloc
   manywarnings
-  mbrtowc
   mbsalign
   mbswidth
+  mbmemcasecmp
+  mbmemcasecoll
   memcasecmp
+  memmem
   memchr
   memcmp2
   mempcpy
diff --git a/configure.ac b/configure.ac
index acd397e..b9f2e5d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -136,6 +136,8 @@ AC_CHECK_FUNCS([gethostid],
 
 gl_WINSIZE_IN_PTEM
 
+gl_LIBUNISTRING
+
 AC_MSG_CHECKING([whether localtime caches TZ])
 AC_CACHE_VAL([utils_cv_localtime_cache],
 [if test x$ac_cv_func_tzset = xyes; then
diff --git a/src/Makefile.am b/src/Makefile.am
index 00c7ff7..4b17a03 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -374,6 +374,10 @@ stdbuf_LDADD += $(LIBICONV)
 timeout_LDADD += $(LIBICONV)
 truncate_LDADD += $(LIBICONV)
 
+# for libunistring
+printf_LDADD += $(LIBUNISTRING)
+join_LDADD += $(LIBUNISTRING)
+
 # for canon_host
 pinky_LDADD += $(GETADDRINFO_LIB)
 who_LDADD += $(GETADDRINFO_LIB)
diff --git a/src/join.c b/src/join.c
index 6eaad65..f516387 100644
--- a/src/join.c
+++ b/src/join.c
@@ -21,18 +21,27 @@
 #include <assert.h>
 #include <sys/types.h>
 #include <getopt.h>
+#include <string.h>
 
 #include "system.h"
 #include "error.h"
 #include "fadvise.h"
 #include "hard-locale.h"
 #include "linebuffer.h"
+#if HAVE_LIBUNISTRING
+# include <unicase.h>
+# include <unistr.h>
+# include <uniconv.h>
+# include <unictype.h>
+# include <unistring/localcharset.h>
+#endif
+#include "mbiter.h"
+#include "mbmemcasecoll.h"
 #include "memcasecmp.h"
 #include "quote.h"
 #include "stdio--.h"
 #include "xmemcoll.h"
 #include "xstrtol.h"
-#include "argmatch.h"
 
 /* The official name of this program (e.g., no `g' prefix).  */
 #define PROGRAM_NAME "join"
@@ -94,9 +103,15 @@ static struct line *prevline[2] = {NULL, NULL};
    want to overwrite the previous buffer before we check order. */
 static struct line *spareline[2] = {NULL, NULL};
 
+/* True if the LC_CTYPE locale is hard.  */
+static bool hard_LC_CTYPE;
+
 /* True if the LC_COLLATE locale is hard.  */
 static bool hard_LC_COLLATE;
 
+/* True if the LC_COLLATE locale is utf-8.  */
+static bool utf8_LC_CTYPE;
+
 /* If nonzero, print unpairable lines in file 1 or 2.  */
 static bool print_unpairables_1, print_unpairables_2;
 
@@ -122,10 +137,11 @@ static struct outlist outlist_head;
 /* Last element in `outlist', where a new element can be added.  */
 static struct outlist *outlist_end = &outlist_head;
 
-/* Tab character separating fields.  If negative, fields are separated
-   by any nonempty string of blanks, otherwise by exactly one
-   tab character whose value (when cast to unsigned char) equals TAB.  */
-static int tab = -1;
+/* Tab character separating fields.  If NULL, fields are separated
+   by any nonempty string of blanks, if NUL the whole line is compared,
+   otherwise by exactly one tab character.  */
+static char const *tab;
+static size_t tabsize;
 
 /* If nonzero, check that the input is correctly ordered. */
 static enum
@@ -164,6 +180,59 @@ static bool ignore_case;
    join them without checking for ordering */
 static bool join_header_lines;
 
+#if HAVE_LIBUNISTRING
+
+/* Like u8_strconv_from_locale() except that
+   the string is not copied if already UTF-8.
+   One must free the result only when != src.  */
+
+static uint8_t *
+u8_str_from_locale (const char *string)
+{
+  /* Note to generalise away from using the global utf8_LC_CTYPE,
+     we could use: STREQ (locale_charset(), "UTF-8").  */
+
+  if (utf8_LC_CTYPE)
+    {
+      size_t size = strlen (string) + 1;
+      if (u8_check ((const uint8_t *) string, size))
+        {
+          errno = EILSEQ;
+          return NULL;
+        }
+      return (uint8_t *) string;
+    }
+  else
+    return u8_strconv_from_locale (string);
+}
+
+/* Return the number of base characters.  */
+
+static size_t
+u8_base_chars (const uint8_t *s)
+{
+  /* Note Normalization + u8_mbsnlen is not enough to detect single characters
+     as some don't compose to a single unicode char. For example
+     "e\xcc\x81\xcc\x82" doesn't compose to a single character, while
+     "e\xcc\x82\xcc\x81" does.  In general characters can have many
+     combining chars appended.  */
+
+  size_t chars = 0;
+  const uint8_t* s_end = s + strlen(s);
+
+  while (*s)
+    {
+      ucs4_t uc;
+      s += u8_mbtouc_unsafe (&uc, s, s_end - s);
+      chars += !uc_is_property (uc, UC_PROPERTY_COMBINING);
+      /* FIXME: Do we want to exclude other characters like LTR marks etc. */
+    }
+
+  return chars;
+}
+
+#endif
+
 void
 usage (int status)
 {
@@ -238,42 +307,148 @@ extract_field (struct line *line, char *field, size_t len)
   ++(line->nfields);
 }
 
+/* Find a string in possibly NUL-containing multi-byte data.  */
+
+static char*
+mbmemstr (char const *hay, size_t hay_size, char const *needle)
+{
+  char *ret;
+
+  while (hay_size && !(ret = mbsstr (hay, needle)))
+    {
+      size_t bale_size = strlen (hay) + 1;
+      hay += bale_size;
+      hay_size -= bale_size;
+    }
+
+  return ret;
+}
+
 /* Fill in the `fields' structure in LINE.  */
 
 static void
 xfields (struct line *line)
 {
   char *ptr = line->buf.buffer;
-  char const *lim = ptr + line->buf.length - 1;
+  char *lim = ptr + line->buf.length - 1;
 
   if (ptr == lim)
     return;
 
-  if (0 <= tab && tab != '\n')
+  if (tab && *tab != '\n')
     {
+     /* FIXME: "case mappings of substrings" in libunistring info says:
+        "Case mapping of a substring cannot simply be performed by extracting
+        the substring and then applying the case mapping function to it."
+        Therefore when splitting on non blanks, Surrounding chars might be
+        needed for context, so there are _ct_ variants in libunistring for this.
+        We assume for the moment that chosen delimiters will have no effect on
+        case mapping.  */
       char *sep;
-      for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
-        extract_field (line, ptr, sep - ptr);
+      if (MB_CUR_MAX == 1 || utf8_LC_CTYPE || *tab < 0x30)
+        {
+          for (; (sep = memmem (ptr, lim - ptr, tab, tabsize - 1)) != NULL;
+               ptr = sep + tabsize - 1)
+            extract_field (line, ptr, sep - ptr);
+        }
+      else
+        {
+          /* We can't simply search memory, and must resort
+             to multi-byte iteration to find the delimiter.  */
+          *lim = '\0'; lim++;
+          for (; (sep = mbmemstr (ptr, lim - ptr, tab)) != NULL;
+               ptr = sep + tabsize - 1)
+            extract_field (line, ptr, sep - ptr);
+          *lim = '\n'; lim--;
+        }
     }
-  else if (tab < 0)
+  else if (!tab)
     {
-      /* Skip leading blanks before the first field.  */
-      while (isblank (to_uchar (*ptr)))
-        if (++ptr == lim)
-          return;
 
-      do
+      /* In this case we must do multi-byte character
+         iteration in order to find these blank characters:
+            1680 OGHAM SPACE MARK
+            180E MONGOLIAN VOWEL SEPARATOR
+            2000 EN QUAD
+            2001 EM QUAD
+            2002 EN SPACE
+            2003 EM SPACE
+            2004 THREE-PER-EM SPACE
+            2005 FOUR-PER-EM SPACE
+            2006 SIX-PER-EM SPACE
+            2008 PUNCTUATION SPACE
+            2009 THIN SPACE
+            200A HAIR SPACE
+            205F MEDIUM MATHEMATICAL SPACE
+            3000 IDEOGRAPHIC SPACE
+         Note 00A0 (NBSP) or 2007 (FIGURE SPACE) are not blank (or space). */
+      if (MB_CUR_MAX > 1)
         {
-          char *sep;
-          for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++)
-            continue;
-          extract_field (line, ptr, sep - ptr);
-          if (sep == lim)
-            return;
-          for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++)
-            continue;
+          /* Skip leading blanks before the first field.  */
+          mbi_iterator_t iter;
+          for (mbi_init (iter, ptr, lim - ptr);; mbi_advance (iter))
+            {
+              if (!mbi_avail (iter))
+                return;
+              if (!mb_isblank (mbi_cur (iter)))
+                break;
+            }
+          ptr = (char *) mbi_cur_ptr (iter);
+
+          do
+            {
+              char *sep;
+
+              /* Find start of next field.  */
+              for (;mbi_avail (iter); mbi_advance (iter))
+                if (mb_isblank (mbi_cur (iter)))
+                  break;
+              sep = (char *) mbi_cur_ptr (iter);
+
+              extract_field (line, ptr, sep - ptr);
+              if (sep == lim)
+                return;
+
+              /* Skip leading blanks for next field.  */
+              for (;mbi_avail (iter); mbi_advance (iter))
+                if (!mb_isblank (mbi_cur (iter)))
+                  break;
+              ptr = (char *) mbi_cur_ptr (iter);
+            }
+          while (ptr != lim);
+        }
+      else
+        {
+          /* This single byte matching actually supports all character
+             encodings, as it's just searching for [ \t] characters,
+             which are unique in all encodings.  However it doesn't
+             find the blank characters mentioned above.  */
+
+          /* Skip leading blanks before the first field.  */
+          while (isblank (to_uchar (*ptr)))
+            if (++ptr == lim)
+              return;
+
+          do
+            {
+              char *sep;
+
+              /* Find start of next field.  */
+              for (sep = ptr + 1; sep != lim; sep++)
+                if (isblank (to_uchar (*sep)))
+                   break;
+
+              extract_field (line, ptr, sep - ptr);
+              if (sep == lim)
+                return;
+
+              /* Skip leading blanks for next field.  */
+              for (ptr = sep + 1; ptr != lim; ptr++)
+                if (!isblank (to_uchar (*ptr)))
+                   break;
+            }
+          while (ptr != lim);
         }
-      while (ptr != lim);
     }
 
   extract_field (line, ptr, lim - ptr);
@@ -336,8 +511,31 @@ keycmp (struct line const *line1, struct line const *line2,
 
   if (ignore_case)
     {
-      /* FIXME: ignore_case does not work with NLS (in particular,
-         with multibyte chars).  */
+      /* Do a quick check first to see if the data is the same.  */
+      if (len1 == len2 && memcmp (beg1, beg2, len1) == 0)
+        return 0;
+
+      if (hard_LC_CTYPE)
+        {
+#if HAVE_LIBUNISTRING
+          /* The ulc_casecoll function handles not only multibyte characters
+             correctly, but also the German sharp s, the Greek final sigma,
+             the Turkish dotless i, etc.  Note normalization is disabled
+             currently, so as to be consistent with the case sensitive
+             comparison.  Note we use the "coll" rather than "cmp" variants
+             here to report disorder.  */
+          if (ulc_casecoll (beg1, len1, beg2, len2, uc_locale_language (),
+                            NULL, &diff) >= 0)
+            return diff;
+          if (errno == ENOMEM)
+            xalloc_die ();
+#endif
+          /* If ulc_casecoll failed due to some conversion error, fall back to
+             a comparison that at least handles multibyte characters and the
+             Turkish dotless i correctly.  This adds around 1383 bytes to .text
+             and 1028 bytes to .bss  */
+          return mbmemcasecoll (beg1, len1, beg2, len2, hard_LC_COLLATE);
+        }
       diff = memcasecmp (beg1, beg2, MIN (len1, len2));
     }
   else
@@ -527,13 +725,25 @@ prfield (size_t n, struct line const *line)
     fputs (empty_filler, stdout);
 }
 
+/* Output the field separator accounting for NULs.  */
+
+static void
+output_separator (void)
+{
+  if (!tab)
+    putchar (' ');
+  else if (!*tab)
+    putchar ('\0');
+  else
+    fputs (tab, stdout);
+}
+
 /* Print the join of LINE1 and LINE2.  */
 
 static void
 prjoin (struct line const *line1, struct line const *line2)
 {
   const struct outlist *outlist;
-  char output_separator = tab < 0 ? ' ' : tab;
 
   outlist = outlist_head.next;
   if (outlist)
@@ -568,7 +778,7 @@ prjoin (struct line const *line1, struct line const *line2)
           o = o->next;
           if (o == NULL)
             break;
-          putchar (output_separator);
+          output_separator();
         }
       putchar ('\n');
     }
@@ -586,23 +796,23 @@ prjoin (struct line const *line1, struct line const *line2)
       prfield (join_field_1, line1);
       for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
         {
-          putchar (output_separator);
+          output_separator();
           prfield (i, line1);
         }
       for (i = join_field_1 + 1; i < line1->nfields; ++i)
         {
-          putchar (output_separator);
+          output_separator();
           prfield (i, line1);
         }
 
       for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
         {
-          putchar (output_separator);
+          output_separator();
           prfield (i, line2);
         }
       for (i = join_field_2 + 1; i < line2->nfields; ++i)
         {
-          putchar (output_separator);
+          output_separator();
           prfield (i, line2);
         }
       putchar ('\n');
@@ -965,7 +1175,9 @@ main (int argc, char **argv)
   setlocale (LC_ALL, "");
   bindtextdomain (PACKAGE, LOCALEDIR);
   textdomain (PACKAGE);
+  hard_LC_CTYPE = hard_locale (LC_CTYPE);
   hard_LC_COLLATE = hard_locale (LC_COLLATE);
+  utf8_LC_CTYPE = STREQ (locale_charset(), "UTF-8");
 
   atexit (close_stdout);
   atexit (free_spareline);
@@ -1043,20 +1255,40 @@ main (int argc, char **argv)
 
         case 't':
           {
-            unsigned char newtab = optarg[0];
-            if (! newtab)
-              newtab = '\n'; /* '' => process the whole line.  */
-            else if (optarg[1])
+            char const *newtab = optarg;
+            uint8_t *u8tab = NULL;
+            if (! *optarg)
+              newtab = "\n"; /* '' => process the whole line.  */
+#if HAVE_LIBUNISTRING
+            else
+              {
+                if (!(u8tab = u8_str_from_locale (optarg)))
+                  error (EXIT_FAILURE, errno, _("error converting tab %s"),
+                         quote (optarg));
+              }
+#endif
+            if (*optarg && optarg[1]) /* multiple bytes */
               {
                 if (STREQ (optarg, "\\0"))
-                  newtab = '\0';
+                  newtab = "\0";
                 else
-                  error (EXIT_FAILURE, 0, _("multi-character tab %s"),
-                         quote (optarg));
+                  {
+#if HAVE_LIBUNISTRING
+                    /* We support only single characters to support existing
+                       documentation, and to restrict possible future character
+                       combinations, like the current "\0".  */
+                    if (u8_base_chars (u8tab) > 1)
+#endif
+                      error (EXIT_FAILURE, 0, _("multi-character tab %s"),
+                             quote (optarg));
+                  }
               }
-            if (0 <= tab && tab != newtab)
+            if (u8tab != (uint8_t *) optarg)
+              free (u8tab);
+            if (tab && !STREQ (tab, newtab))
               error (EXIT_FAILURE, 0, _("incompatible tabs"));
             tab = newtab;
+            tabsize = MAX (2, strlen (tab) + 1);
           }
           break;
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 5619d0b..1afe271 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -153,6 +153,7 @@ TESTS =						\
   misc/arch					\
   misc/pr					\
   misc/join					\
+  misc/join-i18n				\
   pr/pr-tests					\
   misc/df-P					\
   misc/pwd-option				\
diff --git a/tests/misc/join-i18n b/tests/misc/join-i18n
new file mode 100755
index 0000000..3192e70
--- /dev/null
+++ b/tests/misc/join-i18n
@@ -0,0 +1,104 @@
+#!/bin/sh
+# Test multi-byte operations in join
+
+# Copyright (C) 2010 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+: ${LOCALE_FR_UTF8=none}
+if test "$LOCALE_FR_UTF8" != "none"; then
+  (
+    #Note should be using printf \u.... below.
+    export LC_ALL=$LOCALE_FR_UTF8
+
+    # allow multi-byte tab
+    join -t "$(printf '\xc3\x89')" /dev/null /dev/null || fail=1
+
+    # allow tab with combining chars
+    join -t "$(printf '\x65\xcc\x81')" /dev/null /dev/null || fail=1
+    join -t "$(printf 'e\xcc\x81\xcc\x82')" /dev/null /dev/null || fail=1
+    join -t "$(printf 'e\xcc\x82\xcc\x81')" /dev/null /dev/null || fail=1
+
+    # ensure multi-byte tab matched
+    printf "aÂ»1Â»a\n" > f1; printf "Â¼Â»1Â»b\n" > f2
+    join -j2 -t 'Â»' f1 f2 > out || fail=1
+    echo "1Â»aÂ»aÂ»Â¼Â»b" > exp
+    compare out exp || fail=1
+
+    # multi-byte case folding
+    printf "Ã¡ 1\n" > f1; printf "Ã 2\n" > f2
+    join -i f1 f2 > out || fail=1
+    echo "Ã¡ 1 2" > exp
+    compare out exp || fail=1
+
+    # locale aware case folding
+    printf "Ã 1\n" > f1; printf "SS 2\n" > f2
+    join -i f1 f2 > out || fail=1
+    echo "Ã 1 2" > exp
+    compare out exp || fail=1
+
+    # NUL support
+    printf "1\x002\n1\x003\n" > f1
+    printf "1\x002\n1\x004" > f2
+    join -t '' f1 f2 > out || fail=1
+    printf "1\x002\n" > exp
+    compare out exp || fail=1
+
+    # locale aware case folding NUL support
+    printf "1\x002\n1\x00Ã\n" > f1
+    printf "1\x002\n1\x00SS\n" > f2
+    join -i -t '' f1 f2 > out || fail=1
+    printf "1\x002\n1\x00Ã\n" > exp
+    compare out exp || fail=1
+
+    # Validate specified tab
+    join -t "$(printf '\xA1')" /dev/null /dev/null && fail=1
+
+    # Support input with invalid encoding
+    printf "\xa1\n" > f1; printf "\xa1 foo\n"> f2
+    join f1 f2 > out || fail=1
+    printf "\xa1 foo\n" > exp
+    compare out exp || fail=1
+
+    # check blank matching in the presence of NULs
+    printf " one \x001\n" > f1; printf " one \x002\n" > f2
+    join f1 f2 > out || fail=1
+    printf "one \x001 \x002\n" > exp
+    compare out exp || fail=1
+
+    # check multi-byte blank matching
+    env printf "1\u2003a\n" > f1
+    env printf "1\u2003b\n" > f2
+    join f1 f2 > out || fail=1
+    echo "1 a b" > exp # note conversion of em space to normal space
+    compare out exp || fail=1
+  )
+fi
+
+# One must manually create this locale on GNU/Linux due to it
+# being ASCII and ISO C incompatible. This works on Fedora:
+# sudo localedef -i ja_JP -c -f SHIFT_JIS /usr/lib/locale/ja_JP.sjis
+if test "$(LANG=ja_JP.SJIS locale charmap 2>/dev/null)" = SHIFT_JIS; then
+  # Test checking for ambiguous tab chars
+  export LC_ALL=ja_JP.SJIS
+  printf '1''\x81\x7C''2|3' > f1
+  printf '1''\x81\x7C''4|3' > f2
+  join -t '|' -j2 f1 f2 > out || fail=1
+  printf "3|1\x81\x7c2|1\x81\x7c4\n" > exp
+  compare out exp || fail=1
+fi
+
+Exit $fail
-- 
1.6.2.5