[Guile-commits] GNU Guile branch, master, updated. release_1-9-6-60-gedb

From: Julian Graham
Subject: [Guile-commits] GNU Guile branch, master, updated. release_1-9-6-60-gedb7bb4
Date: Sun, 03 Jan 2010 06:09:21 +0000

- Log -----------------------------------------------------------------
commit edb7bb4766773cffa8262b4cd8bb980888913d65
Author: Julian Graham <address@hidden>
Date:   Sun Jan 3 01:06:05 2010 -0500

    Support for Unicode string normalization functions
    * libguile/strings.c, libguile/strings.h (normalize_str,
      scm_string_normalize_nfc, scm_string_normalize_nfd, scm_normalize_nfkc,
      scm_string_normalize_nfkd): New functions.
    * test-suite/tests/strings.test: Unit tests for `string-normalize-nfc',
      `string-normalize-nfd', `string-normalize-nfkc', and
    * doc/ref/api-data.texi (String Comparison): Documentation for normalization


Summary of changes:
 doc/ref/api-data.texi         |   64 ++++++++++++++++++++++++++++++++++++
 libguile/strings.c            |   73 +++++++++++++++++++++++++++++++++++++++++
 libguile/strings.h            |    5 +++
 test-suite/tests/strings.test |   40 ++++++++++++++++++++++
 4 files changed, 182 insertions(+), 0 deletions(-)

diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi
index e847c9c..8e797ac 100755
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@@ -3273,6 +3273,70 @@ Compute a hash value for @var{S}.  the optional argument 
@var{bound} is a non-ne
 Compute a hash value for @var{S}.  the optional argument @var{bound} is a 
non-negative exact integer specifying the range of the hash function. A 
positive value restricts the return value to the range [0,bound).
 @end deffn
+Because the same visual appearance of an abstract Unicode character can 
+be obtained via multiple sequences of Unicode characters, even the 
+case-insensitive string comparison functions described above may return
address@hidden when presented with strings containing different 
+representations of the same character.  For example, the Unicode 
+represented with a single character (U+1E69) or by the character ``LATIN
+SMALL LETTER S'' (U+0073) followed by the combining marks ``COMBINING 
+DOT BELOW'' (U+0323) and ``COMBINING DOT ABOVE'' (U+0307).
+For this reason, it is often desirable to ensure that the strings
+to be compared are using a mutually consistent representation for every 
+character.  The Unicode standard defines two methods of normalizing the
+contents of strings: Decomposition, which breaks composite characters 
+into a set of constituent characters with an ordering defined by the
+Unicode Standard; and composition, which performs the converse.
+There are two decomposition operations.  ``Canonical decomposition'' 
+produces character sequences that share the same visual appearance as
+the original characters, while ``compatiblity decomposition'' produces
+ones whose visual appearances may differ from the originals but which
+represent the same abstract character.
+These operations are encapsulated in the following set of normalization
address@hidden @dfn
address@hidden NFD
+Characters are decomposed to their canonical forms.
address@hidden NFKD
+Characters are decomposed to their compatibility forms.
address@hidden NFC
+Characters are decomposed to their canonical forms, then composed.
address@hidden NFKC
+Characters are decomposed to their compatibility forms, then composed.
address@hidden table
+The functions below put their arguments into one of the forms described
address@hidden {Scheme Procedure} string-normalize-nfd s
address@hidden {C Function} scm_string_normalize_nfd (s)
+Return the @code{NFD} normalized form of @var{s}.
address@hidden deffn
address@hidden {Scheme Procedure} string-normalize-nfkd s
address@hidden {C Function} scm_string_normalize_nfkd (s)
+Return the @code{NFKD} normalized form of @var{s}.
address@hidden deffn
address@hidden {Scheme Procedure} string-normalize-nfc s
address@hidden {C Function} scm_string_normalize_nfc (s)
+Return the @code{NFC} normalized form of @var{s}.
address@hidden deffn
address@hidden {Scheme Procedure} string-normalize-nfkc s
address@hidden {C Function} scm_string_normalize_nfkc (s)
+Return the @code{NFKC} normalized form of @var{s}.
address@hidden deffn
 @node String Searching
 @subsubsection String Searching
diff --git a/libguile/strings.c b/libguile/strings.c
index 3151bbe..0cbab3e 100644
--- a/libguile/strings.c
+++ b/libguile/strings.c
@@ -25,6 +25,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <ctype.h>
+#include <uninorm.h>
 #include <unistr.h>
 #include <uniconv.h>
@@ -1736,6 +1737,78 @@ scm_to_locale_stringbuf (SCM str, char *buf, size_t 
   return len;
+/* This function is a partial clone of SCM_STRING_TO_U32_BUF from 
+   libguile/i18n.c.  It would be useful to have this factored out into a more
+   convenient location, but its use of alloca makes that tricky to do. */
+static SCM 
+normalize_str (SCM string, uninorm_t form)
+  SCM ret;
+  scm_t_uint32 *w_str;
+  scm_t_wchar *cbuf;
+  size_t rlen, len = scm_i_string_length (string);
+  if (scm_i_is_narrow_string (string))
+    {
+      size_t i;
+      const char *buf = scm_i_string_chars (string);
+      w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
+      for (i = 0; i < len; i ++)
+       w_str[i] = (unsigned char) buf[i];
+      w_str[len] = 0;
+    }
+  else w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
+  w_str = u32_normalize (form, w_str, len, NULL, &rlen);  
+  ret = scm_i_make_wide_string (rlen, &cbuf);
+  u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
+  free (w_str);
+  return ret;
+SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
+           (SCM string),
+           "Returns the NFC normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfc
+  SCM_VALIDATE_STRING (1, string);
+  return normalize_str (string, UNINORM_NFC);
+#undef FUNC_NAME
+SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
+           (SCM string),
+           "Returns the NFD normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfd
+  SCM_VALIDATE_STRING (1, string);
+  return normalize_str (string, UNINORM_NFD);
+#undef FUNC_NAME
+SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
+           (SCM string),
+           "Returns the NFKC normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfkc
+  SCM_VALIDATE_STRING (1, string);
+  return normalize_str (string, UNINORM_NFKC);
+#undef FUNC_NAME
+SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
+           (SCM string),
+           "Returns the NFKD normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfkd
+  SCM_VALIDATE_STRING (1, string);
+  return normalize_str (string, UNINORM_NFKD);
+#undef FUNC_NAME
 /* converts C scm_array of strings to SCM scm_list of strings. */
 /* If argc < 0, a null terminated scm_array is assumed. */
diff --git a/libguile/strings.h b/libguile/strings.h
index edff0f8..6eafafa 100644
--- a/libguile/strings.h
+++ b/libguile/strings.h
@@ -142,6 +142,11 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp,
 SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str);
 SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
+SCM_API SCM scm_string_normalize_nfd (SCM str);
+SCM_API SCM scm_string_normalize_nfkd (SCM str);
+SCM_API SCM scm_string_normalize_nfc (SCM str);
+SCM_API SCM scm_string_normalize_nfkc (SCM str);
 SCM_API SCM scm_makfromstrs (int argc, char **argv);
diff --git a/test-suite/tests/strings.test b/test-suite/tests/strings.test
index 013c1a8..984178d 100644
--- a/test-suite/tests/strings.test
+++ b/test-suite/tests/strings.test
@@ -386,6 +386,46 @@
         (string-ci>=? (string-ints 0) (string-ints 255)))))
+;; Unicode string normalization forms
+;; string-normalize-nfd
+(with-test-prefix "string-normalize-nfd"
+  (pass-if "canonical decomposition is equal?"
+    (equal? (string-normalize-nfd "\xe9") "\x65\u0301")))
+;; string-normalize-nfkd
+(with-test-prefix "string-normalize-nfkd"
+  (pass-if "compatibility decomposition is equal?"
+    (equal? (string-normalize-nfkd "\u1e9b\u0323") "s\u0323\u0307")))
+;; string-normalize-nfc
+(with-test-prefix "string-normalize-nfc"
+  (pass-if "canonical composition is equal?"
+    (equal? (string-normalize-nfc "\x65\u0301") "\xe9")))
+;; string-normalize-nfkc
+(with-test-prefix "string-normalize-nfkc"
+  (pass-if "compatibility composition is equal?"
+    (equal? (string-normalize-nfkc "\u1e9b\u0323") "\u1e69")))
 ;; string-ref

