bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling wh


From: Michal Nazarewicz
Subject: bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings
Date: Tue, 4 Oct 2016 03:10:33 +0200

Implement part of Unicode special handling rules for Azeri and Turkish
languages, namely ‘i’ is paired with ‘İ’ while ‘ı’ is paired with ‘I’.

* src/casefiddle.c (struct casing_context, prepare_casing_context): Add
flag for handling of Turkic i.
(case_character_impl): Extract flag normalisation to a new function:
(normalise_flag): New function.
(case_single_character): Update after above changes.
(case_character): Rename to:
(case_characters): Add handling of Turkic i.
(do_casify_multibyte_string, do_casify_multibyte_region): Update to use
renamed case_characters.

* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test
cases for dotless and dotted i’s.
---
 src/casefiddle.c             | 251 ++++++++++++++++++++++++++++++-------------
 test/src/casefiddle-tests.el |  37 ++++++-
 2 files changed, 213 insertions(+), 75 deletions(-)

diff --git a/src/casefiddle.c b/src/casefiddle.c
index ace589c..2a7aa64 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -35,7 +35,8 @@ struct casing_str_buf {
   unsigned char len_bytes;
 };
 
-enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
+enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP,
+                 /* Only for internal use: */ CASE_NO_ACTION};
 
 #include "special-casing.h"
 
@@ -53,6 +54,8 @@ struct casing_context {
   bool inbuffer;
   /* Whether we are inside of a word. */
   bool inword;
+  /* Whether to apply Azeri/Turkish rules for dotted and dotless i. */
+  bool treat_turkic_i;
 };
 
 /* Initialise CTX structure and prepares related global data for casing
@@ -61,6 +64,8 @@ static void
 prepare_casing_context (struct casing_context *ctx,
                        enum case_action flag, bool inbuffer)
 {
+  Lisp_Object lang, l, tr, az;
+
   ctx->flag = flag;
   ctx->inbuffer = inbuffer;
   ctx->inword = false;
@@ -68,42 +73,77 @@ prepare_casing_context (struct casing_context *ctx,
     ? uniprop_table (intern_c_string ("titlecase"))
     : Qnil;
 
+  ctx->treat_turkic_i = false;
+
   /* If the case table is flagged as modified, rescan it.  */
   if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
     Fset_case_table (BVAR (current_buffer, downcase_table));
 
   if (inbuffer && (int) flag >= (int) CASE_CAPITALIZE)
     SETUP_BUFFER_SYNTAX_TABLE ();      /* For syntax_prefix_flag_p.  */
+
+  /* FIXME: Is current-iso639-language the best source of that information? */
+  lang = Vcurrent_iso639_language;
+  tr = intern_c_string ("tr");
+  az = intern_c_string ("az");
+  if (SYMBOLP (lang))
+    {
+      l = lang;
+      goto check_language;
+    }
+  while (CONSP (lang))
+    {
+      l = XCAR (lang);
+      lang = XCDR (lang);
+    check_language:
+      if (EQ (l, tr) || EQ (l, az))
+       {
+         ctx->treat_turkic_i = true;
+         break;
+       }
+    }
+}
+
+/* Normalise CFG->flag and return CASE_UP, CASE_DOWN, CASE_CAPITALIZE or
+   CASE_NO_ACTION.  The latter if CFG->flag is CASE_CAPITALIZE_UP and we are
+   inside of a word. */
+static enum case_action
+normalise_flag (struct casing_context *ctx)
+{
+  /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
+  switch (ctx->flag) {
+  case CASE_CAPITALIZE:
+    return (enum case_action)((int)ctx->flag - ctx->inword);
+  case CASE_CAPITALIZE_UP:
+    return ctx->inword ? CASE_NO_ACTION : CASE_CAPITALIZE;
+  default:
+    return ctx->flag;
+  }
 }
 
-/* Based on CTX, case character CH.  If BUF is NULL, return cased character.
-   Otherwise, if BUF is non-NULL, save result in it and return whether the
-   character has been changed.
+/* Based on CTX and FLAG, case character CH.  If BUF is NULL, return cased
+   character.  Otherwise, if BUF is non-NULL, save result in it and return 0 if
+   the character changed or -1 if it didn’t.
+
+   FLAG may be one of CASE_UP, CASE_DOWN, CASE_CAPITALIZE (title-case if
+   possible, upper-aces otherwise) or CASE_NO_ACTION.  CTX->inword is not taken
+   into account when interpreting FLAG (it may be taken into account for other
+   decisions though).
 
    Since meaning of return value depends on arguments, it’s more convenient to
-   use case_single_character or case_character instead. */
+   use case_single_character or case_characters instead. */
 static int
 case_character_impl (struct casing_str_buf *buf,
-                    struct casing_context *ctx, int ch)
+                    struct casing_context *ctx, enum case_action flag, int ch)
 {
-  enum case_action flag;
   Lisp_Object prop;
-  bool was_inword;
   int cased;
 
   /* Update inword state */
-  was_inword = ctx->inword;
   ctx->inword = SYNTAX (ch) == Sword &&
-    (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+    (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
 
-  /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
-  if (ctx->flag == CASE_CAPITALIZE)
-    flag = (enum case_action)((int)ctx->flag - was_inword);
-  else if (ctx->flag != CASE_CAPITALIZE_UP)
-    flag = ctx->flag;
-  else if (!was_inword)
-    flag = CASE_CAPITALIZE;
-  else
+  if (flag == CASE_NO_ACTION)
     {
       cased = ch;
       goto done;
@@ -120,7 +160,7 @@ case_character_impl (struct casing_str_buf *buf,
              ((it - special_casing_code_points) * 3 + (int)flag);
            memcpy (buf, entry, sizeof *buf);
            buf->len_chars &= ~SPECIAL_CASING_NO_CHANGE_BIT;
-           return !(entry->len_chars & SPECIAL_CASING_NO_CHANGE_BIT);
+           return entry->len_chars & SPECIAL_CASING_NO_CHANGE_BIT ? -1 : 0;
          }
     }
 
@@ -139,7 +179,7 @@ case_character_impl (struct casing_str_buf *buf,
     return cased;
   buf->len_chars = 1;
   buf->len_bytes = CHAR_STRING (cased, buf->data);
-  return cased != ch;
+  return cased == ch ? -1 : 0;
 }
 
 /* In Greek, lower case sigma has two forms: one when used in the middle and 
one
@@ -152,6 +192,13 @@ case_character_impl (struct casing_str_buf *buf,
 #define CAPITAL_SIGMA     0x03A3
 #define SMALL_SIGMA       0x03C3
 #define SMALL_FINAL_SIGMA 0x03C2
+
+/* Azeri and Turkish have dotless and dotted i.  An upper case of i is
+   İ while lower case of I is ı. */
+
+#define CAPITAL_DOTTED_I    0x130
+#define SMALL_DOTLESS_I     0x131
+#define COMBINING_DOT_ABOVE 0x307
 
 /* Based on CTX, case character CH accordingly.  Update CTX as necessary.
    Return cased character.
@@ -162,38 +209,88 @@ case_character_impl (struct casing_str_buf *buf,
 static inline int
 case_single_character (struct casing_context *ctx, int ch)
 {
-  return case_character_impl (NULL, ctx, ch);
+  enum case_action flag = normalise_flag (ctx);
+  return case_character_impl (NULL, ctx, flag, ch);
 }
 
-/* Save in BUF result of casing character CH.  Return whether casing changed 
the
-   character.
+/* Save in BUF result of casing character CH.
 
    If not-NULL, NEXT points to the next character in the cased string.  If 
NULL,
    it is assumed current character is the last one being cased.  This is used 
to
    apply some rules which depend on proceeding state.
 
-   This is like case_single_character but also handles one-to-many casing
-   rules. */
-static bool
-case_character (struct casing_str_buf *buf, struct casing_context *ctx,
-               int ch, const unsigned char *next)
+   Return:
+   - -1 if character has not been changed,
+   - 0 if the character has changed or
+   - a positive number if the character CH and the one following it (pointed by
+     NEXT) map to character saved in BUF.  Returned value is the length in 
bytes
+     of the next character.
+
+   This is like case_single_character but also handles one-to-many as well as
+   many-to-one and many-to-many casing rules. */
+static int
+case_characters (struct casing_str_buf *buf, struct casing_context *ctx,
+                int ch, const unsigned char *next)
 {
-  bool changed, was_inword;
+  enum case_action flag = normalise_flag (ctx);
 
-  was_inword = ctx->inword;
-  changed = case_character_impl (buf, ctx, ch);
+  if (flag != CASE_NO_ACTION && __builtin_expect(ctx->treat_turkic_i, false))
+    {
+      bool dot_above = false;
+      int cased = ch;
 
-  /* If we have just down-cased a capital sigma and the next character no 
longer
-     has a word syntax (i.e. current character is end of word), use final
-     sigma. */
-  if (was_inword && ch == CAPITAL_SIGMA && changed &&
-      (!next || SYNTAX (STRING_CHAR (next)) != Sword))
+      switch (ch) {
+      case 'I':
+       if (flag == CASE_DOWN)
+         {
+           dot_above = next && STRING_CHAR (next) == COMBINING_DOT_ABOVE;
+           cased = dot_above ? 'i' : SMALL_DOTLESS_I;
+         }
+       break;
+
+      case 'i':
+       if (flag == CASE_UP || flag == CASE_CAPITALIZE)
+         cased = CAPITAL_DOTTED_I;
+       break;
+
+      case CAPITAL_DOTTED_I:
+       if (flag == CASE_DOWN)
+         cased = 'i';
+       break;
+
+      case SMALL_DOTLESS_I:
+       if (flag == CASE_UP || flag == CASE_CAPITALIZE)
+         cased = 'I';
+       break;
+
+      default:
+       goto not_turkic_i;
+      }
+
+      ctx->inword = true;
+      buf->len_chars = 1;
+      buf->len_bytes = CHAR_STRING (cased, buf->data);
+      if (dot_above)
+       return CHAR_BYTES (COMBINING_DOT_ABOVE);
+      else
+       return ch == cased ? -1 : 0;
+    }
+
+ not_turkic_i:
+  /* Capital sigma down-cases differently based on whether it’s last
+     letter of a word or not. */
+  if (flag == CASE_DOWN && ch == CAPITAL_SIGMA)
     {
-      buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data);
+      ch = (ctx->inword && (!next || SYNTAX (STRING_CHAR (next)) != Sword))
+       ? SMALL_FINAL_SIGMA : SMALL_SIGMA;
+      buf->len_bytes = CHAR_STRING (ch, buf->data);
       buf->len_chars = 1;
+      ctx->inword = true;
+      return 0;
     }
 
-  return changed;
+  /* Do the casing. */
+  return case_character_impl (buf, ctx, flag, ch);
 }
 
 static Lisp_Object
@@ -240,7 +337,7 @@ do_casify_multibyte_string (struct casing_context *ctx, 
Lisp_Object obj)
   typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 
1];
 
   ptrdiff_t size = SCHARS (obj), n;
-  int ch;
+  int ch, len_bytes;
   USE_SAFE_ALLOCA;
   if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) ||
       INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
@@ -250,12 +347,16 @@ do_casify_multibyte_string (struct casing_context *ctx, 
Lisp_Object obj)
 
   const unsigned char *src = SDATA (obj);
 
-  for (n = 0; size; --size)
+  n = 0;
+  while (size)
     {
       if (dst_end - o < sizeof(struct casing_str_buf))
        string_overflow ();
       ch = STRING_CHAR_ADVANCE (src);
-      case_character ((void *)o, ctx, ch, size > 1 ? src : NULL);
+      len_bytes = case_characters ((void *)o, ctx, ch, size > 1 ? src : NULL);
+      if (len_bytes > 0)
+       src += len_bytes;
+      size -= len_bytes > 0 ? 2 : 1;
       n += ((struct casing_str_buf *)o)->len_chars;
       o += ((struct casing_str_buf *)o)->len_bytes;
     }
@@ -397,44 +498,50 @@ do_casify_multibyte_region (struct casing_context *ctx,
   ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
   ptrdiff_t opoint = PT, added;
   struct casing_str_buf buf;
-  bool changed;
-  int ch, len;
+  int ch, len_bytes, len_chars, ret;
 
-  for (; size; --size)
+  while (size)
     {
-      ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
-      changed = case_character (
+      ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len_bytes);
+      ret = case_characters (
          &buf, ctx, ch,
-         size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL);
+         size > 1 ? BYTE_POS_ADDR (pos_byte + len_bytes) : NULL);
+      len_chars = 1;
 
-      if (!changed)
-       {
-         pos_byte += len;
-         ++pos;
-         continue;
-       }
+      switch (ret) {
+      default:
+       len_chars += 1;
+       /* FALL THROUGH */
 
-      last = pos + buf.len_chars;
-      if (first < 0)
-       first = pos;
+      case 0:
+       len_bytes += ret;
+       len_chars = ret ? 2 : 1;
 
-      if (buf.len_chars == 1 && buf.len_bytes == len)
-       memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
-      else
-       {
-         /* Replace one character with the other(s), keeping text
-            properties the same.  */
-         replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
-                          (const char *) buf.data, buf.len_chars,
-                          buf.len_bytes,
-                          0);
-         added += buf.len_chars - 1;
-         if (opoint > pos)
-           opoint += buf.len_chars - 1;
-       }
+       last = pos + buf.len_chars;
+       if (first < 0)
+         first = pos;
+
+       if (ret == 0 && buf.len_chars == 1 && buf.len_bytes == len_bytes)
+         memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len_bytes);
+       else
+         {
+           /* Replace one character with the other(s), keeping text
+              properties the same.  */
+           replace_range_2 (pos, pos_byte, pos + len_chars, pos_byte + 
len_bytes,
+                            (const char *) buf.data, buf.len_chars,
+                            buf.len_bytes,
+                            0);
+           added += buf.len_chars - len_chars;
+           if (opoint > pos)
+             opoint += buf.len_chars - len_chars;
+         }
 
-      pos_byte += buf.len_bytes;
-      pos += buf.len_chars;
+       /* FALL THOUGH */
+      case -1:
+       size -= len_chars;
+       pos += buf.len_chars;
+       pos_byte += buf.len_bytes;
+      }
     }
 
   if (PT != opoint)
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 1bd745e..9f5e43f 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -134,7 +134,7 @@ casefiddle-tests--characters
      (with-temp-buffer
        (dolist
            (test
-            ;; input  upcase  downcase  capitalize  upcase-initials
+            ;; input  upcase  downcase  capitalize  upcase-initials [locale]
             '(("Foo baR" "FOO BAR" "foo bar" "Foo Bar" "Foo BaR")
               ("Ⅷ ⅷ" "Ⅷ Ⅷ" "ⅷ ⅷ" "Ⅷ Ⅷ" "Ⅷ Ⅷ")
               ;; "DžUNGLA" is an unfortunate result but it’s really best we can
@@ -155,10 +155,41 @@ casefiddle-tests--characters
               ("Σ Σ" "Σ Σ" "σ σ" "Σ Σ" "Σ Σ")
               ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
               ;; If sigma is already lower case, we don’t want to change it.
-              ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ"))
+              ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ")
+
+              ;; There is a language-independent special casing rule which
+              ;; converts İ into i followed by combining dot above that’s why 
we
+              ;; get the weird \u0307.  Conceptually, it converts i with
+              ;; a soft-dot into an i with a hard-dot so it makes some doze of
+              ;; sense.
+              ("İstanbul" "İSTANBUL" "i\u0307stanbul" "İstanbul" "İstanbul")
+              ("İstanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'tr)
+              ("İstanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'az)
+              ("istanbul" "ISTANBUL" "istanbul" "Istanbul" "Istanbul")
+              ("istanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'tr)
+              ("istanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'az)
+              ("Irmak" "IRMAK" "irmak" "Irmak" "Irmak")
+              ("Irmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'tr)
+              ("Irmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'az)
+              ;; FIXME: We explicitly exclude ı→I mapping from the case tables
+              ;; in characters.el which is why instead of:
+              ;;("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak")
+              ;; we actually get:
+              ("ırmak" "ıRMAK" "ırmak" "Irmak" "Irmak")
+              ;; ‘But wait,’ you ask, ‘why capitalise examples work?  This is
+              ;; because those bypass case-table and use character’s Unicode
+              ;; titlecase property.
+              ("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'tr)
+              ("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'az)
+              ;; And for some combining dot above removal.
+              ("I\u0307si\u0307s" "I\u0307Sİ\u0307S" "isi\u0307s"
+                                  "I\u0307si\u0307s" "I\u0307si\u0307s" 'tr)
+              ("I\u0307sI\u0307s" "I\u0307SI\u0307S" "isis"
+                                  "I\u0307sis" "I\u0307sI\u0307s" 'tr))
             (nreverse errors))
-         (let* ((input (car test))
+         (let* ((input (string-to-multibyte (car test)))
                 (expected (cdr test))
+                (current-iso639-language (or (nth 5 test) 'en))
                 (check (lambda (func got)
                          (unless (string-equal got (car expected))
                            (let ((fmt (length (symbol-name func))))
-- 
2.8.0.rc3.226.g39d4020






reply via email to

[Prev in Thread] Current Thread [Next in Thread]