bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 08/10] Update to Unicode 6.2.0


From: Daiki Ueno
Subject: [PATCH v2 08/10] Update to Unicode 6.2.0
Date: Thu, 23 Oct 2014 17:01:39 +0900

* lib/unilbrk/lbrktables.h (LBP_RI): New enumeration value.
(unilbrk_table): Adjust table size.
* lib/unilbrk/lbrktables.c (unilbrk_table): Add a row and column
for LBP_RI.

* lib/uniwbrk.in.h (WBP_RI): New enumeration value.
* lib/uniwbrk/u-wordbreaks.h (FUNC): Support rule WB13c.
Normalize table index skipping ignored properties.
* lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI.  Remove
WBP_EXTEND and WBP_FORMAT, which are now computed without using
the table.
* lib/uniwbrk/wbrktable.h: Adjust table size.
* tests/uniwbrk/test-uc-wordbreaks.c
(wordbreakproperty_to_string): Support WBP_RI.

* lib/unigbrk.in.h (GBP_RI): New enumeration value.
* lib/unigbrk/uc-is-grapheme-break.c (UC_IS_GRAPHEME_BREAK):
Support rule GB8a.
(UC_GRAPHEME_BREAKS_FOR, gb_table): Support GBP_RI.
* tests/unigbrk/test-uc-is-grapheme-break.c
(graphemebreakproperty_to_string): Support GBP_RI.

* lib/gen-uni-tables.c (LBP_RI): New enumeration value.
(get_lbp, debug_output_lbp, fill_org_lbp, debug_output_org_lbp)
(output_lbp): Support LBP_RI.
(WBP_RI): New enumeration value.
(debug_output_wbp, fill_org_wbp, debug_output_org_wbp)
(output_wbp): Support WBP_RI.
(GBP_RI): New enumeration value.
(output_gbp_test, fill_org_gbp): Support GBP_RI.
---
 lib/gen-uni-tables.c                      | 49 ++++++++++++++++++--------
 lib/unigbrk.in.h                          |  3 +-
 lib/unigbrk/uc-is-grapheme-break.c        |  9 +++--
 lib/unilbrk/lbrktables.c                  | 57 ++++++++++++++++---------------
 lib/unilbrk/lbrktables.h                  | 21 ++++++------
 lib/uniwbrk.in.h                          |  3 +-
 lib/uniwbrk/u-wordbreaks.h                | 36 +++++++++++++------
 lib/uniwbrk/wbrktable.c                   | 24 ++++++-------
 lib/uniwbrk/wbrktable.h                   |  2 +-
 tests/unigbrk/test-uc-gbrk-prop.c         |  1 +
 tests/unigbrk/test-uc-is-grapheme-break.c |  1 +
 tests/uniwbrk/test-uc-wordbreaks.c        |  1 +
 12 files changed, 127 insertions(+), 80 deletions(-)

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index ec1aba5..f833777 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
                       /usr/local/share/Unidata/CompositionExclusions.txt \
                       /usr/local/share/Unidata/SpecialCasing.txt \
                       /usr/local/share/Unidata/CaseFolding.txt \
-                      6.1.0
+                      6.2.0
  */
 
 #include <stdbool.h>
@@ -6213,22 +6213,22 @@ output_width_property_test (const char *filename)
 
 enum
 {
-  /* Values >= 26 are resolved at run time. */
-  LBP_BK = 26, /* mandatory break */
+  /* Values >= 27 are resolved at run time. */
+  LBP_BK = 27, /* mandatory break */
 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 /*LBP_LF,         line feed - not used here because it's a DOSism */
-  LBP_CM = 27, /* attached characters and combining marks */
+  LBP_CM = 28, /* attached characters and combining marks */
 /*LBP_NL,         next line - not used here because it's equivalent to LBP_BK 
*/
 /*LBP_SG,         surrogates - not used here because they are not characters */
   LBP_WJ =  0, /* word joiner */
-  LBP_ZW = 28, /* zero width space */
+  LBP_ZW = 29, /* zero width space */
   LBP_GL =  1, /* non-breaking (glue) */
-  LBP_SP = 29, /* space */
+  LBP_SP = 30, /* space */
   LBP_B2 =  2, /* break opportunity before and after */
   LBP_BA =  3, /* break opportunity after */
   LBP_BB =  4, /* break opportunity before */
   LBP_HY =  5, /* hyphen */
-  LBP_CB = 30, /* contingent break opportunity */
+  LBP_CB = 31, /* contingent break opportunity */
   LBP_CL =  6, /* closing punctuation */
   LBP_CP =  7, /* closing parenthesis */
   LBP_EX =  8, /* exclamation/interrogation */
@@ -6241,7 +6241,7 @@ enum
   LBP_PO = 15, /* postfix (numeric) */
   LBP_PR = 16, /* prefix (numeric) */
   LBP_SY = 17, /* symbols allowing breaks */
-  LBP_AI = 31, /* ambiguous (alphabetic or ideograph) */
+  LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
   LBP_AL = 18, /* ordinary alphabetic and symbol characters */
 /*LBP_CJ,         conditional Japanese starter, resolved to NS */
   LBP_H2 = 19, /* Hangul LV syllable */
@@ -6251,8 +6251,9 @@ enum
   LBP_JL = 22, /* Hangul L Jamo */
   LBP_JV = 23, /* Hangul V Jamo */
   LBP_JT = 24, /* Hangul T Jamo */
-  LBP_SA = 32, /* complex context (South East Asian) */
-  LBP_XX = 33  /* unknown */
+  LBP_RI = 26, /* regional indicator */
+  LBP_SA = 33, /* complex context (South East Asian) */
+  LBP_XX = 34  /* unknown */
 };
 
 /* Returns the line breaking classification for ch, as a bit mask.  */
@@ -6710,6 +6711,10 @@ get_lbp (unsigned int ch)
       if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
         attr |= (int64_t) 1 << LBP_JT;
 
+      /* regional indicator */
+      if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
+        attr |= (int64_t) 1 << LBP_RI;
+
       /* complex context (South East Asian) */
       if (((unicode_attributes[ch].category[0] == 'C'
             && unicode_attributes[ch].category[1] == 'f')
@@ -6862,7 +6867,7 @@ get_lbp (unsigned int ch)
           || ch == 0x2064 /* INVISIBLE PLUS */
           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
           || ch == 0x110BD /* KAITHI NUMBER SIGN */)
-        if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | 
((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | 
((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | 
((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | 
((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | 
((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | 
((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | 
((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | 
((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+        if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | 
((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | 
((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | 
((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | 
((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | 
((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | 
((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | 
((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | 
((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | 
((int64_t) 1 << LBP_ID))))
           {
             /* ambiguous (alphabetic) ? */
             if ((unicode_width[ch] != NULL
@@ -6987,6 +6992,7 @@ debug_output_lbp (FILE *stream)
           PRINT_BIT(attr,LBP_JL);
           PRINT_BIT(attr,LBP_JV);
           PRINT_BIT(attr,LBP_JT);
+          PRINT_BIT(attr,LBP_RI);
           PRINT_BIT(attr,LBP_SA);
           PRINT_BIT(attr,LBP_XX);
 #undef PRINT_BIT
@@ -7102,6 +7108,7 @@ fill_org_lbp (const char *linebreak_filename)
       TRY(LBP_JL)
       TRY(LBP_JV)
       TRY(LBP_JT)
+      TRY(LBP_RI)
       TRY(LBP_SA)
       TRY(LBP_XX)
 #undef TRY
@@ -7184,6 +7191,7 @@ debug_output_org_lbp (FILE *stream)
           PRINT_BIT(attr,LBP_JL);
           PRINT_BIT(attr,LBP_JV);
           PRINT_BIT(attr,LBP_JT);
+          PRINT_BIT(attr,LBP_RI);
           PRINT_BIT(attr,LBP_SA);
           PRINT_BIT(attr,LBP_XX);
 #undef PRINT_BIT
@@ -7358,6 +7366,7 @@ output_lbp (FILE *stream1, FILE *stream2)
           CASE(LBP_JL);
           CASE(LBP_JV);
           CASE(LBP_JT);
+          CASE(LBP_RI);
           CASE(LBP_SA);
           CASE(LBP_XX);
 #undef CASE
@@ -7457,7 +7466,8 @@ enum
   WBP_MIDLETTER    = 4,
   WBP_MIDNUM       = 5,
   WBP_NUMERIC      = 6,
-  WBP_EXTENDNUMLET = 7
+  WBP_EXTENDNUMLET = 7,
+  WBP_RI           = 13
 };
 
 /* Returns the word breaking property for ch, as a bit mask.  */
@@ -7525,6 +7535,9 @@ get_wbp (unsigned int ch)
       if (unicode_attributes[ch].category != NULL
           && strcmp (unicode_attributes[ch].category, "Pc") == 0)
         attr |= 1 << WBP_EXTENDNUMLET;
+
+      if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
+        attr |= 1 << WBP_RI;
     }
 
   if (attr == 0)
@@ -7570,7 +7583,9 @@ debug_output_wbp (FILE *stream)
             fprintf (stream, " Numeric");
           if (attr & (1 << WBP_EXTENDNUMLET))
             fprintf (stream, " ExtendNumLet");
-          fprintf (stream, "\n");
+          if (attr & (1 << WBP_RI))
+            fprintf (stream, " Regional_Indicator");
+         fprintf (stream, "\n");
         }
     }
 }
@@ -7655,6 +7670,7 @@ fill_org_wbp (const char *wordbreakproperty_filename)
       PROP ("MidNum", WBP_MIDNUM)
       PROP ("Numeric", WBP_NUMERIC)
       PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+      PROP ("Regional_Indicator", WBP_RI)
 #undef PROP
         {
           fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -7701,6 +7717,7 @@ debug_output_org_wbp (FILE *stream)
           PROP ("MidNum", WBP_MIDNUM)
           PROP ("Numeric", WBP_NUMERIC)
           PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+          PROP ("Regional_Indicator", WBP_RI)
 #undef PROP
           fprintf (stream, " ??");
           fprintf (stream, "\n");
@@ -7853,6 +7870,7 @@ output_wbp (FILE *stream)
           CASE(WBP_MIDNUM);
           CASE(WBP_NUMERIC);
           CASE(WBP_EXTENDNUMLET);
+          CASE(WBP_RI);
 #undef CASE
           default:
             abort ();
@@ -7933,7 +7951,8 @@ enum
   GBP_V            = 8,
   GBP_T            = 9,
   GBP_LV           = 10,
-  GBP_LVT          = 11
+  GBP_LVT          = 11,
+  GBP_RI           = 12
 };
 
 /* Construction of sparse 3-level tables.  */
@@ -8004,6 +8023,7 @@ output_gbp_test (const char *filename)
       CASE (GBP_T)
       CASE (GBP_LV)
       CASE (GBP_LVT)
+      CASE (GBP_RI)
 #undef CASE
         default:
           abort ();
@@ -8201,6 +8221,7 @@ fill_org_gbp (const char *graphemebreakproperty_filename)
       PROP ("T", GBP_T)
       PROP ("LV", GBP_LV)
       PROP ("LVT", GBP_LVT)
+      PROP ("Regional_Indicator", GBP_RI)
 #undef PROP
         {
           fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
diff --git a/lib/unigbrk.in.h b/lib/unigbrk.in.h
index 8335e5a..a708a8c 100644
--- a/lib/unigbrk.in.h
+++ b/lib/unigbrk.in.h
@@ -51,7 +51,8 @@ enum
   GBP_V            = 8,
   GBP_T            = 9,
   GBP_LV           = 10,
-  GBP_LVT          = 11
+  GBP_LVT          = 11,
+  GBP_RI           = 12
 };
 
 /* Return the Grapheme_Cluster_Break property of a Unicode character. */
diff --git a/lib/unigbrk/uc-is-grapheme-break.c 
b/lib/unigbrk/uc-is-grapheme-break.c
index 0e61e79..7d1759c 100644
--- a/lib/unigbrk/uc-is-grapheme-break.c
+++ b/lib/unigbrk/uc-is-grapheme-break.c
@@ -47,6 +47,9 @@
    /* GB8 */                                                            \
    ((A) == GBP_LVT || (A) == GBP_T) && (B) == GBP_T ? false :           \
                                                                         \
+   /* GB8a */                                                          \
+   (A) == GBP_RI && (B) == GBP_RI ? false :                            \
+                                                                       \
    /* GB9 */                                                            \
    (B) == GBP_EXTEND ? false :                                          \
                                                                         \
@@ -71,9 +74,10 @@
    | (UC_IS_GRAPHEME_BREAK(A, GBP_V)           << GBP_V)                \
    | (UC_IS_GRAPHEME_BREAK(A, GBP_T)           << GBP_T)                \
    | (UC_IS_GRAPHEME_BREAK(A, GBP_LV)          << GBP_LV)               \
-   | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT)         << GBP_LVT))
+   | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT)         << GBP_LVT)              \
+   | (UC_IS_GRAPHEME_BREAK(A, GBP_RI)          << GBP_RI))
 
-static const unsigned short int gb_table[12] =
+static const unsigned short int gb_table[13] =
   {
     UC_GRAPHEME_BREAKS_FOR(0),  /* GBP_OTHER */
     UC_GRAPHEME_BREAKS_FOR(1),  /* GBP_CR */
@@ -87,6 +91,7 @@ static const unsigned short int gb_table[12] =
     UC_GRAPHEME_BREAKS_FOR(9),  /* GBP_T */
     UC_GRAPHEME_BREAKS_FOR(10), /* GBP_LV */
     UC_GRAPHEME_BREAKS_FOR(11), /* GBP_LVT */
+    UC_GRAPHEME_BREAKS_FOR(12), /* GBP_RI */
   };
 
 bool
diff --git a/lib/unilbrk/lbrktables.c b/lib/unilbrk/lbrktables.c
index d60321d..f4a55a3 100644
--- a/lib/unilbrk/lbrktables.c
+++ b/lib/unilbrk/lbrktables.c
@@ -23,36 +23,37 @@
 /* Define unilbrkprop, table of line breaking properties.  */
 #include "unilbrk/lbrkprop2.h"
 
-const unsigned char unilbrk_table[26][26] =
+const unsigned char unilbrk_table[27][27] =
 {
                                 /* after */
-        /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID 
JL JV JT HL */
-/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, },
-/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, },
-/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, },
-/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D, 
D, D, D, D, },
-/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D, 
D, D, D, I, },
-/* EX */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, 
P, P, P, P, },
-/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, },
-/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D, 
D, D, D, I, },
-/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D, 
D, D, D, I, },
-/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I, 
I, I, I, I, },
-/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, 
D, D, D, D, },
-/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, 
D, D, D, I, },
-/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, I, I, D, },
-/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, D, I, D, },
-/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, D, D, D, },
-/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D, 
I, I, D, D, },
-/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, I, I, D, },
-/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, D, I, D, },
-/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, 
D, D, D, I, },
+        /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID 
JL JV JT HL RI */
+/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, I, },
+/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, I, },
+/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, I, },
+/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D, 
D, D, D, D, D, },
+/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D, 
D, D, D, I, D, },
+/* EX */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, 
P, P, P, P, P, },
+/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I, 
I, I, I, I, I, },
+/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D, 
D, D, D, I, D, },
+/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D, 
D, D, D, I, D, },
+/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I, 
I, I, I, I, D, },
+/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, 
D, D, D, I, D, },
+/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, I, I, D, D, },
+/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, D, I, D, D, },
+/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, D, D, D, D, },
+/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D, 
I, I, D, D, D, },
+/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, I, I, D, D, },
+/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, 
D, D, I, D, D, },
+/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, 
D, D, D, I, D, },
+/* RI */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, 
D, D, I, D, I, },
 /* "" */
 /* before */
 };
diff --git a/lib/unilbrk/lbrktables.h b/lib/unilbrk/lbrktables.h
index 95bb502..9c76ad7 100644
--- a/lib/unilbrk/lbrktables.h
+++ b/lib/unilbrk/lbrktables.h
@@ -21,22 +21,22 @@
 
 enum
 {
-  /* Values >= 26 are resolved at run time. */
-  LBP_BK = 26, /* mandatory break */
+  /* Values >= 27 are resolved at run time. */
+  LBP_BK = 27, /* mandatory break */
 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 /*LBP_LF,         line feed - not used here because it's a DOSism */
-  LBP_CM = 27, /* attached characters and combining marks */
+  LBP_CM = 28, /* attached characters and combining marks */
 /*LBP_NL,         next line - not used here because it's equivalent to LBP_BK 
*/
 /*LBP_SG,         surrogates - not used here because they are not characters */
   LBP_WJ =  0, /* word joiner */
-  LBP_ZW = 28, /* zero width space */
+  LBP_ZW = 29, /* zero width space */
   LBP_GL =  1, /* non-breaking (glue) */
-  LBP_SP = 29, /* space */
+  LBP_SP = 30, /* space */
   LBP_B2 =  2, /* break opportunity before and after */
   LBP_BA =  3, /* break opportunity after */
   LBP_BB =  4, /* break opportunity before */
   LBP_HY =  5, /* hyphen */
-  LBP_CB = 30, /* contingent break opportunity */
+  LBP_CB = 31, /* contingent break opportunity */
   LBP_CL =  6, /* closing punctuation */
   LBP_CP =  7, /* closing parenthesis */
   LBP_EX =  8, /* exclamation/interrogation */
@@ -49,7 +49,7 @@ enum
   LBP_PO = 15, /* postfix (numeric) */
   LBP_PR = 16, /* prefix (numeric) */
   LBP_SY = 17, /* symbols allowing breaks */
-  LBP_AI = 31, /* ambiguous (alphabetic or ideograph) */
+  LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
   LBP_AL = 18, /* ordinary alphabetic and symbol characters */
 /*LBP_CJ,         conditional Japanese starters, resolved to NS */
   LBP_H2 = 19, /* Hangul LV syllable */
@@ -59,8 +59,9 @@ enum
   LBP_JL = 22, /* Hangul L Jamo */
   LBP_JV = 23, /* Hangul V Jamo */
   LBP_JT = 24, /* Hangul T Jamo */
-  LBP_SA = 32, /* complex context (South East Asian) */
-  LBP_XX = 33  /* unknown */
+  LBP_RI = 26, /* regional indicator */
+  LBP_SA = 33, /* complex context (South East Asian) */
+  LBP_XX = 34  /* unknown */
 };
 
 #include "lbrkprop1.h"
@@ -91,7 +92,7 @@ unilbrkprop_lookup (ucs4_t uc)
 #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
 #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
 
-extern const unsigned char unilbrk_table[26][26];
+extern const unsigned char unilbrk_table[27][27];
 
 /* We don't support line breaking of complex-context dependent characters
    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h
index ab4b532..c272d48 100644
--- a/lib/uniwbrk.in.h
+++ b/lib/uniwbrk.in.h
@@ -49,7 +49,8 @@ enum
   WBP_MIDLETTER    = 4,
   WBP_MIDNUM       = 5,
   WBP_NUMERIC      = 6,
-  WBP_EXTENDNUMLET = 7
+  WBP_EXTENDNUMLET = 7,
+  WBP_RI           = 13
 };
 
 /* Return the Word_Break property of a Unicode character.  */
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 33ca7eb..04d2738 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -55,16 +55,12 @@ FUNC (const UNIT *s, size_t n, char *p)
               if (last_char_prop == WBP_CR && prop == WBP_LF)
                 /* *p = 0 */;
               /* Break before and after newlines.  */
-              else if (last_char_prop >= WBP_NEWLINE
-                       /* same as:
-                          last_char_prop == WBP_CR
-                          || last_char_prop == WBP_LF
-                          || last_char_prop == WBP_NEWLINE */
-                       || prop >= WBP_NEWLINE
-                          /* same as:
-                             prop == WBP_CR
-                             || prop == WBP_LF
-                             || prop == WBP_NEWLINE */)
+              else if ((last_char_prop == WBP_CR
+                        || last_char_prop == WBP_LF
+                        || last_char_prop == WBP_NEWLINE)
+                       || (prop == WBP_CR
+                           || prop == WBP_LF
+                           || prop == WBP_NEWLINE))
                 *p = 1;
               /* Ignore Format and Extend characters.  */
               else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
@@ -85,6 +81,7 @@ FUNC (const UNIT *s, size_t n, char *p)
                           (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
                                             ExtendNumLet × ExtendNumLet (WB13a)
                          ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b)
+                               Regional_Indicator × Regional_Indicator  (WB13c)
                    */
                   /* No break across certain punctuation.  Also, disable word
                      breaks that were recognized earlier (due to lookahead of
@@ -101,10 +98,27 @@ FUNC (const UNIT *s, size_t n, char *p)
                       *last_compchar_ptr = 0;
                       /* *p = 0; */
                     }
+                  /* Break after Format and Extend characters.  */
+                  else if (last_compchar_prop == WBP_EXTEND
+                           || last_compchar_prop == WBP_FORMAT)
+                    *p = 1;
                   else
                     {
+                      /* Normalize property value to table index,
+                         skipping 5 properties: WBP_EXTEND,
+                         WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
+                         WBP_LF.  */
+                      int last_compchar_prop_index = last_compchar_prop;
+                      int prop_index = prop;
+
+                      if (last_compchar_prop_index >= WBP_EXTEND)
+                        last_compchar_prop_index -= 5;
+
+                      if (prop_index >= WBP_EXTEND)
+                        prop_index -= 5;
+
                       /* Perform a single table lookup.  */
-                      if (uniwbrk_table[last_compchar_prop][prop])
+                      if (uniwbrk_table[last_compchar_prop_index][prop_index])
                         *p = 1;
                       /* else *p = 0; */
                     }
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 7cbe4d6..04bd0e5 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -32,21 +32,21 @@
   (ALetter | Numeric | Katakana) × ExtendNumLet                    (WB13a)
                     ExtendNumLet × ExtendNumLet                    (WB13a)
                     ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b)
+              Regional_Indicator × Regional_Indicator              (WB13c)
  */
 
-const unsigned char uniwbrk_table[10][8] =
+const unsigned char uniwbrk_table[9][9] =
 {        /* current:      OTHER            MIDNUMLET         NUMERIC         */
          /*                   KATAKANA           MIDLETTER      EXTENDNUMLET */
-         /*                          ALETTER            MIDNUM               */
+         /*                          ALETTER            MIDNUM           RI  */
   /* last */
-  /* WBP_OTHER */        {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_KATAKANA */     {  1,    0,    1,    1,    1,    1,    1,    0 },
-  /* WBP_ALETTER */      {  1,    1,    0,    1,    1,    1,    0,    0 },
-  /* WBP_MIDNUMLET */    {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_MIDLETTER */    {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_MIDNUM */       {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_NUMERIC */      {  1,    1,    0,    1,    1,    1,    0,    0 },
-  /* WBP_EXTENDNUMLET */ {  1,    0,    0,    1,    1,    1,    0,    0 },
-  /* WBP_EXTEND */       {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_FORMAT */       {  1,    1,    1,    1,    1,    1,    1,    1 }
+  /* WBP_OTHER */        {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
+  /* WBP_KATAKANA */     {  1,    0,    1,    1,    1,    1,    1,    0,    1 
},
+  /* WBP_ALETTER */      {  1,    1,    0,    1,    1,    1,    0,    0,    1 
},
+  /* WBP_MIDNUMLET */    {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
+  /* WBP_MIDLETTER */    {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
+  /* WBP_MIDNUM */       {  1,    1,    1,    1,    1,    1,    1,    1,    1 
},
+  /* WBP_NUMERIC */      {  1,    1,    0,    1,    1,    1,    0,    0,    1 
},
+  /* WBP_EXTENDNUMLET */ {  1,    0,    0,    1,    1,    1,    0,    0,    1 
},
+  /* WBP_RI */           {  1,    1,    1,    1,    1,    1,    1,    1,    0 }
 };
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 1b48adf..50b7823 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -15,4 +15,4 @@
    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
-extern const unsigned char uniwbrk_table[10][8];
+extern const unsigned char uniwbrk_table[9][9];
diff --git a/tests/unigbrk/test-uc-gbrk-prop.c 
b/tests/unigbrk/test-uc-gbrk-prop.c
index 1c71280..4bfbdba 100644
--- a/tests/unigbrk/test-uc-gbrk-prop.c
+++ b/tests/unigbrk/test-uc-gbrk-prop.c
@@ -50,6 +50,7 @@ graphemebreakproperty_to_string (int gbp)
       CASE(T)
       CASE(LV)
       CASE(LVT)
+      CASE(RI)
     }
   abort ();
 }
diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c 
b/tests/unigbrk/test-uc-is-grapheme-break.c
index a93f6f2..dbaf3dc 100644
--- a/tests/unigbrk/test-uc-is-grapheme-break.c
+++ b/tests/unigbrk/test-uc-is-grapheme-break.c
@@ -44,6 +44,7 @@ graphemebreakproperty_to_string (int gbp)
       CASE(T)
       CASE(LV)
       CASE(LVT)
+      CASE(RI)
     }
   abort ();
 }
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c 
b/tests/uniwbrk/test-uc-wordbreaks.c
index 736cdba..41585f7 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -47,6 +47,7 @@ wordbreakproperty_to_string (int wbp)
       CASE(MIDNUM)
       CASE(NUMERIC)
       CASE(EXTENDNUMLET)
+      CASE(RI)
     }
   abort ();
 }
-- 
1.9.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]