[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[bug-libunistring] [PATCH 7/8] Update to Unicode 6.3.0
From: |
Daiki Ueno |
Subject: |
[bug-libunistring] [PATCH 7/8] Update to Unicode 6.3.0 |
Date: |
Fri, 10 Oct 2014 22:59:52 +0900 |
* tests/uniwbrk/test-uc-wordbreaks.c
(wordbreakproperty_to_string): Support WBP_DQ, WBP_SQ, and WBP_HL.
* lib/uniwbrk.in.h (WBP_DQ, WBP_SQ, WBP_HL): New enumeration values.
* lib/uniwbrk/u-wordbreaks.h (FUNC): Support WB7a, WB7b, and WB7c.
Update WB5, WB6, WB7, WB9, WB11, WB12, WB13a, and WB13b.
* lib/uniwbrk/wbrktable.h (uniwbrk_table): Adjust table size.
* lib/uniwbrk/wbrktable.c (uniwbrk_table): Support rule WB7a.
Update WB5, WB9, WB10, WB13a, and WB13b.
* lib/gen-uni-tables.c (UC_BIDI_LRI, UC_BIDI_RLI, UC_BIDI_FSI)
(UC_BIDI_PDI): New enumeration values.
(bidi_category_byname): Support those enum values.
(is_WBP_MIDNUMLET): Exclude 0x0027 (SINGLE QUOTE), which is now a
dedicated property assigned.
(is_property_case_ignorable): Check 0x0027.
(WBP_DQ, WBP_SQ, WBP_HL): New enumeration values.
(get_wbp, debug_output_wbp, fill_org_wbp, debug_output_org_wbp)
(output_wbp): Support those enum values.
* lib/unictype.in.h (UC_BIDI_LRI, UC_BIDI_RLI, UC_BIDI_FSI)
(UC_BIDI_PDI): New enumeration values.
* lib/unictype/bidi_byname.gperf: Add those property names.
---
lib/gen-uni-tables.c | 76 ++++++++++++++++++++++++++++++++++----
lib/unictype.in.h | 6 ++-
lib/unictype/bidi_byname.gperf | 12 ++++++
lib/uniwbrk.in.h | 5 ++-
lib/uniwbrk/u-wordbreaks.h | 38 ++++++++++++-------
lib/uniwbrk/wbrktable.c | 52 ++++++++++++++------------
lib/uniwbrk/wbrktable.h | 2 +-
tests/uniwbrk/test-uc-wordbreaks.c | 3 ++
8 files changed, 145 insertions(+), 49 deletions(-)
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index ce63ae4..a887947 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
/usr/local/share/Unidata/CompositionExclusions.txt \
/usr/local/share/Unidata/SpecialCasing.txt \
/usr/local/share/Unidata/CaseFolding.txt \
- 6.2.0
+ 6.3.0
*/
#include <stdbool.h>
@@ -1307,7 +1307,11 @@ enum
UC_BIDI_B, /* Paragraph Separator */
UC_BIDI_S, /* Segment Separator */
UC_BIDI_WS, /* Whitespace */
- UC_BIDI_ON /* Other Neutral */
+ UC_BIDI_ON, /* Other Neutral */
+ UC_BIDI_LRI, /* Left-to-Right Isolate */
+ UC_BIDI_RLI, /* Right-to-Left Isolate */
+ UC_BIDI_FSI, /* First Strong Isolate */
+ UC_BIDI_PDI /* Pop Directional Isolate */
};
static int
@@ -1365,7 +1369,20 @@ bidi_category_byname (const char *category_name)
break;
}
break;
- case 'L':
+ case 'F':
+ switch (category_name[1])
+ {
+ case 'S':
+ switch (category_name[2])
+ {
+ case 'I':
+ if (category_name[3] == '\0')
+ return UC_BIDI_FSI;
+ break;
+ }
+ }
+ break;
+ case 'L':
switch (category_name[1])
{
case '\0':
@@ -1381,7 +1398,11 @@ bidi_category_byname (const char *category_name)
if (category_name[3] == '\0')
return UC_BIDI_LRO;
break;
- }
+ case 'I':
+ if (category_name[3] == '\0')
+ return UC_BIDI_LRI;
+ break;
+ }
break;
}
break;
@@ -1418,6 +1439,10 @@ bidi_category_byname (const char *category_name)
if (category_name[3] == '\0')
return UC_BIDI_PDF;
break;
+ case 'I':
+ if (category_name[3] == '\0')
+ return UC_BIDI_PDI;
+ break;
}
break;
}
@@ -1438,7 +1463,11 @@ bidi_category_byname (const char *category_name)
if (category_name[3] == '\0')
return UC_BIDI_RLO;
break;
- }
+ case 'I':
+ if (category_name[3] == '\0')
+ return UC_BIDI_RLI;
+ break;
+ }
break;
}
break;
@@ -2518,7 +2547,7 @@ output_mirror (const char *filename, const char *version)
static bool
is_WBP_MIDNUMLET (unsigned int ch)
{
- return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
+ return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
|| ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
}
@@ -2999,6 +3028,7 @@ static bool
is_property_case_ignorable (unsigned int ch)
{
bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
+ || ch == 0x0027
|| is_category_Mn (ch)
|| is_category_Me (ch)
|| is_category_Cf (ch)
@@ -7465,7 +7495,10 @@ enum
WBP_MIDNUM = 5,
WBP_NUMERIC = 6,
WBP_EXTENDNUMLET = 7,
- WBP_RI = 13
+ WBP_RI = 13,
+ WBP_DQ = 14,
+ WBP_SQ = 15,
+ WBP_HL = 16
};
/* Returns the word breaking property for ch, as a bit mask. */
@@ -7504,6 +7537,11 @@ get_wbp (unsigned int ch)
|| ch == 0xFF70)
attr |= 1 << WBP_KATAKANA;
+ if ((unicode_scripts[ch] < numscripts
+ && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
+ && strcmp (unicode_attributes[ch].category, "Lo") == 0)
+ attr |= 1 << WBP_HL;
+
if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
|| ch == 0x05F3)
&& ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
@@ -7511,7 +7549,8 @@ get_wbp (unsigned int ch)
&& ((get_lbp (ch) >> LBP_SA) & 1) == 0
&& !(unicode_scripts[ch] < numscripts
&& strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
- && (attr & (1 << WBP_EXTEND)) == 0)
+ && (attr & (1 << WBP_EXTEND)) == 0
+ && (attr & (1 << WBP_HL)) == 0)
attr |= 1 << WBP_ALETTER;
if (is_WBP_MIDNUMLET (ch))
@@ -7536,6 +7575,12 @@ get_wbp (unsigned int ch)
if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
attr |= 1 << WBP_RI;
+
+ if (ch == 0x0022)
+ attr |= 1 << WBP_DQ;
+
+ if (ch == 0x0027)
+ attr |= 1 << WBP_SQ;
}
if (attr == 0)
@@ -7583,6 +7628,12 @@ debug_output_wbp (FILE *stream)
fprintf (stream, " ExtendNumLet");
if (attr & (1 << WBP_RI))
fprintf (stream, " Regional_Indicator");
+ if (attr & (1 << WBP_DQ))
+ fprintf (stream, " Double_Quote");
+ if (attr & (1 << WBP_SQ))
+ fprintf (stream, " Single_Quote");
+ if (attr & (1 << WBP_HL))
+ fprintf (stream, " Hebrew_Letter");
fprintf (stream, "\n");
}
}
@@ -7669,6 +7720,9 @@ fill_org_wbp (const char *wordbreakproperty_filename)
PROP ("Numeric", WBP_NUMERIC)
PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
PROP ("Regional_Indicator", WBP_RI)
+ PROP ("Double_Quote", WBP_DQ)
+ PROP ("Single_Quote", WBP_SQ)
+ PROP ("Hebrew_Letter", WBP_HL)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -7716,6 +7770,9 @@ debug_output_org_wbp (FILE *stream)
PROP ("Numeric", WBP_NUMERIC)
PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
PROP ("Regional_Indicator", WBP_RI)
+ PROP ("Double_Quote", WBP_DQ)
+ PROP ("Single_Quote", WBP_SQ)
+ PROP ("Hebrew_Letter", WBP_HL)
#undef PROP
fprintf (stream, " ??");
fprintf (stream, "\n");
@@ -7869,6 +7926,9 @@ output_wbp (FILE *stream)
CASE(WBP_NUMERIC);
CASE(WBP_EXTENDNUMLET);
CASE(WBP_RI);
+ CASE(WBP_DQ);
+ CASE(WBP_SQ);
+ CASE(WBP_HL);
#undef CASE
default:
abort ();
diff --git a/lib/unictype.in.h b/lib/unictype.in.h
index 30c71aa..c31d9e5 100644
--- a/lib/unictype.in.h
+++ b/lib/unictype.in.h
@@ -312,7 +312,11 @@ enum
UC_BIDI_B, /* Paragraph Separator */
UC_BIDI_S, /* Segment Separator */
UC_BIDI_WS, /* Whitespace */
- UC_BIDI_ON /* Other Neutral */
+ UC_BIDI_ON, /* Other Neutral */
+ UC_BIDI_LRI, /* Left-to-Right Isolate */
+ UC_BIDI_RLI, /* Right-to-Left Isolate */
+ UC_BIDI_FSI, /* First Strong Isolate */
+ UC_BIDI_PDI /* Pop Directional Isolate */
};
/* Return the name of a bidi class. */
diff --git a/lib/unictype/bidi_byname.gperf b/lib/unictype/bidi_byname.gperf
index 9cacacf..5bb0faa 100644
--- a/lib/unictype/bidi_byname.gperf
+++ b/lib/unictype/bidi_byname.gperf
@@ -19,14 +19,18 @@ CS, UC_BIDI_CS
EN, UC_BIDI_EN
ES, UC_BIDI_ES
ET, UC_BIDI_ET
+FSI, UC_BIDI_FSI
L, UC_BIDI_L
LRE, UC_BIDI_LRE
+LRI, UC_BIDI_LRI
LRO, UC_BIDI_LRO
NSM, UC_BIDI_NSM
ON, UC_BIDI_ON
PDF, UC_BIDI_PDF
+PDI, UC_BIDI_PDI
R, UC_BIDI_R
RLE, UC_BIDI_RLE
+RLI, UC_BIDI_RLI
RLO, UC_BIDI_RLO
S, UC_BIDI_S
WS, UC_BIDI_WS
@@ -46,10 +50,14 @@ European Separator, UC_BIDI_ES
EuropeanSeparator, UC_BIDI_ES
European Terminator, UC_BIDI_ET
EuropeanTerminator, UC_BIDI_ET
+First Strong Isolate, UC_BIDI_FSI
+FirstStrongIsolate, UC_BIDI_FSI
Left To Right, UC_BIDI_L
LeftToRight, UC_BIDI_L
Left To Right Embedding, UC_BIDI_LRE
LeftToRightEmbedding, UC_BIDI_LRE
+Left To Right Isolate, UC_BIDI_LRI
+LeftToRightIsolate, UC_BIDI_LRI
Left To Right Override, UC_BIDI_LRO
LeftToRightOverride, UC_BIDI_LRO
Nonspacing Mark, UC_BIDI_NSM
@@ -58,10 +66,14 @@ Other Neutral, UC_BIDI_ON
OtherNeutral, UC_BIDI_ON
Pop Directional Format, UC_BIDI_PDF
PopDirectionalFormat, UC_BIDI_PDF
+Pop Directional Isolate, UC_BIDI_PDI
+PopDirectionalIsolate, UC_BIDI_PDI
Right To Left, UC_BIDI_R
RightToLeft, UC_BIDI_R
Right To Left Embedding, UC_BIDI_RLE
RightToLeftEmbedding, UC_BIDI_RLE
+Right To Left Isolate, UC_BIDI_RLI
+RightToLeftIsolate, UC_BIDI_RLI
Right To Left Override, UC_BIDI_RLO
RightToLeftOverride, UC_BIDI_RLO
Segment Separator, UC_BIDI_S
diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h
index c272d48..9abea42 100644
--- a/lib/uniwbrk.in.h
+++ b/lib/uniwbrk.in.h
@@ -50,7 +50,10 @@ enum
WBP_MIDNUM = 5,
WBP_NUMERIC = 6,
WBP_EXTENDNUMLET = 7,
- WBP_RI = 13
+ WBP_RI = 13,
+ WBP_DQ = 14,
+ WBP_SQ = 15,
+ WBP_HL = 16
};
/* Return the Word_Break property of a Unicode character. */
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 1d7f951..65f00e9 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -69,31 +69,41 @@ FUNC (const UNIT *s, size_t n, char *p)
secondlast last current
- ALetter (MidLetter | MidNumLet) × ALetter (WB7)
- ALetter × (MidLetter | MidNumLet) ALetter (WB6)
- Numeric (MidNum | MidNumLet) × Numeric (WB11)
- Numeric × (MidNum | MidNumLet) Numeric (WB12)
- ALetter × ALetter (WB5)
- ALetter × Numeric (WB9)
- Numeric × ALetter (WB10)
+ (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7)
+ (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6)
+ Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11)
+ Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12)
+ HL × DQ HL (WB7b)
+ HL DQ × HL (WB7c)
+ (ALetter | HL) × (ALetter | HL) (WB5)
+ (ALetter | HL) × Numeric (WB9)
+ Numeric × (ALetter | HL) (WB10)
Numeric × Numeric (WB8)
+ HL × SQ (WB7a)
Katakana × Katakana (WB13)
- (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
+ (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
ExtendNumLet × ExtendNumLet (WB13a)
- ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
Regional_Indicator × Regional_Indicator (WB13c)
*/
/* No break across certain punctuation. Also, disable word
breaks that were recognized earlier (due to lookahead of
only one complex character). */
- if ((prop == WBP_ALETTER
+ if (((prop == WBP_ALETTER
+ || prop == WBP_HL)
&& (last_compchar_prop == WBP_MIDLETTER
- || last_compchar_prop == WBP_MIDNUMLET)
- && secondlast_compchar_prop == WBP_ALETTER)
+ || last_compchar_prop == WBP_MIDNUMLET
+ || last_compchar_prop == WBP_SQ)
+ && (secondlast_compchar_prop == WBP_ALETTER
+ || secondlast_compchar_prop == WBP_HL))
|| (prop == WBP_NUMERIC
&& (last_compchar_prop == WBP_MIDNUM
- || last_compchar_prop == WBP_MIDNUMLET)
- && secondlast_compchar_prop == WBP_NUMERIC))
+ || last_compchar_prop == WBP_MIDNUMLET
+ || last_compchar_prop == WBP_SQ)
+ && secondlast_compchar_prop == WBP_NUMERIC)
+ || (prop == WBP_HL
+ && last_compchar_prop == WBP_DQ
+ && secondlast_compchar_prop == WBP_HL))
{
*last_compchar_ptr = 0;
/* *p = 0; */
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 04bd0e5..baeed58 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -22,31 +22,35 @@
/* This table contains the following rules (see UAX #29):
- last current
-
- ALetter × ALetter (WB5)
- ALetter × Numeric (WB9)
- Numeric × ALetter (WB10)
- Numeric × Numeric (WB8)
- Katakana × Katakana (WB13)
- (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
- ExtendNumLet × ExtendNumLet (WB13a)
- ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
- Regional_Indicator × Regional_Indicator (WB13c)
+ last current
+
+ (ALetter | HL) × (ALetter | HL) (WB5)
+ (ALetter | HL) × Numeric (WB9)
+ HL × SQ (WB7a)
+ Numeric × (ALetter | HL) (WB10)
+ Numeric × Numeric (WB8)
+ Katakana × Katakana (WB13)
+(ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
+ ExtendNumLet × ExtendNumLet (WB13a)
+ ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
+ Regional_Indicator × Regional_Indicator (WB13c)
*/
-const unsigned char uniwbrk_table[9][9] =
-{ /* current: OTHER MIDNUMLET NUMERIC */
- /* KATAKANA MIDLETTER EXTENDNUMLET */
- /* ALETTER MIDNUM RI */
+const unsigned char uniwbrk_table[12][12] =
+{ /* current: OTHER MIDNUMLET NUMERIC DQ */
+ /* KATAKANA MIDLETTER EXTENDNUMLET SQ */
+ /* ALETTER MIDNUM RI HL */
/* last */
- /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
- /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1
},
- /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1
},
- /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
- /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
- /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
- /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1
},
- /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1
},
- /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0 }
+ /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 },
+ /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 },
+ /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 },
+ /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 },
+ /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1 },
+ /* WBP_DQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_SQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_HL */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0 }
};
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 50b7823..567a031 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -15,4 +15,4 @@
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
-extern const unsigned char uniwbrk_table[9][9];
+extern const unsigned char uniwbrk_table[12][12];
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c
b/tests/uniwbrk/test-uc-wordbreaks.c
index 710f583..2dc06f6 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -45,6 +45,9 @@ wordbreakproperty_to_string (int wbp)
CASE(NUMERIC)
CASE(EXTENDNUMLET)
CASE(RI)
+ CASE(DQ)
+ CASE(SQ)
+ CASE(HL)
}
abort ();
}
--
2.1.1
- [bug-libunistring] [PATCH 0/8] Update libunistring-related modules to Unicode 7.0.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 1/8] gen-uni-tables: Check out-of-range values added to 3-level tables, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 2/8] unictype/joininggroup-of: Switch to 3-level table, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 4/8] uniwbrk/u32-wordbreaks-tests: Test using WordBreakTest.txt from UCD, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 3/8] uniwbrk: Ignore Extended/Format at the beginning of the line, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 5/8] Update to Unicode 6.1.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 7/8] Update to Unicode 6.3.0,
Daiki Ueno <=
- [bug-libunistring] [PATCH 6/8] Update to Unicode 6.2.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 8/8] Update to Unicode 7.0.0, Daiki Ueno, 2014/10/10