[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#24603: [PATCHv5 11/11] Implement Irish casing rules (bug#24603)
From: |
Michal Nazarewicz |
Subject: |
bug#24603: [PATCHv5 11/11] Implement Irish casing rules (bug#24603) |
Date: |
Thu, 9 Mar 2017 22:51:50 +0100 |
Add rules for casing Irish words whose rules are quite complicated
and require some letters to remain lower case when upper casing,
involve insertion of dashes and various other transformations.
* src/casefiddle.c (struct casing_context): Add flags for denoting
Irish casing rules are in effect.
(prepare_casing_context): Detect Irish language and set ctx->special
accordingly.
(irish_upcase, irish_downcase): New functions for upcasing and
downcasing Irish letters.
(is_irish_vowel, is_irish_uc_vowel, is_irish_lc_vowel): New functions
detecting whether a letter is an Irish vowel.
(maybe_case_irish): New function implementing Irish casing rules.
(case_characters): Make use of maybe_case_irish.
* test/src/casefiddle-resources/irish-lowercase-1.txt,
test/src/casefiddle-resources/irish-lowercase-1-ref.txt,
test/src/casefiddle-resources/irish-uppercase-1.txt,
test/src/casefiddle-resources/irish-uppercase-1-ref.txt: New files
with test cases for Irish capitalisation. The files are copied from
Mozilla’s test suite.
* test/src/casefiddle-tests.el (casefiddle-tests--resources-dir): New
variable to point to aforementioned test case files.
(casefiddle-tests--test-casing): Support missing expected strings.
(casefiddle--read-lines): New helper functions for reading lines from
a file.
(casefiddle-test-irish): Apply test cases reada from the Irish test
case files.
---
src/casefiddle.c | 313 +++++++++++++++++++++
.../casefiddle-resources/irish-lowercase-1-ref.txt | 211 ++++++++++++++
.../src/casefiddle-resources/irish-lowercase-1.txt | 211 ++++++++++++++
.../casefiddle-resources/irish-uppercase-1-ref.txt | 105 +++++++
.../src/casefiddle-resources/irish-uppercase-1.txt | 105 +++++++
test/src/casefiddle-tests.el | 58 +++-
6 files changed, 992 insertions(+), 11 deletions(-)
create mode 100644 test/src/casefiddle-resources/irish-lowercase-1-ref.txt
create mode 100644 test/src/casefiddle-resources/irish-lowercase-1.txt
create mode 100644 test/src/casefiddle-resources/irish-uppercase-1-ref.txt
create mode 100644 test/src/casefiddle-resources/irish-uppercase-1.txt
diff --git a/src/casefiddle.c b/src/casefiddle.c
index a33bac7d21e..3352fb6795a 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -86,6 +86,24 @@ struct casing_context {
/* As above plus look out for diacritics combining above because
we may need to inject dot above before them. */
SPECIAL_LT_INS_DOT_ABOVE,
+
+ /* Flags for Irish word capitalising rules. Those are insane, see
+ https://bugzilla.mozilla.org/show_bug.cgi?id=1018805 and
+ https://bugzilla.mozilla.org/show_bug.cgi?id=1014639 for reference. */
+ /* Irish handling enabled; we are lower-casing words. */
+ SPECIAL_GA_LC,
+ /* Irish handling enabled; we are upper-casing words or capitalising. */
+ SPECIAL_GA_UC,
+ /* Upper-case next character. */
+ SPECIAL_GA_UC_NEXT,
+ /* We’re in the middle of a potential ‘bhf’ or ‘bhF’ triplet. */
+ SPECIAL_GA_TRIPLET_B = 'b',
+ /* We’re in the middle of a potential ‘n-{v}’ triplet. */
+ SPECIAL_GA_TRIPLET_N = 'n',
+ /* We’re in the middle of a potential triplet starting with ‘ts’ or ‘tS’.
*/
+ SPECIAL_GA_TRIPLET_T = 't'
+ /* Yes, it matters that the last three flags equal the characters for b,
+ n or t. */
} special;
};
@@ -127,6 +145,9 @@ prepare_casing_context (struct casing_context *ctx,
break;
case ('l' << 8) | 't': /* Lithuania */
ctx->special = SPECIAL_LT;
+ break;
+ case ('g' << 8) | 'a': /* Irish */
+ ctx->special = flag == CASE_DOWN ? SPECIAL_GA_LC : SPECIAL_GA_UC;
}
}
@@ -509,6 +530,289 @@ maybe_case_lithuanian (struct casing_str_buf *buf, struct
casing_context *ctx,
return RES_NOT_TOUCHED;
}
+/* The naïve method works for ASCII letters but also non-ASCII Irish
+ vowels and second byte in UTF-8 representation of those vowels. */
+static int irish_upcase(int ch) { return ch & ~('a' ^ 'A'); }
+static int irish_downcase(int ch) { return ch | ('a' ^ 'A'); }
+
+/* A byte starting two-byte sequence of non-ASCII Irish vowels. */
+#define GA_UTF8_FIRST_BYTE 0xC3
+
+/* Classifies UTF-8 encoded character at *CH as Irish vowel or not. */
+static enum {
+ GA_NOT_VOWEL = 0,
+ GA_UC_VOWEL = 1,
+ GA_LC_VOWEL = 2
+} is_irish_vowel(const unsigned char *ch) {
+ switch (*ch) {
+ case 'A': case 'E': case 'I': case 'O': case 'U':
+ return GA_UC_VOWEL;
+ case 'a': case 'e': case 'i': case 'o': case 'u':
+ return GA_LC_VOWEL;
+ case GA_UTF8_FIRST_BYTE:
+ switch (ch[1]) {
+ case 0x81: /* C3 81 → U+00C1 → Á */
+ case 0x89: /* C3 89 → U+00C9 → É */
+ case 0x8D: /* C3 8D → U+00CD → Í */
+ case 0x93: /* C3 93 → U+00D3 → Ó */
+ case 0x9A: /* C3 9A → U+00DA → Ú */
+ return GA_UC_VOWEL;
+ case 0xA1: /* C3 A1 → U+00E1 → á */
+ case 0xA9: /* C3 A9 → U+00E9 → é */
+ case 0xAD: /* C3 AD → U+00ED → í */
+ case 0xB3: /* C3 B3 → U+00F3 → ó */
+ case 0xBA: /* C3 BA → U+00FA → ú */
+ return GA_LC_VOWEL;
+ }
+ }
+ return GA_NOT_VOWEL;
+}
+
+/* Return whether *CH is an upper-case Irish vowel encoded in UTF-8. */
+static bool
+is_irish_uc_vowel(const char *ch) {
+ return is_irish_vowel(ch) == GA_UC_VOWEL;
+}
+
+/* Return whether *CH is an lower-case Irish vowel encoded in UTF-8. */
+static bool
+is_irish_lc_vowel(const char *ch) {
+ return is_irish_vowel(ch) == GA_LC_VOWEL;
+}
+
+/* Save in BUF result of casing character CH if Irish casing rules apply.
+
+ If not-NULL, NEXT points to the next character in the cased string. If
NULL,
+ it is assumed current character is the last one being cased. This is used
to
+ apply some rules which depend on proceeding state.
+
+ FLAG is a normalised flag (as returned by normalise_flag function).
+
+ Return -2 (RES_NOT_TOUCHED) if Irish rules did not apply, no changes
+ were made and other casing rules should be tried. Otherwise, meaning of
+ return values is the same as in case_characters function. */
+static int
+maybe_case_irish (struct casing_str_buf *buf, struct casing_context *ctx,
+ enum case_action flag, int ch, const unsigned char *next) {
+ unsigned n;
+
+ switch (ctx->special) {
+ case SPECIAL_GA_LC:
+ /* We are lower-casing words and Irish rules are enabled. See
+ https://bugzilla.mozilla.org/show_bug.cgi?id=1018805 for reference. The
+ rules are:
+
+ t{V} → t-{v}
+ n{V} → n-{v}
+
+ {V} denotes upper-case Irish vowel and {v} denotes a lower-case one. */
+
+ if (!next ||
+ ctx->inword ||
+ (ch != 't' && ch != 'n') ||
+ !is_irish_uc_vowel(next))
+ return RES_NOT_TOUCHED;
+
+ ctx->inword = true;
+ buf->len_chars = 3;
+ buf->data[0] = ch;
+ buf->data[1] = '-';
+ if (*next < 0x80) {
+ buf->data[2] = irish_downcase(next[0]);
+ buf->len_bytes = 3;
+ return 1;
+ } else {
+ buf->data[2] = next[0];
+ buf->data[3] = irish_downcase(next[1]);
+ buf->len_bytes = 4;
+ return 2;
+ }
+
+ case SPECIAL_GA_UC:
+ /* We are upper-casing or capitalising words and Irish rules are enabled.
+ See https://bugzilla.mozilla.org/show_bug.cgi?id=1014639 for
+ reference. The rules are:
+
+ h{V} → h{V}
+ n{V} → n{V}
+ t{V} → t{V}
+
+ bp → bP
+ bP → bP
+ dt → dT
+ dT → dT
+ gc → gC
+ gC → gC
+ mb → mB
+ mB → mB
+ nd → nD
+ nD → nD
+ ng → nG
+ nG → nG
+
+ bhf → bhF
+ bhF → bhF
+
+ n-{v} → n{V}
+ t-{v} → t{V}
+
+ tSL → tSL
+ tSl → tSL
+ tsl → tSL
+ tSN → tSN
+ tsn → tSN
+ tSn → tSN
+ tSR → tSR
+ tSr → tSR
+ tsr → tSR
+ tS{V} → tS{V}
+ tS{v} → tS{V}
+ ts{v} → tS{V}
+
+ {V} denotes upper-case Irish vowel and {v} denotes a lower-case one. */
+
+ if (!next || ctx->inword || ch < 'a' || ch > 'z')
+ return RES_NOT_TOUCHED;
+
+ n = irish_upcase(*next);
+ if (((ch == 'h' || ch == 'n' || ch == 't') && is_irish_uc_vowel(next)) ||
+ (ch == 'b' && n == 'P') ||
+ (ch == 'd' && n == 'T') ||
+ (ch == 'g' && n == 'C') ||
+ (ch == 'm' && n == 'B') ||
+ (ch == 'n' && n == 'D') ||
+ (ch == 'n' && n == 'G'))
+ {
+ ctx->inword = true;
+ ctx->special = SPECIAL_GA_UC_NEXT;
+ buf->data[0] = ch;
+ buf->len_chars = 1;
+ buf->len_bytes = 1;
+ return RES_NO_CHANGE;
+ }
+
+ if ((ch == 'b' && *next == 'h') || (ch == 't' && n == 'S') ||
+ (ch == 't' && *next == '-') || (ch == 'n' && *next == '-'))
+ {
+ /* We can only look at two characters at a time but here we need to make
+ a decision based on a 3-character sequence.
+
+ Let’s return empty string for now, remember the current character and
+ when we’ll be dealing with the next character we’ll be able to see
+ three characters.
+
+ Downside of this approach is that we cannot always correctly mark
+ buffer as changed. Namely, ‘bhF’ triplet does not need to be changed
+ but sadly we'll mark this as modified. */
+ ctx->inword = true;
+ ctx->special = ch;
+ buf->len_chars = 0;
+ buf->len_bytes = 0;
+ return RES_CHANGED;
+ }
+
+ return RES_NOT_TOUCHED;
+
+ case SPECIAL_GA_UC_NEXT:
+ ctx->special = SPECIAL_GA_UC;
+ n = irish_upcase(ch);
+ buf->len_bytes = CHAR_STRING (n, buf->data);
+ buf->len_chars = 1;
+ return n == ch ? RES_NO_CHANGE : RES_CHANGED;
+
+ case SPECIAL_GA_TRIPLET_B:
+ case SPECIAL_GA_TRIPLET_N:
+ case SPECIAL_GA_TRIPLET_T:
+ /* We’re here after encountering a possible beginning of a three-character
+ sequence that needs to be handled. Those are:
+
+ bhf → bhF
+ bhF → bhF
+
+ n-{v} → n{V}
+ t-{v} → t{V}
+
+ tSL → tSL
+ tSl → tSL
+ tsl → tSL
+ tSN → tSN
+ tsn → tSN
+ tSn → tSN
+ tSR → tSR
+ tSr → tSR
+ tsr → tSR
+ tS{V} → tS{V}
+ tS{v} → tS{V}
+ ts{v} → tS{V} */
+
+ if (*next)
+ switch (ch) {
+ case '-': /* ‘n-’ or ‘t-’ prefix. */
+ if (is_irish_lc_vowel(next))
+ {
+ buf->data[0] = ctx->special;
+ buf->len_chars = 1;
+ buf->len_bytes = 1;
+ ctx->special = SPECIAL_GA_UC_NEXT;
+ return RES_CHANGED;
+ }
+ break;
+
+ case 'h': /* ‘bh’ prefix */
+ if (irish_upcase(*next) == 'F') {
+ ctx->special = SPECIAL_GA_UC;
+ buf->data[0] = 'b';
+ buf->data[1] = 'h';
+ buf->data[2] = 'F';
+ buf->len_chars = 3;
+ buf->len_bytes = 3;
+ return 1;
+ }
+ break;
+
+ case 's': /* ‘ts’ prefix. */
+ if (*next == 'l' || *next == 'n' || *next == 'r' ||
+ is_irish_lc_vowel(next))
+ goto tSU;
+ break;
+
+ case 'S': /* ‘tS’ prefix. */
+ if (*next == 'l' || *next == 'n' || *next == 'r' ||
+ *next == 'L' || *next == 'N' || *next == 'R' ||
+ is_irish_vowel(next))
+ {
+ tSU:
+ /* t{s}{x} → tS{X} */
+ ctx->special = SPECIAL_GA_UC_NEXT;
+ buf->data[0] = 't';
+ buf->data[1] = 'S';
+ buf->len_chars = 2;
+ buf->len_bytes = 2;
+ return RES_CHANGED;
+ }
+ break;
+ }
+
+ /* Recover from a incorrect guess that it was a triplet. */
+ if (ch == '-') {
+ ctx->inword = false;
+ } else if (ctx->flag == CASE_UP) {
+ ch = irish_upcase(ch);
+ } else if (ctx->flag == CASE_CAPITALIZE) {
+ ch = irish_downcase(ch);
+ }
+
+ buf->data[0] = irish_upcase(ctx->special);
+ buf->data[1] = ch;
+ buf->len_chars = 2;
+ buf->len_bytes = 2;
+ ctx->special = SPECIAL_GA_UC;
+ return RES_CHANGED;
+ }
+
+ return RES_NOT_TOUCHED;
+}
+
/* Save in BUF result of casing character CH.
If not-NULL, NEXT points to the next character in the cased string. If
NULL,
@@ -543,6 +847,15 @@ case_characters (struct casing_str_buf *buf, struct
casing_context *ctx,
/* case SPECIAL_LT_DEL_DOT_ABOVE: */
/* case SPECIAL_LT_INS_DOT_ABOVE: */
ret = maybe_case_lithuanian (buf, ctx, flag, ch);
+ break;
+
+ case SPECIAL_GA_LC:
+ case SPECIAL_GA_UC:
+ case SPECIAL_GA_UC_NEXT:
+ case SPECIAL_GA_TRIPLET_B:
+ case SPECIAL_GA_TRIPLET_N:
+ case SPECIAL_GA_TRIPLET_T:
+ ret = maybe_case_irish (buf, ctx, flag, ch, next);
}
if (ret == RES_NOT_TOUCHED)
diff --git a/test/src/casefiddle-resources/irish-lowercase-1-ref.txt
b/test/src/casefiddle-resources/irish-lowercase-1-ref.txt
new file mode 100644
index 00000000000..cbe9f601e9c
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-lowercase-1-ref.txt
@@ -0,0 +1,211 @@
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár n-acmhainní uisce
+ár nathair
+ár nathair
+ár nathair
+n-a shaighdiúir
+gan dul as aca ach le n-a chabhair
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+eolaíocht na n-ábhar
+amhrán náisiúnta
+amhrán náisiúnta
+amhrán náisiúnta
+lucht na n-ealaíon
+lucht na n-ealaíon
+lucht na n-ealaíon
+lucht na n-ealaíon
+lucht na n-ealaíon
+neart daoine
+neart daoine
+neart daoine
+ceol na n-éan
+ceol na n-éan
+ceol na n-éan
+ceol na n-éan
+ceol na n-éan
+sa néal
+sa néal
+sa néal
+ord na n-imeachtaí
+ord na n-imeachtaí
+ord na n-imeachtaí
+ord na n-imeachtaí
+ord na n-imeachtaí
+nathair nimhe
+nathair nimhe
+nathair nimhe
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+lucht adhartha na n-íomhánna
+níos measa
+níos measa
+níos measa
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+gnéithe dár n-oidhreacht
+duine nochta
+duine nochta
+duine nochta
+cultúr na n-óg
+cultúr na n-óg
+cultúr na n-óg
+cultúr na n-óg
+cultúr na n-óg
+dhá nóiméad
+dhá nóiméad
+dhá nóiméad
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+ocht n-uaire sa lá
+gúna nua
+gúna nua
+gúna nua
+formhór na n-údarás
+formhór na n-údarás
+formhór na n-údarás
+formhór na n-údarás
+formhór na n-údarás
+imoibreoir núicléach
+imoibreoir núicléach
+imoibreoir núicléach
+sean-airteagal
+seanairteagal
+bunioncaim
+bun-ioncaim
+buanorduithe
+buan-orduithe
+ár n-athair
+ár n-athair
+clár na n-ábhar
+clár na n-ábhar
+ceol na ndaoine
+ceol na ndaoine
+táim i ngrá leat
+táim i ngrá leat
+cén t-am é?
+cén t-am é?
+cén t-am é?
+cén t-am é?
+cén t-am é?
+tar ar ais!
+tar ar ais!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+tá an t-ádh orm inniu!
+rud tábhachtach
+rud tábhachtach
+rud tábhachtach
+den obair an t-eolas
+den obair an t-eolas
+den obair an t-eolas
+den obair an t-eolas
+den obair an t-eolas
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-éileamh a íoc
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-inneall cuardaigh is fearr
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+an t-íochtar a chur in uachtar
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tabhair an t-ordú seo dó!
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+tá an t-ór buí aige.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-uisce beatha ar an tábla.
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+an t-úrscéal is deireanaí
+dréacht-acht
+dréachtphlean
+dréacht-phlean
+dréacht-íocaíocht
+áitainmneacha
+áit-ainmneacha
+státurraithe
+stát-urraithe
+ar aon tslí
+ar aon tslí
+amach ón tsnáthaid
+amach ón tsnáthaid
+ar an tsráid
+ar an tsráid
+caint an tsráidbhaile
+caint an tsráidbhaile
+cora crua an tsaoil
+cora crua an tsaoil
+bholadh an tsáile
+bholadh an tsáile
+uair sa tseachtain
+uair sa tseachtain
+deireadh an tséasúir
+deireadh an tséasúir
+fear an tsiopa
+fear an tsiopa
+an tsíocháin a choimeád
+an tsíocháin a choimeád
+an tsochaí faisnéise
+an tsochaí faisnéise
+gaoth an tsóláis
+gaoth an tsóláis
+is beag an tsuim iad
+is beag an tsuim iad
+infheicthe ag an tsúil
+infheicthe ag an tsúil
+scríobhfaidh
+scríobhfaidh
+preabphas
+preabphas
+úsáidtear
+úsáidtear
+snagcheol
+snagcheol
+in-athnuaite agatsa
+in-athnuaite agatsa
+teanga dhomhanda
+teanga dhomhanda
+réaltsruth
+réaltsruth
+na hataí
+na hataí
+t-léine
+t-léine
+t-léine
+t-léine
+torc allta
+torc allta
+tsk tsk tsk a chara
+tsk tsk tsk a chara
diff --git a/test/src/casefiddle-resources/irish-lowercase-1.txt
b/test/src/casefiddle-resources/irish-lowercase-1.txt
new file mode 100644
index 00000000000..dcb3454b96d
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-lowercase-1.txt
@@ -0,0 +1,211 @@
+ÁR nACMHAINNÍ UISCE
+ÁR N-ACMHAINNÍ UISCE
+Ár nAcmhainní Uisce
+Ár n-Acmhainní Uisce
+ár n-acmhainní uisce
+Ár nathair
+ÁR NATHAIR
+Ár Nathair
+N-a shaighdiúir
+gan dul as aca ach le nA chabhair
+EOLAÍOCHT NA nÁBHAR
+EOLAÍOCHT NA n-ÁBHAR
+Eolaíocht na nÁbhar
+Eolaíocht na n-Ábhar
+eolaíocht na n-ábhar
+Amhrán náisiúnta
+Amhrán Náisiúnta
+AMHRÁN NÁISIÚNTA
+LUCHT NA nEALAÍON
+LUCHT NA n-EALAÍON
+Lucht na nEalaíon
+Lucht na n-Ealaíon
+lucht na n-ealaíon
+Neart Daoine
+neart daoine
+NEART DAOINE
+CEOL NA nÉAN
+CEOL NA n-ÉAN
+Ceol na nÉan
+Ceol na n-Éan
+ceol na n-éan
+Sa Néal
+Sa néal
+SA NÉAL
+ORD NA nIMEACHTAÍ
+ORD NA n-IMEACHTAÍ
+Ord na nImeachtaí
+Ord na n-Imeachtaí
+ord na n-imeachtaí
+Nathair Nimhe
+Nathair nimhe
+NATHAIR NIMHE
+LUCHT ADHARTHA NA nÍOMHÁNNA
+LUCHT ADHARTHA NA n-ÍOMHÁNNA
+Lucht Adhartha na nÍomhánna
+Lucht Adhartha na n-Íomhánna
+lucht adhartha na n-íomhánna
+Níos Measa
+níos measa
+NÍOS MEASA
+GNÉITHE DÁR nOIDHREACHT
+GNÉITHE DÁR n-OIDHREACHT
+Gnéithe Dár nOidhreacht
+Gnéithe Dár n-Oidhreacht
+gnéithe dár n-oidhreacht
+Duine Nochta
+Duine nochta
+DUINE NOCHTA
+CULTÚR NA nÓG
+CULTÚR NA n-ÓG
+Cultúr na nÓg
+Cultúr na n-Óg
+cultúr na n-óg
+Dhá Nóiméad
+Dhá nóiméad
+DHÁ NÓIMÉAD
+OCHT nUAIRE SA LÁ
+OCHT n-UAIRE SA LÁ
+Ocht nUaire Sa Lá
+Ocht n-Uaire Sa Lá
+ocht n-uaire sa lá
+Gúna Nua
+gúna nua
+GÚNA NUA
+FORMHÓR NA nÚDARÁS
+FORMHÓR NA n-ÚDARÁS
+Formhór na nÚdarás
+Formhór na n-Údarás
+formhór na n-údarás
+Imoibreoir Núicléach
+Imoibreoir núicléach
+IMOIBREOIR NÚICLÉACH
+sean-Airteagal
+SeanAirteagal
+BunIoncaim
+Bun-Ioncaim
+BuanOrduithe
+Buan-Orduithe
+ÁR nATHAIR
+Ár nAthair
+CLÁR NA nÁBHAR
+Clár na nÁbhar
+CEOL NA nDAOINE
+Ceol na nDaoine
+TÁIM I nGRÁ LEAT
+Táim i nGrá Leat
+CÉN tAM É?
+CÉN t-AM É?
+Cén tAm É?
+Cén t-Am É?
+cén t-am é?
+Tar Ar Ais!
+tar ar ais!
+TÁ AN tÁDH ORM INNIU!
+TÁ AN t-ÁDH ORM INNIU!
+Tá An tÁdh Orm Inniu!
+Tá An t-Ádh Orm Inniu!
+tá an t-ádh orm inniu!
+Rud Tábhachtach
+Rud tábhachtach
+rud tábhachtach
+DEN OBAIR AN tEOLAS
+DEN OBAIR AN t-EOLAS
+Den Obair an tEolas
+Den Obair an t-Eolas
+den obair an t-eolas
+AN tÉILEAMH A ÍOC
+AN t-ÉILEAMH A ÍOC
+An tÉileamh a Íoc
+An t-Éileamh a Íoc
+an t-éileamh a íoc
+AN tINNEALL CUARDAIGH IS FEARR
+AN t-INNEALL CUARDAIGH IS FEARR
+An tInneall Cuardaigh Is Fearr
+An t-Inneall Cuardaigh Is Fearr
+an t-inneall cuardaigh is fearr
+AN tÍOCHTAR A CHUR IN UACHTAR
+AN t-ÍOCHTAR A CHUR IN UACHTAR
+An tÍochtar a Chur In Uachtar
+An t-Íochtar a Chur In Uachtar
+an t-íochtar a chur in uachtar
+TABHAIR AN tORDÚ SEO DÓ!
+TABHAIR AN t-ORDÚ SEO DÓ!
+Tabhair An tOrdú Seo Dó!
+Tabhair An t-Ordú Seo Dó!
+tabhair an t-ordú seo dó!
+TÁ AN tÓR BUÍ AIGE.
+TÁ AN t-ÓR BUÍ AIGE.
+Tá An tÓr Buí Aige.
+Tá An t-Ór Buí Aige.
+tá an t-ór buí aige.
+AN tUISCE BEATHA AR AN TÁBLA.
+AN t-UISCE BEATHA AR AN TÁBLA.
+An tUisce Beatha Ar An Tábla.
+An t-Uisce Beatha Ar An Tábla.
+an t-uisce beatha ar an tábla.
+AN tÚRSCÉAL IS DEIREANAÍ
+AN t-ÚRSCÉAL IS DEIREANAÍ
+An tÚrscéal Is Deireanaí
+An t-Úrscéal Is Deireanaí
+an t-úrscéal is deireanaí
+Dréacht-Acht
+DréachtPhlean
+Dréacht-Phlean
+Dréacht-Íocaíocht
+ÁitAinmneacha
+Áit-Ainmneacha
+StátUrraithe
+Stát-Urraithe
+AR AON tSLÍ
+Ar Aon tSlí
+AMACH ÓN tSNÁTHAID
+Amach Ón tSnáthaid
+AR AN tSRÁID
+Ar An tSráid
+CAINT AN tSRÁIDBHAILE
+Caint An tSráidbhaile
+CORA CRUA AN tSAOIL
+Cora Crua An tSaoil
+BHOLADH AN tSÁILE
+Bholadh An tSáile
+UAIR SA tSEACHTAIN
+Uair Sa tSeachtain
+DEIREADH AN tSÉASÚIR
+Deireadh An tSéasúir
+FEAR AN tSIOPA
+Fear an tSiopa
+AN tSÍOCHÁIN A CHOIMEÁD
+An tSíocháin a Choimeád
+AN tSOCHAÍ FAISNÉISE
+An tSochaí Faisnéise
+GAOTH AN tSÓLÁIS
+Gaoth aN tSóláis
+IS BEAG AN tSUIM IAD
+Is Beag An tSuim Iad
+INFHEICTHE AG AN tSÚIL
+Infheicthe Ag An tSúil
+SCRÍOBHFAIDH
+Scríobhfaidh
+PREABPHAS
+Preabphas
+ÚSÁIDTEAR
+Úsáidtear
+SNAGCHEOL
+Snagcheol
+IN-ATHNUAITE AGATSA
+In-Athnuaite AGATSA
+TEANGA DHOMHANDA
+Teanga Dhomhanda
+RÉALTSRUTH
+Réaltsruth
+NA HATAÍ
+Na Hataí
+T-LÉINE
+T-Léine
+t-Léine
+t-léine
+TORC ALLTA
+Torc Allta
+TSK TSK TSK A CHARA
+Tsk Tsk Tsk a Chara
diff --git a/test/src/casefiddle-resources/irish-uppercase-1-ref.txt
b/test/src/casefiddle-resources/irish-uppercase-1-ref.txt
new file mode 100644
index 00000000000..21d3e4a6126
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-uppercase-1-ref.txt
@@ -0,0 +1,105 @@
+ORD NA bhFOCAL
+COSÁN NA bhFILÍ
+ÁR bPOBAL
+NÓRA NA bPORTACH
+I dTOSACH BÁIRE
+AN GHAEILGE I dTUAISCEART NA hÉIREANN
+AS AN gCEANTAR SIN
+I gCONTAE NA MÍ AGUS I gCONAMARA
+DÉ hAOINE
+OIRTHEAR NA hÁISE
+PARLAIMINT NA hEORPA
+POBLACHT NA hÉIREANN
+EALAÍN NA hIODÁILE
+NA hÍOSÁNAIGH
+ACADAMH NA hOLLSCOLAÍOCHTA
+TÍR NA hÓIGE
+TOGHCHÁN NA hUACHTARÁNACHTA
+NA hÚDARÁIS CHÁNACH
+I mBUN MO MHACHNAMH
+I mBÉAL FEIRSTE AGUS I mBAILE ÁTHA CLIATH
+ÁR nACMHAINNÍ UISCE
+EOLAÍOCHT NA nÁBHAR
+LUCHT NA nEALAÍON
+CEOL NA nÉAN
+ORD NA nIMEACHTAÍ
+LUCHT ADHARTHA NA nÍOMHÁNNA
+GNÉITHE DÁR nOIDHREACHT
+CULTÚR NA nÓG
+OCHT nUAIRE SA LÁ
+FORMHÓR NA nÚDARÁS
+ÁR nATHAIR
+CLÁR NA nÁBHAR
+LOCH nEATHACH
+CUMANN NA nÉIREANNACH AONTAITHE
+GRÉASÁN NA nIONTAS
+NÓIBHÍSEACHT NA nÍOSÁNACH
+I gCEANTAR NA nOILEÁN
+TÍR NA nÓG
+BAILE NA nULTACH
+GORT NA nÚLL
+CEOL NA nDAOINE
+I nDÚN NA nGALL
+TÁIM I nGRÁ LEAT
+LABHAIR SÉ I nGAEILGE!
+CÉN tAM É?
+TÁ AN tÁDH ORM INNIU!
+DEN OBAIR AN tEOLAS
+AN tÉILEAMH A ÍOC
+AN tINNEALL CUARDAIGH IS FEARR
+AN tÍOCHTAR A CHUR IN UACHTAR
+TABHAIR AN tORDÚ SEO DÓ!
+TÁ AN tÓR BUÍ AIGE.
+AN tUISCE BEATHA AR AN TÁBLA.
+AN tÚRSCÉAL IS DEIREANAÍ
+AN tACHT OIDEACHAIS
+AN tÁIVÉ MÁIRIA
+AN tEARRACH ARABACH
+AN tÉIRÍ AMACH
+AN tIMEALL
+AN tÍOSÁNACH PEADAR CANISIUS
+AN tOILEÁNACH
+AN tÓR MUIRE
+AN tUASAL ÉAMON Ó CUÍV
+AN tÚDARÁS UM BÓITHRE NÁISIÚNTA
+AR AON tSLÍ
+BÉAL ÁTHA AN tSLÉIBHE
+AMACH ÓN tSNÁTHAID
+BANRÍON AN tSNEACHTA
+AR AN tSRÁID
+CAINT AN tSRÁIDBHAILE
+CORA CRUA AN tSAOIL
+BHOLADH AN tSÁILE
+UAIR SA tSEACHTAIN
+DEIREADH AN tSÉASÚIR
+FEAR AN tSIOPA
+AN tSÍOCHÁIN A CHOIMEÁD
+AN tSOCHAÍ FAISNÉISE
+GAOTH AN tSÓLÁIS
+IS BEAG AN tSUIM IAD
+INFHEICTHE AG AN tSÚIL
+CNOC AN tSAMHRAIDH
+CIONN tSÁILE
+AN tSEIRBHÍS PHOIBLÍ
+BAILE AN tSÉIPÉIL
+AN tSIRIA
+AN tSÍN
+OIFIG AN tSOLÁTHAIR
+POLL AN tSÓMAIS
+EOLAIRE AN tSUÍMH
+CASADH AN tSÚGÁIN
+SCRÍOBHFAIDH
+PREABPHAS
+ÚSÁIDTEAR
+SNAGCHEOL
+STÁITSE IMBOLC
+IN-ATHNUAITE AGATSA
+TEANGA DHOMHANDA
+RÉALTSRUTH
+NA HATAÍ
+NA HATAÍ
+ÁR NATHAIR
+ÁR NATHAIR
+T-LÉINE
+TORC ALLTA
+TSK TSK TSK A CHARA
diff --git a/test/src/casefiddle-resources/irish-uppercase-1.txt
b/test/src/casefiddle-resources/irish-uppercase-1.txt
new file mode 100644
index 00000000000..b95e0aa04df
--- /dev/null
+++ b/test/src/casefiddle-resources/irish-uppercase-1.txt
@@ -0,0 +1,105 @@
+ord na bhfocal
+Cosán na bhFilí
+ár bpobal
+Nóra na bPortach
+i dtosach báire
+An Ghaeilge i dTuaisceart na hÉireann
+as an gceantar sin
+I gContae na Mí agus i gConamara
+Dé hAoine
+Oirthear na hÁise
+Parlaimint na hEorpa
+Poblacht na hÉireann
+Ealaín na hIodáile
+na hÍosánaigh
+Acadamh na hOllscolaíochta
+Tír na hÓige
+toghchán na hUachtaránachta
+na hÚdaráis Chánach
+I mbun mo mhachnamh
+I mBéal Feirste agus i mBaile Átha Cliath
+ár n-acmhainní uisce
+eolaíocht na n-ábhar
+lucht na n-ealaíon
+ceol na n-éan
+ord na n-imeachtaí
+lucht adhartha na n-íomhánna
+gnéithe dár n-oidhreacht
+cultúr na n-óg
+ocht n-uaire sa lá
+formhór na n-údarás
+Ár nAthair
+Clár na nÁbhar
+Loch nEathach
+Cumann na nÉireannach Aontaithe
+Gréasán na nIontas
+nóibhíseacht na nÍosánach
+i gCeantar na nOileán
+Tír na nÓg
+Baile na nUltach
+Gort na nÚll
+ceol na ndaoine
+i nDún na nGall
+táim i ngrá leat
+labhair sé i nGaeilge!
+cén t-am é?
+tá an t-ádh orm inniu!
+Den obair an t-eolas
+An t-éileamh a íoc
+an t-inneall cuardaigh is fearr
+an t-íochtar a chur in uachtar
+Tabhair an t-ordú seo dó!
+Tá an t-ór buí aige.
+an t-uisce beatha ar an tábla.
+an t-úrscéal is deireanaí
+An tAcht Oideachais
+an tÁivé Máiria
+An tEarrach Arabach
+An tÉirí Amach
+An tImeall
+An tÍosánach Peadar Canisius
+An tOileánach
+An tÓr Muire
+an tUasal Éamon Ó Cuív
+An tÚdarás um Bóithre Náisiúnta
+ar aon tslí
+Béal Átha an tSléibhe
+Amach ón tsnáthaid
+Banríon an tSneachta
+ar an tsráid
+Caint an tSráidbhaile
+cora crua an tsaoil
+bholadh an tsáile
+uair sa tseachtain
+deireadh an tséasúir
+fear an tsiopa
+an tsíocháin a choimeád
+an tsochaí faisnéise
+gaoth an tsóláis
+Is beag an tsuim iad
+infheicthe ag an tsúil
+Cnoc an tSamhraidh
+Cionn tSáile
+an tSeirbhís Phoiblí
+Baile an tSéipéil
+An tSiria
+An tSín
+Oifig an tSoláthair
+Poll an tSómais
+Eolaire an tSuímh
+Casadh an tSúgáin
+scríobhfaidh
+preabphas
+úsáidtear
+snagcheol
+Stáitse Imbolc
+in-athnuaite agatsa
+Teanga Dhomhanda
+Réaltsruth
+na hataí
+Na Hataí
+ár nathair
+Ár Nathair
+t-léine
+torc allta
+tsk tsk tsk a chara
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index f7b0da41029..e5309066c9c 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -22,6 +22,11 @@
(require 'case-table)
(require 'ert)
+(defvar casefiddle-tests--resources-dir
+ (concat (concat (file-name-directory (or load-file-name buffer-file-name))
+ "/casefiddle-resources/"))
+ "Path to casefiddle-resources directory next to the \"casefiddle-tests.el\"
file.")
+
(ert-deftest casefiddle-tests-char-properties ()
"Sanity check of character Unicode properties."
(should-not
@@ -161,17 +166,18 @@ casefiddle-tests--test-casing
(while (and func-pairs expected)
(setq funcs (car func-pairs)
getters (list get-string get-region))
- (while (and funcs getters)
- (let ((got (funcall (car getters) (car funcs))))
- (unless (string-equal got (car expected))
- (let ((fmt (length (symbol-name (car funcs)))))
- (setq fmt (format "\n%%%ds: %%s" (max fmt 8)))
- (push (format (concat fmt fmt fmt)
- (car funcs) (funcall fmt-str input)
- "expected" (funcall fmt-str (car expected))
- "but got" (funcall fmt-str got))
- errors))))
- (setq funcs (cdr funcs) getters (cdr getters)))
+ (when (car expected)
+ (while (and funcs getters)
+ (let ((got (funcall (car getters) (car funcs))))
+ (unless (string-equal got (car expected))
+ (let ((fmt (length (symbol-name (car funcs)))))
+ (setq fmt (format "\n%%%ds: %%s" (max fmt 8)))
+ (push (format (concat fmt fmt fmt)
+ (car funcs) (funcall fmt-str input)
+ "expected" (funcall fmt-str (car expected))
+ "but got" (funcall fmt-str got))
+ errors))))
+ (setq funcs (cdr funcs) getters (cdr getters))))
(setq func-pairs (cdr func-pairs) expected (cdr expected))))
errors)
(cons () tests))))
@@ -268,6 +274,36 @@ casefiddle-tests--test-casing
("į\u0307" ; i-ogonek + dot above
"Į" "į\u0307" "Į" "Į" "lt"))))))
+
+(defun casefiddle--read-lines (test-file)
+ (with-temp-buffer
+ (insert-file-contents (concat casefiddle-tests--resources-dir test-file))
+ (split-string (buffer-string) "\n" nil " +")))
+
+(ert-deftest casefiddle-test-irish ()
+ (let (tests)
+ ;; Read upcase test cases
+ (let ((input (casefiddle--read-lines "irish-uppercase-1.txt"))
+ (expected (casefiddle--read-lines "irish-uppercase-1-ref.txt")))
+ (while (and input expected)
+ (push (list (car input) (car expected) nil nil nil "ga") tests)
+ (setq input (cdr input) expected (cdr expected)))
+ (should-not (or input expected)))
+
+ ;; Read downcase test cases
+ (let ((input (casefiddle--read-lines "irish-lowercase-1.txt"))
+ (expected (casefiddle--read-lines "irish-lowercase-1-ref.txt")))
+ (while (and input expected)
+ (let ((test (assoc (car input) tests)))
+ (if test
+ (setcar (cddr test) (car expected))
+ (push (list (car input) nil (car expected) nil nil "ga") tests)))
+ (setq input (cdr input) expected (cdr expected)))
+ (should-not (or input expected)))
+
+ (should-not (with-temp-buffer (casefiddle-tests--test-casing tests)))))
+
+
(ert-deftest casefiddle-tests-casing-byte8 ()
(should-not
(with-temp-buffer
--
2.12.0.246.ga2ecc84866-goog
bug#24603: [PATCHv5 05/11] Support casing characters which map into multiple code points (bug#24603), Michal Nazarewicz, 2017/03/09
bug#24603: [PATCHv5 00/11] Casing improvements, Eli Zaretskii, 2017/03/11
bug#24603: [PATCHv6 0/6] Casing improvements, language-independent part, Michal Nazarewicz, 2017/03/20