Line data Source code
1 : ;;; ucs-normalize.el --- Unicode normalization NFC/NFD/NFKD/NFKC
2 :
3 : ;; Copyright (C) 2009-2017 Free Software Foundation, Inc.
4 :
5 : ;; Author: Taichi Kawabata <kawabata.taichi@gmail.com>
6 : ;; Keywords: unicode, normalization
7 :
8 : ;; This file is part of GNU Emacs.
9 :
10 : ;; GNU Emacs is free software: you can redistribute it and/or modify
11 : ;; it under the terms of the GNU General Public License as published by
12 : ;; the Free Software Foundation, either version 3 of the License, or
13 : ;; (at your option) any later version.
14 :
15 : ;; GNU Emacs is distributed in the hope that it will be useful,
16 : ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 : ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 : ;; GNU General Public License for more details.
19 :
20 : ;; You should have received a copy of the GNU General Public License
21 : ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22 :
23 : ;;; Commentary:
24 : ;;
25 : ;; This program has passed the NormalizationTest-5.2.0.txt.
26 : ;;
27 : ;; References:
28 : ;; http://www.unicode.org/reports/tr15/
29 : ;; http://www.unicode.org/review/pr-29.html
30 : ;;
31 : ;; HFS-Normalization:
32 : ;; Reference:
33 : ;; http://developer.apple.com/technotes/tn/tn1150.html
34 : ;;
35 : ;; HFS Normalization excludes following area for decomposition.
36 : ;;
37 : ;; U+02000 .. U+02FFF :: Punctuation, symbols, dingbats, arrows, etc.
38 : ;; (Characters in this region will be composed.)
39 : ;; U+0F900 .. U+0FAFF :: CJK compatibility Ideographs.
40 : ;; U+2F800 .. U+2FFFF :: CJK compatibility Ideographs.
41 : ;;
42 : ;; HFS-Normalization is useful for normalizing text involving CJK Ideographs.
43 : ;;
44 : ;;;
45 : ;;; Implementation Notes on NFC/HFS-NFC.
46 : ;;;
47 : ;;
48 : ;; <Stages> Decomposition Composition
49 : ;; NFD: 'nfd nil
50 : ;; NFC: 'nfd t
51 : ;; NFKD: 'nfkd nil
52 : ;; NFKC: 'nfkd t
53 : ;; HFS-NFD: 'hfs-nfd 'hfs-nfd-comp-p
54 : ;; HFS-NFC: 'hfs-nfd t
55 : ;;
56 : ;; Algorithm for Normalization
57 : ;;
58 : ;; Before normalization, following data will be prepared.
59 : ;;
60 : ;; 1. quick-check-list
61 : ;;
62 : ;; `quick-check-list' consists of characters that will be decomposed
63 : ;; during normalization. It includes composition-exclusions,
64 : ;; singletons, non-starter-decompositions and decomposable
65 : ;; characters.
66 : ;;
67 : ;; `quick-check-regexp' will search the above characters plus
68 : ;; combining characters.
69 : ;;
70 : ;; 2. decomposition-translation
71 : ;;
72 : ;; `decomposition-translation' is a translation table that will be
73 : ;; used to decompose the characters.
74 : ;;
75 : ;;
76 : ;; Normalization Process
77 : ;;
78 : ;; A. Searching (`ucs-normalize-region')
79 : ;;
80 : ;; Region is searched for `quick-check-regexp' to find possibly
81 : ;; normalizable point.
82 : ;;
83 : ;; B. Identification of Normalization Block
84 : ;;
85 : ;; (1) start of the block
86 : ;; If the searched character is a starter and not combining
87 : ;; with previous character, then the beginning of the block is
88 : ;; the searched character. If searched character is combining
89 : ;; character, then previous character will be the target
90 : ;; character
91 : ;; (2) end of the block
92 : ;; Block ends at non-composable starter character.
93 : ;;
94 : ;; C. Decomposition (`ucs-normalize-block')
95 : ;;
96 : ;; The entire block will be decomposed by
97 : ;; `decomposition-translation' table.
98 : ;;
99 : ;; D. Sorting and Composition of Smaller Blocks (`ucs-normalize-block-compose-chars')
100 : ;;
101 : ;; The block will be split to multiple samller blocks by starter
102 : ;; characters. Each block is sorted, and composed if necessary.
103 : ;;
104 : ;; E. Composition of Entire Block (`ucs-normalize-compose-chars')
105 : ;;
106 : ;; Composed blocks are collected and again composed.
107 :
108 : ;;; Code:
109 :
110 : (defconst ucs-normalize-version "1.2")
111 :
112 : (eval-when-compile (require 'cl-lib))
113 :
114 : (declare-function nfd "ucs-normalize" (char))
115 :
116 : (eval-when-compile
117 :
118 : (defconst ucs-normalize-composition-exclusions
119 : '(#x0958 #x0959 #x095A #x095B #x095C #x095D #x095E #x095F
120 : #x09DC #x09DD #x09DF #x0A33 #x0A36 #x0A59 #x0A5A #x0A5B
121 : #x0A5E #x0B5C #x0B5D #x0F43 #x0F4D #x0F52 #x0F57 #x0F5C
122 : #x0F69 #x0F76 #x0F78 #x0F93 #x0F9D #x0FA2 #x0FA7 #x0FAC
123 : #x0FB9 #xFB1D #xFB1F #xFB2A #xFB2B #xFB2C #xFB2D #xFB2E
124 : #xFB2F #xFB30 #xFB31 #xFB32 #xFB33 #xFB34 #xFB35 #xFB36
125 : #xFB38 #xFB39 #xFB3A #xFB3B #xFB3C #xFB3E #xFB40 #xFB41
126 : #xFB43 #xFB44 #xFB46 #xFB47 #xFB48 #xFB49 #xFB4A #xFB4B
127 : #xFB4C #xFB4D #xFB4E #x2ADC #x1D15E #x1D15F #x1D160 #x1D161
128 : #x1D162 #x1D163 #x1D164 #x1D1BB #x1D1BC #x1D1BD #x1D1BE
129 : #x1D1BF #x1D1C0)
130 : "Composition Exclusion List.
131 : This list is taken from
132 : http://www.unicode.org/Public/UNIDATA/5.2/CompositionExclusions.txt")
133 :
134 : ;; Unicode ranges that decompositions & combining characters are defined.
135 : (defvar check-range nil)
136 : (setq check-range '((#x00a0 . #x3400) (#xA600 . #xAC00) (#xF900 . #x110ff) (#x1d000 . #x1dfff) (#x1f100 . #x1f2ff) (#x2f800 . #x2faff)))
137 :
138 : ;; Basic normalization functions
139 : (defun nfd (char)
140 : (let ((decomposition
141 : (get-char-code-property char 'decomposition)))
142 : (if (and decomposition (numberp (car decomposition))
143 : (or (> (length decomposition) 1)
144 : (/= (car decomposition) char)))
145 : decomposition)))
146 :
147 : (defun nfkd (char)
148 : (let ((decomposition
149 : (get-char-code-property char 'decomposition)))
150 : (if (symbolp (car decomposition)) (cdr decomposition)
151 : (if (or (> (length decomposition) 1)
152 : (/= (car decomposition) char)) decomposition))))
153 :
154 : (defun hfs-nfd (char)
155 : (when (or (and (>= char 0) (< char #x2000))
156 : (and (>= char #x3000) (< char #xf900))
157 : (and (>= char #xfb00) (< char #x2f800))
158 : (>= char #x30000))
159 : (nfd char))))
160 :
161 : (eval-and-compile
162 : (defun ucs-normalize-hfs-nfd-comp-p (char)
163 : (and (>= char #x2000) (< char #x3000)))
164 :
165 : (defsubst ucs-normalize-ccc (char)
166 : (get-char-code-property char 'canonical-combining-class))
167 : )
168 :
169 : ;; Data common to all normalizations
170 :
171 : (eval-when-compile
172 :
173 : (defvar combining-chars nil)
174 : (setq combining-chars nil)
175 : (defvar decomposition-pair-to-composition nil)
176 : (setq decomposition-pair-to-composition nil)
177 : (defvar non-starter-decompositions nil)
178 : (setq non-starter-decompositions nil)
179 : ;; This file needs to access these 2 Unicode properties, but when we
180 : ;; compile it during bootstrap, charprop.el was not built yet, and
181 : ;; therefore is not yet loaded into bootstrap-emacs, so
182 : ;; char-code-property-alist is nil, and get-char-code-property
183 : ;; always returns nil, something the code here doesn't like.
184 : (define-char-code-property 'decomposition "uni-decomposition.el")
185 : (define-char-code-property 'canonical-combining-class "uni-combining.el")
186 : (let ((char 0) ccc decomposition)
187 : (mapc
188 : (lambda (start-end)
189 : (cl-do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
190 : (setq ccc (ucs-normalize-ccc char))
191 : (setq decomposition (get-char-code-property
192 : char 'decomposition))
193 : (if (and (= (length decomposition) 1)
194 : (= (car decomposition) char))
195 : (setq decomposition nil))
196 : (if (and ccc (/= 0 ccc)) (add-to-list 'combining-chars char))
197 : (if (and (numberp (car decomposition))
198 : (/= (ucs-normalize-ccc (car decomposition))
199 : 0))
200 : (add-to-list 'non-starter-decompositions char))
201 : (when (numberp (car decomposition))
202 : (if (and (= 2 (length decomposition))
203 : (null (memq char ucs-normalize-composition-exclusions))
204 : (null (memq char non-starter-decompositions)))
205 : (setq decomposition-pair-to-composition
206 : (cons (cons decomposition char)
207 : decomposition-pair-to-composition)))
208 : ;; If not singleton decomposition, second and later characters in
209 : ;; decomposition will be the subject of combining characters.
210 : (if (cdr decomposition)
211 : (dolist (char (cdr decomposition))
212 : (add-to-list 'combining-chars char))))))
213 : check-range))
214 :
215 : (setq combining-chars
216 : (append combining-chars
217 : '(?ᅡ ?ᅢ ?ᅣ ?ᅤ ?ᅥ ?ᅦ ?ᅧ ?ᅨ ?ᅩ ?ᅪ
218 : ?ᅫ ?ᅬ ?ᅭ ?ᅮ ?ᅯ ?ᅰ ?ᅱ ?ᅲ ?ᅳ ?ᅴ ?ᅵ
219 : ?ᆨ ?ᆩ ?ᆪ ?ᆫ ?ᆬ ?ᆭ ?ᆮ ?ᆯ ?ᆰ ?ᆱ ?ᆲ ?ᆳ ?ᆴ
220 : ?ᆵ ?ᆶ ?ᆷ ?ᆸ ?ᆹ ?ᆺ ?ᆻ ?ᆼ ?ᆽ ?ᆾ ?ᆿ ?ᇀ ?ᇁ ?ᇂ)))
221 : )
222 :
223 : (eval-and-compile
224 : (defun ucs-normalize-make-hash-table-from-alist (alist)
225 : (let ((table (make-hash-table :test 'equal :size 2000)))
226 : (mapc (lambda (x) (puthash (car x) (cdr x) table)) alist)
227 : table))
228 :
229 : (defvar ucs-normalize-decomposition-pair-to-primary-composite nil
230 : "Hash table of decomposed pair to primary composite.
231 : Note that Hangul are excluded.")
232 : (setq ucs-normalize-decomposition-pair-to-primary-composite
233 : (ucs-normalize-make-hash-table-from-alist
234 : (eval-when-compile decomposition-pair-to-composition)))
235 :
236 : (defun ucs-normalize-primary-composite (decomposition-pair composition-predicate)
237 : "Convert DECOMPOSITION-PAIR to primary composite using COMPOSITION-PREDICATE."
238 : (let ((char (or (gethash decomposition-pair
239 : ucs-normalize-decomposition-pair-to-primary-composite)
240 : (and (<= #x1100 (car decomposition-pair))
241 : (< (car decomposition-pair) #x1113)
242 : (<= #x1161 (cadr decomposition-pair))
243 : (< (car decomposition-pair) #x1176)
244 : (let ((lindex (- (car decomposition-pair) #x1100))
245 : (vindex (- (cadr decomposition-pair) #x1161)))
246 : (+ #xAC00 (* (+ (* lindex 21) vindex) 28))))
247 : (and (<= #xac00 (car decomposition-pair))
248 : (< (car decomposition-pair) #xd7a4)
249 : (<= #x11a7 (cadr decomposition-pair))
250 : (< (cadr decomposition-pair) #x11c3)
251 : (= 0 (% (- (car decomposition-pair) #xac00) 28))
252 : (let ((tindex (- (cadr decomposition-pair) #x11a7)))
253 : (+ (car decomposition-pair) tindex))))))
254 : (if (and char
255 : (functionp composition-predicate)
256 : (null (funcall composition-predicate char)))
257 : nil char)))
258 : )
259 :
260 : (defvar ucs-normalize-combining-chars nil)
261 : (setq ucs-normalize-combining-chars (eval-when-compile combining-chars))
262 :
263 : (defvar ucs-normalize-combining-chars-regexp nil
264 : "Regular expression to match sequence of combining characters.")
265 : (setq ucs-normalize-combining-chars-regexp
266 : (eval-when-compile (concat (regexp-opt-charset combining-chars) "+")))
267 :
268 : (declare-function decomposition-translation-alist "ucs-normalize"
269 : (decomposition-function))
270 : (declare-function decomposition-char-recursively "ucs-normalize"
271 : (char decomposition-function))
272 : (declare-function alist-list-to-vector "ucs-normalize" (alist))
273 :
274 : (eval-when-compile
275 :
276 : (defun decomposition-translation-alist (decomposition-function)
277 : (let (decomposition alist)
278 : (mapc
279 : (lambda (start-end)
280 : (cl-do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
281 : (setq decomposition (funcall decomposition-function char))
282 : (if decomposition
283 : (setq alist (cons (cons char
284 : (apply 'append
285 : (mapcar (lambda (x)
286 : (decomposition-char-recursively
287 : x decomposition-function))
288 : decomposition)))
289 : alist)))))
290 : check-range)
291 : alist))
292 :
293 : (defun decomposition-char-recursively (char decomposition-function)
294 : (let ((decomposition (funcall decomposition-function char)))
295 : (if decomposition
296 : (apply 'append
297 : (mapcar (lambda (x)
298 : (decomposition-char-recursively x decomposition-function))
299 : decomposition))
300 : (list char))))
301 :
302 : (defun alist-list-to-vector (alist)
303 : (mapcar (lambda (x) (cons (car x) (apply 'vector (cdr x)))) alist))
304 :
305 : (defvar nfd-alist nil)
306 : (setq nfd-alist (alist-list-to-vector (decomposition-translation-alist 'nfd)))
307 : (defvar nfkd-alist nil)
308 : (setq nfkd-alist (alist-list-to-vector (decomposition-translation-alist 'nfkd)))
309 : (defvar hfs-nfd-alist nil)
310 : (setq hfs-nfd-alist (alist-list-to-vector (decomposition-translation-alist 'hfs-nfd)))
311 : )
312 :
313 : (eval-and-compile
314 : (defvar ucs-normalize-hangul-translation-alist nil)
315 : (setq ucs-normalize-hangul-translation-alist
316 : (let ((i 0) entries)
317 : (while (< i 11172)
318 : (setq entries
319 : (cons (cons (+ #xac00 i)
320 : (if (= 0 (% i 28))
321 : (vector (+ #x1100 (/ i 588))
322 : (+ #x1161 (/ (% i 588) 28)))
323 : (vector (+ #x1100 (/ i 588))
324 : (+ #x1161 (/ (% i 588) 28))
325 : (+ #x11a7 (% i 28)))))
326 : entries)
327 : i (1+ i))) entries))
328 :
329 : (defun ucs-normalize-make-translation-table-from-alist (alist)
330 : (make-translation-table-from-alist
331 : (append alist ucs-normalize-hangul-translation-alist)))
332 :
333 : (define-translation-table 'ucs-normalize-nfd-table
334 : (ucs-normalize-make-translation-table-from-alist (eval-when-compile nfd-alist)))
335 : (define-translation-table 'ucs-normalize-nfkd-table
336 : (ucs-normalize-make-translation-table-from-alist (eval-when-compile nfkd-alist)))
337 : (define-translation-table 'ucs-normalize-hfs-nfd-table
338 : (ucs-normalize-make-translation-table-from-alist (eval-when-compile hfs-nfd-alist)))
339 :
340 : (defun ucs-normalize-sort (chars)
341 : "Sort by canonical combining class of CHARS."
342 : (sort chars
343 : (lambda (ch1 ch2)
344 : (< (ucs-normalize-ccc ch1) (ucs-normalize-ccc ch2)))))
345 :
346 : (defun ucs-normalize-compose-chars (chars composition-predicate)
347 : "Compose CHARS by COMPOSITION-PREDICATE.
348 : CHARS must be sorted and normalized in starter-combining pairs."
349 : (if composition-predicate
350 : (let* ((starter (car chars))
351 : remain result prev-ccc
352 : (target-chars (cdr chars))
353 : target target-ccc
354 : primary-composite)
355 : (while target-chars
356 : (setq target (car target-chars)
357 : target-ccc (ucs-normalize-ccc target))
358 : (if (and (or (null prev-ccc)
359 : (< prev-ccc target-ccc))
360 : (setq primary-composite
361 : (ucs-normalize-primary-composite (list starter target)
362 : composition-predicate)))
363 : ;; case 1: composable
364 : (setq starter primary-composite
365 : prev-ccc nil)
366 : (if (= 0 target-ccc)
367 : ;; case 2: move starter
368 : (setq result (nconc result (cons starter (nreverse remain)))
369 : starter target
370 : remain nil)
371 : ;; case 3: move target
372 : (setq prev-ccc target-ccc
373 : remain (cons target remain))))
374 : (setq target-chars (cdr target-chars)))
375 : (nconc result (cons starter (nreverse remain))))
376 : chars))
377 :
378 : (defun ucs-normalize-block-compose-chars (chars composition-predicate)
379 : "Try composing CHARS by COMPOSITION-PREDICATE.
380 : If COMPOSITION-PREDICATE is not given, then do nothing."
381 : (let ((chars (ucs-normalize-sort chars)))
382 : (if composition-predicate
383 : (ucs-normalize-compose-chars chars composition-predicate)
384 : chars)))
385 : )
386 :
387 : (declare-function quick-check-list "ucs-normalize"
388 : (decomposition-translation &optional composition-predicate))
389 : (declare-function quick-check-list-to-regexp "ucs-normalize" (quick-check-list))
390 :
391 : (eval-when-compile
392 :
393 : (defun quick-check-list (decomposition-translation
394 : &optional composition-predicate)
395 : "Quick-Check List for DECOMPOSITION-TRANSLATION and COMPOSITION-PREDICATE.
396 : It includes Singletons, CompositionExclusions, and Non-Starter
397 : decomposition."
398 : (let (entries decomposition composition)
399 : (with-temp-buffer
400 : (mapc
401 : (lambda (start-end)
402 : (cl-do ((i (car start-end) (+ i 1))) ((> i (cdr start-end)))
403 : (setq decomposition
404 : (string-to-list
405 : (progn
406 : (erase-buffer)
407 : (insert i)
408 : (translate-region 1 2 decomposition-translation)
409 : (buffer-string))))
410 : (setq composition
411 : (ucs-normalize-block-compose-chars decomposition composition-predicate))
412 : (when (not (equal composition (list i)))
413 : (setq entries (cons i entries)))))
414 : check-range))
415 : ;;(remove-duplicates
416 : (append entries
417 : ucs-normalize-composition-exclusions
418 : non-starter-decompositions)))
419 : ;;)
420 :
421 : (defvar nfd-quick-check-list nil)
422 : (setq nfd-quick-check-list (quick-check-list 'ucs-normalize-nfd-table ))
423 : (defvar nfc-quick-check-list nil)
424 : (setq nfc-quick-check-list (quick-check-list 'ucs-normalize-nfd-table t ))
425 : (defvar nfkd-quick-check-list nil)
426 : (setq nfkd-quick-check-list (quick-check-list 'ucs-normalize-nfkd-table ))
427 : (defvar nfkc-quick-check-list nil)
428 : (setq nfkc-quick-check-list (quick-check-list 'ucs-normalize-nfkd-table t ))
429 : (defvar hfs-nfd-quick-check-list nil)
430 : (setq hfs-nfd-quick-check-list (quick-check-list 'ucs-normalize-hfs-nfd-table
431 : 'ucs-normalize-hfs-nfd-comp-p))
432 : (defvar hfs-nfc-quick-check-list nil)
433 : (setq hfs-nfc-quick-check-list (quick-check-list 'ucs-normalize-hfs-nfd-table t ))
434 :
435 : (defun quick-check-list-to-regexp (quick-check-list)
436 : (regexp-opt-charset (append quick-check-list combining-chars)))
437 :
438 : (defun quick-check-decomposition-list-to-regexp (quick-check-list)
439 : (concat (quick-check-list-to-regexp quick-check-list) "\\|[가-힣]"))
440 :
441 : (defun quick-check-composition-list-to-regexp (quick-check-list)
442 : (concat (quick-check-list-to-regexp quick-check-list) "\\|[ᅡ-ᅵᆨ-ᇂ]"))
443 : )
444 :
445 :
446 : ;; NFD/NFC
447 : (defvar ucs-normalize-nfd-quick-check-regexp nil)
448 : (setq ucs-normalize-nfd-quick-check-regexp
449 : (eval-when-compile (quick-check-decomposition-list-to-regexp nfd-quick-check-list)))
450 : (defvar ucs-normalize-nfc-quick-check-regexp nil)
451 : (setq ucs-normalize-nfc-quick-check-regexp
452 : (eval-when-compile (quick-check-composition-list-to-regexp nfc-quick-check-list)))
453 :
454 : ;; NFKD/NFKC
455 : (defvar ucs-normalize-nfkd-quick-check-regexp nil)
456 : (setq ucs-normalize-nfkd-quick-check-regexp
457 : (eval-when-compile (quick-check-decomposition-list-to-regexp nfkd-quick-check-list)))
458 : (defvar ucs-normalize-nfkc-quick-check-regexp nil)
459 : (setq ucs-normalize-nfkc-quick-check-regexp
460 : (eval-when-compile (quick-check-composition-list-to-regexp nfkc-quick-check-list)))
461 :
462 : ;; HFS-NFD/HFS-NFC
463 : (defvar ucs-normalize-hfs-nfd-quick-check-regexp nil)
464 : (setq ucs-normalize-hfs-nfd-quick-check-regexp
465 : (eval-when-compile (concat (quick-check-decomposition-list-to-regexp hfs-nfd-quick-check-list))))
466 : (defvar ucs-normalize-hfs-nfc-quick-check-regexp nil)
467 : (setq ucs-normalize-hfs-nfc-quick-check-regexp
468 : (eval-when-compile (quick-check-composition-list-to-regexp hfs-nfc-quick-check-list)))
469 :
470 : ;;------------------------------------------------------------------------------------------
471 :
472 : ;; Normalize local region.
473 :
474 : (defun ucs-normalize-block
475 : (from to &optional decomposition-translation-table composition-predicate)
476 : "Normalize region FROM TO, by sorting the region with canonical-cc.
477 : If DECOMPOSITION-TRANSLATION-TABLE is given, translate region
478 : before sorting. If COMPOSITION-PREDICATE is given, then compose
479 : the region by using it."
480 0 : (save-restriction
481 0 : (narrow-to-region from to)
482 0 : (goto-char (point-min))
483 0 : (if decomposition-translation-table
484 0 : (translate-region from to decomposition-translation-table))
485 0 : (goto-char (point-min))
486 0 : (let ((start (point)) chars); ccc)
487 0 : (while (not (eobp))
488 0 : (forward-char)
489 0 : (when (or (eobp)
490 0 : (= 0 (ucs-normalize-ccc (char-after (point)))))
491 0 : (setq chars
492 0 : (nconc chars
493 0 : (ucs-normalize-block-compose-chars
494 0 : (string-to-list (buffer-substring start (point)))
495 0 : composition-predicate))
496 0 : start (point)))
497 : ;;(unless ccc (error "Undefined character can not be normalized!"))
498 0 : )
499 0 : (delete-region (point-min) (point-max))
500 0 : (apply 'insert
501 0 : (ucs-normalize-compose-chars
502 0 : chars composition-predicate)))))
503 :
504 : (defun ucs-normalize-region
505 : (from to quick-check-regexp translation-table composition-predicate)
506 : "Normalize region from FROM to TO.
507 : QUICK-CHECK-REGEXP is applied for searching the region.
508 : TRANSLATION-TABLE will be used to decompose region.
509 : COMPOSITION-PREDICATE will be used to compose region."
510 0 : (save-excursion
511 0 : (save-restriction
512 0 : (narrow-to-region from to)
513 0 : (goto-char (point-min))
514 0 : (let (start-pos starter)
515 0 : (while (re-search-forward quick-check-regexp nil t)
516 0 : (setq starter (string-to-char (match-string 0)))
517 0 : (setq start-pos (match-beginning 0))
518 0 : (ucs-normalize-block
519 : ;; from
520 0 : (if (or (= start-pos (point-min))
521 0 : (and (= 0 (ucs-normalize-ccc starter))
522 0 : (not (memq starter ucs-normalize-combining-chars))))
523 0 : start-pos (1- start-pos))
524 : ;; to
525 0 : (if (looking-at ucs-normalize-combining-chars-regexp)
526 0 : (match-end 0) (1+ start-pos))
527 0 : translation-table composition-predicate))))))
528 :
529 : ;; --------------------------------------------------------------------------------
530 :
531 : (defmacro ucs-normalize-string (ucs-normalize-region)
532 6 : `(with-temp-buffer
533 : (insert str)
534 6 : (,ucs-normalize-region (point-min) (point-max))
535 6 : (buffer-string)))
536 :
537 : ;;;###autoload
538 : (defun ucs-normalize-NFD-region (from to)
539 : "Normalize the current region by the Unicode NFD."
540 : (interactive "r")
541 0 : (ucs-normalize-region from to
542 0 : ucs-normalize-nfd-quick-check-regexp
543 0 : 'ucs-normalize-nfd-table nil))
544 : ;;;###autoload
545 : (defun ucs-normalize-NFD-string (str)
546 : "Normalize the string STR by the Unicode NFD."
547 0 : (ucs-normalize-string ucs-normalize-NFD-region))
548 :
549 : ;;;###autoload
550 : (defun ucs-normalize-NFC-region (from to)
551 : "Normalize the current region by the Unicode NFC."
552 : (interactive "r")
553 0 : (ucs-normalize-region from to
554 0 : ucs-normalize-nfc-quick-check-regexp
555 0 : 'ucs-normalize-nfd-table t))
556 : ;;;###autoload
557 : (defun ucs-normalize-NFC-string (str)
558 : "Normalize the string STR by the Unicode NFC."
559 0 : (ucs-normalize-string ucs-normalize-NFC-region))
560 :
561 : ;;;###autoload
562 : (defun ucs-normalize-NFKD-region (from to)
563 : "Normalize the current region by the Unicode NFKD."
564 : (interactive "r")
565 0 : (ucs-normalize-region from to
566 0 : ucs-normalize-nfkd-quick-check-regexp
567 0 : 'ucs-normalize-nfkd-table nil))
568 : ;;;###autoload
569 : (defun ucs-normalize-NFKD-string (str)
570 : "Normalize the string STR by the Unicode NFKD."
571 0 : (ucs-normalize-string ucs-normalize-NFKD-region))
572 :
573 : ;;;###autoload
574 : (defun ucs-normalize-NFKC-region (from to)
575 : "Normalize the current region by the Unicode NFKC."
576 : (interactive "r")
577 0 : (ucs-normalize-region from to
578 0 : ucs-normalize-nfkc-quick-check-regexp
579 0 : 'ucs-normalize-nfkd-table t))
580 : ;;;###autoload
581 : (defun ucs-normalize-NFKC-string (str)
582 : "Normalize the string STR by the Unicode NFKC."
583 0 : (ucs-normalize-string ucs-normalize-NFKC-region))
584 :
585 : ;;;###autoload
586 : (defun ucs-normalize-HFS-NFD-region (from to)
587 : "Normalize the current region by the Unicode NFD and Mac OS's HFS Plus."
588 : (interactive "r")
589 0 : (ucs-normalize-region from to
590 0 : ucs-normalize-hfs-nfd-quick-check-regexp
591 : 'ucs-normalize-hfs-nfd-table
592 0 : 'ucs-normalize-hfs-nfd-comp-p))
593 : ;;;###autoload
594 : (defun ucs-normalize-HFS-NFD-string (str)
595 : "Normalize the string STR by the Unicode NFD and Mac OS's HFS Plus."
596 0 : (ucs-normalize-string ucs-normalize-HFS-NFD-region))
597 : ;;;###autoload
598 : (defun ucs-normalize-HFS-NFC-region (from to)
599 : "Normalize the current region by the Unicode NFC and Mac OS's HFS Plus."
600 : (interactive "r")
601 0 : (ucs-normalize-region from to
602 0 : ucs-normalize-hfs-nfc-quick-check-regexp
603 0 : 'ucs-normalize-hfs-nfd-table t))
604 : ;;;###autoload
605 : (defun ucs-normalize-HFS-NFC-string (str)
606 : "Normalize the string STR by the Unicode NFC and Mac OS's HFS Plus."
607 0 : (ucs-normalize-string ucs-normalize-HFS-NFC-region))
608 :
609 : ;; Post-read-conversion function for `utf-8-hfs'.
610 : (defun ucs-normalize-hfs-nfd-post-read-conversion (len)
611 0 : (save-excursion
612 0 : (save-restriction
613 0 : (narrow-to-region (point) (+ (point) len))
614 0 : (ucs-normalize-HFS-NFC-region (point-min) (point-max))
615 0 : (- (point-max) (point-min)))))
616 :
617 : ;; Pre-write conversion for `utf-8-hfs'.
618 : ;; _from and _to are legacy arguments (see `define-coding-system').
619 : (defun ucs-normalize-hfs-nfd-pre-write-conversion (_from _to)
620 0 : (ucs-normalize-HFS-NFD-region (point-min) (point-max)))
621 :
622 : ;;; coding-system definition
623 : (define-coding-system 'utf-8-hfs
624 : "UTF-8 based coding system for macOS HFS file names.
625 : The singleton characters in HFS normalization exclusion will not
626 : be decomposed."
627 : :coding-type 'utf-8
628 : :mnemonic ?U
629 : :charset-list '(unicode)
630 : :post-read-conversion 'ucs-normalize-hfs-nfd-post-read-conversion
631 : :pre-write-conversion 'ucs-normalize-hfs-nfd-pre-write-conversion
632 : )
633 :
634 : ;; This is tested in dired.c:file_name_completion in order to reject
635 : ;; false positives due to comparison of encoded file names.
636 : (coding-system-put 'utf-8-hfs 'decomposed-characters 't)
637 :
638 : (provide 'ucs-normalize)
639 :
640 : ;; Local Variables:
641 : ;; coding: utf-8
642 : ;; End:
643 :
644 : ;;; ucs-normalize.el ends here
|