LCOV - code coverage report
Current view: top level - lisp/international - ucs-normalize.el (source / functions) Hit Total Coverage
Test: tramp-tests.info Lines: 3 71 4.2 %
Date: 2017-08-27 09:44:50 Functions: 1 17 5.9 %

          Line data    Source code
       1             : ;;; ucs-normalize.el --- Unicode normalization NFC/NFD/NFKD/NFKC
       2             : 
       3             : ;; Copyright (C) 2009-2017 Free Software Foundation, Inc.
       4             : 
       5             : ;; Author: Taichi Kawabata <kawabata.taichi@gmail.com>
       6             : ;; Keywords: unicode, normalization
       7             : 
       8             : ;; This file is part of GNU Emacs.
       9             : 
      10             : ;; GNU Emacs is free software: you can redistribute it and/or modify
      11             : ;; it under the terms of the GNU General Public License as published by
      12             : ;; the Free Software Foundation, either version 3 of the License, or
      13             : ;; (at your option) any later version.
      14             : 
      15             : ;; GNU Emacs is distributed in the hope that it will be useful,
      16             : ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
      17             : ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      18             : ;; GNU General Public License for more details.
      19             : 
      20             : ;; You should have received a copy of the GNU General Public License
      21             : ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
      22             : 
      23             : ;;; Commentary:
      24             : ;;
      25             : ;; This program has passed the NormalizationTest-5.2.0.txt.
      26             : ;;
      27             : ;; References:
      28             : ;; http://www.unicode.org/reports/tr15/
      29             : ;; http://www.unicode.org/review/pr-29.html
      30             : ;;
      31             : ;; HFS-Normalization:
      32             : ;; Reference:
      33             : ;; http://developer.apple.com/technotes/tn/tn1150.html
      34             : ;;
      35             : ;; HFS Normalization excludes following area for decomposition.
      36             : ;;
      37             : ;;  U+02000 .. U+02FFF :: Punctuation, symbols, dingbats, arrows, etc.
      38             : ;;                        (Characters in this region will be composed.)
      39             : ;;  U+0F900 .. U+0FAFF :: CJK compatibility Ideographs.
      40             : ;;  U+2F800 .. U+2FFFF :: CJK compatibility Ideographs.
      41             : ;;
      42             : ;; HFS-Normalization is useful for normalizing text involving CJK Ideographs.
      43             : ;;
      44             : ;;;
      45             : ;;; Implementation Notes on NFC/HFS-NFC.
      46             : ;;;
      47             : ;;
      48             : ;;    <Stages>   Decomposition    Composition
      49             : ;;   NFD:        'nfd             nil
      50             : ;;   NFC:        'nfd             t
      51             : ;;   NFKD:       'nfkd            nil
      52             : ;;   NFKC:       'nfkd            t
      53             : ;;   HFS-NFD:    'hfs-nfd         'hfs-nfd-comp-p
      54             : ;;   HFS-NFC:    'hfs-nfd         t
      55             : ;;
      56             : ;; Algorithm for Normalization
      57             : ;;
      58             : ;; Before normalization, following data will be prepared.
      59             : ;;
      60             : ;; 1. quick-check-list
      61             : ;;
      62             : ;;  `quick-check-list' consists of characters that will be decomposed
      63             : ;;  during normalization.  It includes composition-exclusions,
      64             : ;;  singletons, non-starter-decompositions and decomposable
      65             : ;;  characters.
      66             : ;;
      67             : ;;  `quick-check-regexp' will search the above characters plus
      68             : ;;  combining characters.
      69             : ;;
      70             : ;; 2. decomposition-translation
      71             : ;;
      72             : ;;  `decomposition-translation' is a translation table that will be
      73             : ;;  used to decompose the characters.
      74             : ;;
      75             : ;;
      76             : ;; Normalization Process
      77             : ;;
      78             : ;; A. Searching (`ucs-normalize-region')
      79             : ;;
      80             : ;;    Region is searched for `quick-check-regexp' to find possibly
      81             : ;;    normalizable point.
      82             : ;;
      83             : ;; B. Identification of Normalization Block
      84             : ;;
      85             : ;;    (1) start of the block
      86             : ;;        If the searched character is a starter and not combining
      87             : ;;        with previous character, then the beginning of the block is
      88             : ;;        the searched character.  If searched character is combining
      89             : ;;        character, then previous character will be the target
      90             : ;;        character
      91             : ;;    (2) end of the block
      92             : ;;        Block ends at non-composable starter character.
      93             : ;;
      94             : ;; C. Decomposition  (`ucs-normalize-block')
      95             : ;;
      96             : ;;    The entire block will be decomposed by
      97             : ;;    `decomposition-translation' table.
      98             : ;;
      99             : ;; D. Sorting and Composition  of Smaller Blocks (`ucs-normalize-block-compose-chars')
     100             : ;;
     101             : ;;    The block will be split to multiple samller blocks by starter
     102             : ;;    characters.  Each block is sorted, and composed if necessary.
     103             : ;;
     104             : ;; E. Composition of Entire Block (`ucs-normalize-compose-chars')
     105             : ;;
     106             : ;;   Composed blocks are collected and again composed.
     107             : 
     108             : ;;; Code:
     109             : 
     110             : (defconst ucs-normalize-version "1.2")
     111             : 
     112             : (eval-when-compile (require 'cl-lib))
     113             : 
     114             : (declare-function nfd "ucs-normalize" (char))
     115             : 
     116             : (eval-when-compile
     117             : 
     118             :   (defconst ucs-normalize-composition-exclusions
     119             :     '(#x0958 #x0959 #x095A #x095B #x095C #x095D #x095E #x095F
     120             :       #x09DC #x09DD #x09DF #x0A33 #x0A36 #x0A59 #x0A5A #x0A5B
     121             :       #x0A5E #x0B5C #x0B5D #x0F43 #x0F4D #x0F52 #x0F57 #x0F5C
     122             :       #x0F69 #x0F76 #x0F78 #x0F93 #x0F9D #x0FA2 #x0FA7 #x0FAC
     123             :       #x0FB9 #xFB1D #xFB1F #xFB2A #xFB2B #xFB2C #xFB2D #xFB2E
     124             :       #xFB2F #xFB30 #xFB31 #xFB32 #xFB33 #xFB34 #xFB35 #xFB36
     125             :       #xFB38 #xFB39 #xFB3A #xFB3B #xFB3C #xFB3E #xFB40 #xFB41
     126             :       #xFB43 #xFB44 #xFB46 #xFB47 #xFB48 #xFB49 #xFB4A #xFB4B
     127             :       #xFB4C #xFB4D #xFB4E #x2ADC #x1D15E #x1D15F #x1D160 #x1D161
     128             :       #x1D162 #x1D163 #x1D164 #x1D1BB #x1D1BC #x1D1BD #x1D1BE
     129             :       #x1D1BF #x1D1C0)
     130             :    "Composition Exclusion List.
     131             :   This list is taken from
     132             :     http://www.unicode.org/Public/UNIDATA/5.2/CompositionExclusions.txt")
     133             : 
     134             :   ;; Unicode ranges that decompositions & combining characters are defined.
     135             :   (defvar check-range nil)
     136             :     (setq check-range '((#x00a0 . #x3400) (#xA600 . #xAC00) (#xF900 . #x110ff) (#x1d000 . #x1dfff) (#x1f100 . #x1f2ff) (#x2f800 . #x2faff)))
     137             : 
     138             :   ;; Basic normalization functions
     139             :   (defun nfd (char)
     140             :     (let ((decomposition
     141             :            (get-char-code-property char 'decomposition)))
     142             :       (if (and decomposition (numberp (car decomposition))
     143             :                (or (> (length decomposition) 1)
     144             :                    (/= (car decomposition) char)))
     145             :           decomposition)))
     146             : 
     147             :   (defun nfkd (char)
     148             :     (let ((decomposition
     149             :            (get-char-code-property char 'decomposition)))
     150             :       (if (symbolp (car decomposition)) (cdr decomposition)
     151             :         (if (or (> (length decomposition) 1)
     152             :                 (/= (car decomposition) char)) decomposition))))
     153             : 
     154             :   (defun hfs-nfd (char)
     155             :     (when (or (and (>= char 0) (< char #x2000))
     156             :               (and (>= char #x3000) (< char #xf900))
     157             :               (and (>= char #xfb00) (< char #x2f800))
     158             :               (>= char #x30000))
     159             :       (nfd char))))
     160             : 
     161             : (eval-and-compile
     162             : (defun ucs-normalize-hfs-nfd-comp-p (char)
     163             :   (and (>= char #x2000) (< char #x3000)))
     164             : 
     165             : (defsubst ucs-normalize-ccc (char)
     166             :   (get-char-code-property char 'canonical-combining-class))
     167             : )
     168             : 
     169             : ;; Data common to all normalizations
     170             : 
     171             : (eval-when-compile
     172             : 
     173             :   (defvar combining-chars nil)
     174             :     (setq combining-chars nil)
     175             :   (defvar decomposition-pair-to-composition nil)
     176             :     (setq decomposition-pair-to-composition nil)
     177             :   (defvar non-starter-decompositions nil)
     178             :     (setq non-starter-decompositions nil)
     179             :   ;; This file needs to access these 2 Unicode properties, but when we
     180             :   ;; compile it during bootstrap, charprop.el was not built yet, and
     181             :   ;; therefore is not yet loaded into bootstrap-emacs, so
     182             :   ;; char-code-property-alist is nil, and get-char-code-property
     183             :   ;; always returns nil, something the code here doesn't like.
     184             :   (define-char-code-property 'decomposition "uni-decomposition.el")
     185             :   (define-char-code-property 'canonical-combining-class "uni-combining.el")
     186             :   (let ((char 0) ccc decomposition)
     187             :     (mapc
     188             :      (lambda (start-end)
     189             :        (cl-do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
     190             :          (setq ccc (ucs-normalize-ccc char))
     191             :          (setq decomposition (get-char-code-property
     192             :                               char 'decomposition))
     193             :          (if (and (= (length decomposition) 1)
     194             :                   (= (car decomposition) char))
     195             :              (setq decomposition nil))
     196             :          (if (and ccc (/= 0 ccc)) (add-to-list 'combining-chars char))
     197             :          (if (and (numberp (car decomposition))
     198             :                   (/= (ucs-normalize-ccc (car decomposition))
     199             :                       0))
     200             :              (add-to-list 'non-starter-decompositions char))
     201             :          (when (numberp (car decomposition))
     202             :            (if (and (= 2 (length decomposition))
     203             :                     (null (memq char ucs-normalize-composition-exclusions))
     204             :                     (null (memq char non-starter-decompositions)))
     205             :                (setq decomposition-pair-to-composition
     206             :                      (cons (cons decomposition char)
     207             :                            decomposition-pair-to-composition)))
     208             :            ;; If not singleton decomposition, second and later characters in
     209             :            ;; decomposition will be the subject of combining characters.
     210             :            (if (cdr decomposition)
     211             :                (dolist (char (cdr decomposition))
     212             :                  (add-to-list 'combining-chars char))))))
     213             :      check-range))
     214             : 
     215             :   (setq combining-chars
     216             :         (append combining-chars
     217             :                 '(?ᅡ ?ᅢ ?ᅣ ?ᅤ ?ᅥ ?ᅦ ?ᅧ ?ᅨ ?ᅩ ?ᅪ
     218             :                 ?ᅫ ?ᅬ ?ᅭ ?ᅮ ?ᅯ ?ᅰ ?ᅱ ?ᅲ ?ᅳ ?ᅴ ?ᅵ
     219             :                 ?ᆨ ?ᆩ ?ᆪ ?ᆫ ?ᆬ ?ᆭ ?ᆮ ?ᆯ ?ᆰ ?ᆱ ?ᆲ ?ᆳ ?ᆴ
     220             :                 ?ᆵ ?ᆶ ?ᆷ ?ᆸ ?ᆹ ?ᆺ ?ᆻ ?ᆼ ?ᆽ ?ᆾ ?ᆿ ?ᇀ ?ᇁ ?ᇂ)))
     221             :   )
     222             : 
     223             : (eval-and-compile
     224             : (defun ucs-normalize-make-hash-table-from-alist (alist)
     225             :   (let ((table (make-hash-table :test 'equal :size 2000)))
     226             :     (mapc (lambda (x) (puthash (car x) (cdr x) table)) alist)
     227             :     table))
     228             : 
     229             : (defvar ucs-normalize-decomposition-pair-to-primary-composite nil
     230             :   "Hash table of decomposed pair to primary composite.
     231             : Note that Hangul are excluded.")
     232             :   (setq ucs-normalize-decomposition-pair-to-primary-composite
     233             :         (ucs-normalize-make-hash-table-from-alist
     234             :          (eval-when-compile decomposition-pair-to-composition)))
     235             : 
     236             : (defun ucs-normalize-primary-composite (decomposition-pair composition-predicate)
     237             :   "Convert DECOMPOSITION-PAIR to primary composite using COMPOSITION-PREDICATE."
     238             :   (let ((char (or (gethash decomposition-pair
     239             :                            ucs-normalize-decomposition-pair-to-primary-composite)
     240             :                   (and (<= #x1100 (car decomposition-pair))
     241             :                        (< (car decomposition-pair) #x1113)
     242             :                        (<= #x1161 (cadr decomposition-pair))
     243             :                        (< (car decomposition-pair) #x1176)
     244             :                        (let ((lindex (- (car decomposition-pair) #x1100))
     245             :                              (vindex (- (cadr decomposition-pair) #x1161)))
     246             :                          (+ #xAC00 (* (+ (* lindex  21) vindex) 28))))
     247             :                   (and (<= #xac00 (car decomposition-pair))
     248             :                        (< (car decomposition-pair) #xd7a4)
     249             :                        (<= #x11a7 (cadr decomposition-pair))
     250             :                        (< (cadr decomposition-pair) #x11c3)
     251             :                        (= 0 (% (- (car decomposition-pair) #xac00) 28))
     252             :                        (let ((tindex (- (cadr decomposition-pair) #x11a7)))
     253             :                          (+ (car decomposition-pair) tindex))))))
     254             :     (if (and char
     255             :              (functionp composition-predicate)
     256             :              (null (funcall composition-predicate char)))
     257             :         nil char)))
     258             : )
     259             : 
     260             : (defvar ucs-normalize-combining-chars nil)
     261             :   (setq ucs-normalize-combining-chars (eval-when-compile combining-chars))
     262             : 
     263             : (defvar ucs-normalize-combining-chars-regexp nil
     264             :   "Regular expression to match sequence of combining characters.")
     265             :   (setq ucs-normalize-combining-chars-regexp
     266             :         (eval-when-compile (concat (regexp-opt-charset combining-chars) "+")))
     267             : 
     268             : (declare-function decomposition-translation-alist "ucs-normalize"
     269             :                   (decomposition-function))
     270             : (declare-function decomposition-char-recursively "ucs-normalize"
     271             :                   (char decomposition-function))
     272             : (declare-function alist-list-to-vector "ucs-normalize" (alist))
     273             : 
     274             : (eval-when-compile
     275             : 
     276             :   (defun decomposition-translation-alist (decomposition-function)
     277             :     (let (decomposition alist)
     278             :       (mapc
     279             :        (lambda (start-end)
     280             :          (cl-do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
     281             :            (setq decomposition (funcall decomposition-function char))
     282             :            (if decomposition
     283             :                (setq alist (cons (cons char
     284             :                                        (apply 'append
     285             :                                               (mapcar (lambda (x)
     286             :                                                         (decomposition-char-recursively
     287             :                                                          x decomposition-function))
     288             :                                                       decomposition)))
     289             :                                  alist)))))
     290             :        check-range)
     291             :       alist))
     292             : 
     293             :   (defun decomposition-char-recursively (char decomposition-function)
     294             :     (let ((decomposition (funcall decomposition-function char)))
     295             :       (if decomposition
     296             :           (apply 'append
     297             :                  (mapcar (lambda (x)
     298             :                            (decomposition-char-recursively x decomposition-function))
     299             :                          decomposition))
     300             :         (list char))))
     301             : 
     302             :   (defun alist-list-to-vector (alist)
     303             :     (mapcar (lambda (x) (cons (car x) (apply 'vector (cdr x)))) alist))
     304             : 
     305             :   (defvar nfd-alist nil)
     306             :     (setq nfd-alist (alist-list-to-vector (decomposition-translation-alist 'nfd)))
     307             :   (defvar nfkd-alist nil)
     308             :     (setq nfkd-alist (alist-list-to-vector (decomposition-translation-alist 'nfkd)))
     309             :   (defvar hfs-nfd-alist nil)
     310             :     (setq hfs-nfd-alist (alist-list-to-vector (decomposition-translation-alist 'hfs-nfd)))
     311             :   )
     312             : 
     313             : (eval-and-compile
     314             : (defvar ucs-normalize-hangul-translation-alist nil)
     315             :   (setq ucs-normalize-hangul-translation-alist
     316             :         (let ((i 0) entries)
     317             :           (while (< i 11172)
     318             :             (setq entries
     319             :                   (cons (cons (+ #xac00 i)
     320             :                               (if (= 0 (% i 28))
     321             :                                   (vector (+ #x1100 (/ i 588))
     322             :                                           (+ #x1161 (/ (% i 588) 28)))
     323             :                                 (vector (+ #x1100 (/ i 588))
     324             :                                         (+ #x1161 (/ (% i 588) 28))
     325             :                                         (+ #x11a7 (% i 28)))))
     326             :                         entries)
     327             :                   i (1+ i))) entries))
     328             : 
     329             : (defun ucs-normalize-make-translation-table-from-alist (alist)
     330             :   (make-translation-table-from-alist
     331             :      (append alist ucs-normalize-hangul-translation-alist)))
     332             : 
     333             : (define-translation-table 'ucs-normalize-nfd-table
     334             :   (ucs-normalize-make-translation-table-from-alist (eval-when-compile nfd-alist)))
     335             : (define-translation-table 'ucs-normalize-nfkd-table
     336             :   (ucs-normalize-make-translation-table-from-alist (eval-when-compile nfkd-alist)))
     337             : (define-translation-table 'ucs-normalize-hfs-nfd-table
     338             :   (ucs-normalize-make-translation-table-from-alist (eval-when-compile hfs-nfd-alist)))
     339             : 
     340             : (defun ucs-normalize-sort (chars)
     341             :   "Sort by canonical combining class of CHARS."
     342             :   (sort chars
     343             :         (lambda (ch1 ch2)
     344             :           (< (ucs-normalize-ccc ch1) (ucs-normalize-ccc ch2)))))
     345             : 
     346             : (defun ucs-normalize-compose-chars (chars composition-predicate)
     347             :   "Compose CHARS by COMPOSITION-PREDICATE.
     348             : CHARS must be sorted and normalized in starter-combining pairs."
     349             :   (if composition-predicate
     350             :       (let* ((starter (car chars))
     351             :              remain result prev-ccc
     352             :              (target-chars (cdr chars))
     353             :              target target-ccc
     354             :              primary-composite)
     355             :      (while target-chars
     356             :        (setq target     (car target-chars)
     357             :              target-ccc (ucs-normalize-ccc target))
     358             :        (if (and (or (null prev-ccc)
     359             :                     (< prev-ccc target-ccc))
     360             :                 (setq primary-composite
     361             :                       (ucs-normalize-primary-composite (list starter target)
     362             :                                                        composition-predicate)))
     363             :            ;; case 1: composable
     364             :            (setq starter primary-composite
     365             :                  prev-ccc nil)
     366             :          (if (= 0 target-ccc)
     367             :              ;; case 2: move starter
     368             :              (setq result (nconc result (cons starter (nreverse remain)))
     369             :                    starter target
     370             :                    remain nil)
     371             :            ;; case 3: move target
     372             :            (setq prev-ccc target-ccc
     373             :                  remain (cons target remain))))
     374             :        (setq target-chars (cdr target-chars)))
     375             :      (nconc result (cons starter (nreverse remain))))
     376             :     chars))
     377             : 
     378             : (defun ucs-normalize-block-compose-chars (chars composition-predicate)
     379             :   "Try composing CHARS by COMPOSITION-PREDICATE.
     380             : If COMPOSITION-PREDICATE is not given, then do nothing."
     381             :   (let ((chars (ucs-normalize-sort chars)))
     382             :     (if composition-predicate
     383             :         (ucs-normalize-compose-chars chars composition-predicate)
     384             :       chars)))
     385             : )
     386             : 
     387             : (declare-function quick-check-list "ucs-normalize"
     388             :                   (decomposition-translation &optional composition-predicate))
     389             : (declare-function quick-check-list-to-regexp "ucs-normalize" (quick-check-list))
     390             : 
     391             : (eval-when-compile
     392             : 
     393             :   (defun quick-check-list (decomposition-translation
     394             :                            &optional composition-predicate)
     395             :     "Quick-Check List for DECOMPOSITION-TRANSLATION and COMPOSITION-PREDICATE.
     396             : It includes Singletons, CompositionExclusions, and Non-Starter
     397             : decomposition."
     398             :     (let (entries decomposition composition)
     399             :       (with-temp-buffer
     400             :         (mapc
     401             :          (lambda (start-end)
     402             :            (cl-do ((i (car start-end) (+ i 1))) ((> i (cdr start-end)))
     403             :              (setq decomposition
     404             :                    (string-to-list
     405             :                     (progn
     406             :                       (erase-buffer)
     407             :                       (insert i)
     408             :                       (translate-region 1 2 decomposition-translation)
     409             :                       (buffer-string))))
     410             :              (setq composition
     411             :                    (ucs-normalize-block-compose-chars decomposition composition-predicate))
     412             :              (when (not (equal composition (list i)))
     413             :                (setq entries (cons i entries)))))
     414             :          check-range))
     415             :       ;;(remove-duplicates
     416             :        (append entries
     417             :                ucs-normalize-composition-exclusions
     418             :                non-starter-decompositions)))
     419             :   ;;)
     420             : 
     421             :   (defvar nfd-quick-check-list nil)
     422             :     (setq nfd-quick-check-list     (quick-check-list 'ucs-normalize-nfd-table      ))
     423             :   (defvar nfc-quick-check-list nil)
     424             :     (setq nfc-quick-check-list     (quick-check-list 'ucs-normalize-nfd-table     t ))
     425             :   (defvar nfkd-quick-check-list nil)
     426             :     (setq nfkd-quick-check-list    (quick-check-list 'ucs-normalize-nfkd-table      ))
     427             :   (defvar nfkc-quick-check-list nil)
     428             :     (setq nfkc-quick-check-list    (quick-check-list 'ucs-normalize-nfkd-table    t ))
     429             :   (defvar hfs-nfd-quick-check-list nil)
     430             :     (setq hfs-nfd-quick-check-list (quick-check-list 'ucs-normalize-hfs-nfd-table
     431             :                                                      'ucs-normalize-hfs-nfd-comp-p))
     432             :   (defvar hfs-nfc-quick-check-list nil)
     433             :     (setq hfs-nfc-quick-check-list (quick-check-list 'ucs-normalize-hfs-nfd-table t ))
     434             : 
     435             :   (defun quick-check-list-to-regexp (quick-check-list)
     436             :     (regexp-opt-charset (append quick-check-list combining-chars)))
     437             : 
     438             :   (defun quick-check-decomposition-list-to-regexp (quick-check-list)
     439             :     (concat (quick-check-list-to-regexp quick-check-list) "\\|[가-힣]"))
     440             : 
     441             :   (defun quick-check-composition-list-to-regexp (quick-check-list)
     442             :     (concat (quick-check-list-to-regexp quick-check-list) "\\|[ᅡ-ᅵᆨ-ᇂ]"))
     443             : )
     444             : 
     445             : 
     446             : ;; NFD/NFC
     447             : (defvar ucs-normalize-nfd-quick-check-regexp nil)
     448             :   (setq ucs-normalize-nfd-quick-check-regexp
     449             :   (eval-when-compile (quick-check-decomposition-list-to-regexp nfd-quick-check-list)))
     450             : (defvar ucs-normalize-nfc-quick-check-regexp nil)
     451             :   (setq ucs-normalize-nfc-quick-check-regexp
     452             :   (eval-when-compile (quick-check-composition-list-to-regexp nfc-quick-check-list)))
     453             : 
     454             : ;; NFKD/NFKC
     455             : (defvar ucs-normalize-nfkd-quick-check-regexp nil)
     456             :   (setq ucs-normalize-nfkd-quick-check-regexp
     457             :   (eval-when-compile (quick-check-decomposition-list-to-regexp nfkd-quick-check-list)))
     458             : (defvar ucs-normalize-nfkc-quick-check-regexp nil)
     459             :   (setq ucs-normalize-nfkc-quick-check-regexp
     460             :   (eval-when-compile (quick-check-composition-list-to-regexp nfkc-quick-check-list)))
     461             : 
     462             : ;; HFS-NFD/HFS-NFC
     463             : (defvar ucs-normalize-hfs-nfd-quick-check-regexp nil)
     464             :   (setq ucs-normalize-hfs-nfd-quick-check-regexp
     465             :   (eval-when-compile (concat (quick-check-decomposition-list-to-regexp hfs-nfd-quick-check-list))))
     466             : (defvar ucs-normalize-hfs-nfc-quick-check-regexp nil)
     467             :   (setq ucs-normalize-hfs-nfc-quick-check-regexp
     468             :   (eval-when-compile (quick-check-composition-list-to-regexp hfs-nfc-quick-check-list)))
     469             : 
     470             : ;;------------------------------------------------------------------------------------------
     471             : 
     472             : ;; Normalize local region.
     473             : 
     474             : (defun ucs-normalize-block
     475             :   (from to &optional decomposition-translation-table composition-predicate)
     476             :   "Normalize region FROM TO, by sorting the region with canonical-cc.
     477             : If DECOMPOSITION-TRANSLATION-TABLE is given, translate region
     478             : before sorting.  If COMPOSITION-PREDICATE is given, then compose
     479             : the region by using it."
     480           0 :   (save-restriction
     481           0 :     (narrow-to-region from to)
     482           0 :     (goto-char (point-min))
     483           0 :     (if decomposition-translation-table
     484           0 :         (translate-region from to decomposition-translation-table))
     485           0 :     (goto-char (point-min))
     486           0 :     (let ((start (point)) chars); ccc)
     487           0 :       (while (not (eobp))
     488           0 :         (forward-char)
     489           0 :         (when (or (eobp)
     490           0 :                   (= 0 (ucs-normalize-ccc (char-after (point)))))
     491           0 :           (setq chars
     492           0 :                 (nconc chars
     493           0 :                        (ucs-normalize-block-compose-chars
     494           0 :                         (string-to-list (buffer-substring start (point)))
     495           0 :                         composition-predicate))
     496           0 :                 start (point)))
     497             :         ;;(unless ccc (error "Undefined character can not be normalized!"))
     498           0 :         )
     499           0 :       (delete-region (point-min) (point-max))
     500           0 :       (apply 'insert
     501           0 :              (ucs-normalize-compose-chars
     502           0 :               chars composition-predicate)))))
     503             : 
     504             : (defun ucs-normalize-region
     505             :   (from to quick-check-regexp translation-table composition-predicate)
     506             :   "Normalize region from FROM to TO.
     507             : QUICK-CHECK-REGEXP is applied for searching the region.
     508             : TRANSLATION-TABLE will be used to decompose region.
     509             : COMPOSITION-PREDICATE will be used to compose region."
     510           0 :   (save-excursion
     511           0 :     (save-restriction
     512           0 :       (narrow-to-region from to)
     513           0 :       (goto-char (point-min))
     514           0 :       (let (start-pos starter)
     515           0 :         (while (re-search-forward quick-check-regexp nil t)
     516           0 :           (setq starter (string-to-char (match-string 0)))
     517           0 :           (setq start-pos (match-beginning 0))
     518           0 :           (ucs-normalize-block
     519             :            ;; from
     520           0 :            (if (or (= start-pos (point-min))
     521           0 :                    (and (= 0 (ucs-normalize-ccc starter))
     522           0 :                         (not (memq starter ucs-normalize-combining-chars))))
     523           0 :                start-pos (1- start-pos))
     524             :            ;; to
     525           0 :            (if (looking-at ucs-normalize-combining-chars-regexp)
     526           0 :                (match-end 0) (1+ start-pos))
     527           0 :            translation-table composition-predicate))))))
     528             : 
     529             : ;; --------------------------------------------------------------------------------
     530             : 
     531             : (defmacro ucs-normalize-string (ucs-normalize-region)
     532           6 :   `(with-temp-buffer
     533             :      (insert str)
     534           6 :      (,ucs-normalize-region (point-min) (point-max))
     535           6 :      (buffer-string)))
     536             : 
     537             : ;;;###autoload
     538             : (defun ucs-normalize-NFD-region (from to)
     539             :   "Normalize the current region by the Unicode NFD."
     540             :   (interactive "r")
     541           0 :   (ucs-normalize-region from to
     542           0 :                         ucs-normalize-nfd-quick-check-regexp
     543           0 :                         'ucs-normalize-nfd-table nil))
     544             : ;;;###autoload
     545             : (defun ucs-normalize-NFD-string (str)
     546             :   "Normalize the string STR by the Unicode NFD."
     547           0 :   (ucs-normalize-string ucs-normalize-NFD-region))
     548             : 
     549             : ;;;###autoload
     550             : (defun ucs-normalize-NFC-region (from to)
     551             :   "Normalize the current region by the Unicode NFC."
     552             :   (interactive "r")
     553           0 :   (ucs-normalize-region from to
     554           0 :                         ucs-normalize-nfc-quick-check-regexp
     555           0 :                         'ucs-normalize-nfd-table t))
     556             : ;;;###autoload
     557             : (defun ucs-normalize-NFC-string (str)
     558             :   "Normalize the string STR by the Unicode NFC."
     559           0 :   (ucs-normalize-string ucs-normalize-NFC-region))
     560             : 
     561             : ;;;###autoload
     562             : (defun ucs-normalize-NFKD-region (from to)
     563             :   "Normalize the current region by the Unicode NFKD."
     564             :   (interactive "r")
     565           0 :   (ucs-normalize-region from to
     566           0 :                         ucs-normalize-nfkd-quick-check-regexp
     567           0 :                         'ucs-normalize-nfkd-table nil))
     568             : ;;;###autoload
     569             : (defun ucs-normalize-NFKD-string (str)
     570             :   "Normalize the string STR by the Unicode NFKD."
     571           0 :   (ucs-normalize-string ucs-normalize-NFKD-region))
     572             : 
     573             : ;;;###autoload
     574             : (defun ucs-normalize-NFKC-region (from to)
     575             :   "Normalize the current region by the Unicode NFKC."
     576             :   (interactive "r")
     577           0 :   (ucs-normalize-region from to
     578           0 :                         ucs-normalize-nfkc-quick-check-regexp
     579           0 :                         'ucs-normalize-nfkd-table t))
     580             : ;;;###autoload
     581             : (defun ucs-normalize-NFKC-string (str)
     582             :   "Normalize the string STR by the Unicode NFKC."
     583           0 :   (ucs-normalize-string ucs-normalize-NFKC-region))
     584             : 
     585             : ;;;###autoload
     586             : (defun ucs-normalize-HFS-NFD-region (from to)
     587             :   "Normalize the current region by the Unicode NFD and Mac OS's HFS Plus."
     588             :   (interactive "r")
     589           0 :   (ucs-normalize-region from to
     590           0 :                         ucs-normalize-hfs-nfd-quick-check-regexp
     591             :                         'ucs-normalize-hfs-nfd-table
     592           0 :                         'ucs-normalize-hfs-nfd-comp-p))
     593             : ;;;###autoload
     594             : (defun ucs-normalize-HFS-NFD-string (str)
     595             :   "Normalize the string STR by the Unicode NFD and Mac OS's HFS Plus."
     596           0 :   (ucs-normalize-string ucs-normalize-HFS-NFD-region))
     597             : ;;;###autoload
     598             : (defun ucs-normalize-HFS-NFC-region (from to)
     599             :   "Normalize the current region by the Unicode NFC and Mac OS's HFS Plus."
     600             :   (interactive "r")
     601           0 :   (ucs-normalize-region from to
     602           0 :                         ucs-normalize-hfs-nfc-quick-check-regexp
     603           0 :                         'ucs-normalize-hfs-nfd-table t))
     604             : ;;;###autoload
     605             : (defun ucs-normalize-HFS-NFC-string (str)
     606             :   "Normalize the string STR by the Unicode NFC and Mac OS's HFS Plus."
     607           0 :   (ucs-normalize-string ucs-normalize-HFS-NFC-region))
     608             : 
     609             : ;; Post-read-conversion function for `utf-8-hfs'.
     610             : (defun ucs-normalize-hfs-nfd-post-read-conversion (len)
     611           0 :   (save-excursion
     612           0 :     (save-restriction
     613           0 :       (narrow-to-region (point) (+ (point) len))
     614           0 :       (ucs-normalize-HFS-NFC-region (point-min) (point-max))
     615           0 :       (- (point-max) (point-min)))))
     616             : 
     617             : ;; Pre-write conversion for `utf-8-hfs'.
     618             : ;; _from and _to are legacy arguments (see `define-coding-system').
     619             : (defun ucs-normalize-hfs-nfd-pre-write-conversion (_from _to)
     620           0 :   (ucs-normalize-HFS-NFD-region (point-min) (point-max)))
     621             : 
     622             : ;;; coding-system definition
     623             : (define-coding-system 'utf-8-hfs
     624             :   "UTF-8 based coding system for macOS HFS file names.
     625             : The singleton characters in HFS normalization exclusion will not
     626             : be decomposed."
     627             :   :coding-type 'utf-8
     628             :   :mnemonic ?U
     629             :   :charset-list '(unicode)
     630             :   :post-read-conversion 'ucs-normalize-hfs-nfd-post-read-conversion
     631             :   :pre-write-conversion 'ucs-normalize-hfs-nfd-pre-write-conversion
     632             :   )
     633             : 
     634             : ;; This is tested in dired.c:file_name_completion in order to reject
     635             : ;; false positives due to comparison of encoded file names.
     636             : (coding-system-put 'utf-8-hfs 'decomposed-characters 't)
     637             : 
     638             : (provide 'ucs-normalize)
     639             : 
     640             : ;; Local Variables:
     641             : ;; coding: utf-8
     642             : ;; End:
     643             : 
     644             : ;;; ucs-normalize.el ends here

Generated by: LCOV version 1.12