;;; ucs-normalize.el --- Unicode normalization NFC/NFD/NFKD/NFKC
-;; Copyright (C) 2009
-;; Free Software Foundation, Inc.
+;; Copyright (C) 2009-2016 Free Software Foundation, Inc.
;; Author: Taichi Kawabata <kawabata.taichi@gmail.com>
;; Keywords: unicode, normalization
;;; Commentary:
;;
-;; This program has passed the NormalizationTest-5.1.0.txt.
+;; This program has passed the NormalizationTest-5.2.0.txt.
;;
;; References:
;; http://www.unicode.org/reports/tr15/
;; with previous character, then the beginning of the block is
;; the searched character. If searched character is combining
;; character, then previous character will be the target
-;; character
-;; (2) end of the block
+;; character
+;; (2) end of the block
;; Block ends at non-composable starter character.
;;
;; C. Decomposition (`ucs-normalize-block')
;; D. Sorting and Composition of Smaller Blocks (`ucs-normalize-block-compose-chars')
;;
;; The block will be split to multiple samller blocks by starter
-;; charcters. Each block is sorted, and composed if necessary.
+;; characters. Each block is sorted, and composed if necessary.
;;
;; E. Composition of Entire Block (`ucs-normalize-compose-chars')
;;
;;; Code:
-(defconst ucs-normalize-version "1.1")
+(defconst ucs-normalize-version "1.2")
-(eval-when-compile (require 'cl))
+(eval-when-compile (require 'cl-lib))
+
+(declare-function nfd "ucs-normalize" (char))
(eval-when-compile
#x1D1BF #x1D1C0)
"Composition Exclusion List.
This list is taken from
- http://www.unicode.org/Public/UNIDATA/CompositionExclusions-5.1.0.txt")
+ http://www.unicode.org/Public/UNIDATA/5.2/CompositionExclusions.txt")
- ;; Unicode ranges that decompositions & combinings are defined.
+ ;; Unicode ranges that decompositions & combining characters are defined.
(defvar check-range nil)
- (setq check-range '((#x00a0 . #x3400) (#xA600 . #xAC00) (#xF900 . #x10fff) (#x1d000 . #x1dfff) (#x2f800 . #x2faff)))
+ (setq check-range '((#x00a0 . #x3400) (#xA600 . #xAC00) (#xF900 . #x110ff) (#x1d000 . #x1dfff) (#x1f100 . #x1f2ff) (#x2f800 . #x2faff)))
;; Basic normalization functions
(defun nfd (char)
(let ((decomposition
(get-char-code-property char 'decomposition)))
- (if (and decomposition (numberp (car decomposition)))
+ (if (and decomposition (numberp (car decomposition))
+ (or (> (length decomposition) 1)
+ (/= (car decomposition) char)))
decomposition)))
(defun nfkd (char)
(let ((decomposition
(get-char-code-property char 'decomposition)))
(if (symbolp (car decomposition)) (cdr decomposition)
- decomposition)))
+ (if (or (> (length decomposition) 1)
+ (/= (car decomposition) char)) decomposition))))
(defun hfs-nfd (char)
(when (or (and (>= char 0) (< char #x2000))
(eval-when-compile
- (defvar combining-chars nil)
+ (defvar combining-chars nil)
(setq combining-chars nil)
(defvar decomposition-pair-to-composition nil)
(setq decomposition-pair-to-composition nil)
(defvar non-starter-decompositions nil)
(setq non-starter-decompositions nil)
+ ;; This file needs to access these 2 Unicode properties, but when we
+ ;; compile it during bootstrap, charprop.el was not built yet, and
+ ;; therefore is not yet loaded into bootstrap-emacs, so
+ ;; char-code-property-alist is nil, and get-char-code-property
+ ;; always returns nil, something the code here doesn't like.
+ (define-char-code-property 'decomposition "uni-decomposition.el")
+ (define-char-code-property 'canonical-combining-class "uni-combining.el")
(let ((char 0) ccc decomposition)
(mapc
(lambda (start-end)
- (do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
+ (cl-do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
(setq ccc (ucs-normalize-ccc char))
(setq decomposition (get-char-code-property
char 'decomposition))
+ (if (and (= (length decomposition) 1)
+ (= (car decomposition) char))
+ (setq decomposition nil))
(if (and ccc (/= 0 ccc)) (add-to-list 'combining-chars char))
(if (and (numberp (car decomposition))
(/= (ucs-normalize-ccc (car decomposition))
check-range))
(setq combining-chars
- (append combining-chars
+ (append combining-chars
'(?ᅡ ?ᅢ ?ᅣ ?ᅤ ?ᅥ ?ᅦ ?ᅧ ?ᅨ ?ᅩ ?ᅪ
- ?ᅫ ?ᅬ ?ᅭ ?ᅮ ?ᅯ ?ᅰ ?ᅱ ?ᅲ ?ᅳ ?ᅴ ?ᅵ
+ ?ᅫ ?ᅬ ?ᅭ ?ᅮ ?ᅯ ?ᅰ ?ᅱ ?ᅲ ?ᅳ ?ᅴ ?ᅵ
?ᆨ ?ᆩ ?ᆪ ?ᆫ ?ᆬ ?ᆭ ?ᆮ ?ᆯ ?ᆰ ?ᆱ ?ᆲ ?ᆳ ?ᆴ
?ᆵ ?ᆶ ?ᆷ ?ᆸ ?ᆹ ?ᆺ ?ᆻ ?ᆼ ?ᆽ ?ᆾ ?ᆿ ?ᇀ ?ᇁ ?ᇂ)))
)
(eval-when-compile decomposition-pair-to-composition)))
(defun ucs-normalize-primary-composite (decomposition-pair composition-predicate)
- "Convert DECOMPOSITION-PAIR to primay composite using COMPOSITION-PREDICATE."
+ "Convert DECOMPOSITION-PAIR to primary composite using COMPOSITION-PREDICATE."
(let ((char (or (gethash decomposition-pair
ucs-normalize-decomposition-pair-to-primary-composite)
(and (<= #x1100 (car decomposition-pair))
(defvar ucs-normalize-combining-chars-regexp nil
"Regular expression to match sequence of combining characters.")
(setq ucs-normalize-combining-chars-regexp
- (eval-when-compile (concat (regexp-opt (mapcar 'char-to-string combining-chars)) "+")))
+ (eval-when-compile (concat (regexp-opt-charset combining-chars) "+")))
+
+(declare-function decomposition-translation-alist "ucs-normalize"
+ (decomposition-function))
+(declare-function decomposition-char-recursively "ucs-normalize"
+ (char decomposition-function))
+(declare-function alist-list-to-vector "ucs-normalize" (alist))
(eval-when-compile
(let (decomposition alist)
(mapc
(lambda (start-end)
- (do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
+ (cl-do ((char (car start-end) (+ char 1))) ((> char (cdr start-end)))
(setq decomposition (funcall decomposition-function char))
(if decomposition
(setq alist (cons (cons char
(apply 'append
- (mapcar (lambda (x)
+ (mapcar (lambda (x)
(decomposition-char-recursively
x decomposition-function))
decomposition)))
(let ((decomposition (funcall decomposition-function char)))
(if decomposition
(apply 'append
- (mapcar (lambda (x)
+ (mapcar (lambda (x)
(decomposition-char-recursively x decomposition-function))
decomposition))
(list char))))
(setq ucs-normalize-hangul-translation-alist
(let ((i 0) entries)
(while (< i 11172)
- (setq entries
- (cons (cons (+ #xac00 i)
+ (setq entries
+ (cons (cons (+ #xac00 i)
(if (= 0 (% i 28))
(vector (+ #x1100 (/ i 588))
(+ #x1161 (/ (% i 588) 28)))
i (1+ i))) entries))
(defun ucs-normalize-make-translation-table-from-alist (alist)
- (make-translation-table-from-alist
+ (make-translation-table-from-alist
(append alist ucs-normalize-hangul-translation-alist)))
(define-translation-table 'ucs-normalize-nfd-table
(ucs-normalize-make-translation-table-from-alist (eval-when-compile hfs-nfd-alist)))
(defun ucs-normalize-sort (chars)
- "Sort by canonical combining class of chars."
+ "Sort by canonical combining class of CHARS."
(sort chars
(lambda (ch1 ch2)
(< (ucs-normalize-ccc ch1) (ucs-normalize-ccc ch2)))))
chars)))
)
+(declare-function quick-check-list "ucs-normalize"
+ (decomposition-translation &optional composition-predicate))
+(declare-function quick-check-list-to-regexp "ucs-normalize" (quick-check-list))
+
(eval-when-compile
(defun quick-check-list (decomposition-translation
&optional composition-predicate)
"Quick-Check List for DECOMPOSITION-TRANSLATION and COMPOSITION-PREDICATE.
It includes Singletons, CompositionExclusions, and Non-Starter
-decomposition. "
+decomposition."
(let (entries decomposition composition)
- (mapc
- (lambda (start-end)
- (do ((i (car start-end) (+ i 1))) ((> i (cdr start-end)))
- (setq decomposition
- (string-to-list
- (with-temp-buffer
- (insert i)
- (translate-region 1 2 decomposition-translation)
- (buffer-string))))
- (setq composition
- (ucs-normalize-block-compose-chars decomposition composition-predicate))
- (when (not (equal composition (list i)))
- (setq entries (cons i entries)))))
- check-range)
+ (with-temp-buffer
+ (mapc
+ (lambda (start-end)
+ (cl-do ((i (car start-end) (+ i 1))) ((> i (cdr start-end)))
+ (setq decomposition
+ (string-to-list
+ (progn
+ (erase-buffer)
+ (insert i)
+ (translate-region 1 2 decomposition-translation)
+ (buffer-string))))
+ (setq composition
+ (ucs-normalize-block-compose-chars decomposition composition-predicate))
+ (when (not (equal composition (list i)))
+ (setq entries (cons i entries)))))
+ check-range))
;;(remove-duplicates
(append entries
ucs-normalize-composition-exclusions
(setq hfs-nfc-quick-check-list (quick-check-list 'ucs-normalize-hfs-nfd-table t ))
(defun quick-check-list-to-regexp (quick-check-list)
- (regexp-opt (mapcar 'char-to-string (append quick-check-list combining-chars))))
+ (regexp-opt-charset (append quick-check-list combining-chars)))
(defun quick-check-decomposition-list-to-regexp (quick-check-list)
(concat (quick-check-list-to-regexp quick-check-list) "\\|[가-힣]"))
(- (point-max) (point-min)))))
;; Pre-write conversion for `utf-8-hfs'.
-(defun ucs-normalize-hfs-nfd-pre-write-conversion (from to)
- (let ((old-buf (current-buffer)))
- (set-buffer (generate-new-buffer " *temp*"))
- (if (stringp from)
- (insert from)
- (insert-buffer-substring old-buf from to))
- (ucs-normalize-HFS-NFD-region (point-min) (point-max))
- nil))
+;; _from and _to are legacy arguments (see `define-coding-system').
+(defun ucs-normalize-hfs-nfd-pre-write-conversion (_from _to)
+ (ucs-normalize-HFS-NFD-region (point-min) (point-max)))
;;; coding-system definition
(define-coding-system 'utf-8-hfs
:pre-write-conversion 'ucs-normalize-hfs-nfd-pre-write-conversion
)
+;; This is tested in dired.c:file_name_completion in order to reject
+;; false positives due to comparison of encoded file names.
+(coding-system-put 'utf-8-hfs 'decomposed-characters 't)
+
(provide 'ucs-normalize)
;; Local Variables:
;; coding: utf-8
;; End:
-;; arch-tag: cef65ae7-71ad-4e19-8da8-56ab4d42aaa4
;;; ucs-normalize.el ends here