]> code.delx.au - gnu-emacs/blobdiff - lisp/international/utf-8.el
Merged from miles@gnu.org--gnu-2005 (patch 45-55, 214-231)
[gnu-emacs] / lisp / international / utf-8.el
index 5a7acee0f0e929264320ca7d801731b0a0416882..863bb22ef0a8eae2c238b2c87972cc56433cae19 100644 (file)
@@ -186,7 +186,7 @@ Setting this variable outside customize has no effect."
                       ucs-mule-to-mule-unicode)
             (define-translation-table 'utf-translation-table-for-encode)))
         (set-default s v))
-  :version "21.4"
+  :version "22.1"
   :type 'boolean
   :group 'mule)
 
@@ -194,13 +194,88 @@ Setting this variable outside customize has no effect."
 (defconst utf-translate-cjk-charsets '(chinese-gb2312
                                       chinese-big5-1 chinese-big5-2
                                       japanese-jisx0208 japanese-jisx0212
+                                      katakana-jisx0201
                                       korean-ksc5601)
   "List of charsets supported by `utf-translate-cjk-mode'.")
 
-(defconst utf-translate-cjk-unicode-range
-  '((#x2e80 . #xd7a3)
-    (#xff00 . #xffef))
-  "List of Unicode code ranges supported by `utf-translate-cjk-mode'.")
+(defvar utf-translate-cjk-lang-env nil
+  "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
+The value nil means that the tables are not yet loaded.")
+
+(defvar utf-translate-cjk-unicode-range)
+
+;; String generated from utf-translate-cjk-unicode-range.  It is
+;; suitable for an argument to skip-chars-forward.
+(defvar utf-translate-cjk-unicode-range-string nil)
+
+(defun utf-translate-cjk-set-unicode-range (range)
+  (setq utf-translate-cjk-unicode-range range)
+  (setq utf-translate-cjk-unicode-range-string
+       (let ((decode-char-no-trans
+              #'(lambda (x)
+                  (cond ((< x #x100) (make-char 'latin-iso8859-1 x))
+                        ((< x #x2500)
+                         (setq x (- x #x100))
+                         (make-char 'mule-unicode-0100-24ff
+                                    (+ (/ x 96) 32) (+ (% x 96) 32)))
+                        ((< x #x3400)
+                         (setq x (- x #x2500))
+                         (make-char 'mule-unicode-2500-33ff
+                                    (+ (/ x 96) 32) (+ (% x 96) 32)))
+                        (t
+                         (setq x (- x #xe000))
+                         (make-char 'mule-unicode-e000-ffff
+                                    (+ (/ x 96) 32) (+ (% x 96) 32))))))
+             ranges from to)
+         (dolist (elt range)
+           (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt)))
+           (if (and (>= to #x3400) (< to #xE000))
+               (setq to #x33FF))
+           (cond ((< from #x100)
+                  (if (>= to #xE000)
+                      (setq ranges (cons (cons #xE000 to) ranges)
+                            to #x33FF))
+                  (if (>= to #x2500)
+                      (setq ranges (cons (cons #x2500 to) ranges)
+                            to #x24FF))
+                  (if (>= to #x100)
+                      (setq ranges (cons (cons #x100 to) ranges)
+                            to #xFF)))
+                 ((< from #x2500)
+                  (if (>= to #xE000)
+                      (setq ranges (cons (cons #xE000 to) ranges)
+                            to #x33FF))
+                  (if (>= to #x2500)
+                      (setq ranges (cons (cons #x2500 to) ranges)
+                            to #x24FF)))
+                 ((< from #x3400)
+                  (if (>= to #xE000)
+                      (setq ranges (cons (cons #xE000 to) ranges)
+                            to #x33FF))))
+           (if (<= from to)
+               (setq ranges (cons (cons from to) ranges))))
+         (mapconcat #'(lambda (x)
+                        (format "%c-%c"
+                                (funcall decode-char-no-trans (car x))
+                                (funcall decode-char-no-trans (cdr x))))
+                    ranges "")))
+  ;; These forces loading and settting tables for
+  ;; utf-translate-cjk-mode.
+  (setq utf-translate-cjk-lang-env nil
+       ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
+       ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)))
+
+(defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3)
+                                            (#xff00 . #xffef))
+  "List of Unicode code ranges supported by `utf-translate-cjk-mode'.
+Setting this variable directly does not take effect;
+use either \\[customize] or the function
+`utf-translate-cjk-set-unicode-range'."
+  :version "22.1"
+  :type '(repeat (cons integer integer))
+  :set (lambda (symbol value)
+        (utf-translate-cjk-set-unicode-range value))
+  :group 'mule)
 
 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
 (defsubst utf-translate-cjk-substitutable-p (code-point)
@@ -213,10 +288,6 @@ Setting this variable outside customize has no effect."
        (setq elt nil)))
     elt))
 
-(defvar utf-translate-cjk-lang-env nil
-  "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
-The value nil means that the tables are not yet loaded.")
-
 (defun utf-translate-cjk-load-tables ()
   "Load tables for `utf-translate-cjk-mode'."
   ;; Fixme: Allow the use of the CJK charsets to be
@@ -234,26 +305,30 @@ The value nil means that the tables are not yet loaded.")
     ;; Load the files explicitly, to avoid having to keep
     ;; around the large tables they contain (as well as the
     ;; ones which get built).
-    (cond ((string= "Korean" current-language-environment)
-          (load "subst-jis")
-          (load "subst-big5")
-          (load "subst-gb2312")
-          (load "subst-ksc"))
-         ((string= "Chinese-BIG5" current-language-environment)
-          (load "subst-jis")
-          (load "subst-ksc")
-          (load "subst-gb2312")
-          (load "subst-big5"))
-         ((string= "Chinese-GB" current-language-environment)
-          (load "subst-jis")
-          (load "subst-ksc")
-          (load "subst-big5")
-          (load "subst-gb2312"))
-         (t
-          (load "subst-ksc")
-          (load "subst-gb2312")
-          (load "subst-big5")
-          (load "subst-jis")))   ; jis covers as much as big5, gb2312
+    ;; Here we bind coding-system-for-read to nil so that coding tags
+    ;; in the files are respected even if the files are not yet
+    ;; byte-compiled
+    (let ((coding-system-for-read nil))
+      (cond ((string= "Korean" current-language-environment)
+            (load "subst-jis")
+            (load "subst-big5")
+            (load "subst-gb2312")
+            (load "subst-ksc"))
+           ((string= "Chinese-BIG5" current-language-environment)
+            (load "subst-jis")
+            (load "subst-ksc")
+            (load "subst-gb2312")
+            (load "subst-big5"))
+           ((string= "Chinese-GB" current-language-environment)
+            (load "subst-jis")
+            (load "subst-ksc")
+            (load "subst-big5")
+            (load "subst-gb2312"))
+           (t
+            (load "subst-ksc")
+            (load "subst-gb2312")
+            (load "subst-big5")
+            (load "subst-jis")))) ; jis covers as much as big5, gb2312
 
     (when redefined
       (define-translation-hash-table 'utf-subst-table-for-decode
@@ -303,7 +378,7 @@ characters and want to avoid some overhead on encoding/decoding
 by the above coding systems, you can customize the user option
 `utf-translate-cjk-mode' to nil."
   :init-value t
-  :version "21.4"
+  :version "22.1"
   :type 'boolean
   :group 'mule
   :global t
@@ -425,25 +500,32 @@ by the above coding systems, you can customize the user option
          ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
          ((r1 &= #x3F)
           (r1 |= ((r0 & #x1F) << 6))
-          ;; Now r2 holds scalar value.  We don't have to check
+          ;; Now r1 holds scalar value.  We don't have to check
           ;; `overlong sequence' because r0 >= 0xC2.
 
           (if (r1 >= 256)
               ;; mule-unicode-0100-24ff (< 0800)
-              ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-               (r1 -= #x0100)
-               (r2 = (((r1 / 96) + 32) << 7))
-               (r1 %= 96)
-               (r1 += (r2 + 32))
-               (translate-character
-                utf-translation-table-for-decode r0 r1)
+              ((r0 = r1)
+               (lookup-integer utf-subst-table-for-decode r0 r1)
+               (if (r7 == 0)
+                   ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+                    (r1 -= #x0100)
+                    (r2 = (((r1 / 96) + 32) << 7))
+                    (r1 %= 96)
+                    (r1 += (r2 + 32))
+                    (translate-character
+                     utf-translation-table-for-decode r0 r1)))
                (write-multibyte-character r0 r1)
                (read r0)
                (repeat))
             (if (r1 >= 160)
                 ;; latin-iso8859-1
-                ((r1 -= 128)
-                 (write-multibyte-character r6 r1)
+                ((r0 = r1)
+                 (lookup-integer utf-subst-table-for-decode r0 r1)
+                 (if (r7 == 0)
+                     ((r1 -= 128)
+                      (write-multibyte-character r6 r1))
+                   ((write-multibyte-character r0 r1)))
                  (read r0)
                  (repeat))
               ;; eight-bit-control
@@ -482,13 +564,16 @@ by the above coding systems, you can customize the user option
 
           (if (r3 < #x2500)
               ;; mule-unicode-0100-24ff (>= 0800)
-              ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-               (r3 -= #x0100)
-               (r3 //= 96)
-               (r1 = (r7 + 32))
-               (r1 += ((r3 + 32) << 7))
-               (translate-character
-                utf-translation-table-for-decode r0 r1)
+              ((r0 = r3)
+               (lookup-integer utf-subst-table-for-decode r0 r1)
+               (if (r7 == 0)
+                   ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+                    (r3 -= #x0100)
+                    (r3 //= 96)
+                    (r1 = (r7 + 32))
+                    (r1 += ((r3 + 32) << 7))
+                    (translate-character
+                     utf-translation-table-for-decode r0 r1)))
                (write-multibyte-character r0 r1)
                (read r0)
                (repeat)))
@@ -856,7 +941,7 @@ Move point to the end of the sequence."
 (defcustom utf-8-compose-scripts nil
   "*Non-nil means compose various scripts on decoding utf-8 text."
   :group 'mule
-  :version "21.4"
+  :version "22.1"
   :type 'boolean)
 
 (defun utf-8-post-read-conversion (length)
@@ -874,17 +959,17 @@ Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
            hash-table ch)
        (set-buffer-multibyte t)
        (when utf-translate-cjk-mode
-         (if (not utf-translate-cjk-lang-env)
-             ;; Check these characters:
-             ;;   "U+2e80-U+33ff", "U+ff00-U+ffef"
-             ;; We may have to translate them to CJK charsets.
-             (let ((range2 "\e$,29@\e(B-\e$,2G\7f\e$,3r`\e(B-\e$,3u/\e(B"))
-               (skip-chars-forward (concat range range2))
-               (unless (eobp)
-                 (utf-translate-cjk-load-tables)
-                 (setq range (concat range range2)))
-         (setq hash-table (get 'utf-subst-table-for-decode
-                               'translation-hash-table)))))
+         (unless utf-translate-cjk-lang-env
+           ;; Check these characters in utf-translate-cjk-range.
+           ;; We may have to translate them to CJK charsets.
+           (skip-chars-forward
+            (concat range utf-translate-cjk-unicode-range-string))
+           (unless (eobp)
+             (utf-translate-cjk-load-tables)
+             (setq range
+                   (concat range utf-translate-cjk-unicode-range-string)))
+           (setq hash-table (get 'utf-subst-table-for-decode
+                                 'translation-hash-table))))
        (while (and (skip-chars-forward range)
                    (not (eobp)))
          (setq ch (following-char))