code.delx.au - gnu-emacs/blob - lisp/international/utf-8.el

   1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
   2
   3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
   4 ;; Licensed to the Free Software Foundation.
   5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   6
   7 ;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
   8 ;; Maintainer: FSF
   9 ;; Keywords: multilingual, Unicode, UTF-8, i18n
  10
  11 ;; This file is part of GNU Emacs.
  12
  13 ;; GNU Emacs is free software; you can redistribute it and/or modify
  14 ;; it under the terms of the GNU General Public License as published by
  15 ;; the Free Software Foundation; either version 2, or (at your option)
  16 ;; any later version.
  17
  18 ;; GNU Emacs is distributed in the hope that it will be useful,
  19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 ;; GNU General Public License for more details.
  22
  23 ;; You should have received a copy of the GNU General Public License
  24 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  26 ;; Boston, MA 02111-1307, USA.
  27
  28 ;;; Commentary:
  29
  30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
  31 ;; of the following character sets to and from UTF-8:
  32 ;;
  33 ;;   ascii
  34 ;;   eight-bit-control
  35 ;;   latin-iso8859-1
  36 ;;   mule-unicode-0100-24ff
  37 ;;   mule-unicode-2500-33ff
  38 ;;   mule-unicode-e000-ffff
  39 ;;
  40 ;; On decoding, Unicode characters that do not fit into the above
  41 ;; character sets are handled as `eight-bit-control' or
  42 ;; `eight-bit-graphic' characters to retain the information about the
  43 ;; original byte sequence and text properties record the corresponding
  44 ;; unicode.
  45 ;;
  46 ;; Fixme: note that reading and writing invalid utf-8 may not be
  47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
  48 ;;
  49 ;; Characters from other character sets can be encoded with
  50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
  51 ;; registering the translation with `register-char-codings'.  Hash
  52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
  53 ;; support encoding and decoding of about a quarter of the CJK space
  54 ;; between U+3400 and U+DFFF.
  55
  56 ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  57
  58 ;;        scalar       |               utf-8
  59 ;;        value        | 1st byte  | 2nd byte  | 3rd byte
  60 ;; --------------------+-----------+-----------+----------
  61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx |           |
  62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
  63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
  64
  65 ;;; Code:
  66
  67 (defvar ucs-mule-to-mule-unicode (make-translation-table)
  68   "Translation table for encoding to `mule-utf-8'.")
  69 ;; Could have been done by ucs-tables loaded before.
  70 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
  71   (define-translation-table 'ucs-mule-to-mule-unicode
  72     ucs-mule-to-mule-unicode))
  73
  74 (defvar utf-8-subst-table (make-hash-table :test 'eq))
  75 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
  76 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
  77 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
  78
  79 (defvar utf-8-translation-table-for-decode (make-translation-table)
  80   "Translation table applied after decoding utf-8 to mule-unicode.
  81 This is only actually applied to characters which would normally be
  82 decoded into mule-unicode-0100-24ff.")
  83 (define-translation-table 'utf-8-translation-table-for-decode
  84   utf-8-translation-table-for-decode)
  85
  86 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
  87 ;; space of mule-unicode.  For Latin scripts this isn't very
  88 ;; important.  Hebrew and Arabic might go here too when there's proper
  89 ;; support for them.
  90 (mapc
  91  (lambda (pair)
  92    (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
  93  '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B)
  94    (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B)
  95    (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B)
  96    (?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B)
  97    (?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\7f\e(B . ?\e,FO\e(B)
  98    (?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B)
  99    (?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B)
 100    (?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B)
 101    (?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B)
 102    (?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B)
 103    (?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B)
 104    (?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B)
 105    (?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B)
 106    (?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B)
 107    (?\e$,1'N\e(B . ?\e,F~\e(B)
 108
 109    (?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B)
 110    (?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B)
 111    (?\e$,1(*\e(B . ?\e,L*\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B)
 112    (?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B)
 113    (?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B)
 114    (?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B)
 115    (?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B)
 116    (?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B)
 117    (?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B)
 118    (?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B)
 119    (?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B)
 120    (?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B)
 121    (?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B)
 122    (?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B)
 123    (?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B)
 124    (?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B)
 125    (?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B)
 126    (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B)
 127    (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B)))
 128
 129 (defcustom utf-8-fragment-on-decoding nil
 130   "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets.
 131 Setting this means that the relevant Cyrillic and Greek characters are
 132 decoded into the iso8859 charsets rather than into
 133 mule-unicode-0100-24ff.  The iso8859 charsets take half as much space
 134 in the buffer, but using them may affect how the buffer can be re-encoded
 135 and may require a different input method to search for them, for instance.
 136 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
 137 for mechanisms to make this largely transparent.
 138
 139 Setting this variable outside customize has no effect."
 140   :set (lambda (s v)
 141          (if v
 142              (define-translation-table 'utf-8-translation-table-for-decode
 143                utf-8-translation-table-for-decode)
 144            (define-translation-table 'utf-8-translation-table-for-decode))
 145          (set-default s v))
 146   :version "21.4"
 147   :type 'boolean
 148   :group 'mule)
 149
 150 (defcustom utf-8-translate-cjk nil
 151   "Whether the `mule-utf-8' coding system should encode many CJK characters.
 152
 153 Enabling this loads tables which enable the coding system to encode
 154 characters in the charsets `korean-ksc5601', `chinese-gb2312' and
 155 `japanese-jisx0208', and to decode the corresponding unicodes into
 156 such characters.  This works by loading the library `utf-8-subst'; see
 157 its commentary.  The tables are fairly large (about 33000 entries), so this
 158 option is not the default."
 159   :link '(emacs-commentary-link "utf-8-subst")
 160   :set (lambda (s v)
 161          (when v
 162            (require 'utf-8-subst)
 163            (let ((table (make-char-table 'translation-table)))
 164              (coding-system-put 'mule-utf-8 'safe-charsets
 165                                 (append (coding-system-get 'mule-utf-8
 166                                                            'safe-charsets)
 167                                         '(korean-ksc5601 chinese-gb2312
 168                                                          japanese-jisx0208)))
 169              (maphash (lambda (k v)
 170                         (aset table k v))
 171                       utf-8-subst-rev-table)
 172              (register-char-codings 'mule-utf-8 table)))
 173          (set-default s v))
 174   :version "21.4"
 175   :type 'boolean
 176   :group 'mule)
 177
 178 (define-ccl-program ccl-decode-mule-utf-8
 179   ;;
 180   ;;        charset         | bytes in utf-8 | bytes in emacs
 181   ;; -----------------------+----------------+---------------
 182   ;;         ascii          |       1        |       1
 183   ;; -----------------------+----------------+---------------
 184   ;;    eight-bit-control   |       2        |       2
 185   ;;    eight-bit-graphic   |       2        |       1
 186   ;;     latin-iso8859-1    |       2        |       2
 187   ;; -----------------------+----------------+---------------
 188   ;; mule-unicode-0100-24ff |       2        |       4
 189   ;;        (< 0800)        |                |
 190   ;; -----------------------+----------------+---------------
 191   ;; mule-unicode-0100-24ff |       3        |       4
 192   ;;        (>= 8000)       |                |
 193   ;; mule-unicode-2500-33ff |       3        |       4
 194   ;; mule-unicode-e000-ffff |       3        |       4
 195   ;;
 196   ;; Thus magnification factor is two.
 197   ;;
 198   `(2
 199     ((r5 = ,(charset-id 'eight-bit-control))
 200      (r6 = ,(charset-id 'eight-bit-graphic))
 201      (loop
 202       (read r0)
 203
 204       ;; 1byte encoding, i.e., ascii
 205       (if (r0 < #x80)
 206           (write r0)
 207         (if (r0 < #xc0)             ; continuation byte (invalid here)
 208             (if (r0 < #xa0)
 209                 (write-multibyte-character r5 r0)
 210               (write-multibyte-character r6 r0))
 211           ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
 212           (if (r0 < #xe0)
 213               ((read r1)
 214
 215                (if ((r1 & #b11000000) != #b10000000)
 216                    ;; Invalid 2-byte sequence
 217                    ((if (r0 < #xa0)
 218                         (write-multibyte-character r5 r0)
 219                       (write-multibyte-character r6 r0))
 220                     (if (r1 < #x80)
 221                         (write r1)
 222                       (if (r1 < #xa0)
 223                           (write-multibyte-character r5 r1)
 224                         (write-multibyte-character r6 r1))))
 225
 226                  ((r3 = r0)        ; save in case of overlong sequence
 227                   (r2 = r1)
 228                   (r0 &= #x1f)
 229                   (r0 <<= 6)
 230                   (r2 = r1)        ; save in case of overlong sequence
 231                   (r1 &= #x3f)
 232                   (r1 += r0)
 233                   ;; Now r1 holds scalar value
 234
 235                   (if (r1 < 128)        ; `overlong sequence'
 236                       ((if (r3 < #xa0)
 237                            (write-multibyte-character r5 r3)
 238                          (write-multibyte-character r6 r3))
 239                        (if (r2 < #x80)
 240                            (write r2)
 241                          (if (r2 < #xa0)
 242                              (write-multibyte-character r5 r2)
 243                            (write-multibyte-character r6 r2))))
 244
 245                     ;; eight-bit-control
 246                     (if (r1 < 160)
 247                         ((write-multibyte-character r5 r1))
 248
 249                       ;; latin-iso8859-1
 250                       (if (r1 < 256)
 251                           ((r0 = ,(charset-id 'latin-iso8859-1))
 252                            (r1 -= 128)
 253                            (write-multibyte-character r0 r1))
 254
 255                         ;; mule-unicode-0100-24ff (< 0800)
 256                         ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 257                          (r1 -= #x0100)
 258                          (r2 = (((r1 / 96) + 32) << 7))
 259                          (r1 %= 96)
 260                          (r1 += (r2 + 32))
 261                          (translate-character
 262                           utf-8-translation-table-for-decode r0 r1)
 263                          (write-multibyte-character r0 r1))))))))
 264
 265             ;; 3byte encoding
 266             ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
 267             (if (r0 < #xf0)
 268                 ((read r1 r2)
 269
 270                  ;; This is set to 1 if the encoding is invalid.
 271                  (r4 = 0)
 272
 273                  (r3 = (r1 & #b11000000))
 274                  (r3 |= ((r2 >> 2) & #b00110000))
 275                  (if (r3 != #b10100000)
 276                      (r4 = 1)
 277                    ((r3 = ((r0 & #x0f) << 12))
 278                     (r3 += ((r1 & #x3f) << 6))
 279                     (r3 += (r2 & #x3f))
 280                     (if (r3 < #x0800)
 281                         (r4 = 1))))
 282
 283                  (if (r4 != 0)
 284                      ;; Invalid 3-byte sequence
 285                      ((if (r0 < #xa0)
 286                           (write-multibyte-character r5 r0)
 287                         (write-multibyte-character r6 r0))
 288                       (if (r1 < #x80)
 289                           (write r1)
 290                         (if (r1 < #xa0)
 291                             (write-multibyte-character r5 r1)
 292                           (write-multibyte-character r6 r1)))
 293                       (if (r2 < #x80)
 294                           (write r2)
 295                         (if (r2 < #xa0)
 296                             (write-multibyte-character r5 r2)
 297                           (write-multibyte-character r6 r2))))
 298
 299                    ;; mule-unicode-0100-24ff (>= 0800)
 300                    ((if (r3 < #x2500)
 301                         ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 302                          (r3 -= #x0100)
 303                          (r3 //= 96)
 304                          (r1 = (r7 + 32))
 305                          (r1 += ((r3 + 32) << 7))
 306                          (translate-character
 307                           utf-8-translation-table-for-decode r0 r1)
 308                          (write-multibyte-character r0 r1))
 309
 310                       ;; mule-unicode-2500-33ff
 311                       ;; Fixme: Perhaps allow translation via
 312                       ;; utf-8-subst-table for #x2e80 up, so that we use
 313                       ;; consistent charsets for all of CJK.  Would need
 314                       ;; corresponding change to encoding tables.
 315                       (if (r3 < #x3400)
 316                           ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
 317                            (r3 -= #x2500)
 318                            (r3 //= 96)
 319                            (r1 = (r7 + 32))
 320                            (r1 += ((r3 + 32) << 7))
 321                            (write-multibyte-character r0 r1))
 322
 323                         ;; U+3400 .. U+D7FF
 324                         ;; Try to convert to CJK chars, else keep
 325                         ;; them as eight-bit-{control|graphic}.
 326                         (if (r3 < #xd800)
 327                             ((r4 = r3)  ; don't zap r3
 328                              (lookup-integer utf-8-subst-table r4 r5)
 329                              (if r7
 330                                  ;; got a translation
 331                                  ((write-multibyte-character r4 r5)
 332                                   ;; Zapped through register starvation.
 333                                   (r5 = ,(charset-id 'eight-bit-control)))
 334                                ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
 335                                ((r3 = r6)
 336                                 (write-multibyte-character r3 r0)
 337                                 (if (r1 < #xa0)
 338                                     (r3 = r5))
 339                                 (write-multibyte-character r3 r1)
 340                                 (if (r2 < #xa0)
 341                                     (r3 = r5)
 342                                   (r3 = r6))
 343                                 (write-multibyte-character r3 r2))))
 344
 345                           ;; Surrogates, U+D800 .. U+DFFF
 346                           (if (r3 < #xe000)
 347                               ((r3 = r6)
 348                                (write-multibyte-character r3 r0) ; eight-bit-graphic
 349                                (if (r1 < #xa0)
 350                                    (r3 = r5))
 351                                (write-multibyte-character r3 r1)
 352                                (if (r2 < #xa0)
 353                                    (r3 = r5)
 354                                  (r3 = r6))
 355                                (write-multibyte-character r3 r2))
 356
 357                             ;; mule-unicode-e000-ffff
 358                             ;; Fixme: fffe and ffff are invalid.
 359                             ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
 360                              (r3 -= #xe000)
 361                              (r3 //= 96)
 362                              (r1 = (r7 + 32))
 363                              (r1 += ((r3 + 32) << 7))
 364                              (write-multibyte-character r0 r1)))))))))
 365
 366               (if (r0 < #xfe)
 367                   ;; 4byte encoding
 368                   ;; keep those bytes as eight-bit-{control|graphic}
 369                   ;; Fixme: allow lookup in utf-8-subst-table.
 370                   ((read r1 r2 r3)
 371                    ;; r0 > #xf0, thus eight-bit-graphic
 372                    (write-multibyte-character r6 r0)
 373                    (if (r1 < #xa0)
 374                        (if (r1 < #x80)  ; invalid byte
 375                            (write r1)
 376                          (write-multibyte-character r5 r1))
 377                      (write-multibyte-character r6 r1))
 378                    (if (r2 < #xa0)
 379                        (if (r2 < #x80)  ; invalid byte
 380                            (write r2)
 381                          (write-multibyte-character r5 r2))
 382                      (write-multibyte-character r6 r2))
 383                    (if (r3 < #xa0)
 384                        (if (r3 < #x80)  ; invalid byte
 385                            (write r3)
 386                          (write-multibyte-character r5 r3))
 387                      (write-multibyte-character r6 r3))
 388                    (if (r0 >= #xf8)     ; 5- or 6-byte encoding
 389                        ((read r1)
 390                         (if (r1 < #xa0)
 391                             (if (r1 < #x80) ; invalid byte
 392                                 (write r1)
 393                               (write-multibyte-character r5 r1))
 394                           (write-multibyte-character r6 r1))
 395                         (if (r0 >= #xfc) ; 6-byte
 396                             ((read r1)
 397                              (if (r1 < #xa0)
 398                                  (if (r1 < #x80) ; invalid byte
 399                                      (write r1)
 400                                    (write-multibyte-character r5 r1))
 401                                (write-multibyte-character r6 r1)))))))
 402                 ;; else invalid byte >= #xfe
 403                 (write-multibyte-character r6 r0))))))
 404       (repeat))))
 405
 406   "CCL program to decode UTF-8.
 407 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
 408 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
 409 `utf-8-subst-table'.
 410 Encodings of un-representable Unicode characters are decoded asis into
 411 eight-bit-control and eight-bit-graphic characters.")
 412
 413 (define-ccl-program ccl-encode-mule-utf-8
 414   `(1
 415     ((r5 = -1)
 416      (loop
 417       (if (r5 < 0)
 418           ((r1 = -1)
 419            (read-multibyte-character r0 r1)
 420            (translate-character ucs-mule-to-mule-unicode r0 r1))
 421         (;; We have already done read-multibyte-character.
 422          (r0 = r5)
 423          (r1 = r6)
 424          (r5 = -1)))
 425
 426       (if (r0 == ,(charset-id 'ascii))
 427           (write r1)
 428
 429         (if (r0 == ,(charset-id 'latin-iso8859-1))
 430             ;; r1          scalar                  utf-8
 431             ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 432             ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
 433             ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
 434             ((r0 = (((r1 & #x40) >> 6) | #xc2))
 435              (r1 &= #x3f)
 436              (r1 |= #x80)
 437              (write r0 r1))
 438
 439           (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
 440               ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 441                ;; #x3f80 == (0011 1111 1000 0000)b
 442                (r1 &= #x7f)
 443                (r1 += (r0 + 224))       ; 240 == -32 + #x0100
 444                ;; now r1 holds scalar value
 445                (if (r1 < #x0800)
 446                    ;; 2byte encoding
 447                    ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
 448                     ;; #x07c0 == (0000 0111 1100 0000)b
 449                     (r1 &= #x3f)
 450                     (r1 |= #x80)
 451                     (write r0 r1))
 452                  ;; 3byte encoding
 453                  ((r0 = (((r1 & #xf000) >> 12) | #xe0))
 454                   (r2 = ((r1 & #x3f) | #x80))
 455                   (r1 &= #x0fc0)
 456                   (r1 >>= 6)
 457                   (r1 |= #x80)
 458                   (write r0 r1 r2))))
 459
 460             (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
 461                 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 462                  (r1 &= #x7f)
 463                  (r1 += (r0 + 9440))    ; 9440 == -32 + #x2500
 464                  (r0 = (((r1 & #xf000) >> 12) | #xe0))
 465                  (r2 = ((r1 & #x3f) | #x80))
 466                  (r1 &= #x0fc0)
 467                  (r1 >>= 6)
 468                  (r1 |= #x80)
 469                  (write r0 r1 r2))
 470
 471               (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
 472                   ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 473                    (r1 &= #x7f)
 474                    (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
 475                    (r0 = (((r1 & #xf000) >> 12) | #xe0))
 476                    (r2 = ((r1 & #x3f) | #x80))
 477                    (r1 &= #x0fc0)
 478                    (r1 >>= 6)
 479                    (r1 |= #x80)
 480                    (write r0 r1 r2))
 481
 482                 (if (r0 == ,(charset-id 'eight-bit-control))
 483                     ;; r1          scalar                  utf-8
 484                     ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 485                     ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
 486                     ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
 487                     ((write #xc2)
 488                      (write r1))
 489
 490                   (if (r0 == ,(charset-id 'eight-bit-graphic))
 491                       ;; r1          scalar                  utf-8
 492                       ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 493                       ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
 494                       ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
 495                       ((write r1)
 496                        (r1 = -1)
 497                        (read-multibyte-character r0 r1)
 498                        (if (r0 != ,(charset-id 'eight-bit-graphic))
 499                            (if (r0 != ,(charset-id 'eight-bit-control))
 500                                ((r5 = r0)
 501                                 (r6 = r1))))
 502                        (if (r5 < 0)
 503                            ((read-multibyte-character r0 r2)
 504                             (if (r0 != ,(charset-id 'eight-bit-graphic))
 505                                 (if (r0 != ,(charset-id 'eight-bit-control))
 506                                     ((r5 = r0)
 507                                      (r6 = r2))))
 508                             (if (r5 < 0)
 509                                 (write r1 r2)
 510                               (if (r1 < #xa0)
 511                                   (write r1)
 512                                 ((write #xc2)
 513                                  (write r1)))))))
 514
 515                     ((lookup-character utf-8-subst-rev-table r0 r1)
 516                      (if r7             ; lookup succeeded
 517                          ((r1 = (((r0 & #xf000) >> 12) | #xe0))
 518                           (r2 = ((r0 & #x3f) | #x80))
 519                           (r0 &= #x0fc0)
 520                           (r0 >>= 6)
 521                           (r0 |= #x80)
 522                           (write r1 r0 r2))
 523                        ;; Unsupported character.
 524                        ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
 525                        ((write #xef)
 526                         (write #xbf)
 527                         (write #xbd)))))))))))
 528       (repeat)))
 529     (if (r1 >= #xa0)
 530         (write r1)
 531       (if (r1 >= #x80)
 532           ((write #xc2)
 533            (write r1)))))
 534
 535   "CCL program to encode into UTF-8.")
 536
 537 ;; Dummy definition so that the CCL can be checked correctly; the
 538 ;; actual data are loaded on demand.
 539 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
 540   (define-translation-table 'ucs-mule-8859-to-mule-unicode))
 541
 542 (define-ccl-program ccl-untranslated-to-ucs
 543   `(0
 544     (if (r0 < #xf0)                     ; 3-byte encoding, as above
 545         ((r4 = 0)
 546          (r3 = (r1 & #b11000000))
 547          (r3 |= ((r2 >> 2) & #b00110000))
 548          (if (r3 != #b10100000)
 549              (r4 = 1)
 550            ((r3 = ((r0 & #x0f) << 12))
 551             (r3 += ((r1 & #x3f) << 6))
 552             (r3 += (r2 & #x3f))
 553             (if (r3 < #x0800)
 554                 (r4 = 1))))
 555          (if (r4 != 0)
 556              (r0 = 0)
 557            (r0 = r3)))
 558       (if (r0 < #xf8)                   ; 4-byte (Mule-UCS recipe)
 559           ((r4 = (r1 >> 6))
 560            (if (r4 != #b10)
 561                (r0 = 0)
 562              ((r4 = (r2 >> 6))
 563               (if (r4 != #b10)
 564                   (r0 = 0)
 565                 ((r4 = (r3 >> 6))
 566                  (if (r4 != #b10)
 567                      (r0 = 0)
 568                    ((r1 = ((r1  & #x3F) << 12))
 569                     (r2 = ((r2  & #x3F) << 6))
 570                     (r3 &= #x3F)
 571                     (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
 572         (r0 = 0))))
 573   "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
 574 r0 == 0 for invalid sequence.")
 575
 576 (defvar utf-8-ccl-regs (make-vector 8 0))
 577
 578 (defsubst utf-8-untranslated-to-ucs ()
 579   "Return the UCS code for an untranslated sequence of raw bytes t point.
 580 Only for 3- or 4-byte sequences."
 581   (aset utf-8-ccl-regs 0 (or (char-after) 0))
 582   (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
 583   (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
 584   (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
 585   (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
 586   (aref utf-8-ccl-regs 0))
 587
 588 (defun utf-8-help-echo (window object position)
 589   (format "Untranslated Unicode U+%04X"
 590           (get-char-property position 'untranslated-utf-8 object)))
 591
 592 ;; We compose the untranslatable sequences into a single character.
 593 ;; This is infelicitous for editing, because there's currently no
 594 ;; mechanism for treating compositions as atomic, but is OK for
 595 ;; display.  They are composed to U+FFFD with help-echo which
 596 ;; indicates the unicodes they represent.  This function GCs too much.
 597 (defsubst utf-8-compose ()
 598   "Put a suitable composition on an untranslatable sequence.
 599 Return the sequence's length."
 600   (let* ((u (utf-8-untranslated-to-ucs))
 601          (l (unless (zerop u)
 602               (if (>= u #x10000)
 603                        4
 604                      3))))
 605     (when l
 606       (put-text-property (point) (min (point-max) (+ l (point)))
 607                          'untranslated-utf-8 u)
 608       (put-text-property (point) (min (point-max) (+ l (point)))
 609                          'help-echo 'utf-8-help-echo)
 610       (compose-region (point) (+ l (point)) ?\e$,3u=\e(B)
 611       l)))
 612
 613 (defcustom utf-8-compose-scripts nil
 614   "*Non-nil means compose various scripts on decoding utf-8 text."
 615   :group 'mule
 616   :version "21.4"
 617   :type 'boolean)
 618
 619 (defun utf-8-post-read-conversion (length)
 620   "Compose untranslated utf-8 sequences into single characters.
 621 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
 622   (save-excursion
 623     ;; Can't do eval-when-compile to insert a multibyte constant
 624     ;; version of the string in the loop, since it's always loaded as
 625     ;; unibyte from a byte-compiled file.
 626     (let ((range (string-as-multibyte "^\xe1-\xf7")))
 627       (while (and (skip-chars-forward range)
 628                   (not (eobp)))
 629         (forward-char (utf-8-compose)))))
 630   ;; Fixme: Takahashi-san implies it may not work this easily.  I
 631   ;; asked why but didn't get a reply. -- fx
 632   (when (and utf-8-compose-scripts (> length 1))
 633     ;; These currently have definitions which cover the relevant
 634     ;; unicodes.  We could avoid loading thai-util &c by checking
 635     ;; whether the region contains any characters with the appropriate
 636     ;; categories.  There aren't yet Unicode-based rules for Tibetan.
 637     (save-excursion (setq length (diacritic-post-read-conversion length)))
 638     (save-excursion (setq length (thai-post-read-conversion length)))
 639     (save-excursion (setq length (lao-post-read-conversion length)))
 640     (save-excursion
 641       (setq length (in-is13194-devanagari-post-read-conversion length))))
 642   length)
 643
 644 ;; ucs-tables is preloaded
 645 ;; (defun utf-8-pre-write-conversion (beg end)
 646 ;;   "Semi-dummy pre-write function effectively to autoload ucs-tables."
 647 ;;   ;; Ensure translation table is loaded.
 648 ;;   (require 'ucs-tables)
 649 ;;   ;; Don't do this again.
 650 ;;   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
 651 ;;   nil)
 652
 653 (make-coding-system
 654  'mule-utf-8 4 ?u
 655  "UTF-8 encoding for Emacs-supported Unicode characters.
 656 The supported Emacs character sets are the following, plus any other
 657 characters included in the tables `ucs-mule-to-mule-unicode' and
 658 `utf-8-subst-rev-table':
 659  ascii
 660  eight-bit-control
 661  eight-bit-graphic
 662  latin-iso8859-1
 663  latin-iso8859-2
 664  latin-iso8859-3
 665  latin-iso8859-4
 666  cyrillic-iso8859-5
 667  greek-iso8859-7
 668  hebrew-iso8859-8
 669  latin-iso8859-9
 670  latin-iso8859-14
 671  latin-iso8859-15
 672  mule-unicode-0100-24ff
 673  mule-unicode-2500-33ff
 674  mule-unicode-e000-ffff
 675
 676 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
 677 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
 678 \(see user option `utf-8-translate-cjk'); otherwise, sequences of
 679 eight-bit-control and eight-bit-graphic characters are used to
 680 preserve their byte sequences, and these are composed to display as a
 681 single character.  Emacs characters that otherwise can't be encoded
 682 are encoded as U+FFFD."
 683
 684  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
 685  '((safe-charsets
 686     ascii
 687     eight-bit-control
 688     eight-bit-graphic
 689     latin-iso8859-1
 690     latin-iso8859-15
 691     latin-iso8859-14
 692     latin-iso8859-9
 693     hebrew-iso8859-8
 694     greek-iso8859-7
 695     cyrillic-iso8859-5
 696     latin-iso8859-4
 697     latin-iso8859-3
 698     latin-iso8859-2
 699     vietnamese-viscii-lower
 700     vietnamese-viscii-upper
 701     thai-tis620
 702     ipa
 703     ethiopic
 704     indian-is13194
 705     katakana-jisx0201
 706     chinese-sisheng
 707     lao
 708     mule-unicode-0100-24ff
 709     mule-unicode-2500-33ff
 710     mule-unicode-e000-ffff)
 711    (mime-charset . utf-8)
 712    (coding-category . coding-category-utf-8)
 713    (valid-codes (0 . 255))
 714 ;;    (pre-write-conversion . utf-8-pre-write-conversion)
 715    (post-read-conversion . utf-8-post-read-conversion)))
 716
 717 (define-coding-system-alias 'utf-8 'mule-utf-8)
 718
 719 ;; I think this needs special private charsets defined for the
 720 ;; untranslated sequences, if it's going to work well.
 721
 722 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
 723 ;;;   (let* ((prop (get-char-property pos 'composition string))
 724 ;;;      (l (and prop (- (cadr prop) (car prop)))))
 725 ;;;     (cond ((and l (> l (- to pos)))
 726 ;;;        (delete-region pos to))
 727 ;;;       ((and (> (char-after pos) 224)
 728 ;;;             (< (char-after pos) 256)
 729 ;;;             (save-restriction
 730 ;;;               (narrow-to-region pos to)
 731 ;;;               (utf-8-compose)))
 732 ;;;        t))))
 733
 734 ;;; (dotimes (i 96)
 735 ;;;   (aset composition-function-table
 736 ;;;     (+ 128 i)
 737 ;;;     `((,(string-as-multibyte "[\200-\237\240-\377]")
 738 ;;;        . utf-8-compose-function))))
 739
 740 ;;; utf-8.el ends here