1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
9 ;; Keywords: multilingual, Unicode, UTF-8, i18n
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 2, or (at your option)
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26 ;; Boston, MA 02111-1307, USA.
30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
31 ;; of the following character sets to and from UTF-8:
36 ;; mule-unicode-0100-24ff
37 ;; mule-unicode-2500-33ff
38 ;; mule-unicode-e000-ffff
40 ;; On decoding, Unicode characters that do not fit into the above
41 ;; character sets are handled as `eight-bit-control' or
42 ;; `eight-bit-graphic' characters to retain the information about the
43 ;; original byte sequence and text properties record the corresponding
46 ;; Fixme: note that reading and writing invalid utf-8 may not be
47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
49 ;; Characters from other character sets can be encoded with
50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
51 ;; registering the translation with `register-char-codings'. Hash
52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
53 ;; support encoding and decoding of about a quarter of the CJK space
54 ;; between U+3400 and U+DFFF.
56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
59 ;; value | 1st byte | 2nd byte | 3rd byte
60 ;; --------------------+-----------+-----------+----------
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
67 (defvar ucs-mule-to-mule-unicode (make-translation-table)
68 "Translation table for encoding to `mule-utf-8'.")
69 ;; Could have been done by ucs-tables loaded before.
70 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
71 (define-translation-table 'ucs-mule-to-mule-unicode
72 ucs-mule-to-mule-unicode))
74 (defvar utf-8-subst-table (make-hash-table :test 'eq))
75 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
76 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
77 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
79 (defvar utf-8-translation-table-for-decode (make-translation-table)
80 "Translation table applied after decoding utf-8 to mule-unicode.
81 This is only actually applied to characters which would normally be
82 decoded into mule-unicode-0100-24ff.")
83 (define-translation-table 'utf-8-translation-table-for-decode
84 utf-8-translation-table-for-decode)
86 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
87 ;; space of mule-unicode. For Latin scripts this isn't very
88 ;; important. Hebrew and Arabic might go here too when there's proper
92 (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
93 '((?
\e$,1&d
\e(B . ?
\e,F4
\e(B) (?
\e$,1&e
\e(B . ?
\e,F5
\e(B) (?
\e$,1&f
\e(B . ?
\e,F6
\e(B) (?
\e$,1&h
\e(B . ?
\e,F8
\e(B) (?
\e$,1&i
\e(B . ?
\e,F9
\e(B)
94 (?
\e$,1&j
\e(B . ?
\e,F:
\e(B) (?
\e$,1&l
\e(B . ?
\e,F<
\e(B) (?
\e$,1&n
\e(B . ?
\e,F>
\e(B) (?
\e$,1&o
\e(B . ?
\e,F?
\e(B) (?
\e$,1&p
\e(B . ?
\e,F@
\e(B)
95 (?
\e$,1&q
\e(B . ?
\e,FA
\e(B) (?
\e$,1&r
\e(B . ?
\e,FB
\e(B) (?
\e$,1&s
\e(B . ?
\e,FC
\e(B) (?
\e$,1&t
\e(B . ?
\e,FD
\e(B) (?
\e$,1&u
\e(B . ?
\e,FE
\e(B)
96 (?
\e$,1&v
\e(B . ?
\e,FF
\e(B) (?
\e$,1&w
\e(B . ?
\e,FG
\e(B) (?
\e$,1&x
\e(B . ?
\e,FH
\e(B) (?
\e$,1&y
\e(B . ?
\e,FI
\e(B) (?
\e$,1&z
\e(B . ?
\e,FJ
\e(B)
97 (?
\e$,1&{
\e(B . ?
\e,FK
\e(B) (?
\e$,1&|
\e(B . ?
\e,FL
\e(B) (?
\e$,1&}
\e(B . ?
\e,FM
\e(B) (?
\e$,1&~
\e(B . ?
\e,FN
\e(B) (?
\e$,1&
\7f\e(B . ?
\e,FO
\e(B)
98 (?
\e$,1'
\e(B . ?
\e,FP
\e(B) (?
\e$,1'!
\e(B . ?
\e,FQ
\e(B) (?
\e$,1'#
\e(B . ?
\e,FS
\e(B) (?
\e$,1'$
\e(B . ?
\e,FT
\e(B) (?
\e$,1'%
\e(B . ?
\e,FU
\e(B)
99 (?
\e$,1'&
\e(B . ?
\e,FV
\e(B) (?
\e$,1''
\e(B . ?
\e,FW
\e(B) (?
\e$,1'(
\e(B . ?
\e,FX
\e(B) (?
\e$,1')
\e(B . ?
\e,FY
\e(B) (?
\e$,1'*
\e(B . ?
\e,FZ
\e(B)
100 (?
\e$,1'+
\e(B . ?
\e,F[
\e(B) (?
\e$,1',
\e(B . ?
\e,F\
\e(B) (?
\e$,1'-
\e(B . ?
\e,F]
\e(B) (?
\e$,1'.
\e(B . ?
\e,F^
\e(B) (?
\e$,1'/
\e(B . ?
\e,F_
\e(B)
101 (?
\e$,1'0
\e(B . ?
\e,F`
\e(B) (?
\e$,1'1
\e(B . ?
\e,Fa
\e(B) (?
\e$,1'2
\e(B . ?
\e,Fb
\e(B) (?
\e$,1'3
\e(B . ?
\e,Fc
\e(B) (?
\e$,1'4
\e(B . ?
\e,Fd
\e(B)
102 (?
\e$,1'5
\e(B . ?
\e,Fe
\e(B) (?
\e$,1'6
\e(B . ?
\e,Ff
\e(B) (?
\e$,1'7
\e(B . ?
\e,Fg
\e(B) (?
\e$,1'8
\e(B . ?
\e,Fh
\e(B) (?
\e$,1'9
\e(B . ?
\e,Fi
\e(B)
103 (?
\e$,1':
\e(B . ?
\e,Fj
\e(B) (?
\e$,1';
\e(B . ?
\e,Fk
\e(B) (?
\e$,1'<
\e(B . ?
\e,Fl
\e(B) (?
\e$,1'=
\e(B . ?
\e,Fm
\e(B) (?
\e$,1'>
\e(B . ?
\e,Fn
\e(B)
104 (?
\e$,1'?
\e(B . ?
\e,Fo
\e(B) (?
\e$,1'@
\e(B . ?
\e,Fp
\e(B) (?
\e$,1'A
\e(B . ?
\e,Fq
\e(B) (?
\e$,1'B
\e(B . ?
\e,Fr
\e(B) (?
\e$,1'C
\e(B . ?
\e,Fs
\e(B)
105 (?
\e$,1'D
\e(B . ?
\e,Ft
\e(B) (?
\e$,1'E
\e(B . ?
\e,Fu
\e(B) (?
\e$,1'F
\e(B . ?
\e,Fv
\e(B) (?
\e$,1'G
\e(B . ?
\e,Fw
\e(B) (?
\e$,1'H
\e(B . ?
\e,Fx
\e(B)
106 (?
\e$,1'I
\e(B . ?
\e,Fy
\e(B) (?
\e$,1'J
\e(B . ?
\e,Fz
\e(B) (?
\e$,1'K
\e(B . ?
\e,F{
\e(B) (?
\e$,1'L
\e(B . ?
\e,F|
\e(B) (?
\e$,1'M
\e(B . ?
\e,F}
\e(B)
107 (?
\e$,1'N
\e(B . ?
\e,F~
\e(B)
109 (?
\e$,1(!
\e(B . ?
\e,L!
\e(B) (?
\e$,1("
\e(B . ?
\e,L"
\e(B) (?
\e$,1(#
\e(B . ?
\e,L#
\e(B) (?
\e$,1($
\e(B . ?
\e,L$
\e(B)
110 (?
\e$,1(%
\e(B . ?
\e,L%
\e(B) (?
\e$,1(&
\e(B . ?
\e,L&
\e(B) (?
\e$,1('
\e(B . ?
\e,L'
\e(B) (?
\e$,1((
\e(B . ?
\e,L(
\e(B) (?
\e$,1()
\e(B . ?
\e,L)
\e(B)
111 (?
\e$,1(*
\e(B . ?
\e,L*
\e(B) (?
\e$,1(+
\e(B . ?
\e,L+
\e(B) (?
\e$,1(,
\e(B . ?
\e,L,
\e(B) (?
\e$,1(.
\e(B . ?
\e,L.
\e(B) (?
\e$,1(/
\e(B . ?
\e,L/
\e(B)
112 (?
\e$,1(0
\e(B . ?
\e,L0
\e(B) (?
\e$,1(1
\e(B . ?
\e,L1
\e(B) (?
\e$,1(2
\e(B . ?
\e,L2
\e(B) (?
\e$,1(3
\e(B . ?
\e,L3
\e(B) (?
\e$,1(4
\e(B . ?
\e,L4
\e(B)
113 (?
\e$,1(5
\e(B . ?
\e,L5
\e(B) (?
\e$,1(6
\e(B . ?
\e,L6
\e(B) (?
\e$,1(7
\e(B . ?
\e,L7
\e(B) (?
\e$,1(8
\e(B . ?
\e,L8
\e(B) (?
\e$,1(9
\e(B . ?
\e,L9
\e(B)
114 (?
\e$,1(:
\e(B . ?
\e,L:
\e(B) (?
\e$,1(;
\e(B . ?
\e,L;
\e(B) (?
\e$,1(<
\e(B . ?
\e,L<
\e(B) (?
\e$,1(=
\e(B . ?
\e,L=
\e(B) (?
\e$,1(>
\e(B . ?
\e,L>
\e(B)
115 (?
\e$,1(?
\e(B . ?
\e,L?
\e(B) (?
\e$,1(@
\e(B . ?
\e,L@
\e(B) (?
\e$,1(A
\e(B . ?
\e,LA
\e(B) (?
\e$,1(B
\e(B . ?
\e,LB
\e(B) (?
\e$,1(C
\e(B . ?
\e,LC
\e(B)
116 (?
\e$,1(D
\e(B . ?
\e,LD
\e(B) (?
\e$,1(E
\e(B . ?
\e,LE
\e(B) (?
\e$,1(F
\e(B . ?
\e,LF
\e(B) (?
\e$,1(G
\e(B . ?
\e,LG
\e(B) (?
\e$,1(H
\e(B . ?
\e,LH
\e(B)
117 (?
\e$,1(I
\e(B . ?
\e,LI
\e(B) (?
\e$,1(J
\e(B . ?
\e,LJ
\e(B) (?
\e$,1(K
\e(B . ?
\e,LK
\e(B) (?
\e$,1(L
\e(B . ?
\e,LL
\e(B) (?
\e$,1(M
\e(B . ?
\e,LM
\e(B)
118 (?
\e$,1(N
\e(B . ?
\e,LN
\e(B) (?
\e$,1(O
\e(B . ?
\e,LO
\e(B) (?
\e$,1(P
\e(B . ?
\e,LP
\e(B) (?
\e$,1(Q
\e(B . ?
\e,LQ
\e(B) (?
\e$,1(R
\e(B . ?
\e,LR
\e(B)
119 (?
\e$,1(S
\e(B . ?
\e,LS
\e(B) (?
\e$,1(T
\e(B . ?
\e,LT
\e(B) (?
\e$,1(U
\e(B . ?
\e,LU
\e(B) (?
\e$,1(V
\e(B . ?
\e,LV
\e(B) (?
\e$,1(W
\e(B . ?
\e,LW
\e(B)
120 (?
\e$,1(X
\e(B . ?
\e,LX
\e(B) (?
\e$,1(Y
\e(B . ?
\e,LY
\e(B) (?
\e$,1(Z
\e(B . ?
\e,LZ
\e(B) (?
\e$,1([
\e(B . ?
\e,L[
\e(B) (?
\e$,1(\
\e(B . ?
\e,L\
\e(B)
121 (?
\e$,1(]
\e(B . ?
\e,L]
\e(B) (?
\e$,1(^
\e(B . ?
\e,L^
\e(B) (?
\e$,1(_
\e(B . ?
\e,L_
\e(B) (?
\e$,1(`
\e(B . ?
\e,L`
\e(B) (?
\e$,1(a
\e(B . ?
\e,La
\e(B)
122 (?
\e$,1(b
\e(B . ?
\e,Lb
\e(B) (?
\e$,1(c
\e(B . ?
\e,Lc
\e(B) (?
\e$,1(d
\e(B . ?
\e,Ld
\e(B) (?
\e$,1(e
\e(B . ?
\e,Le
\e(B) (?
\e$,1(f
\e(B . ?
\e,Lf
\e(B)
123 (?
\e$,1(g
\e(B . ?
\e,Lg
\e(B) (?
\e$,1(h
\e(B . ?
\e,Lh
\e(B) (?
\e$,1(i
\e(B . ?
\e,Li
\e(B) (?
\e$,1(j
\e(B . ?
\e,Lj
\e(B) (?
\e$,1(k
\e(B . ?
\e,Lk
\e(B)
124 (?
\e$,1(l
\e(B . ?
\e,Ll
\e(B) (?
\e$,1(m
\e(B . ?
\e,Lm
\e(B) (?
\e$,1(n
\e(B . ?
\e,Ln
\e(B) (?
\e$,1(o
\e(B . ?
\e,Lo
\e(B) (?
\e$,1(q
\e(B . ?
\e,Lq
\e(B)
125 (?
\e$,1(r
\e(B . ?
\e,Lr
\e(B) (?
\e$,1(s
\e(B . ?
\e,Ls
\e(B) (?
\e$,1(t
\e(B . ?
\e,Lt
\e(B) (?
\e$,1(u
\e(B . ?
\e,Lu
\e(B) (?
\e$,1(v
\e(B . ?
\e,Lv
\e(B)
126 (?
\e$,1(w
\e(B . ?
\e,Lw
\e(B) (?
\e$,1(x
\e(B . ?
\e,Lx
\e(B) (?
\e$,1(y
\e(B . ?
\e,Ly
\e(B) (?
\e$,1(z
\e(B . ?
\e,Lz
\e(B) (?
\e$,1({
\e(B . ?
\e,L{
\e(B)
127 (?
\e$,1(|
\e(B . ?
\e,L|
\e(B) (?
\e$,1(~
\e(B . ?
\e,L~
\e(B) (?
\e$,1(
\7f\e(B . ?
\e,L
\7f\e(B)))
129 (defcustom utf-8-fragment-on-decoding nil
130 "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets.
131 Setting this means that the relevant Cyrillic and Greek characters are
132 decoded into the iso8859 charsets rather than into
133 mule-unicode-0100-24ff. The iso8859 charsets take half as much space
134 in the buffer, but using them may affect how the buffer can be re-encoded
135 and may require a different input method to search for them, for instance.
136 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
137 for mechanisms to make this largely transparent.
139 Setting this variable outside customize has no effect."
142 (define-translation-table 'utf-8-translation-table-for-decode
143 utf-8-translation-table-for-decode)
144 (define-translation-table 'utf-8-translation-table-for-decode))
150 (defcustom utf-8-translate-cjk nil
151 "Whether the `mule-utf-8' coding system should encode many CJK characters.
153 Enabling this loads tables which enable the coding system to encode
154 characters in the charsets `korean-ksc5601', `chinese-gb2312' and
155 `japanese-jisx0208', and to decode the corresponding unicodes into
156 such characters. This works by loading the library `utf-8-subst'; see
157 its commentary. The tables are fairly large (about 33000 entries), so this
158 option is not the default."
159 :link '(emacs-commentary-link "utf-8-subst")
162 (require 'utf-8-subst)
163 (let ((table (make-char-table 'translation-table)))
164 (coding-system-put 'mule-utf-8 'safe-charsets
165 (append (coding-system-get 'mule-utf-8
167 '(korean-ksc5601 chinese-gb2312
169 (maphash (lambda (k v)
171 utf-8-subst-rev-table)
172 (register-char-codings 'mule-utf-8 table)))
178 (define-ccl-program ccl-decode-mule-utf-8
180 ;; charset | bytes in utf-8 | bytes in emacs
181 ;; -----------------------+----------------+---------------
183 ;; -----------------------+----------------+---------------
184 ;; eight-bit-control | 2 | 2
185 ;; eight-bit-graphic | 2 | 1
186 ;; latin-iso8859-1 | 2 | 2
187 ;; -----------------------+----------------+---------------
188 ;; mule-unicode-0100-24ff | 2 | 4
190 ;; -----------------------+----------------+---------------
191 ;; mule-unicode-0100-24ff | 3 | 4
193 ;; mule-unicode-2500-33ff | 3 | 4
194 ;; mule-unicode-e000-ffff | 3 | 4
196 ;; Thus magnification factor is two.
199 ((r5 = ,(charset-id 'eight-bit-control))
200 (r6 = ,(charset-id 'eight-bit-graphic))
204 ;; 1byte encoding, i.e., ascii
207 (if (r0 < #xc0) ; continuation byte (invalid here)
209 (write-multibyte-character r5 r0)
210 (write-multibyte-character r6 r0))
211 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
215 (if ((r1 & #b11000000) != #b10000000)
216 ;; Invalid 2-byte sequence
218 (write-multibyte-character r5 r0)
219 (write-multibyte-character r6 r0))
223 (write-multibyte-character r5 r1)
224 (write-multibyte-character r6 r1))))
226 ((r3 = r0) ; save in case of overlong sequence
230 (r2 = r1) ; save in case of overlong sequence
233 ;; Now r1 holds scalar value
235 (if (r1 < 128) ; `overlong sequence'
237 (write-multibyte-character r5 r3)
238 (write-multibyte-character r6 r3))
242 (write-multibyte-character r5 r2)
243 (write-multibyte-character r6 r2))))
247 ((write-multibyte-character r5 r1))
251 ((r0 = ,(charset-id 'latin-iso8859-1))
253 (write-multibyte-character r0 r1))
255 ;; mule-unicode-0100-24ff (< 0800)
256 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
258 (r2 = (((r1 / 96) + 32) << 7))
262 utf-8-translation-table-for-decode r0 r1)
263 (write-multibyte-character r0 r1))))))))
266 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
270 ;; This is set to 1 if the encoding is invalid.
273 (r3 = (r1 & #b11000000))
274 (r3 |= ((r2 >> 2) & #b00110000))
275 (if (r3 != #b10100000)
277 ((r3 = ((r0 & #x0f) << 12))
278 (r3 += ((r1 & #x3f) << 6))
284 ;; Invalid 3-byte sequence
286 (write-multibyte-character r5 r0)
287 (write-multibyte-character r6 r0))
291 (write-multibyte-character r5 r1)
292 (write-multibyte-character r6 r1)))
296 (write-multibyte-character r5 r2)
297 (write-multibyte-character r6 r2))))
299 ;; mule-unicode-0100-24ff (>= 0800)
301 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
305 (r1 += ((r3 + 32) << 7))
307 utf-8-translation-table-for-decode r0 r1)
308 (write-multibyte-character r0 r1))
310 ;; mule-unicode-2500-33ff
311 ;; Fixme: Perhaps allow translation via
312 ;; utf-8-subst-table for #x2e80 up, so that we use
313 ;; consistent charsets for all of CJK. Would need
314 ;; corresponding change to encoding tables.
316 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
320 (r1 += ((r3 + 32) << 7))
321 (write-multibyte-character r0 r1))
324 ;; Try to convert to CJK chars, else keep
325 ;; them as eight-bit-{control|graphic}.
327 ((r4 = r3) ; don't zap r3
328 (lookup-integer utf-8-subst-table r4 r5)
331 ((write-multibyte-character r4 r5)
332 ;; Zapped through register starvation.
333 (r5 = ,(charset-id 'eight-bit-control)))
334 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
336 (write-multibyte-character r3 r0)
339 (write-multibyte-character r3 r1)
343 (write-multibyte-character r3 r2))))
345 ;; Surrogates, U+D800 .. U+DFFF
348 (write-multibyte-character r3 r0) ; eight-bit-graphic
351 (write-multibyte-character r3 r1)
355 (write-multibyte-character r3 r2))
357 ;; mule-unicode-e000-ffff
358 ;; Fixme: fffe and ffff are invalid.
359 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
363 (r1 += ((r3 + 32) << 7))
364 (write-multibyte-character r0 r1)))))))))
368 ;; keep those bytes as eight-bit-{control|graphic}
369 ;; Fixme: allow lookup in utf-8-subst-table.
371 ;; r0 > #xf0, thus eight-bit-graphic
372 (write-multibyte-character r6 r0)
374 (if (r1 < #x80) ; invalid byte
376 (write-multibyte-character r5 r1))
377 (write-multibyte-character r6 r1))
379 (if (r2 < #x80) ; invalid byte
381 (write-multibyte-character r5 r2))
382 (write-multibyte-character r6 r2))
384 (if (r3 < #x80) ; invalid byte
386 (write-multibyte-character r5 r3))
387 (write-multibyte-character r6 r3))
388 (if (r0 >= #xf8) ; 5- or 6-byte encoding
391 (if (r1 < #x80) ; invalid byte
393 (write-multibyte-character r5 r1))
394 (write-multibyte-character r6 r1))
395 (if (r0 >= #xfc) ; 6-byte
398 (if (r1 < #x80) ; invalid byte
400 (write-multibyte-character r5 r1))
401 (write-multibyte-character r6 r1)))))))
402 ;; else invalid byte >= #xfe
403 (write-multibyte-character r6 r0))))))
406 "CCL program to decode UTF-8.
407 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
408 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
410 Encodings of un-representable Unicode characters are decoded asis into
411 eight-bit-control and eight-bit-graphic characters.")
413 (define-ccl-program ccl-encode-mule-utf-8
419 (read-multibyte-character r0 r1)
420 (translate-character ucs-mule-to-mule-unicode r0 r1))
421 (;; We have already done read-multibyte-character.
426 (if (r0 == ,(charset-id 'ascii))
429 (if (r0 == ,(charset-id 'latin-iso8859-1))
431 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
432 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
433 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
434 ((r0 = (((r1 & #x40) >> 6) | #xc2))
439 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
440 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
441 ;; #x3f80 == (0011 1111 1000 0000)b
443 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
444 ;; now r1 holds scalar value
447 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
448 ;; #x07c0 == (0000 0111 1100 0000)b
453 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
454 (r2 = ((r1 & #x3f) | #x80))
460 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
461 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
463 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
464 (r0 = (((r1 & #xf000) >> 12) | #xe0))
465 (r2 = ((r1 & #x3f) | #x80))
471 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
472 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
474 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
475 (r0 = (((r1 & #xf000) >> 12) | #xe0))
476 (r2 = ((r1 & #x3f) | #x80))
482 (if (r0 == ,(charset-id 'eight-bit-control))
484 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
485 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
486 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
490 (if (r0 == ,(charset-id 'eight-bit-graphic))
492 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
493 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
494 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
497 (read-multibyte-character r0 r1)
498 (if (r0 != ,(charset-id 'eight-bit-graphic))
499 (if (r0 != ,(charset-id 'eight-bit-control))
503 ((read-multibyte-character r0 r2)
504 (if (r0 != ,(charset-id 'eight-bit-graphic))
505 (if (r0 != ,(charset-id 'eight-bit-control))
515 ((lookup-character utf-8-subst-rev-table r0 r1)
516 (if r7 ; lookup succeeded
517 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
518 (r2 = ((r0 & #x3f) | #x80))
523 ;; Unsupported character.
524 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
527 (write #xbd)))))))))))
535 "CCL program to encode into UTF-8.")
537 ;; Dummy definition so that the CCL can be checked correctly; the
538 ;; actual data are loaded on demand.
539 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
540 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
542 (define-ccl-program ccl-untranslated-to-ucs
544 (if (r0 < #xf0) ; 3-byte encoding, as above
546 (r3 = (r1 & #b11000000))
547 (r3 |= ((r2 >> 2) & #b00110000))
548 (if (r3 != #b10100000)
550 ((r3 = ((r0 & #x0f) << 12))
551 (r3 += ((r1 & #x3f) << 6))
558 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe)
568 ((r1 = ((r1 & #x3F) << 12))
569 (r2 = ((r2 & #x3F) << 6))
571 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
573 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
574 r0 == 0 for invalid sequence.")
576 (defvar utf-8-ccl-regs (make-vector 8 0))
578 (defsubst utf-8-untranslated-to-ucs ()
579 "Return the UCS code for an untranslated sequence of raw bytes t point.
580 Only for 3- or 4-byte sequences."
581 (aset utf-8-ccl-regs 0 (or (char-after) 0))
582 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
583 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
584 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
585 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
586 (aref utf-8-ccl-regs 0))
588 (defun utf-8-help-echo (window object position)
589 (format "Untranslated Unicode U+%04X"
590 (get-char-property position 'untranslated-utf-8 object)))
592 ;; We compose the untranslatable sequences into a single character.
593 ;; This is infelicitous for editing, because there's currently no
594 ;; mechanism for treating compositions as atomic, but is OK for
595 ;; display. They are composed to U+FFFD with help-echo which
596 ;; indicates the unicodes they represent. This function GCs too much.
597 (defsubst utf-8-compose ()
598 "Put a suitable composition on an untranslatable sequence.
599 Return the sequence's length."
600 (let* ((u (utf-8-untranslated-to-ucs))
606 (put-text-property (point) (min (point-max) (+ l (point)))
607 'untranslated-utf-8 u)
608 (put-text-property (point) (min (point-max) (+ l (point)))
609 'help-echo 'utf-8-help-echo)
610 (compose-region (point) (+ l (point)) ?
\e$,3u=
\e(B)
613 (defcustom utf-8-compose-scripts nil
614 "*Non-nil means compose various scripts on decoding utf-8 text."
619 (defun utf-8-post-read-conversion (length)
620 "Compose untranslated utf-8 sequences into single characters.
621 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
623 ;; Can't do eval-when-compile to insert a multibyte constant
624 ;; version of the string in the loop, since it's always loaded as
625 ;; unibyte from a byte-compiled file.
626 (let ((range (string-as-multibyte "^\xe1-\xf7")))
627 (while (and (skip-chars-forward range)
629 (forward-char (utf-8-compose)))))
630 ;; Fixme: Takahashi-san implies it may not work this easily. I
631 ;; asked why but didn't get a reply. -- fx
632 (when (and utf-8-compose-scripts (> length 1))
633 ;; These currently have definitions which cover the relevant
634 ;; unicodes. We could avoid loading thai-util &c by checking
635 ;; whether the region contains any characters with the appropriate
636 ;; categories. There aren't yet Unicode-based rules for Tibetan.
637 (save-excursion (setq length (diacritic-post-read-conversion length)))
638 (save-excursion (setq length (thai-post-read-conversion length)))
639 (save-excursion (setq length (lao-post-read-conversion length)))
641 (setq length (in-is13194-devanagari-post-read-conversion length))))
644 ;; ucs-tables is preloaded
645 ;; (defun utf-8-pre-write-conversion (beg end)
646 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
647 ;; ;; Ensure translation table is loaded.
648 ;; (require 'ucs-tables)
649 ;; ;; Don't do this again.
650 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
655 "UTF-8 encoding for Emacs-supported Unicode characters.
656 The supported Emacs character sets are the following, plus any other
657 characters included in the tables `ucs-mule-to-mule-unicode' and
658 `utf-8-subst-rev-table':
672 mule-unicode-0100-24ff
673 mule-unicode-2500-33ff
674 mule-unicode-e000-ffff
676 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
677 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
678 \(see user option `utf-8-translate-cjk'); otherwise, sequences of
679 eight-bit-control and eight-bit-graphic characters are used to
680 preserve their byte sequences, and these are composed to display as a
681 single character. Emacs characters that otherwise can't be encoded
682 are encoded as U+FFFD."
684 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
699 vietnamese-viscii-lower
700 vietnamese-viscii-upper
708 mule-unicode-0100-24ff
709 mule-unicode-2500-33ff
710 mule-unicode-e000-ffff)
711 (mime-charset . utf-8)
712 (coding-category . coding-category-utf-8)
713 (valid-codes (0 . 255))
714 ;; (pre-write-conversion . utf-8-pre-write-conversion)
715 (post-read-conversion . utf-8-post-read-conversion)))
717 (define-coding-system-alias 'utf-8 'mule-utf-8)
719 ;; I think this needs special private charsets defined for the
720 ;; untranslated sequences, if it's going to work well.
722 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
723 ;;; (let* ((prop (get-char-property pos 'composition string))
724 ;;; (l (and prop (- (cadr prop) (car prop)))))
725 ;;; (cond ((and l (> l (- to pos)))
726 ;;; (delete-region pos to))
727 ;;; ((and (> (char-after pos) 224)
728 ;;; (< (char-after pos) 256)
729 ;;; (save-restriction
730 ;;; (narrow-to-region pos to)
731 ;;; (utf-8-compose)))
735 ;;; (aset composition-function-table
737 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
738 ;;; . utf-8-compose-function))))
740 ;;; utf-8.el ends here