]> code.delx.au - gnu-emacs/blob - lisp/international/utf-8.el
*** empty log message ***
[gnu-emacs] / lisp / international / utf-8.el
1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
8 ;; Maintainer: FSF
9 ;; Keywords: multilingual, Unicode, UTF-8, i18n
10
11 ;; This file is part of GNU Emacs.
12
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; any later version.
17
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
22
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26 ;; Boston, MA 02111-1307, USA.
27
28 ;;; Commentary:
29
30 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
31 ;; of the following character sets to and from UTF-8:
32 ;;
33 ;; ascii
34 ;; eight-bit-control
35 ;; latin-iso8859-1
36 ;; mule-unicode-0100-24ff
37 ;; mule-unicode-2500-33ff
38 ;; mule-unicode-e000-ffff
39 ;;
40 ;; On decoding, Unicode characters that do not fit into the above
41 ;; character sets are handled as `eight-bit-control' or
42 ;; `eight-bit-graphic' characters to retain the information about the
43 ;; original byte sequence and text properties record the corresponding
44 ;; unicode.
45 ;;
46 ;; Fixme: note that reading and writing invalid utf-8 may not be
47 ;; idempotent -- to represent the bytes to fix that needs a new charset.
48 ;;
49 ;; Characters from other character sets can be encoded with
50 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
51 ;; registering the translation with `register-char-codings'. Hash
52 ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
53 ;; support encoding and decoding of about a quarter of the CJK space
54 ;; between U+3400 and U+DFFF.
55
56 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
57
58 ;; scalar | utf-8
59 ;; value | 1st byte | 2nd byte | 3rd byte
60 ;; --------------------+-----------+-----------+----------
61 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
62 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
63 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
64
65 ;;; Code:
66
67 (defvar ucs-mule-to-mule-unicode (make-translation-table)
68 "Translation table for encoding to `mule-utf-8'.")
69 ;; Could have been done by ucs-tables loaded before.
70 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
71 (define-translation-table 'ucs-mule-to-mule-unicode
72 ucs-mule-to-mule-unicode))
73
74 (defvar utf-8-subst-table (make-hash-table :test 'eq))
75 (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
76 (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
77 (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
78
79 (defvar utf-8-translation-table-for-decode (make-translation-table)
80 "Translation table applied after decoding utf-8 to mule-unicode.
81 This is only actually applied to characters which would normally be
82 decoded into mule-unicode-0100-24ff.")
83 (define-translation-table 'utf-8-translation-table-for-decode
84 utf-8-translation-table-for-decode)
85
86 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
87 ;; space of mule-unicode. For Latin scripts this isn't very
88 ;; important. Hebrew and Arabic might go here too when there's proper
89 ;; support for them.
90 (mapc
91 (lambda (pair)
92 (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
93 '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B)
94 (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B)
95 (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B)
96 (?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B)
97 (?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\7f\e(B . ?\e,FO\e(B)
98 (?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B)
99 (?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B)
100 (?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B)
101 (?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B)
102 (?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B)
103 (?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B)
104 (?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B)
105 (?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B)
106 (?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B)
107 (?\e$,1'N\e(B . ?\e,F~\e(B)
108
109 (?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B)
110 (?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B)
111 (?\e$,1(*\e(B . ?\e,L*\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B)
112 (?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B)
113 (?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B)
114 (?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B)
115 (?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B)
116 (?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B)
117 (?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B)
118 (?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B)
119 (?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B)
120 (?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B)
121 (?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B)
122 (?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B)
123 (?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B)
124 (?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B)
125 (?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B)
126 (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B)
127 (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B)))
128
129 (defcustom utf-8-fragment-on-decoding nil
130 "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets.
131 Setting this means that the relevant Cyrillic and Greek characters are
132 decoded into the iso8859 charsets rather than into
133 mule-unicode-0100-24ff. The iso8859 charsets take half as much space
134 in the buffer, but using them may affect how the buffer can be re-encoded
135 and may require a different input method to search for them, for instance.
136 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
137 for mechanisms to make this largely transparent.
138
139 Setting this variable outside customize has no effect."
140 :set (lambda (s v)
141 (if v
142 (define-translation-table 'utf-8-translation-table-for-decode
143 utf-8-translation-table-for-decode)
144 (define-translation-table 'utf-8-translation-table-for-decode))
145 (set-default s v))
146 :version "21.4"
147 :type 'boolean
148 :group 'mule)
149
150 (defcustom utf-8-translate-cjk nil
151 "Whether the `mule-utf-8' coding system should encode many CJK characters.
152
153 Enabling this loads tables which enable the coding system to encode
154 characters in the charsets `korean-ksc5601', `chinese-gb2312' and
155 `japanese-jisx0208', and to decode the corresponding unicodes into
156 such characters. This works by loading the library `utf-8-subst'; see
157 its commentary. The tables are fairly large (about 33000 entries), so this
158 option is not the default."
159 :link '(emacs-commentary-link "utf-8-subst")
160 :set (lambda (s v)
161 (when v
162 (require 'utf-8-subst)
163 (let ((table (make-char-table 'translation-table)))
164 (coding-system-put 'mule-utf-8 'safe-charsets
165 (append (coding-system-get 'mule-utf-8
166 'safe-charsets)
167 '(korean-ksc5601 chinese-gb2312
168 japanese-jisx0208)))
169 (maphash (lambda (k v)
170 (aset table k v))
171 utf-8-subst-rev-table)
172 (register-char-codings 'mule-utf-8 table)))
173 (set-default s v))
174 :version "21.4"
175 :type 'boolean
176 :group 'mule)
177
178 (define-ccl-program ccl-decode-mule-utf-8
179 ;;
180 ;; charset | bytes in utf-8 | bytes in emacs
181 ;; -----------------------+----------------+---------------
182 ;; ascii | 1 | 1
183 ;; -----------------------+----------------+---------------
184 ;; eight-bit-control | 2 | 2
185 ;; eight-bit-graphic | 2 | 1
186 ;; latin-iso8859-1 | 2 | 2
187 ;; -----------------------+----------------+---------------
188 ;; mule-unicode-0100-24ff | 2 | 4
189 ;; (< 0800) | |
190 ;; -----------------------+----------------+---------------
191 ;; mule-unicode-0100-24ff | 3 | 4
192 ;; (>= 8000) | |
193 ;; mule-unicode-2500-33ff | 3 | 4
194 ;; mule-unicode-e000-ffff | 3 | 4
195 ;;
196 ;; Thus magnification factor is two.
197 ;;
198 `(2
199 ((r5 = ,(charset-id 'eight-bit-control))
200 (r6 = ,(charset-id 'eight-bit-graphic))
201 (loop
202 (read r0)
203
204 ;; 1byte encoding, i.e., ascii
205 (if (r0 < #x80)
206 (write r0)
207 (if (r0 < #xc0) ; continuation byte (invalid here)
208 (if (r0 < #xa0)
209 (write-multibyte-character r5 r0)
210 (write-multibyte-character r6 r0))
211 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
212 (if (r0 < #xe0)
213 ((read r1)
214
215 (if ((r1 & #b11000000) != #b10000000)
216 ;; Invalid 2-byte sequence
217 ((if (r0 < #xa0)
218 (write-multibyte-character r5 r0)
219 (write-multibyte-character r6 r0))
220 (if (r1 < #x80)
221 (write r1)
222 (if (r1 < #xa0)
223 (write-multibyte-character r5 r1)
224 (write-multibyte-character r6 r1))))
225
226 ((r3 = r0) ; save in case of overlong sequence
227 (r2 = r1)
228 (r0 &= #x1f)
229 (r0 <<= 6)
230 (r2 = r1) ; save in case of overlong sequence
231 (r1 &= #x3f)
232 (r1 += r0)
233 ;; Now r1 holds scalar value
234
235 (if (r1 < 128) ; `overlong sequence'
236 ((if (r3 < #xa0)
237 (write-multibyte-character r5 r3)
238 (write-multibyte-character r6 r3))
239 (if (r2 < #x80)
240 (write r2)
241 (if (r2 < #xa0)
242 (write-multibyte-character r5 r2)
243 (write-multibyte-character r6 r2))))
244
245 ;; eight-bit-control
246 (if (r1 < 160)
247 ((write-multibyte-character r5 r1))
248
249 ;; latin-iso8859-1
250 (if (r1 < 256)
251 ((r0 = ,(charset-id 'latin-iso8859-1))
252 (r1 -= 128)
253 (write-multibyte-character r0 r1))
254
255 ;; mule-unicode-0100-24ff (< 0800)
256 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
257 (r1 -= #x0100)
258 (r2 = (((r1 / 96) + 32) << 7))
259 (r1 %= 96)
260 (r1 += (r2 + 32))
261 (translate-character
262 utf-8-translation-table-for-decode r0 r1)
263 (write-multibyte-character r0 r1))))))))
264
265 ;; 3byte encoding
266 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
267 (if (r0 < #xf0)
268 ((read r1 r2)
269
270 ;; This is set to 1 if the encoding is invalid.
271 (r4 = 0)
272
273 (r3 = (r1 & #b11000000))
274 (r3 |= ((r2 >> 2) & #b00110000))
275 (if (r3 != #b10100000)
276 (r4 = 1)
277 ((r3 = ((r0 & #x0f) << 12))
278 (r3 += ((r1 & #x3f) << 6))
279 (r3 += (r2 & #x3f))
280 (if (r3 < #x0800)
281 (r4 = 1))))
282
283 (if (r4 != 0)
284 ;; Invalid 3-byte sequence
285 ((if (r0 < #xa0)
286 (write-multibyte-character r5 r0)
287 (write-multibyte-character r6 r0))
288 (if (r1 < #x80)
289 (write r1)
290 (if (r1 < #xa0)
291 (write-multibyte-character r5 r1)
292 (write-multibyte-character r6 r1)))
293 (if (r2 < #x80)
294 (write r2)
295 (if (r2 < #xa0)
296 (write-multibyte-character r5 r2)
297 (write-multibyte-character r6 r2))))
298
299 ;; mule-unicode-0100-24ff (>= 0800)
300 ((if (r3 < #x2500)
301 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
302 (r3 -= #x0100)
303 (r3 //= 96)
304 (r1 = (r7 + 32))
305 (r1 += ((r3 + 32) << 7))
306 (translate-character
307 utf-8-translation-table-for-decode r0 r1)
308 (write-multibyte-character r0 r1))
309
310 ;; mule-unicode-2500-33ff
311 ;; Fixme: Perhaps allow translation via
312 ;; utf-8-subst-table for #x2e80 up, so that we use
313 ;; consistent charsets for all of CJK. Would need
314 ;; corresponding change to encoding tables.
315 (if (r3 < #x3400)
316 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
317 (r3 -= #x2500)
318 (r3 //= 96)
319 (r1 = (r7 + 32))
320 (r1 += ((r3 + 32) << 7))
321 (write-multibyte-character r0 r1))
322
323 ;; U+3400 .. U+D7FF
324 ;; Try to convert to CJK chars, else keep
325 ;; them as eight-bit-{control|graphic}.
326 (if (r3 < #xd800)
327 ((r4 = r3) ; don't zap r3
328 (lookup-integer utf-8-subst-table r4 r5)
329 (if r7
330 ;; got a translation
331 ((write-multibyte-character r4 r5)
332 ;; Zapped through register starvation.
333 (r5 = ,(charset-id 'eight-bit-control)))
334 ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
335 ((r3 = r6)
336 (write-multibyte-character r3 r0)
337 (if (r1 < #xa0)
338 (r3 = r5))
339 (write-multibyte-character r3 r1)
340 (if (r2 < #xa0)
341 (r3 = r5)
342 (r3 = r6))
343 (write-multibyte-character r3 r2))))
344
345 ;; Surrogates, U+D800 .. U+DFFF
346 (if (r3 < #xe000)
347 ((r3 = r6)
348 (write-multibyte-character r3 r0) ; eight-bit-graphic
349 (if (r1 < #xa0)
350 (r3 = r5))
351 (write-multibyte-character r3 r1)
352 (if (r2 < #xa0)
353 (r3 = r5)
354 (r3 = r6))
355 (write-multibyte-character r3 r2))
356
357 ;; mule-unicode-e000-ffff
358 ;; Fixme: fffe and ffff are invalid.
359 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
360 (r3 -= #xe000)
361 (r3 //= 96)
362 (r1 = (r7 + 32))
363 (r1 += ((r3 + 32) << 7))
364 (write-multibyte-character r0 r1)))))))))
365
366 (if (r0 < #xfe)
367 ;; 4byte encoding
368 ;; keep those bytes as eight-bit-{control|graphic}
369 ;; Fixme: allow lookup in utf-8-subst-table.
370 ((read r1 r2 r3)
371 ;; r0 > #xf0, thus eight-bit-graphic
372 (write-multibyte-character r6 r0)
373 (if (r1 < #xa0)
374 (if (r1 < #x80) ; invalid byte
375 (write r1)
376 (write-multibyte-character r5 r1))
377 (write-multibyte-character r6 r1))
378 (if (r2 < #xa0)
379 (if (r2 < #x80) ; invalid byte
380 (write r2)
381 (write-multibyte-character r5 r2))
382 (write-multibyte-character r6 r2))
383 (if (r3 < #xa0)
384 (if (r3 < #x80) ; invalid byte
385 (write r3)
386 (write-multibyte-character r5 r3))
387 (write-multibyte-character r6 r3))
388 (if (r0 >= #xf8) ; 5- or 6-byte encoding
389 ((read r1)
390 (if (r1 < #xa0)
391 (if (r1 < #x80) ; invalid byte
392 (write r1)
393 (write-multibyte-character r5 r1))
394 (write-multibyte-character r6 r1))
395 (if (r0 >= #xfc) ; 6-byte
396 ((read r1)
397 (if (r1 < #xa0)
398 (if (r1 < #x80) ; invalid byte
399 (write r1)
400 (write-multibyte-character r5 r1))
401 (write-multibyte-character r6 r1)))))))
402 ;; else invalid byte >= #xfe
403 (write-multibyte-character r6 r0))))))
404 (repeat))))
405
406 "CCL program to decode UTF-8.
407 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
408 mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
409 `utf-8-subst-table'.
410 Encodings of un-representable Unicode characters are decoded asis into
411 eight-bit-control and eight-bit-graphic characters.")
412
413 (define-ccl-program ccl-encode-mule-utf-8
414 `(1
415 ((r5 = -1)
416 (loop
417 (if (r5 < 0)
418 ((r1 = -1)
419 (read-multibyte-character r0 r1)
420 (translate-character ucs-mule-to-mule-unicode r0 r1))
421 (;; We have already done read-multibyte-character.
422 (r0 = r5)
423 (r1 = r6)
424 (r5 = -1)))
425
426 (if (r0 == ,(charset-id 'ascii))
427 (write r1)
428
429 (if (r0 == ,(charset-id 'latin-iso8859-1))
430 ;; r1 scalar utf-8
431 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
432 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
433 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
434 ((r0 = (((r1 & #x40) >> 6) | #xc2))
435 (r1 &= #x3f)
436 (r1 |= #x80)
437 (write r0 r1))
438
439 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
440 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
441 ;; #x3f80 == (0011 1111 1000 0000)b
442 (r1 &= #x7f)
443 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
444 ;; now r1 holds scalar value
445 (if (r1 < #x0800)
446 ;; 2byte encoding
447 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
448 ;; #x07c0 == (0000 0111 1100 0000)b
449 (r1 &= #x3f)
450 (r1 |= #x80)
451 (write r0 r1))
452 ;; 3byte encoding
453 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
454 (r2 = ((r1 & #x3f) | #x80))
455 (r1 &= #x0fc0)
456 (r1 >>= 6)
457 (r1 |= #x80)
458 (write r0 r1 r2))))
459
460 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
461 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
462 (r1 &= #x7f)
463 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
464 (r0 = (((r1 & #xf000) >> 12) | #xe0))
465 (r2 = ((r1 & #x3f) | #x80))
466 (r1 &= #x0fc0)
467 (r1 >>= 6)
468 (r1 |= #x80)
469 (write r0 r1 r2))
470
471 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
472 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
473 (r1 &= #x7f)
474 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
475 (r0 = (((r1 & #xf000) >> 12) | #xe0))
476 (r2 = ((r1 & #x3f) | #x80))
477 (r1 &= #x0fc0)
478 (r1 >>= 6)
479 (r1 |= #x80)
480 (write r0 r1 r2))
481
482 (if (r0 == ,(charset-id 'eight-bit-control))
483 ;; r1 scalar utf-8
484 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
485 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
486 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
487 ((write #xc2)
488 (write r1))
489
490 (if (r0 == ,(charset-id 'eight-bit-graphic))
491 ;; r1 scalar utf-8
492 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
493 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
494 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
495 ((write r1)
496 (r1 = -1)
497 (read-multibyte-character r0 r1)
498 (if (r0 != ,(charset-id 'eight-bit-graphic))
499 (if (r0 != ,(charset-id 'eight-bit-control))
500 ((r5 = r0)
501 (r6 = r1))))
502 (if (r5 < 0)
503 ((read-multibyte-character r0 r2)
504 (if (r0 != ,(charset-id 'eight-bit-graphic))
505 (if (r0 != ,(charset-id 'eight-bit-control))
506 ((r5 = r0)
507 (r6 = r2))))
508 (if (r5 < 0)
509 (write r1 r2)
510 (if (r1 < #xa0)
511 (write r1)
512 ((write #xc2)
513 (write r1)))))))
514
515 ((lookup-character utf-8-subst-rev-table r0 r1)
516 (if r7 ; lookup succeeded
517 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
518 (r2 = ((r0 & #x3f) | #x80))
519 (r0 &= #x0fc0)
520 (r0 >>= 6)
521 (r0 |= #x80)
522 (write r1 r0 r2))
523 ;; Unsupported character.
524 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
525 ((write #xef)
526 (write #xbf)
527 (write #xbd)))))))))))
528 (repeat)))
529 (if (r1 >= #xa0)
530 (write r1)
531 (if (r1 >= #x80)
532 ((write #xc2)
533 (write r1)))))
534
535 "CCL program to encode into UTF-8.")
536
537 ;; Dummy definition so that the CCL can be checked correctly; the
538 ;; actual data are loaded on demand.
539 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
540 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
541
542 (define-ccl-program ccl-untranslated-to-ucs
543 `(0
544 (if (r0 < #xf0) ; 3-byte encoding, as above
545 ((r4 = 0)
546 (r3 = (r1 & #b11000000))
547 (r3 |= ((r2 >> 2) & #b00110000))
548 (if (r3 != #b10100000)
549 (r4 = 1)
550 ((r3 = ((r0 & #x0f) << 12))
551 (r3 += ((r1 & #x3f) << 6))
552 (r3 += (r2 & #x3f))
553 (if (r3 < #x0800)
554 (r4 = 1))))
555 (if (r4 != 0)
556 (r0 = 0)
557 (r0 = r3)))
558 (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe)
559 ((r4 = (r1 >> 6))
560 (if (r4 != #b10)
561 (r0 = 0)
562 ((r4 = (r2 >> 6))
563 (if (r4 != #b10)
564 (r0 = 0)
565 ((r4 = (r3 >> 6))
566 (if (r4 != #b10)
567 (r0 = 0)
568 ((r1 = ((r1 & #x3F) << 12))
569 (r2 = ((r2 & #x3F) << 6))
570 (r3 &= #x3F)
571 (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
572 (r0 = 0))))
573 "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
574 r0 == 0 for invalid sequence.")
575
576 (defvar utf-8-ccl-regs (make-vector 8 0))
577
578 (defsubst utf-8-untranslated-to-ucs ()
579 "Return the UCS code for an untranslated sequence of raw bytes t point.
580 Only for 3- or 4-byte sequences."
581 (aset utf-8-ccl-regs 0 (or (char-after) 0))
582 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
583 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
584 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
585 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
586 (aref utf-8-ccl-regs 0))
587
588 (defun utf-8-help-echo (window object position)
589 (format "Untranslated Unicode U+%04X"
590 (get-char-property position 'untranslated-utf-8 object)))
591
592 ;; We compose the untranslatable sequences into a single character.
593 ;; This is infelicitous for editing, because there's currently no
594 ;; mechanism for treating compositions as atomic, but is OK for
595 ;; display. They are composed to U+FFFD with help-echo which
596 ;; indicates the unicodes they represent. This function GCs too much.
597 (defsubst utf-8-compose ()
598 "Put a suitable composition on an untranslatable sequence.
599 Return the sequence's length."
600 (let* ((u (utf-8-untranslated-to-ucs))
601 (l (unless (zerop u)
602 (if (>= u #x10000)
603 4
604 3))))
605 (when l
606 (put-text-property (point) (min (point-max) (+ l (point)))
607 'untranslated-utf-8 u)
608 (put-text-property (point) (min (point-max) (+ l (point)))
609 'help-echo 'utf-8-help-echo)
610 (compose-region (point) (+ l (point)) ?\e$,3u=\e(B)
611 l)))
612
613 (defcustom utf-8-compose-scripts nil
614 "*Non-nil means compose various scripts on decoding utf-8 text."
615 :group 'mule
616 :version "21.4"
617 :type 'boolean)
618
619 (defun utf-8-post-read-conversion (length)
620 "Compose untranslated utf-8 sequences into single characters.
621 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
622 (save-excursion
623 ;; Can't do eval-when-compile to insert a multibyte constant
624 ;; version of the string in the loop, since it's always loaded as
625 ;; unibyte from a byte-compiled file.
626 (let ((range (string-as-multibyte "^\xe1-\xf7")))
627 (while (and (skip-chars-forward range)
628 (not (eobp)))
629 (forward-char (utf-8-compose)))))
630 ;; Fixme: Takahashi-san implies it may not work this easily. I
631 ;; asked why but didn't get a reply. -- fx
632 (when (and utf-8-compose-scripts (> length 1))
633 ;; These currently have definitions which cover the relevant
634 ;; unicodes. We could avoid loading thai-util &c by checking
635 ;; whether the region contains any characters with the appropriate
636 ;; categories. There aren't yet Unicode-based rules for Tibetan.
637 (save-excursion (setq length (diacritic-post-read-conversion length)))
638 (save-excursion (setq length (thai-post-read-conversion length)))
639 (save-excursion (setq length (lao-post-read-conversion length)))
640 (save-excursion
641 (setq length (in-is13194-devanagari-post-read-conversion length))))
642 length)
643
644 ;; ucs-tables is preloaded
645 ;; (defun utf-8-pre-write-conversion (beg end)
646 ;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
647 ;; ;; Ensure translation table is loaded.
648 ;; (require 'ucs-tables)
649 ;; ;; Don't do this again.
650 ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
651 ;; nil)
652
653 (make-coding-system
654 'mule-utf-8 4 ?u
655 "UTF-8 encoding for Emacs-supported Unicode characters.
656 The supported Emacs character sets are the following, plus any other
657 characters included in the tables `ucs-mule-to-mule-unicode' and
658 `utf-8-subst-rev-table':
659 ascii
660 eight-bit-control
661 eight-bit-graphic
662 latin-iso8859-1
663 latin-iso8859-2
664 latin-iso8859-3
665 latin-iso8859-4
666 cyrillic-iso8859-5
667 greek-iso8859-7
668 hebrew-iso8859-8
669 latin-iso8859-9
670 latin-iso8859-14
671 latin-iso8859-15
672 mule-unicode-0100-24ff
673 mule-unicode-2500-33ff
674 mule-unicode-e000-ffff
675
676 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
677 may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
678 \(see user option `utf-8-translate-cjk'); otherwise, sequences of
679 eight-bit-control and eight-bit-graphic characters are used to
680 preserve their byte sequences, and these are composed to display as a
681 single character. Emacs characters that otherwise can't be encoded
682 are encoded as U+FFFD."
683
684 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
685 '((safe-charsets
686 ascii
687 eight-bit-control
688 eight-bit-graphic
689 latin-iso8859-1
690 latin-iso8859-15
691 latin-iso8859-14
692 latin-iso8859-9
693 hebrew-iso8859-8
694 greek-iso8859-7
695 cyrillic-iso8859-5
696 latin-iso8859-4
697 latin-iso8859-3
698 latin-iso8859-2
699 vietnamese-viscii-lower
700 vietnamese-viscii-upper
701 thai-tis620
702 ipa
703 ethiopic
704 indian-is13194
705 katakana-jisx0201
706 chinese-sisheng
707 lao
708 mule-unicode-0100-24ff
709 mule-unicode-2500-33ff
710 mule-unicode-e000-ffff)
711 (mime-charset . utf-8)
712 (coding-category . coding-category-utf-8)
713 (valid-codes (0 . 255))
714 ;; (pre-write-conversion . utf-8-pre-write-conversion)
715 (post-read-conversion . utf-8-post-read-conversion)))
716
717 (define-coding-system-alias 'utf-8 'mule-utf-8)
718
719 ;; I think this needs special private charsets defined for the
720 ;; untranslated sequences, if it's going to work well.
721
722 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
723 ;;; (let* ((prop (get-char-property pos 'composition string))
724 ;;; (l (and prop (- (cadr prop) (car prop)))))
725 ;;; (cond ((and l (> l (- to pos)))
726 ;;; (delete-region pos to))
727 ;;; ((and (> (char-after pos) 224)
728 ;;; (< (char-after pos) 256)
729 ;;; (save-restriction
730 ;;; (narrow-to-region pos to)
731 ;;; (utf-8-compose)))
732 ;;; t))))
733
734 ;;; (dotimes (i 96)
735 ;;; (aset composition-function-table
736 ;;; (+ 128 i)
737 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
738 ;;; . utf-8-compose-function))))
739
740 ;;; utf-8.el ends here