]> code.delx.au - gnu-emacs/blob - lisp/international/utf-8.el
Comment change.
[gnu-emacs] / lisp / international / utf-8.el
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
2
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001 Free Software Foundation, Inc.
6
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
8 ;; Keywords: multilingual, Unicode, UTF-8, i18n
9
10 ;; This file is part of GNU Emacs.
11
12 ;; GNU Emacs is free software; you can redistribute it and/or modify
13 ;; it under the terms of the GNU General Public License as published by
14 ;; the Free Software Foundation; either version 2, or (at your option)
15 ;; any later version.
16
17 ;; GNU Emacs is distributed in the hope that it will be useful,
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;; GNU General Public License for more details.
21
22 ;; You should have received a copy of the GNU General Public License
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 ;; Boston, MA 02111-1307, USA.
26
27 ;;; Commentary:
28
29 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
30 ;; of the following character sets to and from UTF-8:
31 ;;
32 ;; ascii
33 ;; eight-bit-control
34 ;; latin-iso8859-1
35 ;; mule-unicode-0100-24ff
36 ;; mule-unicode-2500-33ff
37 ;; mule-unicode-e000-ffff
38 ;;
39 ;; On decoding, Unicode characters that do not fit into the above
40 ;; character sets are handled as `eight-bit-control' or
41 ;; `eight-bit-graphic' characters to retain the information about the
42 ;; original byte sequence.
43 ;;
44 ;; Characters from other character sets can be encoded with
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
46 ;; registering the translation with `register-char-codings'.
47
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
49
50 ;; scalar | utf-8
51 ;; value | 1st byte | 2nd byte | 3rd byte
52 ;; --------------------+-----------+-----------+----------
53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
56
57 ;;; Code:
58
59 (defvar ucs-mule-to-mule-unicode (make-translation-table)
60 "Translation table for encoding to `mule-utf-8'.")
61 ;; Could have been done by ucs-tables loaded before.
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
64 (define-ccl-program ccl-decode-mule-utf-8
65 ;;
66 ;; charset | bytes in utf-8 | bytes in emacs
67 ;; -----------------------+----------------+---------------
68 ;; ascii | 1 | 1
69 ;; -----------------------+----------------+---------------
70 ;; eight-bit-control | 2 | 2
71 ;; eight-bit-graphic | 2 | 1
72 ;; latin-iso8859-1 | 2 | 2
73 ;; -----------------------+----------------+---------------
74 ;; mule-unicode-0100-24ff | 2 | 4
75 ;; (< 0800) | |
76 ;; -----------------------+----------------+---------------
77 ;; mule-unicode-0100-24ff | 3 | 4
78 ;; (>= 8000) | |
79 ;; mule-unicode-2500-33ff | 3 | 4
80 ;; mule-unicode-e000-ffff | 3 | 4
81 ;;
82 ;; Thus magnification factor is two.
83 ;;
84 `(2
85 ((r5 = ,(charset-id 'eight-bit-control))
86 (r6 = ,(charset-id 'eight-bit-graphic))
87 (loop
88 (read r0)
89
90 ;; 1byte encoding, i.e., ascii
91 (if (r0 < #x80)
92 (write r0)
93
94 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
95 (if (r0 < #xe0)
96 ((read r1)
97
98 (if ((r1 & #b11000000) != #b10000000)
99 ;; Invalid 2-byte sequence
100 ((if (r0 < #xa0)
101 (write-multibyte-character r5 r0)
102 (write-multibyte-character r6 r0))
103 (if (r1 < #x80)
104 (write r1)
105 (if (r1 < #xa0)
106 (write-multibyte-character r5 r1)
107 (write-multibyte-character r6 r1))))
108
109 ((r0 &= #x1f)
110 (r0 <<= 6)
111 (r1 &= #x3f)
112 (r1 += r0)
113 ;; Now r1 holds scalar value
114
115 ;; eight-bit-control
116 (if (r1 < 160)
117 ((write-multibyte-character r5 r1))
118
119 ;; latin-iso8859-1
120 (if (r1 < 256)
121 ((r0 = ,(charset-id 'latin-iso8859-1))
122 (r1 -= 128)
123 (write-multibyte-character r0 r1))
124
125 ;; mule-unicode-0100-24ff (< 0800)
126 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
127 (r1 -= #x0100)
128 (r2 = (((r1 / 96) + 32) << 7))
129 (r1 %= 96)
130 (r1 += (r2 + 32))
131 (write-multibyte-character r0 r1)))))))
132
133 ;; 3byte encoding
134 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
135 (if (r0 < #xf0)
136 ((read r1 r2)
137
138 ;; This is set to 1 if the encoding is invalid.
139 (r4 = 0)
140
141 (r3 = (r1 & #b11000000))
142 (r3 |= ((r2 >> 2) & #b00110000))
143 (if (r3 != #b10100000)
144 (r4 = 1)
145 ((r3 = ((r0 & #x0f) << 12))
146 (r3 += ((r1 & #x3f) << 6))
147 (r3 += (r2 & #x3f))
148 (if (r3 < #x0800)
149 (r4 = 1))))
150
151 (if (r4 != 0)
152 ;; Invalid 3-byte sequence
153 ((if (r0 < #xa0)
154 (write-multibyte-character r5 r0)
155 (write-multibyte-character r6 r0))
156 (if (r1 < #x80)
157 (write r1)
158 (if (r1 < #xa0)
159 (write-multibyte-character r5 r1)
160 (write-multibyte-character r6 r1)))
161 (if (r2 < #x80)
162 (write r2)
163 (if (r2 < #xa0)
164 (write-multibyte-character r5 r2)
165 (write-multibyte-character r6 r2))))
166
167 ;; mule-unicode-0100-24ff (>= 0800)
168 ((if (r3 < #x2500)
169 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
170 (r3 -= #x0100)
171 (r3 //= 96)
172 (r1 = (r7 + 32))
173 (r1 += ((r3 + 32) << 7))
174 (write-multibyte-character r0 r1))
175
176 ;; mule-unicode-2500-33ff
177 (if (r3 < #x3400)
178 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
179 (r3 -= #x2500)
180 (r3 //= 96)
181 (r1 = (r7 + 32))
182 (r1 += ((r3 + 32) << 7))
183 (write-multibyte-character r0 r1))
184
185 ;; U+3400 .. U+DFFF
186 ;; keep those bytes as eight-bit-{control|graphic}
187 (if (r3 < #xe000)
188 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
189 (r3 = r6)
190 (write-multibyte-character r3 r0)
191 (if (r1 < #xa0)
192 (r3 = r5))
193 (write-multibyte-character r3 r1)
194 (if (r2 < #xa0)
195 (r3 = r5)
196 (r3 = r6))
197 (write-multibyte-character r3 r2))
198
199 ;; mule-unicode-e000-ffff
200 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
201 (r3 -= #xe000)
202 (r3 //= 96)
203 (r1 = (r7 + 32))
204 (r1 += ((r3 + 32) << 7))
205 (write-multibyte-character r0 r1))))))))
206
207 ;; 4byte encoding
208 ;; keep those bytes as eight-bit-{control|graphic}
209 ((read r1 r2 r3)
210 ;; r0 > #xf0, thus eight-bit-graphic
211 (write-multibyte-character r6 r0)
212 (if (r1 < #xa0)
213 (write-multibyte-character r5 r1)
214 (write-multibyte-character r6 r1))
215 (if (r2 < #xa0)
216 (write-multibyte-character r5 r2)
217 (write-multibyte-character r6 r2))
218 (if (r3 < #xa0)
219 (write-multibyte-character r5 r3)
220 (write-multibyte-character r6 r3))))))
221
222 (repeat))))
223
224 "CCL program to decode UTF-8.
225 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
226 mule-unicode-*. Encodings of un-representable Unicode characters are
227 decoded asis into eight-bit-control and eight-bit-graphic
228 characters.")
229
230 (define-ccl-program ccl-encode-mule-utf-8
231 `(1
232 ((r5 = -1)
233 (loop
234 (if (r5 < 0)
235 ((r1 = -1)
236 (read-multibyte-character r0 r1)
237 (translate-character ucs-mule-to-mule-unicode r0 r1))
238 (;; We have already done read-multibyte-character.
239 (r0 = r5)
240 (r1 = r6)
241 (r5 = -1)))
242
243 (if (r0 == ,(charset-id 'ascii))
244 (write r1)
245
246 (if (r0 == ,(charset-id 'latin-iso8859-1))
247 ;; r1 scalar utf-8
248 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
249 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
250 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
251 ((r0 = (((r1 & #x40) >> 6) | #xc2))
252 (r1 &= #x3f)
253 (r1 |= #x80)
254 (write r0 r1))
255
256 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
257 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
258 ;; #x3f80 == (0011 1111 1000 0000)b
259 (r1 &= #x7f)
260 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
261 ;; now r1 holds scalar value
262 (if (r1 < #x0800)
263 ;; 2byte encoding
264 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
265 ;; #x07c0 == (0000 0111 1100 0000)b
266 (r1 &= #x3f)
267 (r1 |= #x80)
268 (write r0 r1))
269 ;; 3byte encoding
270 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
271 (r2 = ((r1 & #x3f) | #x80))
272 (r1 &= #x0fc0)
273 (r1 >>= 6)
274 (r1 |= #x80)
275 (write r0 r1 r2))))
276
277 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
278 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
279 (r1 &= #x7f)
280 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
281 (r0 = (((r1 & #xf000) >> 12) | #xe0))
282 (r2 = ((r1 & #x3f) | #x80))
283 (r1 &= #x0fc0)
284 (r1 >>= 6)
285 (r1 |= #x80)
286 (write r0 r1 r2))
287
288 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
289 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
290 (r1 &= #x7f)
291 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
292 (r0 = (((r1 & #xf000) >> 12) | #xe0))
293 (r2 = ((r1 & #x3f) | #x80))
294 (r1 &= #x0fc0)
295 (r1 >>= 6)
296 (r1 |= #x80)
297 (write r0 r1 r2))
298
299 (if (r0 == ,(charset-id 'eight-bit-control))
300 ;; r1 scalar utf-8
301 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
302 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
303 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
304 ((write #xc2)
305 (write r1))
306
307 (if (r0 == ,(charset-id 'eight-bit-graphic))
308 ;; r1 scalar utf-8
309 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
310 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
311 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
312 ((write r1)
313 (r1 = -1)
314 (read-multibyte-character r0 r1)
315 (if (r0 != ,(charset-id 'eight-bit-graphic))
316 (if (r0 != ,(charset-id 'eight-bit-control))
317 ((r5 = r0)
318 (r6 = r1))))
319 (if (r5 < 0)
320 ((read-multibyte-character r0 r2)
321 (if (r0 != ,(charset-id 'eight-bit-graphic))
322 (if (r0 != ,(charset-id 'eight-bit-control))
323 ((r5 = r0)
324 (r6 = r2))))
325 (if (r5 < 0)
326 (write r1 r2)
327 (if (r1 < #xa0)
328 (write r1)
329 ((write #xc2)
330 (write r1)))))))
331
332 ;; Unsupported character.
333 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
334 ((write #xef)
335 (write #xbf)
336 (write #xbd)))))))))
337 (repeat)))
338 (if (r1 >= #xa0)
339 (write r1)
340 (if (r1 >= #x80)
341 ((write #xc2)
342 (write r1)))))
343
344 "CCL program to encode into UTF-8.
345 Only characters from the charsets ascii, eight-bit-control,
346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
347 Others are encoded as U+FFFD.")
348
349 ;; Dummy definition so that the CCL can be checked correctly; the
350 ;; actual data are loaded on demand.
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
353
354 (defsubst utf-8-untranslated-to-ucs ()
355 (let ((b1 (char-after))
356 (b2 (char-after (1+ (point))))
357 (b3 (char-after (+ 2 (point))))
358 (b4 (char-after (+ 4 (point)))))
359 (if (and b1 b2 b3)
360 (cond ((< b1 ?\xf0)
361 (setq b2 (lsh (logand b2 ?\x3f) 6))
362 (setq b3 (logand b3 ?\x3f))
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
364 (b4
365 (setq b2 (lsh (logand b2 ?\x3f) 12))
366 (setq b3 (lsh (logand b3 ?\x3f) 6))
367 (setq b4 (logand b4 ?\x3f))
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
369 18)))))))))
370
371 (defun utf-8-help-echo (window object position)
372 (format "Untranslated Unicode U+%04X"
373 (get-char-property position 'untranslated-utf-8 object)))
374
375 (defvar utf-8-subst-table nil
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
377
378 ;; We compose the untranslatable sequences into a single character.
379 ;; This is infelicitous for editing, because there's currently no
380 ;; mechanism for treating compositions as atomic, but is OK for
381 ;; display. We try to compose an appropriate character from a hash
382 ;; table of CJK characters to display correctly. Otherwise we use
383 ;; U+FFFD. What we really should have is hash table lookup from CCL
384 ;; so that we could do this properly. This function GCs too much.
385 (defsubst utf-8-compose ()
386 "Put a suitable composition on an untranslatable sequence.
387 Return the sequence's length."
388 (let* ((u (utf-8-untranslated-to-ucs))
389 (l (and u (if (>= u ?\x10000)
390 4
391 3)))
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
393 (when u
394 (put-text-property (point) (min (point-max) (+ l (point)))
395 'untranslated-utf-8 u)
396 (unless subst
397 (put-text-property (point) (min (point-max) (+ l (point)))
398 'help-echo 'utf-8-help-echo)
399 (setq subst ?\e$,3u=\e(B))
400 (compose-region (point) (+ l (point)) subst)
401 l)))
402
403 (defcustom utf-8-compose-scripts nil
404 "*Non-nil means compose various scipts on decoding utf-8 text."
405 :group 'mule
406 :type 'boolean) ; omitted in Emacs 21.1
407
408 (defun utf-8-post-read-conversion (length)
409 "Compose untranslated utf-8 sequences into single characters.
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
411 (save-excursion
412 ;; Can't do eval-when-compile to insert a multibyte constant
413 ;; version of the string in the loop, since it's always loaded as
414 ;; unibyte from a byte-compiled file.
415 (let ((range (string-as-multibyte "^\341-\377")))
416 (while (and (skip-chars-forward
417 range)
418 (not (eobp)))
419 (forward-char (utf-8-compose)))))
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs
421 ;; checking with him.
422 (when (and utf-8-compose-scripts (> length 1))
423 ;; These currently have definitions which cover the relevant
424 ;; Unicodes. We could avoid loading thai-util &c by checking
425 ;; whether the region contains any characters with the appropriate
426 ;; categories. There aren't yet Unicode-based rules for Tibetan.
427 (save-excursion (setq length (diacritic-post-read-conversion length)))
428 (save-excursion (setq length (thai-post-read-conversion length)))
429 (save-excursion (setq length (lao-post-read-conversion length)))
430 (save-excursion (setq length (devanagari-post-read-conversion length))))
431 length)
432
433 (defun utf-8-pre-write-conversion (beg end)
434 "Semi-dummy pre-write function effectively to autoload ucs-tables."
435 ;; Ensure translation table is loaded.
436 (require 'ucs-tables)
437 ;; Don't do this again.
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
439 nil)
440
441 (make-coding-system
442 'mule-utf-8 4 ?u
443 "UTF-8 encoding for Emacs-supported Unicode characters.
444 The supported Emacs character sets are the following, plus others
445 which may be included in the translation table
446 `ucs-mule-to-mule-unicode':
447 ascii
448 eight-bit-control
449 eight-bit-graphic
450 latin-iso8859-1
451 latin-iso8859-2
452 latin-iso8859-3
453 latin-iso8859-4
454 cyrillic-iso8859-5
455 greek-iso8859-7
456 hebrew-iso8859-8
457 latin-iso8859-9
458 latin-iso8859-14
459 latin-iso8859-15
460 mule-unicode-0100-24ff
461 mule-unicode-2500-33ff
462 mule-unicode-e000-ffff
463
464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
465 are decoded into sequences of eight-bit-control and eight-bit-graphic
466 characters to preserve their byte sequences and composed to display as
467 a single character. Emacs characters that can't be encoded to these
468 ranges are encoded as U+FFFD."
469
470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
471 '((safe-charsets
472 ascii
473 eight-bit-control
474 eight-bit-graphic
475 latin-iso8859-1
476 latin-iso8859-15
477 latin-iso8859-14
478 latin-iso8859-9
479 hebrew-iso8859-8
480 greek-iso8859-7
481 cyrillic-iso8859-5
482 latin-iso8859-4
483 latin-iso8859-3
484 latin-iso8859-2
485 vietnamese-viscii-lower
486 vietnamese-viscii-upper
487 thai-tis620
488 ipa
489 ethiopic
490 indian-is13194
491 katakana-jisx0201
492 chinese-sisheng
493 lao
494 mule-unicode-0100-24ff
495 mule-unicode-2500-33ff
496 mule-unicode-e000-ffff)
497 (mime-charset . utf-8)
498 (coding-category . coding-category-utf-8)
499 (valid-codes (0 . 255))
500 (pre-write-conversion . utf-8-pre-write-conversion)
501 (post-read-conversion . utf-8-post-read-conversion)))
502
503 (define-coding-system-alias 'utf-8 'mule-utf-8)
504
505 ;; I think this needs special private charsets defined for the
506 ;; untranslated sequences, if it's going to work well.
507
508 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
509 ;;; (let* ((prop (get-char-property pos 'composition string))
510 ;;; (l (and prop (- (cadr prop) (car prop)))))
511 ;;; (cond ((and l (> l (- to pos)))
512 ;;; (delete-region pos to))
513 ;;; ((and (> (char-after pos) 224)
514 ;;; (< (char-after pos) 256)
515 ;;; (save-restriction
516 ;;; (narrow-to-region pos to)
517 ;;; (utf-8-compose)))
518 ;;; t))))
519
520 ;;; (dotimes (i 96)
521 ;;; (aset composition-function-table
522 ;;; (+ 128 i)
523 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
524 ;;; . utf-8-compose-function))))
525
526 ;;; utf-8.el ends here