1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit-*-
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001 Free Software Foundation, Inc.
7 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
8 ;; Keywords: multilingual, Unicode, UTF-8, i18n
10 ;; This file is part of GNU Emacs.
12 ;; GNU Emacs is free software; you can redistribute it and/or modify
13 ;; it under the terms of the GNU General Public License as published by
14 ;; the Free Software Foundation; either version 2, or (at your option)
17 ;; GNU Emacs is distributed in the hope that it will be useful,
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;; GNU General Public License for more details.
22 ;; You should have received a copy of the GNU General Public License
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 ;; Boston, MA 02111-1307, USA.
29 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
30 ;; of the following character sets to and from UTF-8:
35 ;; mule-unicode-0100-24ff
36 ;; mule-unicode-2500-33ff
37 ;; mule-unicode-e000-ffff
39 ;; On decoding, Unicode characters that do not fit into the above
40 ;; character sets are handled as `eight-bit-control' or
41 ;; `eight-bit-graphic' characters to retain the information about the
42 ;; original byte sequence.
44 ;; Characters from other character sets can be encoded with
45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
46 ;; registering the translation with `register-char-codings'.
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
51 ;; value | 1st byte | 2nd byte | 3rd byte
52 ;; --------------------+-----------+-----------+----------
53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
59 (defvar ucs-mule-to-mule-unicode (make-translation-table)
60 "Translation table for encoding to `mule-utf-8'.")
61 ;; Could have been done by ucs-tables loaded before.
62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
63 (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
64 (define-ccl-program ccl-decode-mule-utf-8
66 ;; charset | bytes in utf-8 | bytes in emacs
67 ;; -----------------------+----------------+---------------
69 ;; -----------------------+----------------+---------------
70 ;; eight-bit-control | 2 | 2
71 ;; eight-bit-graphic | 2 | 1
72 ;; latin-iso8859-1 | 2 | 2
73 ;; -----------------------+----------------+---------------
74 ;; mule-unicode-0100-24ff | 2 | 4
76 ;; -----------------------+----------------+---------------
77 ;; mule-unicode-0100-24ff | 3 | 4
79 ;; mule-unicode-2500-33ff | 3 | 4
80 ;; mule-unicode-e000-ffff | 3 | 4
82 ;; Thus magnification factor is two.
85 ((r5 = ,(charset-id 'eight-bit-control))
86 (r6 = ,(charset-id 'eight-bit-graphic))
90 ;; 1byte encoding, i.e., ascii
94 ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
98 (if ((r1 & #b11000000) != #b10000000)
99 ;; Invalid 2-byte sequence
101 (write-multibyte-character r5 r0)
102 (write-multibyte-character r6 r0))
106 (write-multibyte-character r5 r1)
107 (write-multibyte-character r6 r1))))
113 ;; Now r1 holds scalar value
117 ((write-multibyte-character r5 r1))
121 ((r0 = ,(charset-id 'latin-iso8859-1))
123 (write-multibyte-character r0 r1))
125 ;; mule-unicode-0100-24ff (< 0800)
126 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
128 (r2 = (((r1 / 96) + 32) << 7))
131 (write-multibyte-character r0 r1)))))))
134 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
138 ;; This is set to 1 if the encoding is invalid.
141 (r3 = (r1 & #b11000000))
142 (r3 |= ((r2 >> 2) & #b00110000))
143 (if (r3 != #b10100000)
145 ((r3 = ((r0 & #x0f) << 12))
146 (r3 += ((r1 & #x3f) << 6))
152 ;; Invalid 3-byte sequence
154 (write-multibyte-character r5 r0)
155 (write-multibyte-character r6 r0))
159 (write-multibyte-character r5 r1)
160 (write-multibyte-character r6 r1)))
164 (write-multibyte-character r5 r2)
165 (write-multibyte-character r6 r2))))
167 ;; mule-unicode-0100-24ff (>= 0800)
169 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
173 (r1 += ((r3 + 32) << 7))
174 (write-multibyte-character r0 r1))
176 ;; mule-unicode-2500-33ff
178 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
182 (r1 += ((r3 + 32) << 7))
183 (write-multibyte-character r0 r1))
186 ;; keep those bytes as eight-bit-{control|graphic}
188 ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
190 (write-multibyte-character r3 r0)
193 (write-multibyte-character r3 r1)
197 (write-multibyte-character r3 r2))
199 ;; mule-unicode-e000-ffff
200 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
204 (r1 += ((r3 + 32) << 7))
205 (write-multibyte-character r0 r1))))))))
208 ;; keep those bytes as eight-bit-{control|graphic}
210 ;; r0 > #xf0, thus eight-bit-graphic
211 (write-multibyte-character r6 r0)
213 (write-multibyte-character r5 r1)
214 (write-multibyte-character r6 r1))
216 (write-multibyte-character r5 r2)
217 (write-multibyte-character r6 r2))
219 (write-multibyte-character r5 r3)
220 (write-multibyte-character r6 r3))))))
224 "CCL program to decode UTF-8.
225 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
226 mule-unicode-*. Encodings of un-representable Unicode characters are
227 decoded asis into eight-bit-control and eight-bit-graphic
230 (define-ccl-program ccl-encode-mule-utf-8
236 (read-multibyte-character r0 r1)
237 (translate-character ucs-mule-to-mule-unicode r0 r1))
238 (;; We have already done read-multibyte-character.
243 (if (r0 == ,(charset-id 'ascii))
246 (if (r0 == ,(charset-id 'latin-iso8859-1))
248 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
249 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
250 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
251 ((r0 = (((r1 & #x40) >> 6) | #xc2))
256 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
257 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
258 ;; #x3f80 == (0011 1111 1000 0000)b
260 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
261 ;; now r1 holds scalar value
264 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
265 ;; #x07c0 == (0000 0111 1100 0000)b
270 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
271 (r2 = ((r1 & #x3f) | #x80))
277 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
278 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
280 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
281 (r0 = (((r1 & #xf000) >> 12) | #xe0))
282 (r2 = ((r1 & #x3f) | #x80))
288 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
289 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
291 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
292 (r0 = (((r1 & #xf000) >> 12) | #xe0))
293 (r2 = ((r1 & #x3f) | #x80))
299 (if (r0 == ,(charset-id 'eight-bit-control))
301 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
302 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
303 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
307 (if (r0 == ,(charset-id 'eight-bit-graphic))
309 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
310 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
311 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
314 (read-multibyte-character r0 r1)
315 (if (r0 != ,(charset-id 'eight-bit-graphic))
316 (if (r0 != ,(charset-id 'eight-bit-control))
320 ((read-multibyte-character r0 r2)
321 (if (r0 != ,(charset-id 'eight-bit-graphic))
322 (if (r0 != ,(charset-id 'eight-bit-control))
332 ;; Unsupported character.
333 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
344 "CCL program to encode into UTF-8.
345 Only characters from the charsets ascii, eight-bit-control,
346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
347 Others are encoded as U+FFFD.")
349 ;; Dummy definition so that the CCL can be checked correctly; the
350 ;; actual data are loaded on demand.
351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
352 (define-translation-table 'ucs-mule-8859-to-mule-unicode))
354 (defsubst utf-8-untranslated-to-ucs ()
355 (let ((b1 (char-after))
356 (b2 (char-after (1+ (point))))
357 (b3 (char-after (+ 2 (point))))
358 (b4 (char-after (+ 4 (point)))))
361 (setq b2 (lsh (logand b2 ?\x3f) 6))
362 (setq b3 (logand b3 ?\x3f))
363 (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
365 (setq b2 (lsh (logand b2 ?\x3f) 12))
366 (setq b3 (lsh (logand b3 ?\x3f) 6))
367 (setq b4 (logand b4 ?\x3f))
368 (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
371 (defun utf-8-help-echo (window object position)
372 (format "Untranslated Unicode U+%04X"
373 (get-char-property position 'untranslated-utf-8 object)))
375 (defvar utf-8-subst-table nil
376 "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
378 ;; We compose the untranslatable sequences into a single character.
379 ;; This is infelicitous for editing, because there's currently no
380 ;; mechanism for treating compositions as atomic, but is OK for
381 ;; display. We try to compose an appropriate character from a hash
382 ;; table of CJK characters to display correctly. Otherwise we use
383 ;; U+FFFD. What we really should have is hash table lookup from CCL
384 ;; so that we could do this properly. This function GCs too much.
385 (defsubst utf-8-compose ()
386 "Put a suitable composition on an untranslatable sequence.
387 Return the sequence's length."
388 (let* ((u (utf-8-untranslated-to-ucs))
389 (l (and u (if (>= u ?\x10000)
392 (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
394 (put-text-property (point) (min (point-max) (+ l (point)))
395 'untranslated-utf-8 u)
397 (put-text-property (point) (min (point-max) (+ l (point)))
398 'help-echo 'utf-8-help-echo)
399 (setq subst ?
\e$,3u=
\e(B))
400 (compose-region (point) (+ l (point)) subst)
403 (defcustom utf-8-compose-scripts nil
404 "*Non-nil means compose various scipts on decoding utf-8 text."
406 :type 'boolean) ; omitted in Emacs 21.1
408 (defun utf-8-post-read-conversion (length)
409 "Compose untranslated utf-8 sequences into single characters.
410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
412 ;; Can't do eval-when-compile to insert a multibyte constant
413 ;; version of the string in the loop, since it's always loaded as
414 ;; unibyte from a byte-compiled file.
415 (let ((range (string-as-multibyte "^\341-\377")))
416 (while (and (skip-chars-forward
419 (forward-char (utf-8-compose)))))
420 ;; Fixme: Takahashi-san implies it may not work this easily -- needs
421 ;; checking with him.
422 (when (and utf-8-compose-scripts (> length 1))
423 ;; These currently have definitions which cover the relevant
424 ;; Unicodes. We could avoid loading thai-util &c by checking
425 ;; whether the region contains any characters with the appropriate
426 ;; categories. There aren't yet Unicode-based rules for Tibetan.
427 (save-excursion (setq length (diacritic-post-read-conversion length)))
428 (save-excursion (setq length (thai-post-read-conversion length)))
429 (save-excursion (setq length (lao-post-read-conversion length)))
430 (save-excursion (setq length (devanagari-post-read-conversion length))))
433 (defun utf-8-pre-write-conversion (beg end)
434 "Semi-dummy pre-write function effectively to autoload ucs-tables."
435 ;; Ensure translation table is loaded.
436 (require 'ucs-tables)
437 ;; Don't do this again.
438 (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
443 "UTF-8 encoding for Emacs-supported Unicode characters.
444 The supported Emacs character sets are the following, plus others
445 which may be included in the translation table
446 `ucs-mule-to-mule-unicode':
460 mule-unicode-0100-24ff
461 mule-unicode-2500-33ff
462 mule-unicode-e000-ffff
464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
465 are decoded into sequences of eight-bit-control and eight-bit-graphic
466 characters to preserve their byte sequences and composed to display as
467 a single character. Emacs characters that can't be encoded to these
468 ranges are encoded as U+FFFD."
470 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
485 vietnamese-viscii-lower
486 vietnamese-viscii-upper
494 mule-unicode-0100-24ff
495 mule-unicode-2500-33ff
496 mule-unicode-e000-ffff)
497 (mime-charset . utf-8)
498 (coding-category . coding-category-utf-8)
499 (valid-codes (0 . 255))
500 (pre-write-conversion . utf-8-pre-write-conversion)
501 (post-read-conversion . utf-8-post-read-conversion)))
503 (define-coding-system-alias 'utf-8 'mule-utf-8)
505 ;; I think this needs special private charsets defined for the
506 ;; untranslated sequences, if it's going to work well.
508 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
509 ;;; (let* ((prop (get-char-property pos 'composition string))
510 ;;; (l (and prop (- (cadr prop) (car prop)))))
511 ;;; (cond ((and l (> l (- to pos)))
512 ;;; (delete-region pos to))
513 ;;; ((and (> (char-after pos) 224)
514 ;;; (< (char-after pos) 256)
515 ;;; (save-restriction
516 ;;; (narrow-to-region pos to)
517 ;;; (utf-8-compose)))
521 ;;; (aset composition-function-table
523 ;;; `((,(string-as-multibyte "[\200-\237\240-\377]")
524 ;;; . utf-8-compose-function))))
526 ;;; utf-8.el ends here