1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
7 ;; Keywords: multilingual, Unicode, UTF-8, i18n
9 ;; This file is part of GNU Emacs.
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA.
28 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
29 ;; following character sets to and from UTF-8:
34 ;; mule-unicode-0100-24ff
35 ;; mule-unicode-2500-33ff
36 ;; mule-unicode-e000-ffff
38 ;; Characters of other character sets cannot be encoded with
39 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
40 ;; case and syntax information, so things like `downcase' will only
41 ;; work for characters from ASCII and Latin-1.
43 ;; On decoding, Unicode characters that do not fit into the above
44 ;; character sets are handled as `eight-bit-control' or
45 ;; `eight-bit-graphic' characters to retain the information about the
46 ;; original byte sequence.
48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
51 ;; value | 1st byte | 2nd byte | 3rd byte
52 ;; --------------------+-----------+-----------+----------
53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
59 (define-ccl-program ccl-decode-mule-utf-8
61 ;; charset | bytes in utf-8 | bytes in emacs
62 ;; -----------------------+----------------+---------------
64 ;; -----------------------+----------------+---------------
65 ;; eight-bit-control | 2 | 2
66 ;; latin-iso8859-1 | 2 | 2
67 ;; -----------------------+----------------+---------------
68 ;; mule-unicode-0100-24ff | 2 | 4
70 ;; -----------------------+----------------+---------------
71 ;; mule-unicode-0100-24ff | 3 | 4
73 ;; mule-unicode-2500-33ff | 3 | 4
74 ;; mule-unicode-e000-ffff | 3 | 4
76 ;; Thus magnification factor is two.
82 ;; 1byte encoding, i.e., ascii
93 ;; now r1 holds scalar value
97 ((r0 = ,(charset-id 'eight-bit-control))
98 (write-multibyte-character r0 r1))
102 ((r0 = ,(charset-id 'latin-iso8859-1))
104 (write-multibyte-character r0 r1))
106 ;; mule-unicode-0100-24ff (< 0800)
107 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
109 (r2 = (((r1 / 96) + 32) << 7))
112 (write-multibyte-character r0 r1)))))
117 (r3 = ((r0 & #x0f) << 12))
118 (r3 += ((r1 & #x3f) << 6))
120 ;; now r3 holds scalar value
122 ;; mule-unicode-0100-24ff (>= 0800)
124 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
128 (r1 += ((r3 + 32) << 7))
129 (write-multibyte-character r0 r1))
131 ;; mule-unicode-2500-33ff
133 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
137 (r1 += ((r3 + 32) << 7))
138 (write-multibyte-character r0 r1))
141 ;; keep those bytes as eight-bit-{control|graphic}
143 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
144 (r3 = ,(charset-id 'eight-bit-graphic))
145 (write-multibyte-character r3 r0)
147 (r3 = ,(charset-id 'eight-bit-control)))
148 (write-multibyte-character r3 r1)
150 (r3 = ,(charset-id 'eight-bit-control))
151 (r3 = ,(charset-id 'eight-bit-graphic)))
152 (write-multibyte-character r3 r2))
154 ;; mule-unicode-e000-ffff
155 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
159 (r1 += ((r3 + 32) << 7))
160 (write-multibyte-character r0 r1))))))
163 ;; keep those bytes as eight-bit-{control|graphic}
165 ;; r0 > #xf0, thus eight-bit-graphic
166 (r4 = ,(charset-id 'eight-bit-graphic))
167 (write-multibyte-character r4 r0)
169 (r4 = ,(charset-id 'eight-bit-control)))
170 (write-multibyte-character r4 r1)
172 (r4 = ,(charset-id 'eight-bit-control))
173 (r4 = ,(charset-id 'eight-bit-graphic)))
174 (write-multibyte-character r4 r2)
176 (r4 = ,(charset-id 'eight-bit-control))
177 (r4 = ,(charset-id 'eight-bit-graphic)))
178 (write-multibyte-character r4 r3)))))
182 "CCL program to decode UTF-8.
183 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
184 mule-unicode-*. Encodings of un-representable Unicode characters are
185 decoded asis into eight-bit-control and eight-bit-graphic
188 (define-ccl-program ccl-encode-mule-utf-8
194 (read-multibyte-character r0 r1))
195 (;; We have already done read-multibyte-character.
200 (if (r0 == ,(charset-id 'ascii))
203 (if (r0 == ,(charset-id 'latin-iso8859-1))
205 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
206 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
207 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
208 ((r0 = (((r1 & #x40) >> 6) | #xc2))
213 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
214 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
215 ;; #x3f80 == (0011 1111 1000 0000)b
217 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
218 ;; now r1 holds scalar value
221 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
222 ;; #x07c0 == (0000 0111 1100 0000)b
227 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
228 (r2 = ((r1 & #x3f) | #x80))
234 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
235 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
237 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
238 (r0 = (((r1 & #xf000) >> 12) | #xe0))
239 (r2 = ((r1 & #x3f) | #x80))
245 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
246 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
248 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
249 (r0 = (((r1 & #xf000) >> 12) | #xe0))
250 (r2 = ((r1 & #x3f) | #x80))
256 (if (r0 == ,(charset-id 'eight-bit-control))
258 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
259 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
260 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
264 (if (r0 == ,(charset-id 'eight-bit-graphic))
266 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
267 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
268 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
271 (read-multibyte-character r0 r1)
272 (if (r0 != ,(charset-id 'eight-bit-graphic))
273 (if (r0 != ,(charset-id 'eight-bit-control))
277 ((read-multibyte-character r0 r2)
278 (if (r0 != ,(charset-id 'eight-bit-graphic))
279 (if (r0 != ,(charset-id 'eight-bit-control))
289 ;; Unsupported character.
290 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
301 "CCL program to encode into UTF-8.
302 Only characters from the charsets ascii, eight-bit-control,
303 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
304 Others are encoded as U+FFFD.")
308 "UTF-8 encoding for Emacs-supported Unicode characters.
309 The supported Emacs character sets are:
314 mule-unicode-0100-24ff
315 mule-unicode-2500-33ff
316 mule-unicode-e000-ffff
318 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
319 are decoded into sequences of eight-bit-control and eight-bit-graphic
320 characters to preserve their byte sequences. Emacs characters out of
321 these ranges are encoded into U+FFFD.
323 Note that, currently, characters in the mule-unicode charsets have no
324 syntax and case information. Thus, for instance, upper- and
325 lower-casing commands won't work with them."
327 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
333 mule-unicode-0100-24ff
334 mule-unicode-2500-33ff
335 mule-unicode-e000-ffff)
336 (mime-charset . utf-8)
337 (coding-category . coding-category-utf-8)
338 (valid-codes (0 . 255))))
340 (define-coding-system-alias 'utf-8 'mule-utf-8)