code.delx.au - gnu-emacs/blob - lisp/international/utf-8.el

   1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
   2
   3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
   4 ;; Licensed to the Free Software Foundation.
   5
   6 ;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
   7 ;; Keywords: multilingual, Unicode, UTF-8, i18n
   8
   9 ;; This file is part of GNU Emacs.
  10
  11 ;; GNU Emacs is free software; you can redistribute it and/or modify
  12 ;; it under the terms of the GNU General Public License as published by
  13 ;; the Free Software Foundation; either version 2, or (at your option)
  14 ;; any later version.
  15
  16 ;; GNU Emacs is distributed in the hope that it will be useful,
  17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 ;; GNU General Public License for more details.
  20
  21 ;; You should have received a copy of the GNU General Public License
  22 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 ;; Boston, MA 02111-1307, USA.
  25
  26 ;;; Commentary:
  27
  28 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
  29 ;; following character sets to and from UTF-8:
  30 ;;
  31 ;;   ascii
  32 ;;   eight-bit-control
  33 ;;   latin-iso8859-1
  34 ;;   mule-unicode-0100-24ff
  35 ;;   mule-unicode-2500-33ff
  36 ;;   mule-unicode-e000-ffff
  37 ;;
  38 ;; Characters of other character sets cannot be encoded with
  39 ;; mule-utf-8.  Note that the mule-unicode charsets currently lack
  40 ;; case and syntax information, so things like `downcase' will only
  41 ;; work for characters from ASCII and Latin-1.
  42 ;;
  43 ;; On decoding, Unicode characters that do not fit into the above
  44 ;; character sets are handled as `eight-bit-control' or
  45 ;; `eight-bit-graphic' characters to retain the information about the
  46 ;; original byte sequence.
  47
  48 ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  49
  50 ;;        scalar       |               utf-8
  51 ;;        value        | 1st byte  | 2nd byte  | 3rd byte
  52 ;; --------------------+-----------+-----------+----------
  53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx |           |
  54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
  55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
  56
  57 ;;; Code:
  58
  59 (define-ccl-program ccl-decode-mule-utf-8
  60   ;;
  61   ;;        charset         | bytes in utf-8 | bytes in emacs
  62   ;; -----------------------+----------------+---------------
  63   ;;         ascii          |       1        |       1
  64   ;; -----------------------+----------------+---------------
  65   ;;    eight-bit-control   |       2        |       2
  66   ;;     latin-iso8859-1    |       2        |       2
  67   ;; -----------------------+----------------+---------------
  68   ;; mule-unicode-0100-24ff |       2        |       4
  69   ;;        (< 0800)        |                |
  70   ;; -----------------------+----------------+---------------
  71   ;; mule-unicode-0100-24ff |       3        |       4
  72   ;;        (>= 8000)       |                |
  73   ;; mule-unicode-2500-33ff |       3        |       4
  74   ;; mule-unicode-e000-ffff |       3        |       4
  75   ;;
  76   ;; Thus magnification factor is two.
  77   ;;
  78   `(2
  79     ((loop
  80       (read r0)
  81
  82       ;; 1byte encoding, i.e., ascii
  83       (if (r0 < #x80)
  84           (write r0)
  85
  86         ;; 2byte encoding
  87         (if (r0 < #xe0)
  88             ((read r1)
  89              (r0 &= #x1f)
  90              (r0 <<= 6)
  91              (r1 &= #x3f)
  92              (r1 += r0)
  93              ;; now r1 holds scalar value
  94
  95              ;; eight-bit-control
  96              (if (r1 < 160)
  97                  ((r0 = ,(charset-id 'eight-bit-control))
  98                   (write-multibyte-character r0 r1))
  99
 100                ;; latin-iso8859-1
 101                (if (r1 < 256)
 102                    ((r0 = ,(charset-id 'latin-iso8859-1))
 103                     (r1 -= 128)
 104                     (write-multibyte-character r0 r1))
 105
 106                  ;; mule-unicode-0100-24ff (< 0800)
 107                  ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 108                   (r1 -= #x0100)
 109                   (r2 = (((r1 / 96) + 32) << 7))
 110                   (r1 %= 96)
 111                   (r1 += (r2 + 32))
 112                   (write-multibyte-character r0 r1)))))
 113
 114           ;; 3byte encoding
 115           (if (r0 < #xf0)
 116               ((read r1 r2)
 117                (r3 = ((r0 & #x0f) << 12))
 118                (r3 += ((r1 & #x3f) << 6))
 119                (r3 += (r2 & #x3f))
 120                ;; now r3 holds scalar value
 121
 122                ;; mule-unicode-0100-24ff (>= 0800)
 123                (if (r3 < #x2500)
 124                    ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 125                     (r3 -= #x0100)
 126                     (r3 //= 96)
 127                     (r1 = (r7 + 32))
 128                     (r1 += ((r3 + 32) << 7))
 129                     (write-multibyte-character r0 r1))
 130
 131                  ;; mule-unicode-2500-33ff
 132                  (if (r3 < #x3400)
 133                      ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
 134                       (r3 -= #x2500)
 135                       (r3 //= 96)
 136                       (r1 = (r7 + 32))
 137                       (r1 += ((r3 + 32) << 7))
 138                       (write-multibyte-character r0 r1))
 139
 140                    ;; U+3400 .. U+DFFF
 141                    ;; keep those bytes as eight-bit-{control|graphic}
 142                    (if (r3 < #xe000)
 143                        (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
 144                         (r3 = ,(charset-id 'eight-bit-graphic))
 145                         (write-multibyte-character r3 r0)
 146                         (if (r1 < #xa0)
 147                             (r3 = ,(charset-id 'eight-bit-control)))
 148                         (write-multibyte-character r3 r1)
 149                         (if (r2 < #xa0)
 150                             (r3 = ,(charset-id 'eight-bit-control))
 151                           (r3 = ,(charset-id 'eight-bit-graphic)))
 152                         (write-multibyte-character r3 r2))
 153
 154                      ;; mule-unicode-e000-ffff
 155                      ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
 156                       (r3 -= #xe000)
 157                       (r3 //= 96)
 158                       (r1 = (r7 + 32))
 159                       (r1 += ((r3 + 32) << 7))
 160                       (write-multibyte-character r0 r1))))))
 161
 162             ;; 4byte encoding
 163             ;; keep those bytes as eight-bit-{control|graphic}
 164             ((read r1 r2 r3)
 165              ;; r0 > #xf0, thus eight-bit-graphic
 166              (r4 = ,(charset-id 'eight-bit-graphic))
 167              (write-multibyte-character r4 r0)
 168              (if (r1 < #xa0)
 169                  (r4 = ,(charset-id 'eight-bit-control)))
 170              (write-multibyte-character r4 r1)
 171              (if (r2 < #xa0)
 172                  (r4 = ,(charset-id 'eight-bit-control))
 173                (r4 = ,(charset-id 'eight-bit-graphic)))
 174              (write-multibyte-character r4 r2)
 175              (if (r3 < #xa0)
 176                  (r4 = ,(charset-id 'eight-bit-control))
 177                (r4 = ,(charset-id 'eight-bit-graphic)))
 178              (write-multibyte-character r4 r3)))))
 179
 180       (repeat))))
 181
 182   "CCL program to decode UTF-8.
 183 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
 184 mule-unicode-*.  Encodings of un-representable Unicode characters are
 185 decoded asis into eight-bit-control and eight-bit-graphic
 186 characters.")
 187
 188 (define-ccl-program ccl-encode-mule-utf-8
 189   `(1
 190     ((r5 = -1)
 191      (loop
 192       (if (r5 < 0)
 193           ((r1 = -1)
 194            (read-multibyte-character r0 r1))
 195         (;; We have already done read-multibyte-character.
 196          (r0 = r5)
 197          (r1 = r6)
 198          (r5 = -1)))
 199
 200       (if (r0 == ,(charset-id 'ascii))
 201           (write r1)
 202
 203         (if (r0 == ,(charset-id 'latin-iso8859-1))
 204             ;; r1          scalar                  utf-8
 205             ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 206             ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
 207             ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
 208             ((r0 = (((r1 & #x40) >> 6) | #xc2))
 209              (r1 &= #x3f)
 210              (r1 |= #x80)
 211              (write r0 r1))
 212
 213           (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
 214               ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 215                ;; #x3f80 == (0011 1111 1000 0000)b
 216                (r1 &= #x7f)
 217                (r1 += (r0 + 224))       ; 240 == -32 + #x0100
 218                ;; now r1 holds scalar value
 219                (if (r1 < #x0800)
 220                    ;; 2byte encoding
 221                    ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
 222                     ;; #x07c0 == (0000 0111 1100 0000)b
 223                     (r1 &= #x3f)
 224                     (r1 |= #x80)
 225                     (write r0 r1))
 226                  ;; 3byte encoding
 227                  ((r0 = (((r1 & #xf000) >> 12) | #xe0))
 228                   (r2 = ((r1 & #x3f) | #x80))
 229                   (r1 &= #x0fc0)
 230                   (r1 >>= 6)
 231                   (r1 |= #x80)
 232                   (write r0 r1 r2))))
 233
 234             (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
 235                 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 236                  (r1 &= #x7f)
 237                  (r1 += (r0 + 9440))    ; 9440 == -32 + #x2500
 238                  (r0 = (((r1 & #xf000) >> 12) | #xe0))
 239                  (r2 = ((r1 & #x3f) | #x80))
 240                  (r1 &= #x0fc0)
 241                  (r1 >>= 6)
 242                  (r1 |= #x80)
 243                  (write r0 r1 r2))
 244
 245               (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
 246                   ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 247                    (r1 &= #x7f)
 248                    (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
 249                    (r0 = (((r1 & #xf000) >> 12) | #xe0))
 250                    (r2 = ((r1 & #x3f) | #x80))
 251                    (r1 &= #x0fc0)
 252                    (r1 >>= 6)
 253                    (r1 |= #x80)
 254                    (write r0 r1 r2))
 255
 256                 (if (r0 == ,(charset-id 'eight-bit-control))
 257                     ;; r1          scalar                  utf-8
 258                     ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 259                     ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
 260                     ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
 261                     ((write #xc2)
 262                      (write r1))
 263
 264                   (if (r0 == ,(charset-id 'eight-bit-graphic))
 265                       ;; r1          scalar                  utf-8
 266                       ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 267                       ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
 268                       ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
 269                       ((write r1)
 270                        (r1 = -1)
 271                        (read-multibyte-character r0 r1)
 272                        (if (r0 != ,(charset-id 'eight-bit-graphic))
 273                            (if (r0 != ,(charset-id 'eight-bit-control))
 274                                ((r5 = r0)
 275                                 (r6 = r1))))
 276                        (if (r5 < 0)
 277                            ((read-multibyte-character r0 r2)
 278                             (if (r0 != ,(charset-id 'eight-bit-graphic))
 279                                 (if (r0 != ,(charset-id 'eight-bit-control))
 280                                     ((r5 = r0)
 281                                      (r6 = r2))))
 282                             (if (r5 < 0)
 283                                 (write r1 r2)
 284                               (if (r1 < #xa0)
 285                                   (write r1)
 286                                 ((write #xc2)
 287                                  (write r1)))))))
 288
 289                     ;; Unsupported character.
 290                     ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
 291                     ((write #xef)
 292                      (write #xbf)
 293                      (write #xbd)))))))))
 294       (repeat)))
 295     (if (r1 >= #xa0)
 296         (write r1)
 297       (if (r1 >= #x80)
 298           ((write #xc2)
 299            (write r1)))))
 300
 301   "CCL program to encode into UTF-8.
 302 Only characters from the charsets ascii, eight-bit-control,
 303 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
 304 Others are encoded as U+FFFD.")
 305
 306 (make-coding-system
 307  'mule-utf-8 4 ?u
 308  "UTF-8 encoding for Emacs-supported Unicode characters.
 309 The supported Emacs character sets are:
 310    ascii
 311    eight-bit-control
 312    eight-bit-graphic
 313    latin-iso8859-1
 314    mule-unicode-0100-24ff
 315    mule-unicode-2500-33ff
 316    mule-unicode-e000-ffff
 317
 318 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
 319 are decoded into sequences of eight-bit-control and eight-bit-graphic
 320 characters to preserve their byte sequences.  Emacs characters out of
 321 these ranges are encoded into U+FFFD.
 322
 323 Note that, currently, characters in the mule-unicode charsets have no
 324 syntax and case information.  Thus, for instance, upper- and
 325 lower-casing commands won't work with them."
 326
 327  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
 328  '((safe-charsets
 329     ascii
 330     eight-bit-control
 331     eight-bit-graphic
 332     latin-iso8859-1
 333     mule-unicode-0100-24ff
 334     mule-unicode-2500-33ff
 335     mule-unicode-e000-ffff)
 336    (mime-charset . utf-8)
 337    (coding-category . coding-category-utf-8)
 338    (valid-codes (0 . 255))))
 339
 340 (define-coding-system-alias 'utf-8 'mule-utf-8)