code.delx.au - gnu-emacs/blob - lisp/international/utf-8.el

   1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit-*-
   2
   3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
   4 ;; Licensed to the Free Software Foundation.
   5 ;; Copyright (C) 2001 Free Software Foundation, Inc.
   6
   7 ;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
   8 ;; Keywords: multilingual, Unicode, UTF-8, i18n
   9
  10 ;; This file is part of GNU Emacs.
  11
  12 ;; GNU Emacs is free software; you can redistribute it and/or modify
  13 ;; it under the terms of the GNU General Public License as published by
  14 ;; the Free Software Foundation; either version 2, or (at your option)
  15 ;; any later version.
  16
  17 ;; GNU Emacs is distributed in the hope that it will be useful,
  18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ;; GNU General Public License for more details.
  21
  22 ;; You should have received a copy of the GNU General Public License
  23 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  25 ;; Boston, MA 02111-1307, USA.
  26
  27 ;;; Commentary:
  28
  29 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
  30 ;; of the following character sets to and from UTF-8:
  31 ;;
  32 ;;   ascii
  33 ;;   eight-bit-control
  34 ;;   latin-iso8859-1
  35 ;;   mule-unicode-0100-24ff
  36 ;;   mule-unicode-2500-33ff
  37 ;;   mule-unicode-e000-ffff
  38 ;;
  39 ;; On decoding, Unicode characters that do not fit into the above
  40 ;; character sets are handled as `eight-bit-control' or
  41 ;; `eight-bit-graphic' characters to retain the information about the
  42 ;; original byte sequence.
  43 ;;
  44 ;; Characters from other character sets can be encoded with
  45 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
  46 ;; registering the translation with `register-char-codings'.
  47
  48 ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  49
  50 ;;        scalar       |               utf-8
  51 ;;        value        | 1st byte  | 2nd byte  | 3rd byte
  52 ;; --------------------+-----------+-----------+----------
  53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx |           |
  54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
  55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
  56
  57 ;;; Code:
  58
  59 (defvar ucs-mule-to-mule-unicode (make-translation-table)
  60   "Translation table for encoding to `mule-utf-8'.")
  61 ;; Could have been done by ucs-tables loaded before.
  62 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
  63   (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
  64 (define-ccl-program ccl-decode-mule-utf-8
  65   ;;
  66   ;;        charset         | bytes in utf-8 | bytes in emacs
  67   ;; -----------------------+----------------+---------------
  68   ;;         ascii          |       1        |       1
  69   ;; -----------------------+----------------+---------------
  70   ;;    eight-bit-control   |       2        |       2
  71   ;;    eight-bit-graphic   |       2        |       1
  72   ;;     latin-iso8859-1    |       2        |       2
  73   ;; -----------------------+----------------+---------------
  74   ;; mule-unicode-0100-24ff |       2        |       4
  75   ;;        (< 0800)        |                |
  76   ;; -----------------------+----------------+---------------
  77   ;; mule-unicode-0100-24ff |       3        |       4
  78   ;;        (>= 8000)       |                |
  79   ;; mule-unicode-2500-33ff |       3        |       4
  80   ;; mule-unicode-e000-ffff |       3        |       4
  81   ;;
  82   ;; Thus magnification factor is two.
  83   ;;
  84   `(2
  85     ((r5 = ,(charset-id 'eight-bit-control))
  86      (r6 = ,(charset-id 'eight-bit-graphic))
  87      (loop
  88       (read r0)
  89
  90       ;; 1byte encoding, i.e., ascii
  91       (if (r0 < #x80)
  92           (write r0)
  93
  94         ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
  95         (if (r0 < #xe0)
  96             ((read r1)
  97
  98              (if ((r1 & #b11000000) != #b10000000)
  99                  ;; Invalid 2-byte sequence
 100                  ((if (r0 < #xa0)
 101                       (write-multibyte-character r5 r0)
 102                     (write-multibyte-character r6 r0))
 103                   (if (r1 < #x80)
 104                       (write r1)
 105                     (if (r1 < #xa0)
 106                         (write-multibyte-character r5 r1)
 107                       (write-multibyte-character r6 r1))))
 108
 109                ((r0 &= #x1f)
 110                 (r0 <<= 6)
 111                 (r1 &= #x3f)
 112                 (r1 += r0)
 113                 ;; Now r1 holds scalar value
 114
 115                 ;; eight-bit-control
 116                 (if (r1 < 160)
 117                     ((write-multibyte-character r5 r1))
 118
 119                   ;; latin-iso8859-1
 120                   (if (r1 < 256)
 121                       ((r0 = ,(charset-id 'latin-iso8859-1))
 122                        (r1 -= 128)
 123                        (write-multibyte-character r0 r1))
 124
 125                     ;; mule-unicode-0100-24ff (< 0800)
 126                     ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 127                      (r1 -= #x0100)
 128                      (r2 = (((r1 / 96) + 32) << 7))
 129                      (r1 %= 96)
 130                      (r1 += (r2 + 32))
 131                      (write-multibyte-character r0 r1)))))))
 132
 133           ;; 3byte encoding
 134           ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
 135           (if (r0 < #xf0)
 136               ((read r1 r2)
 137
 138                ;; This is set to 1 if the encoding is invalid.
 139                (r4 = 0)
 140
 141                (r3 = (r1 & #b11000000))
 142                (r3 |= ((r2 >> 2) & #b00110000))
 143                (if (r3 != #b10100000)
 144                    (r4 = 1)
 145                  ((r3 = ((r0 & #x0f) << 12))
 146                   (r3 += ((r1 & #x3f) << 6))
 147                   (r3 += (r2 & #x3f))
 148                   (if (r3 < #x0800)
 149                       (r4 = 1))))
 150
 151                (if (r4 != 0)
 152                    ;; Invalid 3-byte sequence
 153                    ((if (r0 < #xa0)
 154                         (write-multibyte-character r5 r0)
 155                       (write-multibyte-character r6 r0))
 156                     (if (r1 < #x80)
 157                         (write r1)
 158                       (if (r1 < #xa0)
 159                           (write-multibyte-character r5 r1)
 160                         (write-multibyte-character r6 r1)))
 161                     (if (r2 < #x80)
 162                         (write r2)
 163                       (if (r2 < #xa0)
 164                           (write-multibyte-character r5 r2)
 165                         (write-multibyte-character r6 r2))))
 166
 167                  ;; mule-unicode-0100-24ff (>= 0800)
 168                  ((if (r3 < #x2500)
 169                       ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 170                        (r3 -= #x0100)
 171                        (r3 //= 96)
 172                        (r1 = (r7 + 32))
 173                        (r1 += ((r3 + 32) << 7))
 174                        (write-multibyte-character r0 r1))
 175
 176                     ;; mule-unicode-2500-33ff
 177                     (if (r3 < #x3400)
 178                         ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
 179                          (r3 -= #x2500)
 180                          (r3 //= 96)
 181                          (r1 = (r7 + 32))
 182                          (r1 += ((r3 + 32) << 7))
 183                          (write-multibyte-character r0 r1))
 184
 185                       ;; U+3400 .. U+DFFF
 186                     ;; keep those bytes as eight-bit-{control|graphic}
 187                       (if (r3 < #xe000)
 188                           ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
 189                            (r3 = r6)
 190                            (write-multibyte-character r3 r0)
 191                            (if (r1 < #xa0)
 192                                (r3 = r5))
 193                            (write-multibyte-character r3 r1)
 194                            (if (r2 < #xa0)
 195                                (r3 = r5)
 196                              (r3 = r6))
 197                            (write-multibyte-character r3 r2))
 198
 199                         ;; mule-unicode-e000-ffff
 200                         ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
 201                          (r3 -= #xe000)
 202                          (r3 //= 96)
 203                          (r1 = (r7 + 32))
 204                          (r1 += ((r3 + 32) << 7))
 205                          (write-multibyte-character r0 r1))))))))
 206
 207             ;; 4byte encoding
 208             ;; keep those bytes as eight-bit-{control|graphic}
 209             ((read r1 r2 r3)
 210              ;; r0 > #xf0, thus eight-bit-graphic
 211              (write-multibyte-character r6 r0)
 212              (if (r1 < #xa0)
 213                  (write-multibyte-character r5 r1)
 214                (write-multibyte-character r6 r1))
 215              (if (r2 < #xa0)
 216                  (write-multibyte-character r5 r2)
 217                (write-multibyte-character r6 r2))
 218              (if (r3 < #xa0)
 219                  (write-multibyte-character r5 r3)
 220                (write-multibyte-character r6 r3))))))
 221
 222       (repeat))))
 223
 224   "CCL program to decode UTF-8.
 225 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
 226 mule-unicode-*.  Encodings of un-representable Unicode characters are
 227 decoded asis into eight-bit-control and eight-bit-graphic
 228 characters.")
 229
 230 (define-ccl-program ccl-encode-mule-utf-8
 231   `(1
 232     ((r5 = -1)
 233      (loop
 234       (if (r5 < 0)
 235           ((r1 = -1)
 236            (read-multibyte-character r0 r1)
 237            (translate-character ucs-mule-to-mule-unicode r0 r1))
 238         (;; We have already done read-multibyte-character.
 239          (r0 = r5)
 240          (r1 = r6)
 241          (r5 = -1)))
 242
 243       (if (r0 == ,(charset-id 'ascii))
 244           (write r1)
 245
 246         (if (r0 == ,(charset-id 'latin-iso8859-1))
 247             ;; r1          scalar                  utf-8
 248             ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 249             ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
 250             ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
 251             ((r0 = (((r1 & #x40) >> 6) | #xc2))
 252              (r1 &= #x3f)
 253              (r1 |= #x80)
 254              (write r0 r1))
 255
 256           (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
 257               ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 258                ;; #x3f80 == (0011 1111 1000 0000)b
 259                (r1 &= #x7f)
 260                (r1 += (r0 + 224))       ; 240 == -32 + #x0100
 261                ;; now r1 holds scalar value
 262                (if (r1 < #x0800)
 263                    ;; 2byte encoding
 264                    ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
 265                     ;; #x07c0 == (0000 0111 1100 0000)b
 266                     (r1 &= #x3f)
 267                     (r1 |= #x80)
 268                     (write r0 r1))
 269                  ;; 3byte encoding
 270                  ((r0 = (((r1 & #xf000) >> 12) | #xe0))
 271                   (r2 = ((r1 & #x3f) | #x80))
 272                   (r1 &= #x0fc0)
 273                   (r1 >>= 6)
 274                   (r1 |= #x80)
 275                   (write r0 r1 r2))))
 276
 277             (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
 278                 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 279                  (r1 &= #x7f)
 280                  (r1 += (r0 + 9440))    ; 9440 == -32 + #x2500
 281                  (r0 = (((r1 & #xf000) >> 12) | #xe0))
 282                  (r2 = ((r1 & #x3f) | #x80))
 283                  (r1 &= #x0fc0)
 284                  (r1 >>= 6)
 285                  (r1 |= #x80)
 286                  (write r0 r1 r2))
 287
 288               (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
 289                   ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 290                    (r1 &= #x7f)
 291                    (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
 292                    (r0 = (((r1 & #xf000) >> 12) | #xe0))
 293                    (r2 = ((r1 & #x3f) | #x80))
 294                    (r1 &= #x0fc0)
 295                    (r1 >>= 6)
 296                    (r1 |= #x80)
 297                    (write r0 r1 r2))
 298
 299                 (if (r0 == ,(charset-id 'eight-bit-control))
 300                     ;; r1          scalar                  utf-8
 301                     ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 302                     ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
 303                     ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
 304                     ((write #xc2)
 305                      (write r1))
 306
 307                   (if (r0 == ,(charset-id 'eight-bit-graphic))
 308                       ;; r1          scalar                  utf-8
 309                       ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 310                       ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
 311                       ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
 312                       ((write r1)
 313                        (r1 = -1)
 314                        (read-multibyte-character r0 r1)
 315                        (if (r0 != ,(charset-id 'eight-bit-graphic))
 316                            (if (r0 != ,(charset-id 'eight-bit-control))
 317                                ((r5 = r0)
 318                                 (r6 = r1))))
 319                        (if (r5 < 0)
 320                            ((read-multibyte-character r0 r2)
 321                             (if (r0 != ,(charset-id 'eight-bit-graphic))
 322                                 (if (r0 != ,(charset-id 'eight-bit-control))
 323                                     ((r5 = r0)
 324                                      (r6 = r2))))
 325                             (if (r5 < 0)
 326                                 (write r1 r2)
 327                               (if (r1 < #xa0)
 328                                   (write r1)
 329                                 ((write #xc2)
 330                                  (write r1)))))))
 331
 332                     ;; Unsupported character.
 333                     ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
 334                     ((write #xef)
 335                      (write #xbf)
 336                      (write #xbd)))))))))
 337       (repeat)))
 338     (if (r1 >= #xa0)
 339         (write r1)
 340       (if (r1 >= #x80)
 341           ((write #xc2)
 342            (write r1)))))
 343
 344   "CCL program to encode into UTF-8.
 345 Only characters from the charsets ascii, eight-bit-control,
 346 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
 347 Others are encoded as U+FFFD.")
 348
 349 ;; Dummy definition so that the CCL can be checked correctly; the
 350 ;; actual data are loaded on demand.
 351 (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
 352   (define-translation-table 'ucs-mule-8859-to-mule-unicode))
 353
 354 (defsubst utf-8-untranslated-to-ucs ()
 355   (let ((b1 (char-after))
 356         (b2 (char-after (1+ (point))))
 357         (b3 (char-after (+ 2 (point))))
 358         (b4 (char-after (+ 4 (point)))))
 359     (if (and b1 b2 b3)
 360         (cond ((< b1 ?\xf0)
 361                (setq b2 (lsh (logand b2 ?\x3f) 6))
 362                (setq b3 (logand b3 ?\x3f))
 363                (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
 364               (b4
 365                (setq b2 (lsh (logand b2 ?\x3f) 12))
 366                (setq b3 (lsh (logand b3 ?\x3f) 6))
 367                (setq b4 (logand b4 ?\x3f))
 368                (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
 369                                                      18)))))))))
 370
 371 (defun utf-8-help-echo (window object position)
 372   (format "Untranslated Unicode U+%04X"
 373           (get-char-property position 'untranslated-utf-8 object)))
 374
 375 (defvar utf-8-subst-table nil
 376   "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
 377
 378 ;; We compose the untranslatable sequences into a single character.
 379 ;; This is infelicitous for editing, because there's currently no
 380 ;; mechanism for treating compositions as atomic, but is OK for
 381 ;; display.  We try to compose an appropriate character from a hash
 382 ;; table of CJK characters to display correctly.  Otherwise we use
 383 ;; U+FFFD.  What we really should have is hash table lookup from CCL
 384 ;; so that we could do this properly.  This function GCs too much.
 385 (defsubst utf-8-compose ()
 386   "Put a suitable composition on an untranslatable sequence.
 387 Return the sequence's length."
 388   (let* ((u (utf-8-untranslated-to-ucs))
 389          (l (and u (if (>= u ?\x10000)
 390                        4
 391                      3)))
 392          (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
 393     (when u
 394       (put-text-property (point) (min (point-max) (+ l (point)))
 395                          'untranslated-utf-8 u)
 396       (unless subst
 397           (put-text-property (point) (min (point-max) (+ l (point)))
 398                              'help-echo 'utf-8-help-echo)
 399           (setq subst ?\e$,3u=\e(B))
 400       (compose-region (point) (+ l (point)) subst)
 401       l)))
 402
 403 (defcustom utf-8-compose-scripts nil
 404   "*Non-nil means compose various scipts on decoding utf-8 text."
 405   :group 'mule
 406   :type 'boolean)       ; omitted in Emacs 21.1
 407
 408 (defun utf-8-post-read-conversion (length)
 409   "Compose untranslated utf-8 sequences into single characters.
 410 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
 411   (save-excursion
 412     ;; Can't do eval-when-compile to insert a multibyte constant
 413     ;; version of the string in the loop, since it's always loaded as
 414     ;; unibyte from a byte-compiled file.
 415     (let ((range (string-as-multibyte "^\341-\377")))
 416       (while (and (skip-chars-forward
 417                    range)
 418                   (not (eobp)))
 419         (forward-char (utf-8-compose)))))
 420   ;; Fixme: Takahashi-san implies it may not work this easily -- needs
 421   ;; checking with him.
 422   (when (and utf-8-compose-scripts (> length 1))
 423     ;; These currently have definitions which cover the relevant
 424     ;; Unicodes.  We could avoid loading thai-util &c by checking
 425     ;; whether the region contains any characters with the appropriate
 426     ;; categories.  There aren't yet Unicode-based rules for Tibetan.
 427     (save-excursion (setq length (diacritic-post-read-conversion length)))
 428     (save-excursion (setq length (thai-post-read-conversion length)))
 429     (save-excursion (setq length (lao-post-read-conversion length)))
 430     (save-excursion (setq length (devanagari-post-read-conversion length))))
 431   length)
 432
 433 (defun utf-8-pre-write-conversion (beg end)
 434   "Semi-dummy pre-write function effectively to autoload ucs-tables."
 435   ;; Ensure translation table is loaded.
 436   (require 'ucs-tables)
 437   ;; Don't do this again.
 438   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
 439   nil)
 440
 441 (make-coding-system
 442  'mule-utf-8 4 ?u
 443  "UTF-8 encoding for Emacs-supported Unicode characters.
 444 The supported Emacs character sets are the following, plus others
 445 which may be included in the translation table
 446 `ucs-mule-to-mule-unicode':
 447  ascii
 448  eight-bit-control
 449  eight-bit-graphic
 450  latin-iso8859-1
 451  latin-iso8859-2
 452  latin-iso8859-3
 453  latin-iso8859-4
 454  cyrillic-iso8859-5
 455  greek-iso8859-7
 456  hebrew-iso8859-8
 457  latin-iso8859-9
 458  latin-iso8859-14
 459  latin-iso8859-15
 460  mule-unicode-0100-24ff
 461  mule-unicode-2500-33ff
 462  mule-unicode-e000-ffff
 463
 464 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
 465 are decoded into sequences of eight-bit-control and eight-bit-graphic
 466 characters to preserve their byte sequences and composed to display as
 467 a single character.  Emacs characters that can't be encoded to these
 468 ranges are encoded as U+FFFD."
 469
 470  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
 471  '((safe-charsets
 472     ascii
 473     eight-bit-control
 474     eight-bit-graphic
 475     latin-iso8859-1
 476     latin-iso8859-15
 477     latin-iso8859-14
 478     latin-iso8859-9
 479     hebrew-iso8859-8
 480     greek-iso8859-7
 481     cyrillic-iso8859-5
 482     latin-iso8859-4
 483     latin-iso8859-3
 484     latin-iso8859-2
 485     vietnamese-viscii-lower
 486     vietnamese-viscii-upper
 487     thai-tis620
 488     ipa
 489     ethiopic
 490     indian-is13194
 491     katakana-jisx0201
 492     chinese-sisheng
 493     lao
 494     mule-unicode-0100-24ff
 495     mule-unicode-2500-33ff
 496     mule-unicode-e000-ffff)
 497    (mime-charset . utf-8)
 498    (coding-category . coding-category-utf-8)
 499    (valid-codes (0 . 255))
 500    (pre-write-conversion . utf-8-pre-write-conversion)
 501    (post-read-conversion . utf-8-post-read-conversion)))
 502
 503 (define-coding-system-alias 'utf-8 'mule-utf-8)
 504
 505 ;; I think this needs special private charsets defined for the
 506 ;; untranslated sequences, if it's going to work well.
 507
 508 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
 509 ;;;   (let* ((prop (get-char-property pos 'composition string))
 510 ;;;      (l (and prop (- (cadr prop) (car prop)))))
 511 ;;;     (cond ((and l (> l (- to pos)))
 512 ;;;        (delete-region pos to))
 513 ;;;       ((and (> (char-after pos) 224)
 514 ;;;             (< (char-after pos) 256)
 515 ;;;             (save-restriction
 516 ;;;               (narrow-to-region pos to)
 517 ;;;               (utf-8-compose)))
 518 ;;;        t))))
 519
 520 ;;; (dotimes (i 96)
 521 ;;;   (aset composition-function-table
 522 ;;;     (+ 128 i)
 523 ;;;     `((,(string-as-multibyte "[\200-\237\240-\377]")
 524 ;;;        . utf-8-compose-function))))
 525
 526 ;;; utf-8.el ends here