code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 3, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  24 Boston, MA 02110-1301, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-mule) handlers
  31   3. ISO2022 handlers
  32   4. Shift-JIS and BIG5 handlers
  33   5. CCL handlers
  34   6. End-of-line handlers
  35   7. C library functions
  36   8. Emacs Lisp library functions
  37   9. Post-amble
  38
  39 */
  40
  41 /*** 0. General comments ***/
  42
  43
  44 /*** GENERAL NOTE on CODING SYSTEMS ***
  45
  46   A coding system is an encoding mechanism for one or more character
  47   sets.  Here's a list of coding systems which Emacs can handle.  When
  48   we say "decode", it means converting some other coding system to
  49   Emacs' internal format (emacs-mule), and when we say "encode",
  50   it means converting the coding system emacs-mule to some other
  51   coding system.
  52
  53   0. Emacs' internal format (emacs-mule)
  54
  55   Emacs itself holds a multi-lingual character in buffers and strings
  56   in a special format.  Details are described in section 2.
  57
  58   1. ISO2022
  59
  60   The most famous coding system for multiple character sets.  X's
  61   Compound Text, various EUCs (Extended Unix Code), and coding
  62   systems used in Internet communication such as ISO-2022-JP are
  63   all variants of ISO2022.  Details are described in section 3.
  64
  65   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  66
  67   A coding system to encode character sets: ASCII, JISX0201, and
  68   JISX0208.  Widely used for PC's in Japan.  Details are described in
  69   section 4.
  70
  71   3. BIG5
  72
  73   A coding system to encode the character sets ASCII and Big5.  Widely
  74   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  75   described in section 4.  In this file, when we write "BIG5"
  76   (all uppercase), we mean the coding system, and when we write
  77   "Big5" (capitalized), we mean the character set.
  78
  79   4. Raw text
  80
  81   A coding system for text containing random 8-bit code.  Emacs does
  82   no code conversion on such text except for end-of-line format.
  83
  84   5. Other
  85
  86   If a user wants to read/write text encoded in a coding system not
  87   listed above, he can supply a decoder and an encoder for it as CCL
  88   (Code Conversion Language) programs.  Emacs executes the CCL program
  89   while reading/writing.
  90
  91   Emacs represents a coding system by a Lisp symbol that has a property
  92   `coding-system'.  But, before actually using the coding system, the
  93   information about it is set in a structure of type `struct
  94   coding_system' for rapid processing.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  99
 100   How end-of-line of text is encoded depends on the operating system.
 101   For instance, Unix's format is just one byte of `line-feed' code,
 102   whereas DOS's format is two-byte sequence of `carriage-return' and
 103   `line-feed' codes.  MacOS's format is usually one byte of
 104   `carriage-return'.
 105
 106   Since text character encoding and end-of-line encoding are
 107   independent, any coding system described above can have any
 108   end-of-line format.  So Emacs has information about end-of-line
 109   format in each coding-system.  See section 6 for more details.
 110
 111 */
 112
 113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 114
 115   These functions check if a text between SRC and SRC_END is encoded
 116   in the coding system category XXX.  Each returns an integer value in
 117   which appropriate flag bits for the category XXX are set.  The flag
 118   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 119   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 120   of the range 0x80..0x9F are in multibyte form.  */
 121 #if 0
 122 int
 123 detect_coding_emacs_mule (src, src_end, multibytep)
 124      unsigned char *src, *src_end;
 125      int multibytep;
 126 {
 127   ...
 128 }
 129 #endif
 130
 131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 132
 133   These functions decode SRC_BYTES length of unibyte text at SOURCE
 134   encoded in CODING to Emacs' internal format.  The resulting
 135   multibyte text goes to a place pointed to by DESTINATION, the length
 136   of which should not exceed DST_BYTES.
 137
 138   These functions set the information about original and decoded texts
 139   in the members `produced', `produced_char', `consumed', and
 140   `consumed_char' of the structure *CODING.  They also set the member
 141   `result' to one of CODING_FINISH_XXX indicating how the decoding
 142   finished.
 143
 144   DST_BYTES zero means that the source area and destination area are
 145   overlapped, which means that we can produce a decoded text until it
 146   reaches the head of the not-yet-decoded source text.
 147
 148   Below is a template for these functions.  */
 149 #if 0
 150 static void
 151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 152      struct coding_system *coding;
 153      const unsigned char *source;
 154      unsigned char *destination;
 155      int src_bytes, dst_bytes;
 156 {
 157   ...
 158 }
 159 #endif
 160
 161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 162
 163   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 164   internal multibyte format to CODING.  The resulting unibyte text
 165   goes to a place pointed to by DESTINATION, the length of which
 166   should not exceed DST_BYTES.
 167
 168   These functions set the information about original and encoded texts
 169   in the members `produced', `produced_char', `consumed', and
 170   `consumed_char' of the structure *CODING.  They also set the member
 171   `result' to one of CODING_FINISH_XXX indicating how the encoding
 172   finished.
 173
 174   DST_BYTES zero means that the source area and destination area are
 175   overlapped, which means that we can produce encoded text until it
 176   reaches at the head of the not-yet-encoded source text.
 177
 178   Below is a template for these functions.  */
 179 #if 0
 180 static void
 181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 182      struct coding_system *coding;
 183      unsigned char *source, *destination;
 184      int src_bytes, dst_bytes;
 185 {
 186   ...
 187 }
 188 #endif
 189
 190 /*** COMMONLY USED MACROS ***/
 191
 192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 193    get one, two, and three bytes from the source text respectively.
 194    If there are not enough bytes in the source, they jump to
 195    `label_end_of_loop'.  The caller should set variables `coding',
 196    `src' and `src_end' to appropriate pointer in advance.  These
 197    macros are called from decoding routines `decode_coding_XXX', thus
 198    it is assumed that the source text is unibyte.  */
 199
 200 #define ONE_MORE_BYTE(c1)                                       \
 201   do {                                                          \
 202     if (src >= src_end)                                         \
 203       {                                                         \
 204         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 205         goto label_end_of_loop;                                 \
 206       }                                                         \
 207     c1 = *src++;                                                \
 208   } while (0)
 209
 210 #define TWO_MORE_BYTES(c1, c2)                                  \
 211   do {                                                          \
 212     if (src + 1 >= src_end)                                     \
 213       {                                                         \
 214         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 215         goto label_end_of_loop;                                 \
 216       }                                                         \
 217     c1 = *src++;                                                \
 218     c2 = *src++;                                                \
 219   } while (0)
 220
 221
 222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 223    form if MULTIBYTEP is nonzero.  In addition, if SRC is not less
 224    than SRC_END, return with RET.  */
 225
 226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret)      \
 227   do {                                                          \
 228     if (src >= src_end)                                         \
 229       {                                                         \
 230         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 231         return ret;                                             \
 232       }                                                         \
 233     c1 = *src++;                                                \
 234     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 235       c1 = *src++ - 0x20;                                       \
 236   } while (0)
 237
 238 /* Set C to the next character at the source text pointed by `src'.
 239    If there are not enough characters in the source, jump to
 240    `label_end_of_loop'.  The caller should set variables `coding'
 241    `src', `src_end', and `translation_table' to appropriate pointers
 242    in advance.  This macro is used in encoding routines
 243    `encode_coding_XXX', thus it assumes that the source text is in
 244    multibyte form except for 8-bit characters.  8-bit characters are
 245    in multibyte form if coding->src_multibyte is nonzero, else they
 246    are represented by a single byte.  */
 247
 248 #define ONE_MORE_CHAR(c)                                        \
 249   do {                                                          \
 250     int len = src_end - src;                                    \
 251     int bytes;                                                  \
 252     if (len <= 0)                                               \
 253       {                                                         \
 254         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 255         goto label_end_of_loop;                                 \
 256       }                                                         \
 257     if (coding->src_multibyte                                   \
 258         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 259       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 260     else                                                        \
 261       c = *src, bytes = 1;                                      \
 262     if (!NILP (translation_table))                              \
 263       c = translate_char (translation_table, c, -1, 0, 0);      \
 264     src += bytes;                                               \
 265   } while (0)
 266
 267
 268 /* Produce a multibyte form of character C to `dst'.  Jump to
 269    `label_end_of_loop' if there's not enough space at `dst'.
 270
 271    If we are now in the middle of a composition sequence, the decoded
 272    character may be ALTCHAR (for the current composition).  In that
 273    case, the character goes to coding->cmp_data->data instead of
 274    `dst'.
 275
 276    This macro is used in decoding routines.  */
 277
 278 #define EMIT_CHAR(c)                                                    \
 279   do {                                                                  \
 280     if (! COMPOSING_P (coding)                                          \
 281         || coding->composing == COMPOSITION_RELATIVE                    \
 282         || coding->composing == COMPOSITION_WITH_RULE)                  \
 283       {                                                                 \
 284         int bytes = CHAR_BYTES (c);                                     \
 285         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 286           {                                                             \
 287             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 288             goto label_end_of_loop;                                     \
 289           }                                                             \
 290         dst += CHAR_STRING (c, dst);                                    \
 291         coding->produced_char++;                                        \
 292       }                                                                 \
 293                                                                         \
 294     if (COMPOSING_P (coding)                                            \
 295         && coding->composing != COMPOSITION_RELATIVE)                   \
 296       {                                                                 \
 297         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 298         coding->composition_rule_follows                                \
 299           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 300       }                                                                 \
 301   } while (0)
 302
 303
 304 #define EMIT_ONE_BYTE(c)                                        \
 305   do {                                                          \
 306     if (dst >= (dst_bytes ? dst_end : src))                     \
 307       {                                                         \
 308         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 309         goto label_end_of_loop;                                 \
 310       }                                                         \
 311     *dst++ = c;                                                 \
 312   } while (0)
 313
 314 #define EMIT_TWO_BYTES(c1, c2)                                  \
 315   do {                                                          \
 316     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 317       {                                                         \
 318         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 319         goto label_end_of_loop;                                 \
 320       }                                                         \
 321     *dst++ = c1, *dst++ = c2;                                   \
 322   } while (0)
 323
 324 #define EMIT_BYTES(from, to)                                    \
 325   do {                                                          \
 326     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 327       {                                                         \
 328         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 329         goto label_end_of_loop;                                 \
 330       }                                                         \
 331     while (from < to)                                           \
 332       *dst++ = *from++;                                         \
 333   } while (0)
 334
 335 \f
 336 /*** 1. Preamble ***/
 337
 338 #ifdef emacs
 339 #include <config.h>
 340 #endif
 341
 342 #include <stdio.h>
 343
 344 #ifdef emacs
 345
 346 #include "lisp.h"
 347 #include "buffer.h"
 348 #include "charset.h"
 349 #include "composite.h"
 350 #include "ccl.h"
 351 #include "coding.h"
 352 #include "window.h"
 353 #include "intervals.h"
 354 #include "frame.h"
 355 #include "termhooks.h"
 356
 357 #else  /* not emacs */
 358
 359 #include "mulelib.h"
 360
 361 #endif /* not emacs */
 362
 363 Lisp_Object Qcoding_system, Qeol_type;
 364 Lisp_Object Qbuffer_file_coding_system;
 365 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 366 Lisp_Object Qno_conversion, Qundecided;
 367 Lisp_Object Qcoding_system_history;
 368 Lisp_Object Qsafe_chars;
 369 Lisp_Object Qvalid_codes;
 370 Lisp_Object Qascii_incompatible;
 371
 372 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 373 Lisp_Object Qcall_process, Qcall_process_region;
 374 Lisp_Object Qstart_process, Qopen_network_stream;
 375 Lisp_Object Qtarget_idx;
 376
 377 extern Lisp_Object Qcompletion_ignore_case;
 378
 379 /* If a symbol has this property, evaluate the value to define the
 380    symbol as a coding system.  */
 381 Lisp_Object Qcoding_system_define_form;
 382
 383 Lisp_Object Vselect_safe_coding_system_function;
 384
 385 int coding_system_require_warning;
 386
 387 /* Mnemonic string for each format of end-of-line.  */
 388 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 389 /* Mnemonic string to indicate format of end-of-line is not yet
 390    decided.  */
 391 Lisp_Object eol_mnemonic_undecided;
 392
 393 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 394    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 395    This has an effect only for external encoding (i.e. for output to
 396    file and process), not for in-buffer or Lisp string encoding.  */
 397 int system_eol_type;
 398
 399 #ifdef emacs
 400
 401 /* Information about which coding system is safe for which chars.
 402    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 403
 404    GENERIC-LIST is a list of generic coding systems which can encode
 405    any characters.
 406
 407    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 408    corresponding char table that contains safe chars.  */
 409 Lisp_Object Vcoding_system_safe_chars;
 410
 411 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 412
 413 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 414
 415 /* Coding system emacs-mule and raw-text are for converting only
 416    end-of-line format.  */
 417 Lisp_Object Qemacs_mule, Qraw_text;
 418
 419 Lisp_Object Qutf_8;
 420
 421 /* Coding-systems are handed between Emacs Lisp programs and C internal
 422    routines by the following three variables.  */
 423 /* Coding-system for reading files and receiving data from process.  */
 424 Lisp_Object Vcoding_system_for_read;
 425 /* Coding-system for writing files and sending data to process.  */
 426 Lisp_Object Vcoding_system_for_write;
 427 /* Coding-system actually used in the latest I/O.  */
 428 Lisp_Object Vlast_coding_system_used;
 429
 430 /* A vector of length 256 which contains information about special
 431    Latin codes (especially for dealing with Microsoft codes).  */
 432 Lisp_Object Vlatin_extra_code_table;
 433
 434 /* Flag to inhibit code conversion of end-of-line format.  */
 435 int inhibit_eol_conversion;
 436
 437 /* Flag to inhibit ISO2022 escape sequence detection.  */
 438 int inhibit_iso_escape_detection;
 439
 440 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 441 int inherit_process_coding_system;
 442
 443 /* Coding system to be used to encode text for terminal display when
 444    terminal coding system is nil.  */
 445 struct coding_system safe_terminal_coding;
 446
 447 /* Default coding system to be used to write a file.  */
 448 struct coding_system default_buffer_file_coding;
 449
 450 Lisp_Object Vfile_coding_system_alist;
 451 Lisp_Object Vprocess_coding_system_alist;
 452 Lisp_Object Vnetwork_coding_system_alist;
 453
 454 Lisp_Object Vlocale_coding_system;
 455
 456 #endif /* emacs */
 457
 458 Lisp_Object Qcoding_category, Qcoding_category_index;
 459
 460 /* List of symbols `coding-category-xxx' ordered by priority.  */
 461 Lisp_Object Vcoding_category_list;
 462
 463 /* Table of coding categories (Lisp symbols).  */
 464 Lisp_Object Vcoding_category_table;
 465
 466 /* Table of names of symbol for each coding-category.  */
 467 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 468   "coding-category-emacs-mule",
 469   "coding-category-sjis",
 470   "coding-category-iso-7",
 471   "coding-category-iso-7-tight",
 472   "coding-category-iso-8-1",
 473   "coding-category-iso-8-2",
 474   "coding-category-iso-7-else",
 475   "coding-category-iso-8-else",
 476   "coding-category-ccl",
 477   "coding-category-big5",
 478   "coding-category-utf-8",
 479   "coding-category-utf-16-be",
 480   "coding-category-utf-16-le",
 481   "coding-category-raw-text",
 482   "coding-category-binary"
 483 };
 484
 485 /* Table of pointers to coding systems corresponding to each coding
 486    categories.  */
 487 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 488
 489 /* Table of coding category masks.  Nth element is a mask for a coding
 490    category of which priority is Nth.  */
 491 static
 492 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 493
 494 /* Flag to tell if we look up translation table on character code
 495    conversion.  */
 496 Lisp_Object Venable_character_translation;
 497 /* Standard translation table to look up on decoding (reading).  */
 498 Lisp_Object Vstandard_translation_table_for_decode;
 499 /* Standard translation table to look up on encoding (writing).  */
 500 Lisp_Object Vstandard_translation_table_for_encode;
 501
 502 Lisp_Object Qtranslation_table;
 503 Lisp_Object Qtranslation_table_id;
 504 Lisp_Object Qtranslation_table_for_decode;
 505 Lisp_Object Qtranslation_table_for_encode;
 506
 507 /* Alist of charsets vs revision number.  */
 508 Lisp_Object Vcharset_revision_alist;
 509
 510 /* Default coding systems used for process I/O.  */
 511 Lisp_Object Vdefault_process_coding_system;
 512
 513 /* Char table for translating Quail and self-inserting input.  */
 514 Lisp_Object Vtranslation_table_for_input;
 515
 516 /* Global flag to tell that we can't call post-read-conversion and
 517    pre-write-conversion functions.  Usually the value is zero, but it
 518    is set to 1 temporarily while such functions are running.  This is
 519    to avoid infinite recursive call.  */
 520 static int inhibit_pre_post_conversion;
 521
 522 Lisp_Object Qchar_coding_system;
 523
 524 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 525    its validity.  */
 526
 527 Lisp_Object
 528 coding_safe_chars (coding_system)
 529      Lisp_Object coding_system;
 530 {
 531   Lisp_Object coding_spec, plist, safe_chars;
 532
 533   coding_spec = Fget (coding_system, Qcoding_system);
 534   plist = XVECTOR (coding_spec)->contents[3];
 535   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 536   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 537 }
 538
 539 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 540   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 541
 542 \f
 543 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 544
 545 /* Emacs' internal format for representation of multiple character
 546    sets is a kind of multi-byte encoding, i.e. characters are
 547    represented by variable-length sequences of one-byte codes.
 548
 549    ASCII characters and control characters (e.g. `tab', `newline') are
 550    represented by one-byte sequences which are their ASCII codes, in
 551    the range 0x00 through 0x7F.
 552
 553    8-bit characters of the range 0x80..0x9F are represented by
 554    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 555    code + 0x20).
 556
 557    8-bit characters of the range 0xA0..0xFF are represented by
 558    one-byte sequences which are their 8-bit code.
 559
 560    The other characters are represented by a sequence of `base
 561    leading-code', optional `extended leading-code', and one or two
 562    `position-code's.  The length of the sequence is determined by the
 563    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 564    whereas extended leading-code and position-code take the range 0xA0
 565    through 0xFF.  See `charset.h' for more details about leading-code
 566    and position-code.
 567
 568    --- CODE RANGE of Emacs' internal format ---
 569    character set        range
 570    -------------        -----
 571    ascii                0x00..0x7F
 572    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 573    eight-bit-graphic    0xA0..0xBF
 574    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 575    ---------------------------------------------
 576
 577    As this is the internal character representation, the format is
 578    usually not used externally (i.e. in a file or in a data sent to a
 579    process).  But, it is possible to have a text externally in this
 580    format (i.e. by encoding by the coding system `emacs-mule').
 581
 582    In that case, a sequence of one-byte codes has a slightly different
 583    form.
 584
 585    Firstly, all characters in eight-bit-control are represented by
 586    one-byte sequences which are their 8-bit code.
 587
 588    Next, character composition data are represented by the byte
 589    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 590    where,
 591         METHOD is 0xF0 plus one of composition method (enum
 592         composition_method),
 593
 594         BYTES is 0xA0 plus the byte length of these composition data,
 595
 596         CHARS is 0xA0 plus the number of characters composed by these
 597         data,
 598
 599         COMPONENTs are characters of multibyte form or composition
 600         rules encoded by two-byte of ASCII codes.
 601
 602    In addition, for backward compatibility, the following formats are
 603    also recognized as composition data on decoding.
 604
 605    0x80 MSEQ ...
 606    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 607
 608    Here,
 609         MSEQ is a multibyte form but in these special format:
 610           ASCII: 0xA0 ASCII_CODE+0x80,
 611           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 612         RULE is a one byte code of the range 0xA0..0xF0 that
 613         represents a composition rule.
 614   */
 615
 616 enum emacs_code_class_type emacs_code_class[256];
 617
 618 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 619    Check if a text is encoded in Emacs' internal format.  If it is,
 620    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 621
 622 static int
 623 detect_coding_emacs_mule (src, src_end, multibytep)
 624       unsigned char *src, *src_end;
 625       int multibytep;
 626 {
 627   unsigned char c;
 628   int composing = 0;
 629   /* Dummy for ONE_MORE_BYTE.  */
 630   struct coding_system dummy_coding;
 631   struct coding_system *coding = &dummy_coding;
 632
 633   while (1)
 634     {
 635       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
 636                                      CODING_CATEGORY_MASK_EMACS_MULE);
 637       if (composing)
 638         {
 639           if (c < 0xA0)
 640             composing = 0;
 641           else if (c == 0xA0)
 642             {
 643               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
 644               c &= 0x7F;
 645             }
 646           else
 647             c -= 0x20;
 648         }
 649
 650       if (c < 0x20)
 651         {
 652           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 653             return 0;
 654         }
 655       else if (c >= 0x80 && c < 0xA0)
 656         {
 657           if (c == 0x80)
 658             /* Old leading code for a composite character.  */
 659             composing = 1;
 660           else
 661             {
 662               unsigned char *src_base = src - 1;
 663               int bytes;
 664
 665               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 666                                                bytes))
 667                 return 0;
 668               src = src_base + bytes;
 669             }
 670         }
 671     }
 672 }
 673
 674
 675 /* Record the starting position START and METHOD of one composition.  */
 676
 677 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 678   do {                                                          \
 679     struct composition_data *cmp_data = coding->cmp_data;       \
 680     int *data = cmp_data->data + cmp_data->used;                \
 681     coding->cmp_data_start = cmp_data->used;                    \
 682     data[0] = -1;                                               \
 683     data[1] = cmp_data->char_offset + start;                    \
 684     data[3] = (int) method;                                     \
 685     cmp_data->used += 4;                                        \
 686   } while (0)
 687
 688 /* Record the ending position END of the current composition.  */
 689
 690 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 691   do {                                                          \
 692     struct composition_data *cmp_data = coding->cmp_data;       \
 693     int *data = cmp_data->data + coding->cmp_data_start;        \
 694     data[0] = cmp_data->used - coding->cmp_data_start;          \
 695     data[2] = cmp_data->char_offset + end;                      \
 696   } while (0)
 697
 698 /* Record one COMPONENT (alternate character or composition rule).  */
 699
 700 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 701   do {                                                                  \
 702     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 703     if (coding->cmp_data->used - coding->cmp_data_start                 \
 704         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 705       {                                                                 \
 706         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 707         coding->composing = COMPOSITION_NO;                             \
 708       }                                                                 \
 709   } while (0)
 710
 711
 712 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 713    is not less than SRC_END, return -1 without incrementing Src.  */
 714
 715 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 716
 717
 718 /* Decode a character represented as a component of composition
 719    sequence of Emacs 20 style at SRC.  Set C to that character, store
 720    its multibyte form sequence at P, and set P to the end of that
 721    sequence.  If no valid character is found, set C to -1.  */
 722
 723 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 724   do {                                                          \
 725     int bytes;                                                  \
 726                                                                 \
 727     c = SAFE_ONE_MORE_BYTE ();                                  \
 728     if (c < 0)                                                  \
 729       break;                                                    \
 730     if (CHAR_HEAD_P (c))                                        \
 731       c = -1;                                                   \
 732     else if (c == 0xA0)                                         \
 733       {                                                         \
 734         c = SAFE_ONE_MORE_BYTE ();                              \
 735         if (c < 0xA0)                                           \
 736           c = -1;                                               \
 737         else                                                    \
 738           {                                                     \
 739             c -= 0x80;                                          \
 740             *p++ = c;                                           \
 741           }                                                     \
 742       }                                                         \
 743     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 744       {                                                         \
 745         unsigned char *p0 = p;                                  \
 746                                                                 \
 747         c -= 0x20;                                              \
 748         *p++ = c;                                               \
 749         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 750         while (--bytes)                                         \
 751           {                                                     \
 752             c = SAFE_ONE_MORE_BYTE ();                          \
 753             if (c < 0)                                          \
 754               break;                                            \
 755             *p++ = c;                                           \
 756           }                                                     \
 757         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 758             || (coding->flags /* We are recovering a file.  */  \
 759                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 760                 && ! CHAR_HEAD_P (p0[1])))                      \
 761           c = STRING_CHAR (p0, bytes);                          \
 762         else                                                    \
 763           c = -1;                                               \
 764       }                                                         \
 765     else                                                        \
 766       c = -1;                                                   \
 767   } while (0)
 768
 769
 770 /* Decode a composition rule represented as a component of composition
 771    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 772    valid rule is found, set C to -1.  */
 773
 774 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 775   do {                                                  \
 776     c = SAFE_ONE_MORE_BYTE ();                          \
 777     c -= 0xA0;                                          \
 778     if (c < 0 || c >= 81)                               \
 779       c = -1;                                           \
 780     else                                                \
 781       {                                                 \
 782         gref = c / 9, nref = c % 9;                     \
 783         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 784       }                                                 \
 785   } while (0)
 786
 787
 788 /* Decode composition sequence encoded by `emacs-mule' at the source
 789    pointed by SRC.  SRC_END is the end of source.  Store information
 790    of the composition in CODING->cmp_data.
 791
 792    For backward compatibility, decode also a composition sequence of
 793    Emacs 20 style.  In that case, the composition sequence contains
 794    characters that should be extracted into a buffer or string.  Store
 795    those characters at *DESTINATION in multibyte form.
 796
 797    If we encounter an invalid byte sequence, return 0.
 798    If we encounter an insufficient source or destination, or
 799    insufficient space in CODING->cmp_data, return 1.
 800    Otherwise, return consumed bytes in the source.
 801
 802 */
 803 static INLINE int
 804 decode_composition_emacs_mule (coding, src, src_end,
 805                                destination, dst_end, dst_bytes)
 806      struct coding_system *coding;
 807      const unsigned char *src, *src_end;
 808      unsigned char **destination, *dst_end;
 809      int dst_bytes;
 810 {
 811   unsigned char *dst = *destination;
 812   int method, data_len, nchars;
 813   const unsigned char *src_base = src++;
 814   /* Store components of composition.  */
 815   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 816   int ncomponent;
 817   /* Store multibyte form of characters to be composed.  This is for
 818      Emacs 20 style composition sequence.  */
 819   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 820   unsigned char *bufp = buf;
 821   int c, i, gref, nref;
 822
 823   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 824       >= COMPOSITION_DATA_SIZE)
 825     {
 826       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 827       return -1;
 828     }
 829
 830   ONE_MORE_BYTE (c);
 831   if (c - 0xF0 >= COMPOSITION_RELATIVE
 832            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 833     {
 834       int with_rule;
 835
 836       method = c - 0xF0;
 837       with_rule = (method == COMPOSITION_WITH_RULE
 838                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 839       ONE_MORE_BYTE (c);
 840       data_len = c - 0xA0;
 841       if (data_len < 4
 842           || src_base + data_len > src_end)
 843         return 0;
 844       ONE_MORE_BYTE (c);
 845       nchars = c - 0xA0;
 846       if (c < 1)
 847         return 0;
 848       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 849         {
 850           /* If it is longer than this, it can't be valid.  */
 851           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 852             return 0;
 853
 854           if (ncomponent % 2 && with_rule)
 855             {
 856               ONE_MORE_BYTE (gref);
 857               gref -= 32;
 858               ONE_MORE_BYTE (nref);
 859               nref -= 32;
 860               c = COMPOSITION_ENCODE_RULE (gref, nref);
 861             }
 862           else
 863             {
 864               int bytes;
 865               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 866                   || (coding->flags /* We are recovering a file.  */
 867                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 868                       && ! CHAR_HEAD_P (src[1])))
 869                 c = STRING_CHAR (src, bytes);
 870               else
 871                 c = *src, bytes = 1;
 872               src += bytes;
 873             }
 874           component[ncomponent] = c;
 875         }
 876     }
 877   else if (c >= 0x80)
 878     {
 879       /* This may be an old Emacs 20 style format.  See the comment at
 880          the section 2 of this file.  */
 881       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 882       if (src == src_end
 883           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 884         goto label_end_of_loop;
 885
 886       src_end = src;
 887       src = src_base + 1;
 888       if (c < 0xC0)
 889         {
 890           method = COMPOSITION_RELATIVE;
 891           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 892             {
 893               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 894               if (c < 0)
 895                 break;
 896               component[ncomponent++] = c;
 897             }
 898           if (ncomponent < 2)
 899             return 0;
 900           nchars = ncomponent;
 901         }
 902       else if (c == 0xFF)
 903         {
 904           method = COMPOSITION_WITH_RULE;
 905           src++;
 906           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 907           if (c < 0)
 908             return 0;
 909           component[0] = c;
 910           for (ncomponent = 1;
 911                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 912             {
 913               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 914               if (c < 0)
 915                 break;
 916               component[ncomponent++] = c;
 917               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 918               if (c < 0)
 919                 break;
 920               component[ncomponent++] = c;
 921             }
 922           if (ncomponent < 3)
 923             return 0;
 924           nchars = (ncomponent + 1) / 2;
 925         }
 926       else
 927         return 0;
 928     }
 929   else
 930     return 0;
 931
 932   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 933     {
 934       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 935       for (i = 0; i < ncomponent; i++)
 936         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 937       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 938       if (buf < bufp)
 939         {
 940           unsigned char *p = buf;
 941           EMIT_BYTES (p, bufp);
 942           *destination += bufp - buf;
 943           coding->produced_char += nchars;
 944         }
 945       return (src - src_base);
 946     }
 947  label_end_of_loop:
 948   return -1;
 949 }
 950
 951 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 952
 953 static void
 954 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 955      struct coding_system *coding;
 956      const unsigned char *source;
 957      unsigned char *destination;
 958      int src_bytes, dst_bytes;
 959 {
 960   const unsigned char *src = source;
 961   const unsigned char *src_end = source + src_bytes;
 962   unsigned char *dst = destination;
 963   unsigned char *dst_end = destination + dst_bytes;
 964   /* SRC_BASE remembers the start position in source in each loop.
 965      The loop will be exited when there's not enough source code, or
 966      when there's not enough destination area to produce a
 967      character.  */
 968   const unsigned char *src_base;
 969
 970   coding->produced_char = 0;
 971   while ((src_base = src) < src_end)
 972     {
 973       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 974       const unsigned char *p;
 975       int bytes;
 976
 977       if (*src == '\r')
 978         {
 979           int c = *src++;
 980
 981           if (coding->eol_type == CODING_EOL_CR)
 982             c = '\n';
 983           else if (coding->eol_type == CODING_EOL_CRLF)
 984             {
 985               ONE_MORE_BYTE (c);
 986               if (c != '\n')
 987                 {
 988                   src--;
 989                   c = '\r';
 990                 }
 991             }
 992           *dst++ = c;
 993           coding->produced_char++;
 994           continue;
 995         }
 996       else if (*src == '\n')
 997         {
 998           if ((coding->eol_type == CODING_EOL_CR
 999                || coding->eol_type == CODING_EOL_CRLF)
1000               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1001             {
1002               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1003               goto label_end_of_loop;
1004             }
1005           *dst++ = *src++;
1006           coding->produced_char++;
1007           continue;
1008         }
1009       else if (*src == 0x80 && coding->cmp_data)
1010         {
1011           /* Start of composition data.  */
1012           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1013                                                          &dst, dst_end,
1014                                                          dst_bytes);
1015           if (consumed < 0)
1016             goto label_end_of_loop;
1017           else if (consumed > 0)
1018             {
1019               src += consumed;
1020               continue;
1021             }
1022           bytes = CHAR_STRING (*src, tmp);
1023           p = tmp;
1024           src++;
1025         }
1026       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1027                || (coding->flags /* We are recovering a file.  */
1028                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1029                    && ! CHAR_HEAD_P (src[1])))
1030         {
1031           p = src;
1032           src += bytes;
1033         }
1034       else
1035         {
1036           int i, c;
1037
1038           bytes = BYTES_BY_CHAR_HEAD (*src);
1039           src++;
1040           for (i = 1; i < bytes; i++)
1041             {
1042               ONE_MORE_BYTE (c);
1043               if (CHAR_HEAD_P (c))
1044                 break;
1045             }
1046           if (i < bytes)
1047             {
1048               bytes = CHAR_STRING (*src_base, tmp);
1049               p = tmp;
1050               src = src_base + 1;
1051             }
1052           else
1053             {
1054               p = src_base;
1055             }
1056         }
1057       if (dst + bytes >= (dst_bytes ? dst_end : src))
1058         {
1059           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1060           break;
1061         }
1062       while (bytes--) *dst++ = *p++;
1063       coding->produced_char++;
1064     }
1065  label_end_of_loop:
1066   coding->consumed = coding->consumed_char = src_base - source;
1067   coding->produced = dst - destination;
1068 }
1069
1070
1071 /* Encode composition data stored at DATA into a special byte sequence
1072    starting by 0x80.  Update CODING->cmp_data_start and maybe
1073    CODING->cmp_data for the next call.  */
1074
1075 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1076   do {                                                                  \
1077     unsigned char buf[1024], *p0 = buf, *p;                             \
1078     int len = data[0];                                                  \
1079     int i;                                                              \
1080                                                                         \
1081     buf[0] = 0x80;                                                      \
1082     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1083     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1084     p = buf + 4;                                                        \
1085     if (data[3] == COMPOSITION_WITH_RULE                                \
1086         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1087       {                                                                 \
1088         p += CHAR_STRING (data[4], p);                                  \
1089         for (i = 5; i < len; i += 2)                                    \
1090           {                                                             \
1091             int gref, nref;                                             \
1092              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1093             *p++ = 0x20 + gref;                                         \
1094             *p++ = 0x20 + nref;                                         \
1095             p += CHAR_STRING (data[i + 1], p);                          \
1096           }                                                             \
1097       }                                                                 \
1098     else                                                                \
1099       {                                                                 \
1100         for (i = 4; i < len; i++)                                       \
1101           p += CHAR_STRING (data[i], p);                                \
1102       }                                                                 \
1103     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1104                                                                         \
1105     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1106       {                                                                 \
1107         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1108         goto label_end_of_loop;                                         \
1109       }                                                                 \
1110     while (p0 < p)                                                      \
1111       *dst++ = *p0++;                                                   \
1112     coding->cmp_data_start += data[0];                                  \
1113     if (coding->cmp_data_start == coding->cmp_data->used                \
1114         && coding->cmp_data->next)                                      \
1115       {                                                                 \
1116         coding->cmp_data = coding->cmp_data->next;                      \
1117         coding->cmp_data_start = 0;                                     \
1118       }                                                                 \
1119   } while (0)
1120
1121
1122 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1123                             unsigned char *, int, int));
1124
1125 static void
1126 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1127      struct coding_system *coding;
1128      const unsigned char *source;
1129      unsigned char *destination;
1130      int src_bytes, dst_bytes;
1131 {
1132   const unsigned char *src = source;
1133   const unsigned char *src_end = source + src_bytes;
1134   unsigned char *dst = destination;
1135   unsigned char *dst_end = destination + dst_bytes;
1136   const unsigned char *src_base;
1137   int c;
1138   int char_offset;
1139   int *data;
1140
1141   Lisp_Object translation_table;
1142
1143   translation_table = Qnil;
1144
1145   /* Optimization for the case that there's no composition.  */
1146   if (!coding->cmp_data || coding->cmp_data->used == 0)
1147     {
1148       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1149       return;
1150     }
1151
1152   char_offset = coding->cmp_data->char_offset;
1153   data = coding->cmp_data->data + coding->cmp_data_start;
1154   while (1)
1155     {
1156       src_base = src;
1157
1158       /* If SRC starts a composition, encode the information about the
1159          composition in advance.  */
1160       if (coding->cmp_data_start < coding->cmp_data->used
1161           && char_offset + coding->consumed_char == data[1])
1162         {
1163           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1164           char_offset = coding->cmp_data->char_offset;
1165           data = coding->cmp_data->data + coding->cmp_data_start;
1166         }
1167
1168       ONE_MORE_CHAR (c);
1169       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1170                         || coding->eol_type == CODING_EOL_CR))
1171         {
1172           if (coding->eol_type == CODING_EOL_CRLF)
1173             EMIT_TWO_BYTES ('\r', c);
1174           else
1175             EMIT_ONE_BYTE ('\r');
1176         }
1177       else if (SINGLE_BYTE_CHAR_P (c))
1178         {
1179           if (coding->flags && ! ASCII_BYTE_P (c))
1180             {
1181               /* As we are auto saving, retain the multibyte form for
1182                  8-bit chars.  */
1183               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1184               int bytes = CHAR_STRING (c, buf);
1185
1186               if (bytes == 1)
1187                 EMIT_ONE_BYTE (buf[0]);
1188               else
1189                 EMIT_TWO_BYTES (buf[0], buf[1]);
1190             }
1191           else
1192             EMIT_ONE_BYTE (c);
1193         }
1194       else
1195         EMIT_BYTES (src_base, src);
1196       coding->consumed_char++;
1197     }
1198  label_end_of_loop:
1199   coding->consumed = src_base - source;
1200   coding->produced = coding->produced_char = dst - destination;
1201   return;
1202 }
1203
1204 \f
1205 /*** 3. ISO2022 handlers ***/
1206
1207 /* The following note describes the coding system ISO2022 briefly.
1208    Since the intention of this note is to help understand the
1209    functions in this file, some parts are NOT ACCURATE or are OVERLY
1210    SIMPLIFIED.  For thorough understanding, please refer to the
1211    original document of ISO2022.  This is equivalent to the standard
1212    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1213
1214    ISO2022 provides many mechanisms to encode several character sets
1215    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1216    is encoded using bytes less than 128.  This may make the encoded
1217    text a little bit longer, but the text passes more easily through
1218    several types of gateway, some of which strip off the MSB (Most
1219    Significant Bit).
1220
1221    There are two kinds of character sets: control character sets and
1222    graphic character sets.  The former contain control characters such
1223    as `newline' and `escape' to provide control functions (control
1224    functions are also provided by escape sequences).  The latter
1225    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1226    two control character sets and many graphic character sets.
1227
1228    Graphic character sets are classified into one of the following
1229    four classes, according to the number of bytes (DIMENSION) and
1230    number of characters in one dimension (CHARS) of the set:
1231    - DIMENSION1_CHARS94
1232    - DIMENSION1_CHARS96
1233    - DIMENSION2_CHARS94
1234    - DIMENSION2_CHARS96
1235
1236    In addition, each character set is assigned an identification tag,
1237    unique for each set, called the "final character" (denoted as <F>
1238    hereafter).  The <F> of each character set is decided by ECMA(*)
1239    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1240    (0x30..0x3F are for private use only).
1241
1242    Note (*): ECMA = European Computer Manufacturers Association
1243
1244    Here are examples of graphic character sets [NAME(<F>)]:
1245         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1246         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1247         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1248         o DIMENSION2_CHARS96 -- none for the moment
1249
1250    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1251         C0 [0x00..0x1F] -- control character plane 0
1252         GL [0x20..0x7F] -- graphic character plane 0
1253         C1 [0x80..0x9F] -- control character plane 1
1254         GR [0xA0..0xFF] -- graphic character plane 1
1255
1256    A control character set is directly designated and invoked to C0 or
1257    C1 by an escape sequence.  The most common case is that:
1258    - ISO646's  control character set is designated/invoked to C0, and
1259    - ISO6429's control character set is designated/invoked to C1,
1260    and usually these designations/invocations are omitted in encoded
1261    text.  In a 7-bit environment, only C0 can be used, and a control
1262    character for C1 is encoded by an appropriate escape sequence to
1263    fit into the environment.  All control characters for C1 are
1264    defined to have corresponding escape sequences.
1265
1266    A graphic character set is at first designated to one of four
1267    graphic registers (G0 through G3), then these graphic registers are
1268    invoked to GL or GR.  These designations and invocations can be
1269    done independently.  The most common case is that G0 is invoked to
1270    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1271    these invocations and designations are omitted in encoded text.
1272    In a 7-bit environment, only GL can be used.
1273
1274    When a graphic character set of CHARS94 is invoked to GL, codes
1275    0x20 and 0x7F of the GL area work as control characters SPACE and
1276    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1277    be used.
1278
1279    There are two ways of invocation: locking-shift and single-shift.
1280    With locking-shift, the invocation lasts until the next different
1281    invocation, whereas with single-shift, the invocation affects the
1282    following character only and doesn't affect the locking-shift
1283    state.  Invocations are done by the following control characters or
1284    escape sequences:
1285
1286    ----------------------------------------------------------------------
1287    abbrev  function                  cntrl escape seq   description
1288    ----------------------------------------------------------------------
1289    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1290    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1291    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1292    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1293    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1294    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1295    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1296    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1297    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1298    ----------------------------------------------------------------------
1299    (*) These are not used by any known coding system.
1300
1301    Control characters for these functions are defined by macros
1302    ISO_CODE_XXX in `coding.h'.
1303
1304    Designations are done by the following escape sequences:
1305    ----------------------------------------------------------------------
1306    escape sequence      description
1307    ----------------------------------------------------------------------
1308    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1309    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1310    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1311    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1312    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1313    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1314    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1315    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1316    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1317    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1318    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1319    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1320    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1321    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1322    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1323    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1324    ----------------------------------------------------------------------
1325
1326    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1327    of dimension 1, chars 94, and final character <F>, etc...
1328
1329    Note (*): Although these designations are not allowed in ISO2022,
1330    Emacs accepts them on decoding, and produces them on encoding
1331    CHARS96 character sets in a coding system which is characterized as
1332    7-bit environment, non-locking-shift, and non-single-shift.
1333
1334    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1335    '(' can be omitted.  We refer to this as "short-form" hereafter.
1336
1337    Now you may notice that there are a lot of ways of encoding the
1338    same multilingual text in ISO2022.  Actually, there exist many
1339    coding systems such as Compound Text (used in X11's inter client
1340    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1341    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1342    localized platforms), and all of these are variants of ISO2022.
1343
1344    In addition to the above, Emacs handles two more kinds of escape
1345    sequences: ISO6429's direction specification and Emacs' private
1346    sequence for specifying character composition.
1347
1348    ISO6429's direction specification takes the following form:
1349         o CSI ']'      -- end of the current direction
1350         o CSI '0' ']'  -- end of the current direction
1351         o CSI '1' ']'  -- start of left-to-right text
1352         o CSI '2' ']'  -- start of right-to-left text
1353    The control character CSI (0x9B: control sequence introducer) is
1354    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1355
1356    Character composition specification takes the following form:
1357         o ESC '0' -- start relative composition
1358         o ESC '1' -- end composition
1359         o ESC '2' -- start rule-base composition (*)
1360         o ESC '3' -- start relative composition with alternate chars  (**)
1361         o ESC '4' -- start rule-base composition with alternate chars  (**)
1362   Since these are not standard escape sequences of any ISO standard,
1363   the use of them with these meanings is restricted to Emacs only.
1364
1365   (*) This form is used only in Emacs 20.5 and older versions,
1366   but the newer versions can safely decode it.
1367   (**) This form is used only in Emacs 21.1 and newer versions,
1368   and the older versions can't decode it.
1369
1370   Here's a list of example usages of these composition escape
1371   sequences (categorized by `enum composition_method').
1372
1373   COMPOSITION_RELATIVE:
1374         ESC 0 CHAR [ CHAR ] ESC 1
1375   COMPOSITION_WITH_RULE:
1376         ESC 2 CHAR [ RULE CHAR ] ESC 1
1377   COMPOSITION_WITH_ALTCHARS:
1378         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1379   COMPOSITION_WITH_RULE_ALTCHARS:
1380         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1381
1382 enum iso_code_class_type iso_code_class[256];
1383
1384 #define CHARSET_OK(idx, charset, c)                                     \
1385   (coding_system_table[idx]                                             \
1386    && (charset == CHARSET_ASCII                                         \
1387        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1388            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1389    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1390                                               charset)                  \
1391        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1392
1393 #define SHIFT_OUT_OK(idx) \
1394   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1395
1396 #define COMPOSITION_OK(idx)     \
1397   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1398
1399 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1400    Check if a text is encoded in ISO2022.  If it is, return an
1401    integer in which appropriate flag bits any of:
1402         CODING_CATEGORY_MASK_ISO_7
1403         CODING_CATEGORY_MASK_ISO_7_TIGHT
1404         CODING_CATEGORY_MASK_ISO_8_1
1405         CODING_CATEGORY_MASK_ISO_8_2
1406         CODING_CATEGORY_MASK_ISO_7_ELSE
1407         CODING_CATEGORY_MASK_ISO_8_ELSE
1408    are set.  If a code which should never appear in ISO2022 is found,
1409    returns 0.  */
1410
1411 static int
1412 detect_coding_iso2022 (src, src_end, multibytep)
1413      unsigned char *src, *src_end;
1414      int multibytep;
1415 {
1416   int mask = CODING_CATEGORY_MASK_ISO;
1417   int mask_found = 0;
1418   int reg[4], shift_out = 0, single_shifting = 0;
1419   int c, c1, charset;
1420   /* Dummy for ONE_MORE_BYTE.  */
1421   struct coding_system dummy_coding;
1422   struct coding_system *coding = &dummy_coding;
1423   Lisp_Object safe_chars;
1424
1425   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1426   while (mask)
1427     {
1428       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1429     retry:
1430       switch (c)
1431         {
1432         case ISO_CODE_ESC:
1433           if (inhibit_iso_escape_detection)
1434             break;
1435           single_shifting = 0;
1436           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1437           if (c >= '(' && c <= '/')
1438             {
1439               /* Designation sequence for a charset of dimension 1.  */
1440               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1441               if (c1 < ' ' || c1 >= 0x80
1442                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1443                 /* Invalid designation sequence.  Just ignore.  */
1444                 break;
1445               reg[(c - '(') % 4] = charset;
1446             }
1447           else if (c == '$')
1448             {
1449               /* Designation sequence for a charset of dimension 2.  */
1450               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1451               if (c >= '@' && c <= 'B')
1452                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1453                 reg[0] = charset = iso_charset_table[1][0][c];
1454               else if (c >= '(' && c <= '/')
1455                 {
1456                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1457                                                  mask & mask_found);
1458                   if (c1 < ' ' || c1 >= 0x80
1459                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1460                     /* Invalid designation sequence.  Just ignore.  */
1461                     break;
1462                   reg[(c - '(') % 4] = charset;
1463                 }
1464               else
1465                 /* Invalid designation sequence.  Just ignore.  */
1466                 break;
1467             }
1468           else if (c == 'N' || c == 'O')
1469             {
1470               /* ESC <Fe> for SS2 or SS3.  */
1471               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1472               break;
1473             }
1474           else if (c >= '0' && c <= '4')
1475             {
1476               /* ESC <Fp> for start/end composition.  */
1477               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1478                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1479               else
1480                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1481               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1482                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1483               else
1484                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1485               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1486                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1487               else
1488                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1489               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1490                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1491               else
1492                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1493               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1494                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1495               else
1496                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1497               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1498                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1499               else
1500                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1501               break;
1502             }
1503           else
1504             /* Invalid escape sequence.  Just ignore.  */
1505             break;
1506
1507           /* We found a valid designation sequence for CHARSET.  */
1508           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1509           c = MAKE_CHAR (charset, 0, 0);
1510           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1511             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1512           else
1513             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1514           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1515             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1516           else
1517             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1518           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1519             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1520           else
1521             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1522           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1523             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1524           else
1525             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1526           break;
1527
1528         case ISO_CODE_SO:
1529           if (inhibit_iso_escape_detection)
1530             break;
1531           single_shifting = 0;
1532           if (shift_out == 0
1533               && (reg[1] >= 0
1534                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1535                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1536             {
1537               /* Locking shift out.  */
1538               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1539               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1540             }
1541           break;
1542
1543         case ISO_CODE_SI:
1544           if (inhibit_iso_escape_detection)
1545             break;
1546           single_shifting = 0;
1547           if (shift_out == 1)
1548             {
1549               /* Locking shift in.  */
1550               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1551               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1552             }
1553           break;
1554
1555         case ISO_CODE_CSI:
1556           single_shifting = 0;
1557         case ISO_CODE_SS2:
1558         case ISO_CODE_SS3:
1559           {
1560             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1561
1562             if (inhibit_iso_escape_detection)
1563               break;
1564             if (c != ISO_CODE_CSI)
1565               {
1566                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1567                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1568                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1569                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1570                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1571                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1572                 single_shifting = 1;
1573               }
1574             if (VECTORP (Vlatin_extra_code_table)
1575                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1576               {
1577                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1578                     & CODING_FLAG_ISO_LATIN_EXTRA)
1579                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1580                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1581                     & CODING_FLAG_ISO_LATIN_EXTRA)
1582                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1583               }
1584             mask &= newmask;
1585             mask_found |= newmask;
1586           }
1587           break;
1588
1589         default:
1590           if (c < 0x80)
1591             {
1592               single_shifting = 0;
1593               break;
1594             }
1595           else if (c < 0xA0)
1596             {
1597               single_shifting = 0;
1598               if (VECTORP (Vlatin_extra_code_table)
1599                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1600                 {
1601                   int newmask = 0;
1602
1603                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1604                       & CODING_FLAG_ISO_LATIN_EXTRA)
1605                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1606                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1607                       & CODING_FLAG_ISO_LATIN_EXTRA)
1608                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1609                   mask &= newmask;
1610                   mask_found |= newmask;
1611                 }
1612               else
1613                 return 0;
1614             }
1615           else
1616             {
1617               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1618                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1619               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1620               /* Check the length of succeeding codes of the range
1621                  0xA0..0FF.  If the byte length is odd, we exclude
1622                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1623                  when we are not single shifting.  */
1624               if (!single_shifting
1625                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1626                 {
1627                   int i = 1;
1628
1629                   c = -1;
1630                   while (src < src_end)
1631                     {
1632                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1633                                                      mask & mask_found);
1634                       if (c < 0xA0)
1635                         break;
1636                       i++;
1637                     }
1638
1639                   if (i & 1 && src < src_end)
1640                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1641                   else
1642                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1643                   if (c >= 0)
1644                     /* This means that we have read one extra byte.  */
1645                     goto retry;
1646                 }
1647             }
1648           break;
1649         }
1650     }
1651   return (mask & mask_found);
1652 }
1653
1654 /* Decode a character of which charset is CHARSET, the 1st position
1655    code is C1, the 2nd position code is C2, and return the decoded
1656    character code.  If the variable `translation_table' is non-nil,
1657    returned the translated code.  */
1658
1659 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1660   (NILP (translation_table)                     \
1661    ? MAKE_CHAR (charset, c1, c2)                \
1662    : translate_char (translation_table, -1, charset, c1, c2))
1663
1664 /* Set designation state into CODING.  */
1665 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1666   do {                                                                     \
1667     int charset, c;                                                        \
1668                                                                            \
1669     if (final_char < '0' || final_char >= 128)                             \
1670       goto label_invalid_code;                                             \
1671     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1672                                  make_number (chars),                      \
1673                                  make_number (final_char));                \
1674     c = MAKE_CHAR (charset, 0, 0);                                         \
1675     if (charset >= 0                                                       \
1676         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1677             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1678       {                                                                    \
1679         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1680             && reg == 0                                                    \
1681             && charset == CHARSET_ASCII)                                   \
1682           {                                                                \
1683             /* We should insert this designation sequence as is so         \
1684                that it is surely written back to a file.  */               \
1685             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1686             goto label_invalid_code;                                       \
1687           }                                                                \
1688         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1689         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1690             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1691           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1692         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1693       }                                                                    \
1694     else                                                                   \
1695       {                                                                    \
1696         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1697         goto label_invalid_code;                                           \
1698       }                                                                    \
1699   } while (0)
1700
1701 /* Allocate a memory block for storing information about compositions.
1702    The block is chained to the already allocated blocks.  */
1703
1704 void
1705 coding_allocate_composition_data (coding, char_offset)
1706      struct coding_system *coding;
1707      int char_offset;
1708 {
1709   struct composition_data *cmp_data
1710     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1711
1712   cmp_data->char_offset = char_offset;
1713   cmp_data->used = 0;
1714   cmp_data->prev = coding->cmp_data;
1715   cmp_data->next = NULL;
1716   if (coding->cmp_data)
1717     coding->cmp_data->next = cmp_data;
1718   coding->cmp_data = cmp_data;
1719   coding->cmp_data_start = 0;
1720   coding->composing = COMPOSITION_NO;
1721 }
1722
1723 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1724    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1725    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1726    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1727    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1728   */
1729
1730 #define DECODE_COMPOSITION_START(c1)                                       \
1731   do {                                                                     \
1732     if (coding->composing == COMPOSITION_DISABLED)                         \
1733       {                                                                    \
1734         *dst++ = ISO_CODE_ESC;                                             \
1735         *dst++ = c1 & 0x7f;                                                \
1736         coding->produced_char += 2;                                        \
1737       }                                                                    \
1738     else if (!COMPOSING_P (coding))                                        \
1739       {                                                                    \
1740         /* This is surely the start of a composition.  We must be sure     \
1741            that coding->cmp_data has enough space to store the             \
1742            information about the composition.  If not, terminate the       \
1743            current decoding loop, allocate one more memory block for       \
1744            coding->cmp_data in the caller, then start the decoding         \
1745            loop again.  We can't allocate memory here directly because     \
1746            it may cause buffer/string relocation.  */                      \
1747         if (!coding->cmp_data                                              \
1748             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1749                 >= COMPOSITION_DATA_SIZE))                                 \
1750           {                                                                \
1751             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1752             goto label_end_of_loop;                                        \
1753           }                                                                \
1754         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1755                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1756                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1757                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1758         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1759                                       coding->composing);                  \
1760         coding->composition_rule_follows = 0;                              \
1761       }                                                                    \
1762     else                                                                   \
1763       {                                                                    \
1764         /* We are already handling a composition.  If the method is        \
1765            the following two, the codes following the current escape       \
1766            sequence are actual characters stored in a buffer.  */          \
1767         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1768             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1769           {                                                                \
1770             coding->composing = COMPOSITION_RELATIVE;                      \
1771             coding->composition_rule_follows = 0;                          \
1772           }                                                                \
1773       }                                                                    \
1774   } while (0)
1775
1776 /* Handle composition end sequence ESC 1.  */
1777
1778 #define DECODE_COMPOSITION_END(c1)                                      \
1779   do {                                                                  \
1780     if (! COMPOSING_P (coding))                                         \
1781       {                                                                 \
1782         *dst++ = ISO_CODE_ESC;                                          \
1783         *dst++ = c1;                                                    \
1784         coding->produced_char += 2;                                     \
1785       }                                                                 \
1786     else                                                                \
1787       {                                                                 \
1788         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1789         coding->composing = COMPOSITION_NO;                             \
1790       }                                                                 \
1791   } while (0)
1792
1793 /* Decode a composition rule from the byte C1 (and maybe one more byte
1794    from SRC) and store one encoded composition rule in
1795    coding->cmp_data.  */
1796
1797 #define DECODE_COMPOSITION_RULE(c1)                                     \
1798   do {                                                                  \
1799     int rule = 0;                                                       \
1800     (c1) -= 32;                                                         \
1801     if (c1 < 81)                /* old format (before ver.21) */        \
1802       {                                                                 \
1803         int gref = (c1) / 9;                                            \
1804         int nref = (c1) % 9;                                            \
1805         if (gref == 4) gref = 10;                                       \
1806         if (nref == 4) nref = 10;                                       \
1807         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1808       }                                                                 \
1809     else if (c1 < 93)           /* new format (after ver.21) */         \
1810       {                                                                 \
1811         ONE_MORE_BYTE (c2);                                             \
1812         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1813       }                                                                 \
1814     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1815     coding->composition_rule_follows = 0;                               \
1816   } while (0)
1817
1818
1819 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1820
1821 static void
1822 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1823      struct coding_system *coding;
1824      const unsigned char *source;
1825      unsigned char *destination;
1826      int src_bytes, dst_bytes;
1827 {
1828   const unsigned char *src = source;
1829   const unsigned char *src_end = source + src_bytes;
1830   unsigned char *dst = destination;
1831   unsigned char *dst_end = destination + dst_bytes;
1832   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1833   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1834   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1835   /* SRC_BASE remembers the start position in source in each loop.
1836      The loop will be exited when there's not enough source code
1837      (within macro ONE_MORE_BYTE), or when there's not enough
1838      destination area to produce a character (within macro
1839      EMIT_CHAR).  */
1840   const unsigned char *src_base;
1841   int c, charset;
1842   Lisp_Object translation_table;
1843   Lisp_Object safe_chars;
1844
1845   safe_chars = coding_safe_chars (coding->symbol);
1846
1847   if (NILP (Venable_character_translation))
1848     translation_table = Qnil;
1849   else
1850     {
1851       translation_table = coding->translation_table_for_decode;
1852       if (NILP (translation_table))
1853         translation_table = Vstandard_translation_table_for_decode;
1854     }
1855
1856   coding->result = CODING_FINISH_NORMAL;
1857
1858   while (1)
1859     {
1860       int c1, c2 = 0;
1861
1862       src_base = src;
1863       ONE_MORE_BYTE (c1);
1864
1865       /* We produce no character or one character.  */
1866       switch (iso_code_class [c1])
1867         {
1868         case ISO_0x20_or_0x7F:
1869           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1870             {
1871               DECODE_COMPOSITION_RULE (c1);
1872               continue;
1873             }
1874           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1875             {
1876               /* This is SPACE or DEL.  */
1877               charset = CHARSET_ASCII;
1878               break;
1879             }
1880           /* This is a graphic character, we fall down ...  */
1881
1882         case ISO_graphic_plane_0:
1883           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1884             {
1885               DECODE_COMPOSITION_RULE (c1);
1886               continue;
1887             }
1888           charset = charset0;
1889           break;
1890
1891         case ISO_0xA0_or_0xFF:
1892           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1893               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1894             goto label_invalid_code;
1895           /* This is a graphic character, we fall down ... */
1896
1897         case ISO_graphic_plane_1:
1898           if (charset1 < 0)
1899             goto label_invalid_code;
1900           charset = charset1;
1901           break;
1902
1903         case ISO_control_0:
1904           if (COMPOSING_P (coding))
1905             DECODE_COMPOSITION_END ('1');
1906
1907           /* All ISO2022 control characters in this class have the
1908              same representation in Emacs internal format.  */
1909           if (c1 == '\n'
1910               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1911               && (coding->eol_type == CODING_EOL_CR
1912                   || coding->eol_type == CODING_EOL_CRLF))
1913             {
1914               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1915               goto label_end_of_loop;
1916             }
1917           charset = CHARSET_ASCII;
1918           break;
1919
1920         case ISO_control_1:
1921           if (COMPOSING_P (coding))
1922             DECODE_COMPOSITION_END ('1');
1923           goto label_invalid_code;
1924
1925         case ISO_carriage_return:
1926           if (COMPOSING_P (coding))
1927             DECODE_COMPOSITION_END ('1');
1928
1929           if (coding->eol_type == CODING_EOL_CR)
1930             c1 = '\n';
1931           else if (coding->eol_type == CODING_EOL_CRLF)
1932             {
1933               ONE_MORE_BYTE (c1);
1934               if (c1 != ISO_CODE_LF)
1935                 {
1936                   src--;
1937                   c1 = '\r';
1938                 }
1939             }
1940           charset = CHARSET_ASCII;
1941           break;
1942
1943         case ISO_shift_out:
1944           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1945               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1946             goto label_invalid_code;
1947           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1948           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1949           continue;
1950
1951         case ISO_shift_in:
1952           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1953             goto label_invalid_code;
1954           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1955           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1956           continue;
1957
1958         case ISO_single_shift_2_7:
1959         case ISO_single_shift_2:
1960           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1961             goto label_invalid_code;
1962           /* SS2 is handled as an escape sequence of ESC 'N' */
1963           c1 = 'N';
1964           goto label_escape_sequence;
1965
1966         case ISO_single_shift_3:
1967           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1968             goto label_invalid_code;
1969           /* SS2 is handled as an escape sequence of ESC 'O' */
1970           c1 = 'O';
1971           goto label_escape_sequence;
1972
1973         case ISO_control_sequence_introducer:
1974           /* CSI is handled as an escape sequence of ESC '[' ...  */
1975           c1 = '[';
1976           goto label_escape_sequence;
1977
1978         case ISO_escape:
1979           ONE_MORE_BYTE (c1);
1980         label_escape_sequence:
1981           /* Escape sequences handled by Emacs are invocation,
1982              designation, direction specification, and character
1983              composition specification.  */
1984           switch (c1)
1985             {
1986             case '&':           /* revision of following character set */
1987               ONE_MORE_BYTE (c1);
1988               if (!(c1 >= '@' && c1 <= '~'))
1989                 goto label_invalid_code;
1990               ONE_MORE_BYTE (c1);
1991               if (c1 != ISO_CODE_ESC)
1992                 goto label_invalid_code;
1993               ONE_MORE_BYTE (c1);
1994               goto label_escape_sequence;
1995
1996             case '$':           /* designation of 2-byte character set */
1997               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1998                 goto label_invalid_code;
1999               ONE_MORE_BYTE (c1);
2000               if (c1 >= '@' && c1 <= 'B')
2001                 {       /* designation of JISX0208.1978, GB2312.1980,
2002                            or JISX0208.1980 */
2003                   DECODE_DESIGNATION (0, 2, 94, c1);
2004                 }
2005               else if (c1 >= 0x28 && c1 <= 0x2B)
2006                 {       /* designation of DIMENSION2_CHARS94 character set */
2007                   ONE_MORE_BYTE (c2);
2008                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2009                 }
2010               else if (c1 >= 0x2C && c1 <= 0x2F)
2011                 {       /* designation of DIMENSION2_CHARS96 character set */
2012                   ONE_MORE_BYTE (c2);
2013                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2014                 }
2015               else
2016                 goto label_invalid_code;
2017               /* We must update these variables now.  */
2018               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2019               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2020               continue;
2021
2022             case 'n':           /* invocation of locking-shift-2 */
2023               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2024                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2025                 goto label_invalid_code;
2026               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2027               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2028               continue;
2029
2030             case 'o':           /* invocation of locking-shift-3 */
2031               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2032                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2033                 goto label_invalid_code;
2034               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2035               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2036               continue;
2037
2038             case 'N':           /* invocation of single-shift-2 */
2039               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2040                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2041                 goto label_invalid_code;
2042               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2043               ONE_MORE_BYTE (c1);
2044               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2045                 goto label_invalid_code;
2046               break;
2047
2048             case 'O':           /* invocation of single-shift-3 */
2049               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2050                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2051                 goto label_invalid_code;
2052               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2053               ONE_MORE_BYTE (c1);
2054               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2055                 goto label_invalid_code;
2056               break;
2057
2058             case '0': case '2': case '3': case '4': /* start composition */
2059               DECODE_COMPOSITION_START (c1);
2060               continue;
2061
2062             case '1':           /* end composition */
2063               DECODE_COMPOSITION_END (c1);
2064               continue;
2065
2066             case '[':           /* specification of direction */
2067               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2068                 goto label_invalid_code;
2069               /* For the moment, nested direction is not supported.
2070                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2071                  left-to-right, and nonzero means right-to-left.  */
2072               ONE_MORE_BYTE (c1);
2073               switch (c1)
2074                 {
2075                 case ']':       /* end of the current direction */
2076                   coding->mode &= ~CODING_MODE_DIRECTION;
2077
2078                 case '0':       /* end of the current direction */
2079                 case '1':       /* start of left-to-right direction */
2080                   ONE_MORE_BYTE (c1);
2081                   if (c1 == ']')
2082                     coding->mode &= ~CODING_MODE_DIRECTION;
2083                   else
2084                     goto label_invalid_code;
2085                   break;
2086
2087                 case '2':       /* start of right-to-left direction */
2088                   ONE_MORE_BYTE (c1);
2089                   if (c1 == ']')
2090                     coding->mode |= CODING_MODE_DIRECTION;
2091                   else
2092                     goto label_invalid_code;
2093                   break;
2094
2095                 default:
2096                   goto label_invalid_code;
2097                 }
2098               continue;
2099
2100             case '%':
2101               if (COMPOSING_P (coding))
2102                 DECODE_COMPOSITION_END ('1');
2103               ONE_MORE_BYTE (c1);
2104               if (c1 == '/')
2105                 {
2106                   /* CTEXT extended segment:
2107                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2108                      We keep these bytes as is for the moment.
2109                      They may be decoded by post-read-conversion.  */
2110                   int dim, M, L;
2111                   int size, required;
2112                   int produced_chars;
2113
2114                   ONE_MORE_BYTE (dim);
2115                   ONE_MORE_BYTE (M);
2116                   ONE_MORE_BYTE (L);
2117                   size = ((M - 128) * 128) + (L - 128);
2118                   required = 8 + size * 2;
2119                   if (dst + required > (dst_bytes ? dst_end : src))
2120                     goto label_end_of_loop;
2121                   *dst++ = ISO_CODE_ESC;
2122                   *dst++ = '%';
2123                   *dst++ = '/';
2124                   *dst++ = dim;
2125                   produced_chars = 4;
2126                   dst += CHAR_STRING (M, dst), produced_chars++;
2127                   dst += CHAR_STRING (L, dst), produced_chars++;
2128                   while (size-- > 0)
2129                     {
2130                       ONE_MORE_BYTE (c1);
2131                       dst += CHAR_STRING (c1, dst), produced_chars++;
2132                     }
2133                   coding->produced_char += produced_chars;
2134                 }
2135               else if (c1 == 'G')
2136                 {
2137                   unsigned char *d = dst;
2138                   int produced_chars;
2139
2140                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2141                      ESC % G --UTF-8-BYTES-- ESC % @
2142                      We keep these bytes as is for the moment.
2143                      They may be decoded by post-read-conversion.  */
2144                   if (d + 6 > (dst_bytes ? dst_end : src))
2145                     goto label_end_of_loop;
2146                   *d++ = ISO_CODE_ESC;
2147                   *d++ = '%';
2148                   *d++ = 'G';
2149                   produced_chars = 3;
2150                   while (d + 1 < (dst_bytes ? dst_end : src))
2151                     {
2152                       ONE_MORE_BYTE (c1);
2153                       if (c1 == ISO_CODE_ESC
2154                           && src + 1 < src_end
2155                           && src[0] == '%'
2156                           && src[1] == '@')
2157                         {
2158                           src += 2;
2159                           break;
2160                         }
2161                       d += CHAR_STRING (c1, d), produced_chars++;
2162                     }
2163                   if (d + 3 > (dst_bytes ? dst_end : src))
2164                     goto label_end_of_loop;
2165                   *d++ = ISO_CODE_ESC;
2166                   *d++ = '%';
2167                   *d++ = '@';
2168                   dst = d;
2169                   coding->produced_char += produced_chars + 3;
2170                 }
2171               else
2172                 goto label_invalid_code;
2173               continue;
2174
2175             default:
2176               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2177                 goto label_invalid_code;
2178               if (c1 >= 0x28 && c1 <= 0x2B)
2179                 {       /* designation of DIMENSION1_CHARS94 character set */
2180                   ONE_MORE_BYTE (c2);
2181                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2182                 }
2183               else if (c1 >= 0x2C && c1 <= 0x2F)
2184                 {       /* designation of DIMENSION1_CHARS96 character set */
2185                   ONE_MORE_BYTE (c2);
2186                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2187                 }
2188               else
2189                 goto label_invalid_code;
2190               /* We must update these variables now.  */
2191               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2192               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2193               continue;
2194             }
2195         }
2196
2197       /* Now we know CHARSET and 1st position code C1 of a character.
2198          Produce a multibyte sequence for that character while getting
2199          2nd position code C2 if necessary.  */
2200       if (CHARSET_DIMENSION (charset) == 2)
2201         {
2202           ONE_MORE_BYTE (c2);
2203           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2204             /* C2 is not in a valid range.  */
2205             goto label_invalid_code;
2206         }
2207       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2208       EMIT_CHAR (c);
2209       continue;
2210
2211     label_invalid_code:
2212       coding->errors++;
2213       if (COMPOSING_P (coding))
2214         DECODE_COMPOSITION_END ('1');
2215       src = src_base;
2216       c = *src++;
2217       if (! NILP (translation_table))
2218         c = translate_char (translation_table, c, 0, 0, 0);
2219       EMIT_CHAR (c);
2220     }
2221
2222  label_end_of_loop:
2223   coding->consumed = coding->consumed_char = src_base - source;
2224   coding->produced = dst - destination;
2225   return;
2226 }
2227
2228
2229 /* ISO2022 encoding stuff.  */
2230
2231 /*
2232    It is not enough to say just "ISO2022" on encoding, we have to
2233    specify more details.  In Emacs, each ISO2022 coding system
2234    variant has the following specifications:
2235         1. Initial designation to G0 through G3.
2236         2. Allows short-form designation?
2237         3. ASCII should be designated to G0 before control characters?
2238         4. ASCII should be designated to G0 at end of line?
2239         5. 7-bit environment or 8-bit environment?
2240         6. Use locking-shift?
2241         7. Use Single-shift?
2242    And the following two are only for Japanese:
2243         8. Use ASCII in place of JIS0201-1976-Roman?
2244         9. Use JISX0208-1983 in place of JISX0208-1978?
2245    These specifications are encoded in `coding->flags' as flag bits
2246    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2247    details.
2248 */
2249
2250 /* Produce codes (escape sequence) for designating CHARSET to graphic
2251    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2252    '@', 'A', or 'B' and the coding system CODING allows, produce
2253    designation sequence of short-form.  */
2254
2255 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2256   do {                                                                  \
2257     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2258     char *intermediate_char_94 = "()*+";                                \
2259     char *intermediate_char_96 = ",-./";                                \
2260     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2261                                                                         \
2262     if (revision < 255)                                                 \
2263       {                                                                 \
2264         *dst++ = ISO_CODE_ESC;                                          \
2265         *dst++ = '&';                                                   \
2266         *dst++ = '@' + revision;                                        \
2267       }                                                                 \
2268     *dst++ = ISO_CODE_ESC;                                              \
2269     if (CHARSET_DIMENSION (charset) == 1)                               \
2270       {                                                                 \
2271         if (CHARSET_CHARS (charset) == 94)                              \
2272           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2273         else                                                            \
2274           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2275       }                                                                 \
2276     else                                                                \
2277       {                                                                 \
2278         *dst++ = '$';                                                   \
2279         if (CHARSET_CHARS (charset) == 94)                              \
2280           {                                                             \
2281             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2282                 || reg != 0                                             \
2283                 || final_char < '@' || final_char > 'B')                \
2284               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2285           }                                                             \
2286         else                                                            \
2287           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2288       }                                                                 \
2289     *dst++ = final_char;                                                \
2290     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2291   } while (0)
2292
2293 /* The following two macros produce codes (control character or escape
2294    sequence) for ISO2022 single-shift functions (single-shift-2 and
2295    single-shift-3).  */
2296
2297 #define ENCODE_SINGLE_SHIFT_2                           \
2298   do {                                                  \
2299     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2300       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2301     else                                                \
2302       *dst++ = ISO_CODE_SS2;                            \
2303     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2304   } while (0)
2305
2306 #define ENCODE_SINGLE_SHIFT_3                           \
2307   do {                                                  \
2308     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2309       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2310     else                                                \
2311       *dst++ = ISO_CODE_SS3;                            \
2312     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2313   } while (0)
2314
2315 /* The following four macros produce codes (control character or
2316    escape sequence) for ISO2022 locking-shift functions (shift-in,
2317    shift-out, locking-shift-2, and locking-shift-3).  */
2318
2319 #define ENCODE_SHIFT_IN                         \
2320   do {                                          \
2321     *dst++ = ISO_CODE_SI;                       \
2322     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2323   } while (0)
2324
2325 #define ENCODE_SHIFT_OUT                        \
2326   do {                                          \
2327     *dst++ = ISO_CODE_SO;                       \
2328     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2329   } while (0)
2330
2331 #define ENCODE_LOCKING_SHIFT_2                  \
2332   do {                                          \
2333     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2334     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2335   } while (0)
2336
2337 #define ENCODE_LOCKING_SHIFT_3                  \
2338   do {                                          \
2339     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2340     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2341   } while (0)
2342
2343 /* Produce codes for a DIMENSION1 character whose character set is
2344    CHARSET and whose position-code is C1.  Designation and invocation
2345    sequences are also produced in advance if necessary.  */
2346
2347 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2348   do {                                                                  \
2349     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2350       {                                                                 \
2351         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2352           *dst++ = c1 & 0x7F;                                           \
2353         else                                                            \
2354           *dst++ = c1 | 0x80;                                           \
2355         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2356         break;                                                          \
2357       }                                                                 \
2358     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2359       {                                                                 \
2360         *dst++ = c1 & 0x7F;                                             \
2361         break;                                                          \
2362       }                                                                 \
2363     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2364       {                                                                 \
2365         *dst++ = c1 | 0x80;                                             \
2366         break;                                                          \
2367       }                                                                 \
2368     else                                                                \
2369       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2370          must invoke it, or, at first, designate it to some graphic     \
2371          register.  Then repeat the loop to actually produce the        \
2372          character.  */                                                 \
2373       dst = encode_invocation_designation (charset, coding, dst);       \
2374   } while (1)
2375
2376 /* Produce codes for a DIMENSION2 character whose character set is
2377    CHARSET and whose position-codes are C1 and C2.  Designation and
2378    invocation codes are also produced in advance if necessary.  */
2379
2380 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2381   do {                                                                  \
2382     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2383       {                                                                 \
2384         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2385           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2386         else                                                            \
2387           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2388         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2389         break;                                                          \
2390       }                                                                 \
2391     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2392       {                                                                 \
2393         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2394         break;                                                          \
2395       }                                                                 \
2396     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2397       {                                                                 \
2398         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2399         break;                                                          \
2400       }                                                                 \
2401     else                                                                \
2402       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2403          must invoke it, or, at first, designate it to some graphic     \
2404          register.  Then repeat the loop to actually produce the        \
2405          character.  */                                                 \
2406       dst = encode_invocation_designation (charset, coding, dst);       \
2407   } while (1)
2408
2409 #define ENCODE_ISO_CHARACTER(c)                                 \
2410   do {                                                          \
2411     int charset, c1, c2;                                        \
2412                                                                 \
2413     SPLIT_CHAR (c, charset, c1, c2);                            \
2414     if (CHARSET_DEFINED_P (charset))                            \
2415       {                                                         \
2416         if (CHARSET_DIMENSION (charset) == 1)                   \
2417           {                                                     \
2418             if (charset == CHARSET_ASCII                        \
2419                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2420               charset = charset_latin_jisx0201;                 \
2421             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2422           }                                                     \
2423         else                                                    \
2424           {                                                     \
2425             if (charset == charset_jisx0208                     \
2426                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2427               charset = charset_jisx0208_1978;                  \
2428             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2429           }                                                     \
2430       }                                                         \
2431     else                                                        \
2432       {                                                         \
2433         *dst++ = c1;                                            \
2434         if (c2 >= 0)                                            \
2435           *dst++ = c2;                                          \
2436       }                                                         \
2437   } while (0)
2438
2439
2440 /* Instead of encoding character C, produce one or two `?'s.  */
2441
2442 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2443   do {                                                          \
2444     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2445     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2446       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2447   } while (0)
2448
2449
2450 /* Produce designation and invocation codes at a place pointed by DST
2451    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2452    Return new DST.  */
2453
2454 unsigned char *
2455 encode_invocation_designation (charset, coding, dst)
2456      int charset;
2457      struct coding_system *coding;
2458      unsigned char *dst;
2459 {
2460   int reg;                      /* graphic register number */
2461
2462   /* At first, check designations.  */
2463   for (reg = 0; reg < 4; reg++)
2464     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2465       break;
2466
2467   if (reg >= 4)
2468     {
2469       /* CHARSET is not yet designated to any graphic registers.  */
2470       /* At first check the requested designation.  */
2471       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2472       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2473         /* Since CHARSET requests no special designation, designate it
2474            to graphic register 0.  */
2475         reg = 0;
2476
2477       ENCODE_DESIGNATION (charset, reg, coding);
2478     }
2479
2480   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2481       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2482     {
2483       /* Since the graphic register REG is not invoked to any graphic
2484          planes, invoke it to graphic plane 0.  */
2485       switch (reg)
2486         {
2487         case 0:                 /* graphic register 0 */
2488           ENCODE_SHIFT_IN;
2489           break;
2490
2491         case 1:                 /* graphic register 1 */
2492           ENCODE_SHIFT_OUT;
2493           break;
2494
2495         case 2:                 /* graphic register 2 */
2496           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2497             ENCODE_SINGLE_SHIFT_2;
2498           else
2499             ENCODE_LOCKING_SHIFT_2;
2500           break;
2501
2502         case 3:                 /* graphic register 3 */
2503           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2504             ENCODE_SINGLE_SHIFT_3;
2505           else
2506             ENCODE_LOCKING_SHIFT_3;
2507           break;
2508         }
2509     }
2510
2511   return dst;
2512 }
2513
2514 /* Produce 2-byte codes for encoded composition rule RULE.  */
2515
2516 #define ENCODE_COMPOSITION_RULE(rule)           \
2517   do {                                          \
2518     int gref, nref;                             \
2519     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2520     *dst++ = 32 + 81 + gref;                    \
2521     *dst++ = 32 + nref;                         \
2522   } while (0)
2523
2524 /* Produce codes for indicating the start of a composition sequence
2525    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2526    which specify information about the composition.  See the comment
2527    in coding.h for the format of DATA.  */
2528
2529 #define ENCODE_COMPOSITION_START(coding, data)                          \
2530   do {                                                                  \
2531     coding->composing = data[3];                                        \
2532     *dst++ = ISO_CODE_ESC;                                              \
2533     if (coding->composing == COMPOSITION_RELATIVE)                      \
2534       *dst++ = '0';                                                     \
2535     else                                                                \
2536       {                                                                 \
2537         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2538                   ? '3' : '4');                                         \
2539         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2540         coding->composition_rule_follows = 0;                           \
2541       }                                                                 \
2542   } while (0)
2543
2544 /* Produce codes for indicating the end of the current composition.  */
2545
2546 #define ENCODE_COMPOSITION_END(coding, data)                    \
2547   do {                                                          \
2548     *dst++ = ISO_CODE_ESC;                                      \
2549     *dst++ = '1';                                               \
2550     coding->cmp_data_start += data[0];                          \
2551     coding->composing = COMPOSITION_NO;                         \
2552     if (coding->cmp_data_start == coding->cmp_data->used        \
2553         && coding->cmp_data->next)                              \
2554       {                                                         \
2555         coding->cmp_data = coding->cmp_data->next;              \
2556         coding->cmp_data_start = 0;                             \
2557       }                                                         \
2558   } while (0)
2559
2560 /* Produce composition start sequence ESC 0.  Here, this sequence
2561    doesn't mean the start of a new composition but means that we have
2562    just produced components (alternate chars and composition rules) of
2563    the composition and the actual text follows in SRC.  */
2564
2565 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2566   do {                                          \
2567     *dst++ = ISO_CODE_ESC;                      \
2568     *dst++ = '0';                               \
2569     coding->composing = COMPOSITION_RELATIVE;   \
2570   } while (0)
2571
2572 /* The following three macros produce codes for indicating direction
2573    of text.  */
2574 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2575   do {                                                  \
2576     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2577       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2578     else                                                \
2579       *dst++ = ISO_CODE_CSI;                            \
2580   } while (0)
2581
2582 #define ENCODE_DIRECTION_R2L    \
2583   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2584
2585 #define ENCODE_DIRECTION_L2R    \
2586   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2587
2588 /* Produce codes for designation and invocation to reset the graphic
2589    planes and registers to initial state.  */
2590 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2591   do {                                                                      \
2592     int reg;                                                                \
2593     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2594       ENCODE_SHIFT_IN;                                                      \
2595     for (reg = 0; reg < 4; reg++)                                           \
2596       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2597           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2598               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2599         ENCODE_DESIGNATION                                                  \
2600           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2601   } while (0)
2602
2603 /* Produce designation sequences of charsets in the line started from
2604    SRC to a place pointed by DST, and return updated DST.
2605
2606    If the current block ends before any end-of-line, we may fail to
2607    find all the necessary designations.  */
2608
2609 static unsigned char *
2610 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2611      struct coding_system *coding;
2612      Lisp_Object translation_table;
2613      const unsigned char *src, *src_end;
2614      unsigned char *dst;
2615 {
2616   int charset, c, found = 0, reg;
2617   /* Table of charsets to be designated to each graphic register.  */
2618   int r[4];
2619
2620   for (reg = 0; reg < 4; reg++)
2621     r[reg] = -1;
2622
2623   while (found < 4)
2624     {
2625       ONE_MORE_CHAR (c);
2626       if (c == '\n')
2627         break;
2628
2629       charset = CHAR_CHARSET (c);
2630       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2631       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2632         {
2633           found++;
2634           r[reg] = charset;
2635         }
2636     }
2637
2638  label_end_of_loop:
2639   if (found)
2640     {
2641       for (reg = 0; reg < 4; reg++)
2642         if (r[reg] >= 0
2643             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2644           ENCODE_DESIGNATION (r[reg], reg, coding);
2645     }
2646
2647   return dst;
2648 }
2649
2650 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2651
2652 static void
2653 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2654      struct coding_system *coding;
2655      const unsigned char *source;
2656      unsigned char *destination;
2657      int src_bytes, dst_bytes;
2658 {
2659   const unsigned char *src = source;
2660   const unsigned char *src_end = source + src_bytes;
2661   unsigned char *dst = destination;
2662   unsigned char *dst_end = destination + dst_bytes;
2663   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2664      from DST_END to assure overflow checking is necessary only at the
2665      head of loop.  */
2666   unsigned char *adjusted_dst_end = dst_end - 19;
2667   /* SRC_BASE remembers the start position in source in each loop.
2668      The loop will be exited when there's not enough source text to
2669      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2670      there's not enough destination area to produce encoded codes
2671      (within macro EMIT_BYTES).  */
2672   const unsigned char *src_base;
2673   int c;
2674   Lisp_Object translation_table;
2675   Lisp_Object safe_chars;
2676
2677   if (coding->flags & CODING_FLAG_ISO_SAFE)
2678     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2679
2680   safe_chars = coding_safe_chars (coding->symbol);
2681
2682   if (NILP (Venable_character_translation))
2683     translation_table = Qnil;
2684   else
2685     {
2686       translation_table = coding->translation_table_for_encode;
2687       if (NILP (translation_table))
2688         translation_table = Vstandard_translation_table_for_encode;
2689     }
2690
2691   coding->consumed_char = 0;
2692   coding->errors = 0;
2693   while (1)
2694     {
2695       src_base = src;
2696
2697       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2698         {
2699           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2700           break;
2701         }
2702
2703       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2704           && CODING_SPEC_ISO_BOL (coding))
2705         {
2706           /* We have to produce designation sequences if any now.  */
2707           dst = encode_designation_at_bol (coding, translation_table,
2708                                            src, src_end, dst);
2709           CODING_SPEC_ISO_BOL (coding) = 0;
2710         }
2711
2712       /* Check composition start and end.  */
2713       if (coding->composing != COMPOSITION_DISABLED
2714           && coding->cmp_data_start < coding->cmp_data->used)
2715         {
2716           struct composition_data *cmp_data = coding->cmp_data;
2717           int *data = cmp_data->data + coding->cmp_data_start;
2718           int this_pos = cmp_data->char_offset + coding->consumed_char;
2719
2720           if (coding->composing == COMPOSITION_RELATIVE)
2721             {
2722               if (this_pos == data[2])
2723                 {
2724                   ENCODE_COMPOSITION_END (coding, data);
2725                   cmp_data = coding->cmp_data;
2726                   data = cmp_data->data + coding->cmp_data_start;
2727                 }
2728             }
2729           else if (COMPOSING_P (coding))
2730             {
2731               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2732               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2733                 /* We have consumed components of the composition.
2734                    What follows in SRC is the composition's base
2735                    text.  */
2736                 ENCODE_COMPOSITION_FAKE_START (coding);
2737               else
2738                 {
2739                   int c = cmp_data->data[coding->cmp_data_index++];
2740                   if (coding->composition_rule_follows)
2741                     {
2742                       ENCODE_COMPOSITION_RULE (c);
2743                       coding->composition_rule_follows = 0;
2744                     }
2745                   else
2746                     {
2747                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2748                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2749                         ENCODE_UNSAFE_CHARACTER (c);
2750                       else
2751                         ENCODE_ISO_CHARACTER (c);
2752                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2753                         coding->composition_rule_follows = 1;
2754                     }
2755                   continue;
2756                 }
2757             }
2758           if (!COMPOSING_P (coding))
2759             {
2760               if (this_pos == data[1])
2761                 {
2762                   ENCODE_COMPOSITION_START (coding, data);
2763                   continue;
2764                 }
2765             }
2766         }
2767
2768       ONE_MORE_CHAR (c);
2769
2770       /* Now encode the character C.  */
2771       if (c < 0x20 || c == 0x7F)
2772         {
2773           if (c == '\r')
2774             {
2775               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2776                 {
2777                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2778                     ENCODE_RESET_PLANE_AND_REGISTER;
2779                   *dst++ = c;
2780                   continue;
2781                 }
2782               /* fall down to treat '\r' as '\n' ...  */
2783               c = '\n';
2784             }
2785           if (c == '\n')
2786             {
2787               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2788                 ENCODE_RESET_PLANE_AND_REGISTER;
2789               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2790                 bcopy (coding->spec.iso2022.initial_designation,
2791                        coding->spec.iso2022.current_designation,
2792                        sizeof coding->spec.iso2022.initial_designation);
2793               if (coding->eol_type == CODING_EOL_LF
2794                   || coding->eol_type == CODING_EOL_UNDECIDED)
2795                 *dst++ = ISO_CODE_LF;
2796               else if (coding->eol_type == CODING_EOL_CRLF)
2797                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2798               else
2799                 *dst++ = ISO_CODE_CR;
2800               CODING_SPEC_ISO_BOL (coding) = 1;
2801             }
2802           else
2803             {
2804               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2805                 ENCODE_RESET_PLANE_AND_REGISTER;
2806               *dst++ = c;
2807             }
2808         }
2809       else if (ASCII_BYTE_P (c))
2810         ENCODE_ISO_CHARACTER (c);
2811       else if (SINGLE_BYTE_CHAR_P (c))
2812         {
2813           *dst++ = c;
2814           coding->errors++;
2815         }
2816       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2817                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2818         ENCODE_UNSAFE_CHARACTER (c);
2819       else
2820         ENCODE_ISO_CHARACTER (c);
2821
2822       coding->consumed_char++;
2823     }
2824
2825  label_end_of_loop:
2826   coding->consumed = src_base - source;
2827   coding->produced = coding->produced_char = dst - destination;
2828 }
2829
2830 \f
2831 /*** 4. SJIS and BIG5 handlers ***/
2832
2833 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2834    quite widely.  So, for the moment, Emacs supports them in the bare
2835    C code.  But, in the future, they may be supported only by CCL.  */
2836
2837 /* SJIS is a coding system encoding three character sets: ASCII, right
2838    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2839    as is.  A character of charset katakana-jisx0201 is encoded by
2840    "position-code + 0x80".  A character of charset japanese-jisx0208
2841    is encoded in 2-byte but two position-codes are divided and shifted
2842    so that it fits in the range below.
2843
2844    --- CODE RANGE of SJIS ---
2845    (character set)      (range)
2846    ASCII                0x00 .. 0x7F
2847    KATAKANA-JISX0201    0xA1 .. 0xDF
2848    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2849             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2850    -------------------------------
2851
2852 */
2853
2854 /* BIG5 is a coding system encoding two character sets: ASCII and
2855    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2856    character set and is encoded in two bytes.
2857
2858    --- CODE RANGE of BIG5 ---
2859    (character set)      (range)
2860    ASCII                0x00 .. 0x7F
2861    Big5 (1st byte)      0xA1 .. 0xFE
2862         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2863    --------------------------
2864
2865    Since the number of characters in Big5 is larger than maximum
2866    characters in Emacs' charset (96x96), it can't be handled as one
2867    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2868    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2869    contains frequently used characters and the latter contains less
2870    frequently used characters.  */
2871
2872 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2873    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2874    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2875    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2876
2877 /* Number of Big5 characters which have the same code in 1st byte.  */
2878 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2879
2880 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2881   do {                                                                  \
2882     unsigned int temp                                                   \
2883       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2884     if (b1 < 0xC9)                                                      \
2885       charset = charset_big5_1;                                         \
2886     else                                                                \
2887       {                                                                 \
2888         charset = charset_big5_2;                                       \
2889         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2890       }                                                                 \
2891     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2892     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2893   } while (0)
2894
2895 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2896   do {                                                                  \
2897     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2898     if (charset == charset_big5_2)                                      \
2899       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2900     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2901     b2 = temp % BIG5_SAME_ROW;                                          \
2902     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2903   } while (0)
2904
2905 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2906    Check if a text is encoded in SJIS.  If it is, return
2907    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2908
2909 static int
2910 detect_coding_sjis (src, src_end, multibytep)
2911      unsigned char *src, *src_end;
2912      int multibytep;
2913 {
2914   int c;
2915   /* Dummy for ONE_MORE_BYTE.  */
2916   struct coding_system dummy_coding;
2917   struct coding_system *coding = &dummy_coding;
2918
2919   while (1)
2920     {
2921       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2922       if (c < 0x80)
2923         continue;
2924       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2925         return 0;
2926       if (c <= 0x9F || c >= 0xE0)
2927         {
2928           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2929           if (c < 0x40 || c == 0x7F || c > 0xFC)
2930             return 0;
2931         }
2932     }
2933 }
2934
2935 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2936    Check if a text is encoded in BIG5.  If it is, return
2937    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2938
2939 static int
2940 detect_coding_big5 (src, src_end, multibytep)
2941      unsigned char *src, *src_end;
2942      int multibytep;
2943 {
2944   int c;
2945   /* Dummy for ONE_MORE_BYTE.  */
2946   struct coding_system dummy_coding;
2947   struct coding_system *coding = &dummy_coding;
2948
2949   while (1)
2950     {
2951       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2952       if (c < 0x80)
2953         continue;
2954       if (c < 0xA1 || c > 0xFE)
2955         return 0;
2956       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2957       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2958         return 0;
2959     }
2960 }
2961
2962 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2963    Check if a text is encoded in UTF-8.  If it is, return
2964    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2965
2966 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2967 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2968 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2969 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2970 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2971 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2972 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2973
2974 static int
2975 detect_coding_utf_8 (src, src_end, multibytep)
2976      unsigned char *src, *src_end;
2977      int multibytep;
2978 {
2979   unsigned char c;
2980   int seq_maybe_bytes;
2981   /* Dummy for ONE_MORE_BYTE.  */
2982   struct coding_system dummy_coding;
2983   struct coding_system *coding = &dummy_coding;
2984
2985   while (1)
2986     {
2987       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2988       if (UTF_8_1_OCTET_P (c))
2989         continue;
2990       else if (UTF_8_2_OCTET_LEADING_P (c))
2991         seq_maybe_bytes = 1;
2992       else if (UTF_8_3_OCTET_LEADING_P (c))
2993         seq_maybe_bytes = 2;
2994       else if (UTF_8_4_OCTET_LEADING_P (c))
2995         seq_maybe_bytes = 3;
2996       else if (UTF_8_5_OCTET_LEADING_P (c))
2997         seq_maybe_bytes = 4;
2998       else if (UTF_8_6_OCTET_LEADING_P (c))
2999         seq_maybe_bytes = 5;
3000       else
3001         return 0;
3002
3003       do
3004         {
3005           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3006           if (!UTF_8_EXTRA_OCTET_P (c))
3007             return 0;
3008           seq_maybe_bytes--;
3009         }
3010       while (seq_maybe_bytes > 0);
3011     }
3012 }
3013
3014 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3015    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3016    Little Endian (otherwise).  If it is, return
3017    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3018    else return 0.  */
3019
3020 #define UTF_16_INVALID_P(val)   \
3021   (((val) == 0xFFFE)            \
3022    || ((val) == 0xFFFF))
3023
3024 #define UTF_16_HIGH_SURROGATE_P(val) \
3025   (((val) & 0xD800) == 0xD800)
3026
3027 #define UTF_16_LOW_SURROGATE_P(val) \
3028   (((val) & 0xDC00) == 0xDC00)
3029
3030 static int
3031 detect_coding_utf_16 (src, src_end, multibytep)
3032      unsigned char *src, *src_end;
3033      int multibytep;
3034 {
3035   unsigned char c1, c2;
3036   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3037   struct coding_system dummy_coding;
3038   struct coding_system *coding = &dummy_coding;
3039
3040   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3041   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3042
3043   if ((c1 == 0xFF) && (c2 == 0xFE))
3044     return CODING_CATEGORY_MASK_UTF_16_LE;
3045   else if ((c1 == 0xFE) && (c2 == 0xFF))
3046     return CODING_CATEGORY_MASK_UTF_16_BE;
3047   return 0;
3048 }
3049
3050 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3051    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3052
3053 static void
3054 decode_coding_sjis_big5 (coding, source, destination,
3055                          src_bytes, dst_bytes, sjis_p)
3056      struct coding_system *coding;
3057      const unsigned char *source;
3058      unsigned char  *destination;
3059      int src_bytes, dst_bytes;
3060      int sjis_p;
3061 {
3062   const unsigned char *src = source;
3063   const unsigned char *src_end = source + src_bytes;
3064   unsigned char *dst = destination;
3065   unsigned char *dst_end = destination + dst_bytes;
3066   /* SRC_BASE remembers the start position in source in each loop.
3067      The loop will be exited when there's not enough source code
3068      (within macro ONE_MORE_BYTE), or when there's not enough
3069      destination area to produce a character (within macro
3070      EMIT_CHAR).  */
3071   const unsigned char *src_base;
3072   Lisp_Object translation_table;
3073
3074   if (NILP (Venable_character_translation))
3075     translation_table = Qnil;
3076   else
3077     {
3078       translation_table = coding->translation_table_for_decode;
3079       if (NILP (translation_table))
3080         translation_table = Vstandard_translation_table_for_decode;
3081     }
3082
3083   coding->produced_char = 0;
3084   while (1)
3085     {
3086       int c, charset, c1, c2 = 0;
3087
3088       src_base = src;
3089       ONE_MORE_BYTE (c1);
3090
3091       if (c1 < 0x80)
3092         {
3093           charset = CHARSET_ASCII;
3094           if (c1 < 0x20)
3095             {
3096               if (c1 == '\r')
3097                 {
3098                   if (coding->eol_type == CODING_EOL_CRLF)
3099                     {
3100                       ONE_MORE_BYTE (c2);
3101                       if (c2 == '\n')
3102                         c1 = c2;
3103                       else
3104                         /* To process C2 again, SRC is subtracted by 1.  */
3105                         src--;
3106                     }
3107                   else if (coding->eol_type == CODING_EOL_CR)
3108                     c1 = '\n';
3109                 }
3110               else if (c1 == '\n'
3111                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3112                        && (coding->eol_type == CODING_EOL_CR
3113                            || coding->eol_type == CODING_EOL_CRLF))
3114                 {
3115                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3116                   goto label_end_of_loop;
3117                 }
3118             }
3119         }
3120       else
3121         {
3122           if (sjis_p)
3123             {
3124               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3125                 goto label_invalid_code;
3126               if (c1 <= 0x9F || c1 >= 0xE0)
3127                 {
3128                   /* SJIS -> JISX0208 */
3129                   ONE_MORE_BYTE (c2);
3130                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3131                     goto label_invalid_code;
3132                   DECODE_SJIS (c1, c2, c1, c2);
3133                   charset = charset_jisx0208;
3134                 }
3135               else
3136                 /* SJIS -> JISX0201-Kana */
3137                 charset = charset_katakana_jisx0201;
3138             }
3139           else
3140             {
3141               /* BIG5 -> Big5 */
3142               if (c1 < 0xA0 || c1 > 0xFE)
3143                 goto label_invalid_code;
3144               ONE_MORE_BYTE (c2);
3145               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3146                 goto label_invalid_code;
3147               DECODE_BIG5 (c1, c2, charset, c1, c2);
3148             }
3149         }
3150
3151       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3152       EMIT_CHAR (c);
3153       continue;
3154
3155     label_invalid_code:
3156       coding->errors++;
3157       src = src_base;
3158       c = *src++;
3159       EMIT_CHAR (c);
3160     }
3161
3162  label_end_of_loop:
3163   coding->consumed = coding->consumed_char = src_base - source;
3164   coding->produced = dst - destination;
3165   return;
3166 }
3167
3168 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3169    This function can encode charsets `ascii', `katakana-jisx0201',
3170    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3171    are sure that all these charsets are registered as official charset
3172    (i.e. do not have extended leading-codes).  Characters of other
3173    charsets are produced without any encoding.  If SJIS_P is 1, encode
3174    SJIS text, else encode BIG5 text.  */
3175
3176 static void
3177 encode_coding_sjis_big5 (coding, source, destination,
3178                          src_bytes, dst_bytes, sjis_p)
3179      struct coding_system *coding;
3180      unsigned char *source, *destination;
3181      int src_bytes, dst_bytes;
3182      int sjis_p;
3183 {
3184   unsigned char *src = source;
3185   unsigned char *src_end = source + src_bytes;
3186   unsigned char *dst = destination;
3187   unsigned char *dst_end = destination + dst_bytes;
3188   /* SRC_BASE remembers the start position in source in each loop.
3189      The loop will be exited when there's not enough source text to
3190      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3191      there's not enough destination area to produce encoded codes
3192      (within macro EMIT_BYTES).  */
3193   unsigned char *src_base;
3194   Lisp_Object translation_table;
3195
3196   if (NILP (Venable_character_translation))
3197     translation_table = Qnil;
3198   else
3199     {
3200       translation_table = coding->translation_table_for_encode;
3201       if (NILP (translation_table))
3202         translation_table = Vstandard_translation_table_for_encode;
3203     }
3204
3205   while (1)
3206     {
3207       int c, charset, c1, c2;
3208
3209       src_base = src;
3210       ONE_MORE_CHAR (c);
3211
3212       /* Now encode the character C.  */
3213       if (SINGLE_BYTE_CHAR_P (c))
3214         {
3215           switch (c)
3216             {
3217             case '\r':
3218               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3219                 {
3220                   EMIT_ONE_BYTE (c);
3221                   break;
3222                 }
3223               c = '\n';
3224             case '\n':
3225               if (coding->eol_type == CODING_EOL_CRLF)
3226                 {
3227                   EMIT_TWO_BYTES ('\r', c);
3228                   break;
3229                 }
3230               else if (coding->eol_type == CODING_EOL_CR)
3231                 c = '\r';
3232             default:
3233               EMIT_ONE_BYTE (c);
3234             }
3235         }
3236       else
3237         {
3238           SPLIT_CHAR (c, charset, c1, c2);
3239           if (sjis_p)
3240             {
3241               if (charset == charset_jisx0208
3242                   || charset == charset_jisx0208_1978)
3243                 {
3244                   ENCODE_SJIS (c1, c2, c1, c2);
3245                   EMIT_TWO_BYTES (c1, c2);
3246                 }
3247               else if (charset == charset_katakana_jisx0201)
3248                 EMIT_ONE_BYTE (c1 | 0x80);
3249               else if (charset == charset_latin_jisx0201)
3250                 EMIT_ONE_BYTE (c1);
3251               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3252                 {
3253                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3254                   if (CHARSET_WIDTH (charset) > 1)
3255                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3256                 }
3257               else
3258                 /* There's no way other than producing the internal
3259                    codes as is.  */
3260                 EMIT_BYTES (src_base, src);
3261             }
3262           else
3263             {
3264               if (charset == charset_big5_1 || charset == charset_big5_2)
3265                 {
3266                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3267                   EMIT_TWO_BYTES (c1, c2);
3268                 }
3269               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3270                 {
3271                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3272                   if (CHARSET_WIDTH (charset) > 1)
3273                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3274                 }
3275               else
3276                 /* There's no way other than producing the internal
3277                    codes as is.  */
3278                 EMIT_BYTES (src_base, src);
3279             }
3280         }
3281       coding->consumed_char++;
3282     }
3283
3284  label_end_of_loop:
3285   coding->consumed = src_base - source;
3286   coding->produced = coding->produced_char = dst - destination;
3287 }
3288
3289 \f
3290 /*** 5. CCL handlers ***/
3291
3292 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3293    Check if a text is encoded in a coding system of which
3294    encoder/decoder are written in CCL program.  If it is, return
3295    CODING_CATEGORY_MASK_CCL, else return 0.  */
3296
3297 static int
3298 detect_coding_ccl (src, src_end, multibytep)
3299      unsigned char *src, *src_end;
3300      int multibytep;
3301 {
3302   unsigned char *valid;
3303   int c;
3304   /* Dummy for ONE_MORE_BYTE.  */
3305   struct coding_system dummy_coding;
3306   struct coding_system *coding = &dummy_coding;
3307
3308   /* No coding system is assigned to coding-category-ccl.  */
3309   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3310     return 0;
3311
3312   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3313   while (1)
3314     {
3315       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3316       if (! valid[c])
3317         return 0;
3318     }
3319 }
3320
3321 \f
3322 /*** 6. End-of-line handlers ***/
3323
3324 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3325
3326 static void
3327 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3328      struct coding_system *coding;
3329      const unsigned char *source;
3330      unsigned char *destination;
3331      int src_bytes, dst_bytes;
3332 {
3333   const unsigned char *src = source;
3334   unsigned char *dst = destination;
3335   const unsigned char *src_end = src + src_bytes;
3336   unsigned char *dst_end = dst + dst_bytes;
3337   Lisp_Object translation_table;
3338   /* SRC_BASE remembers the start position in source in each loop.
3339      The loop will be exited when there's not enough source code
3340      (within macro ONE_MORE_BYTE), or when there's not enough
3341      destination area to produce a character (within macro
3342      EMIT_CHAR).  */
3343   const unsigned char *src_base;
3344   int c;
3345
3346   translation_table = Qnil;
3347   switch (coding->eol_type)
3348     {
3349     case CODING_EOL_CRLF:
3350       while (1)
3351         {
3352           src_base = src;
3353           ONE_MORE_BYTE (c);
3354           if (c == '\r')
3355             {
3356               ONE_MORE_BYTE (c);
3357               if (c != '\n')
3358                 {
3359                   src--;
3360                   c = '\r';
3361                 }
3362             }
3363           else if (c == '\n'
3364                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3365             {
3366               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3367               goto label_end_of_loop;
3368             }
3369           EMIT_CHAR (c);
3370         }
3371       break;
3372
3373     case CODING_EOL_CR:
3374       while (1)
3375         {
3376           src_base = src;
3377           ONE_MORE_BYTE (c);
3378           if (c == '\n')
3379             {
3380               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3381                 {
3382                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3383                   goto label_end_of_loop;
3384                 }
3385             }
3386           else if (c == '\r')
3387             c = '\n';
3388           EMIT_CHAR (c);
3389         }
3390       break;
3391
3392     default:                    /* no need for EOL handling */
3393       while (1)
3394         {
3395           src_base = src;
3396           ONE_MORE_BYTE (c);
3397           EMIT_CHAR (c);
3398         }
3399     }
3400
3401  label_end_of_loop:
3402   coding->consumed = coding->consumed_char = src_base - source;
3403   coding->produced = dst - destination;
3404   return;
3405 }
3406
3407 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3408    format of end-of-line according to `coding->eol_type'.  It also
3409    convert multibyte form 8-bit characters to unibyte if
3410    CODING->src_multibyte is nonzero.  If `coding->mode &
3411    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3412    also means end-of-line.  */
3413
3414 static void
3415 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3416      struct coding_system *coding;
3417      const unsigned char *source;
3418      unsigned char *destination;
3419      int src_bytes, dst_bytes;
3420 {
3421   const unsigned char *src = source;
3422   unsigned char *dst = destination;
3423   const unsigned char *src_end = src + src_bytes;
3424   unsigned char *dst_end = dst + dst_bytes;
3425   Lisp_Object translation_table;
3426   /* SRC_BASE remembers the start position in source in each loop.
3427      The loop will be exited when there's not enough source text to
3428      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3429      there's not enough destination area to produce encoded codes
3430      (within macro EMIT_BYTES).  */
3431   const unsigned char *src_base;
3432   unsigned char *tmp;
3433   int c;
3434   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3435
3436   translation_table = Qnil;
3437   if (coding->src_multibyte
3438       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3439     {
3440       src_end--;
3441       src_bytes--;
3442       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3443     }
3444
3445   if (coding->eol_type == CODING_EOL_CRLF)
3446     {
3447       while (src < src_end)
3448         {
3449           src_base = src;
3450           c = *src++;
3451           if (c >= 0x20)
3452             EMIT_ONE_BYTE (c);
3453           else if (c == '\n' || (c == '\r' && selective_display))
3454             EMIT_TWO_BYTES ('\r', '\n');
3455           else
3456             EMIT_ONE_BYTE (c);
3457         }
3458       src_base = src;
3459     label_end_of_loop:
3460       ;
3461     }
3462   else
3463     {
3464       if (!dst_bytes || src_bytes <= dst_bytes)
3465         {
3466           safe_bcopy (src, dst, src_bytes);
3467           src_base = src_end;
3468           dst += src_bytes;
3469         }
3470       else
3471         {
3472           if (coding->src_multibyte
3473               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3474             dst_bytes--;
3475           safe_bcopy (src, dst, dst_bytes);
3476           src_base = src + dst_bytes;
3477           dst = destination + dst_bytes;
3478           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3479         }
3480       if (coding->eol_type == CODING_EOL_CR)
3481         {
3482           for (tmp = destination; tmp < dst; tmp++)
3483             if (*tmp == '\n') *tmp = '\r';
3484         }
3485       else if (selective_display)
3486         {
3487           for (tmp = destination; tmp < dst; tmp++)
3488             if (*tmp == '\r') *tmp = '\n';
3489         }
3490     }
3491   if (coding->src_multibyte)
3492     dst = destination + str_as_unibyte (destination, dst - destination);
3493
3494   coding->consumed = src_base - source;
3495   coding->produced = dst - destination;
3496   coding->produced_char = coding->produced;
3497 }
3498
3499 \f
3500 /*** 7. C library functions ***/
3501
3502 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3503    has a property `coding-system'.  The value of this property is a
3504    vector of length 5 (called the coding-vector).  Among elements of
3505    this vector, the first (element[0]) and the fifth (element[4])
3506    carry important information for decoding/encoding.  Before
3507    decoding/encoding, this information should be set in fields of a
3508    structure of type `coding_system'.
3509
3510    The value of the property `coding-system' can be a symbol of another
3511    subsidiary coding-system.  In that case, Emacs gets coding-vector
3512    from that symbol.
3513
3514    `element[0]' contains information to be set in `coding->type'.  The
3515    value and its meaning is as follows:
3516
3517    0 -- coding_type_emacs_mule
3518    1 -- coding_type_sjis
3519    2 -- coding_type_iso2022
3520    3 -- coding_type_big5
3521    4 -- coding_type_ccl encoder/decoder written in CCL
3522    nil -- coding_type_no_conversion
3523    t -- coding_type_undecided (automatic conversion on decoding,
3524                                no-conversion on encoding)
3525
3526    `element[4]' contains information to be set in `coding->flags' and
3527    `coding->spec'.  The meaning varies by `coding->type'.
3528
3529    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3530    of length 32 (of which the first 13 sub-elements are used now).
3531    Meanings of these sub-elements are:
3532
3533    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3534         If the value is an integer of valid charset, the charset is
3535         assumed to be designated to graphic register N initially.
3536
3537         If the value is minus, it is a minus value of charset which
3538         reserves graphic register N, which means that the charset is
3539         not designated initially but should be designated to graphic
3540         register N just before encoding a character in that charset.
3541
3542         If the value is nil, graphic register N is never used on
3543         encoding.
3544
3545    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3546         Each value takes t or nil.  See the section ISO2022 of
3547         `coding.h' for more information.
3548
3549    If `coding->type' is `coding_type_big5', element[4] is t to denote
3550    BIG5-ETen or nil to denote BIG5-HKU.
3551
3552    If `coding->type' takes the other value, element[4] is ignored.
3553
3554    Emacs Lisp's coding systems also carry information about format of
3555    end-of-line in a value of property `eol-type'.  If the value is
3556    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3557    means CODING_EOL_CR.  If it is not integer, it should be a vector
3558    of subsidiary coding systems of which property `eol-type' has one
3559    of the above values.
3560
3561 */
3562
3563 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3564    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3565    is setup so that no conversion is necessary and return -1, else
3566    return 0.  */
3567
3568 int
3569 setup_coding_system (coding_system, coding)
3570      Lisp_Object coding_system;
3571      struct coding_system *coding;
3572 {
3573   Lisp_Object coding_spec, coding_type, eol_type, plist;
3574   Lisp_Object val;
3575
3576   /* At first, zero clear all members.  */
3577   bzero (coding, sizeof (struct coding_system));
3578
3579   /* Initialize some fields required for all kinds of coding systems.  */
3580   coding->symbol = coding_system;
3581   coding->heading_ascii = -1;
3582   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3583   coding->composing = COMPOSITION_DISABLED;
3584   coding->cmp_data = NULL;
3585
3586   if (NILP (coding_system))
3587     goto label_invalid_coding_system;
3588
3589   coding_spec = Fget (coding_system, Qcoding_system);
3590
3591   if (!VECTORP (coding_spec)
3592       || XVECTOR (coding_spec)->size != 5
3593       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3594     goto label_invalid_coding_system;
3595
3596   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3597   if (VECTORP (eol_type))
3598     {
3599       coding->eol_type = CODING_EOL_UNDECIDED;
3600       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3601       if (system_eol_type != CODING_EOL_LF)
3602         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3603     }
3604   else if (XFASTINT (eol_type) == 1)
3605     {
3606       coding->eol_type = CODING_EOL_CRLF;
3607       coding->common_flags
3608         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3609     }
3610   else if (XFASTINT (eol_type) == 2)
3611     {
3612       coding->eol_type = CODING_EOL_CR;
3613       coding->common_flags
3614         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3615     }
3616   else
3617     {
3618       coding->common_flags = 0;
3619       coding->eol_type = CODING_EOL_LF;
3620     }
3621
3622   coding_type = XVECTOR (coding_spec)->contents[0];
3623   /* Try short cut.  */
3624   if (SYMBOLP (coding_type))
3625     {
3626       if (EQ (coding_type, Qt))
3627         {
3628           coding->type = coding_type_undecided;
3629           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3630         }
3631       else
3632         coding->type = coding_type_no_conversion;
3633       /* Initialize this member.  Any thing other than
3634          CODING_CATEGORY_IDX_UTF_16_BE and
3635          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3636          special treatment in detect_eol.  */
3637       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3638
3639       return 0;
3640     }
3641
3642   /* Get values of coding system properties:
3643      `post-read-conversion', `pre-write-conversion',
3644      `translation-table-for-decode', `translation-table-for-encode'.  */
3645   plist = XVECTOR (coding_spec)->contents[3];
3646   /* Pre & post conversion functions should be disabled if
3647      inhibit_eol_conversion is nonzero.  This is the case that a code
3648      conversion function is called while those functions are running.  */
3649   if (! inhibit_pre_post_conversion)
3650     {
3651       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3652       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3653     }
3654   val = Fplist_get (plist, Qtranslation_table_for_decode);
3655   if (SYMBOLP (val))
3656     val = Fget (val, Qtranslation_table_for_decode);
3657   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3658   val = Fplist_get (plist, Qtranslation_table_for_encode);
3659   if (SYMBOLP (val))
3660     val = Fget (val, Qtranslation_table_for_encode);
3661   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3662   val = Fplist_get (plist, Qcoding_category);
3663   if (!NILP (val))
3664     {
3665       val = Fget (val, Qcoding_category_index);
3666       if (INTEGERP (val))
3667         coding->category_idx = XINT (val);
3668       else
3669         goto label_invalid_coding_system;
3670     }
3671   else
3672     goto label_invalid_coding_system;
3673
3674   /* If the coding system has non-nil `composition' property, enable
3675      composition handling.  */
3676   val = Fplist_get (plist, Qcomposition);
3677   if (!NILP (val))
3678     coding->composing = COMPOSITION_NO;
3679
3680   /* If the coding system is ascii-incompatible, record it in
3681      common_flags.   */
3682   val = Fplist_get (plist, Qascii_incompatible);
3683   if (! NILP (val))
3684     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3685
3686   switch (XFASTINT (coding_type))
3687     {
3688     case 0:
3689       coding->type = coding_type_emacs_mule;
3690       coding->common_flags
3691         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3692       if (!NILP (coding->post_read_conversion))
3693         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3694       if (!NILP (coding->pre_write_conversion))
3695         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3696       break;
3697
3698     case 1:
3699       coding->type = coding_type_sjis;
3700       coding->common_flags
3701         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3702       break;
3703
3704     case 2:
3705       coding->type = coding_type_iso2022;
3706       coding->common_flags
3707         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3708       {
3709         Lisp_Object val, temp;
3710         Lisp_Object *flags;
3711         int i, charset, reg_bits = 0;
3712
3713         val = XVECTOR (coding_spec)->contents[4];
3714
3715         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3716           goto label_invalid_coding_system;
3717
3718         flags = XVECTOR (val)->contents;
3719         coding->flags
3720           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3721              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3722              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3723              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3724              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3725              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3726              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3727              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3728              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3729              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3730              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3731              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3732              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3733              );
3734
3735         /* Invoke graphic register 0 to plane 0.  */
3736         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3737         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3738         CODING_SPEC_ISO_INVOCATION (coding, 1)
3739           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3740         /* Not single shifting at first.  */
3741         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3742         /* Beginning of buffer should also be regarded as bol. */
3743         CODING_SPEC_ISO_BOL (coding) = 1;
3744
3745         for (charset = 0; charset <= MAX_CHARSET; charset++)
3746           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3747         val = Vcharset_revision_alist;
3748         while (CONSP (val))
3749           {
3750             charset = get_charset_id (Fcar_safe (XCAR (val)));
3751             if (charset >= 0
3752                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3753                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3754               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3755             val = XCDR (val);
3756           }
3757
3758         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3759            FLAGS[REG] can be one of below:
3760                 integer CHARSET: CHARSET occupies register I,
3761                 t: designate nothing to REG initially, but can be used
3762                   by any charsets,
3763                 list of integer, nil, or t: designate the first
3764                   element (if integer) to REG initially, the remaining
3765                   elements (if integer) is designated to REG on request,
3766                   if an element is t, REG can be used by any charsets,
3767                 nil: REG is never used.  */
3768         for (charset = 0; charset <= MAX_CHARSET; charset++)
3769           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3770             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3771         for (i = 0; i < 4; i++)
3772           {
3773             if ((INTEGERP (flags[i])
3774                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3775                 || (charset = get_charset_id (flags[i])) >= 0)
3776               {
3777                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3778                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3779               }
3780             else if (EQ (flags[i], Qt))
3781               {
3782                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3783                 reg_bits |= 1 << i;
3784                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3785               }
3786             else if (CONSP (flags[i]))
3787               {
3788                 Lisp_Object tail;
3789                 tail = flags[i];
3790
3791                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3792                 if ((INTEGERP (XCAR (tail))
3793                      && (charset = XINT (XCAR (tail)),
3794                          CHARSET_VALID_P (charset)))
3795                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3796                   {
3797                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3798                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3799                   }
3800                 else
3801                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3802                 tail = XCDR (tail);
3803                 while (CONSP (tail))
3804                   {
3805                     if ((INTEGERP (XCAR (tail))
3806                          && (charset = XINT (XCAR (tail)),
3807                              CHARSET_VALID_P (charset)))
3808                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3809                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3810                         = i;
3811                     else if (EQ (XCAR (tail), Qt))
3812                       reg_bits |= 1 << i;
3813                     tail = XCDR (tail);
3814                   }
3815               }
3816             else
3817               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3818
3819             CODING_SPEC_ISO_DESIGNATION (coding, i)
3820               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3821           }
3822
3823         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3824           {
3825             /* REG 1 can be used only by locking shift in 7-bit env.  */
3826             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3827               reg_bits &= ~2;
3828             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3829               /* Without any shifting, only REG 0 and 1 can be used.  */
3830               reg_bits &= 3;
3831           }
3832
3833         if (reg_bits)
3834           for (charset = 0; charset <= MAX_CHARSET; charset++)
3835             {
3836               if (CHARSET_DEFINED_P (charset)
3837                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3838                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3839                 {
3840                   /* There exist some default graphic registers to be
3841                      used by CHARSET.  */
3842
3843                   /* We had better avoid designating a charset of
3844                      CHARS96 to REG 0 as far as possible.  */
3845                   if (CHARSET_CHARS (charset) == 96)
3846                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3847                       = (reg_bits & 2
3848                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3849                   else
3850                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3851                       = (reg_bits & 1
3852                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3853                 }
3854             }
3855       }
3856       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3857       coding->spec.iso2022.last_invalid_designation_register = -1;
3858       break;
3859
3860     case 3:
3861       coding->type = coding_type_big5;
3862       coding->common_flags
3863         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3864       coding->flags
3865         = (NILP (XVECTOR (coding_spec)->contents[4])
3866            ? CODING_FLAG_BIG5_HKU
3867            : CODING_FLAG_BIG5_ETEN);
3868       break;
3869
3870     case 4:
3871       coding->type = coding_type_ccl;
3872       coding->common_flags
3873         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3874       {
3875         val = XVECTOR (coding_spec)->contents[4];
3876         if (! CONSP (val)
3877             || setup_ccl_program (&(coding->spec.ccl.decoder),
3878                                   XCAR (val)) < 0
3879             || setup_ccl_program (&(coding->spec.ccl.encoder),
3880                                   XCDR (val)) < 0)
3881           goto label_invalid_coding_system;
3882
3883         bzero (coding->spec.ccl.valid_codes, 256);
3884         val = Fplist_get (plist, Qvalid_codes);
3885         if (CONSP (val))
3886           {
3887             Lisp_Object this;
3888
3889             for (; CONSP (val); val = XCDR (val))
3890               {
3891                 this = XCAR (val);
3892                 if (INTEGERP (this)
3893                     && XINT (this) >= 0 && XINT (this) < 256)
3894                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3895                 else if (CONSP (this)
3896                          && INTEGERP (XCAR (this))
3897                          && INTEGERP (XCDR (this)))
3898                   {
3899                     int start = XINT (XCAR (this));
3900                     int end = XINT (XCDR (this));
3901
3902                     if (start >= 0 && start <= end && end < 256)
3903                       while (start <= end)
3904                         coding->spec.ccl.valid_codes[start++] = 1;
3905                   }
3906               }
3907           }
3908       }
3909       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3910       coding->spec.ccl.cr_carryover = 0;
3911       coding->spec.ccl.eight_bit_carryover[0] = 0;
3912       break;
3913
3914     case 5:
3915       coding->type = coding_type_raw_text;
3916       break;
3917
3918     default:
3919       goto label_invalid_coding_system;
3920     }
3921   return 0;
3922
3923  label_invalid_coding_system:
3924   coding->type = coding_type_no_conversion;
3925   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3926   coding->common_flags = 0;
3927   coding->eol_type = CODING_EOL_UNDECIDED;
3928   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3929   return NILP (coding_system) ? 0 : -1;
3930 }
3931
3932 /* Free memory blocks allocated for storing composition information.  */
3933
3934 void
3935 coding_free_composition_data (coding)
3936      struct coding_system *coding;
3937 {
3938   struct composition_data *cmp_data = coding->cmp_data, *next;
3939
3940   if (!cmp_data)
3941     return;
3942   /* Memory blocks are chained.  At first, rewind to the first, then,
3943      free blocks one by one.  */
3944   while (cmp_data->prev)
3945     cmp_data = cmp_data->prev;
3946   while (cmp_data)
3947     {
3948       next = cmp_data->next;
3949       xfree (cmp_data);
3950       cmp_data = next;
3951     }
3952   coding->cmp_data = NULL;
3953 }
3954
3955 /* Set `char_offset' member of all memory blocks pointed by
3956    coding->cmp_data to POS.  */
3957
3958 void
3959 coding_adjust_composition_offset (coding, pos)
3960      struct coding_system *coding;
3961      int pos;
3962 {
3963   struct composition_data *cmp_data;
3964
3965   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3966     cmp_data->char_offset = pos;
3967 }
3968
3969 /* Setup raw-text or one of its subsidiaries in the structure
3970    coding_system CODING according to the already setup value eol_type
3971    in CODING.  CODING should be setup for some coding system in
3972    advance.  */
3973
3974 void
3975 setup_raw_text_coding_system (coding)
3976      struct coding_system *coding;
3977 {
3978   if (coding->type != coding_type_raw_text)
3979     {
3980       coding->symbol = Qraw_text;
3981       coding->type = coding_type_raw_text;
3982       if (coding->eol_type != CODING_EOL_UNDECIDED)
3983         {
3984           Lisp_Object subsidiaries;
3985           subsidiaries = Fget (Qraw_text, Qeol_type);
3986
3987           if (VECTORP (subsidiaries)
3988               && XVECTOR (subsidiaries)->size == 3)
3989             coding->symbol
3990               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3991         }
3992       setup_coding_system (coding->symbol, coding);
3993     }
3994   return;
3995 }
3996
3997 /* Emacs has a mechanism to automatically detect a coding system if it
3998    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3999    it's impossible to distinguish some coding systems accurately
4000    because they use the same range of codes.  So, at first, coding
4001    systems are categorized into 7, those are:
4002
4003    o coding-category-emacs-mule
4004
4005         The category for a coding system which has the same code range
4006         as Emacs' internal format.  Assigned the coding-system (Lisp
4007         symbol) `emacs-mule' by default.
4008
4009    o coding-category-sjis
4010
4011         The category for a coding system which has the same code range
4012         as SJIS.  Assigned the coding-system (Lisp
4013         symbol) `japanese-shift-jis' by default.
4014
4015    o coding-category-iso-7
4016
4017         The category for a coding system which has the same code range
4018         as ISO2022 of 7-bit environment.  This doesn't use any locking
4019         shift and single shift functions.  This can encode/decode all
4020         charsets.  Assigned the coding-system (Lisp symbol)
4021         `iso-2022-7bit' by default.
4022
4023    o coding-category-iso-7-tight
4024
4025         Same as coding-category-iso-7 except that this can
4026         encode/decode only the specified charsets.
4027
4028    o coding-category-iso-8-1
4029
4030         The category for a coding system which has the same code range
4031         as ISO2022 of 8-bit environment and graphic plane 1 used only
4032         for DIMENSION1 charset.  This doesn't use any locking shift
4033         and single shift functions.  Assigned the coding-system (Lisp
4034         symbol) `iso-latin-1' by default.
4035
4036    o coding-category-iso-8-2
4037
4038         The category for a coding system which has the same code range
4039         as ISO2022 of 8-bit environment and graphic plane 1 used only
4040         for DIMENSION2 charset.  This doesn't use any locking shift
4041         and single shift functions.  Assigned the coding-system (Lisp
4042         symbol) `japanese-iso-8bit' by default.
4043
4044    o coding-category-iso-7-else
4045
4046         The category for a coding system which has the same code range
4047         as ISO2022 of 7-bit environment but uses locking shift or
4048         single shift functions.  Assigned the coding-system (Lisp
4049         symbol) `iso-2022-7bit-lock' by default.
4050
4051    o coding-category-iso-8-else
4052
4053         The category for a coding system which has the same code range
4054         as ISO2022 of 8-bit environment but uses locking shift or
4055         single shift functions.  Assigned the coding-system (Lisp
4056         symbol) `iso-2022-8bit-ss2' by default.
4057
4058    o coding-category-big5
4059
4060         The category for a coding system which has the same code range
4061         as BIG5.  Assigned the coding-system (Lisp symbol)
4062         `cn-big5' by default.
4063
4064    o coding-category-utf-8
4065
4066         The category for a coding system which has the same code range
4067         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4068         symbol) `utf-8' by default.
4069
4070    o coding-category-utf-16-be
4071
4072         The category for a coding system in which a text has an
4073         Unicode signature (cf. Unicode Standard) in the order of BIG
4074         endian at the head.  Assigned the coding-system (Lisp symbol)
4075         `utf-16-be' by default.
4076
4077    o coding-category-utf-16-le
4078
4079         The category for a coding system in which a text has an
4080         Unicode signature (cf. Unicode Standard) in the order of
4081         LITTLE endian at the head.  Assigned the coding-system (Lisp
4082         symbol) `utf-16-le' by default.
4083
4084    o coding-category-ccl
4085
4086         The category for a coding system of which encoder/decoder is
4087         written in CCL programs.  The default value is nil, i.e., no
4088         coding system is assigned.
4089
4090    o coding-category-binary
4091
4092         The category for a coding system not categorized in any of the
4093         above.  Assigned the coding-system (Lisp symbol)
4094         `no-conversion' by default.
4095
4096    Each of them is a Lisp symbol and the value is an actual
4097    `coding-system' (this is also a Lisp symbol) assigned by a user.
4098    What Emacs does actually is to detect a category of coding system.
4099    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4100    decide a single possible category, it selects a category of the
4101    highest priority.  Priorities of categories are also specified by a
4102    user in a Lisp variable `coding-category-list'.
4103
4104 */
4105
4106 static
4107 int ascii_skip_code[256];
4108
4109 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4110    If it detects possible coding systems, return an integer in which
4111    appropriate flag bits are set.  Flag bits are defined by macros
4112    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4113    it should point the table `coding_priorities'.  In that case, only
4114    the flag bit for a coding system of the highest priority is set in
4115    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4116    range 0x80..0x9F are in multibyte form.
4117
4118    How many ASCII characters are at the head is returned as *SKIP.  */
4119
4120 static int
4121 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4122      unsigned char *source;
4123      int src_bytes, *priorities, *skip;
4124      int multibytep;
4125 {
4126   register unsigned char c;
4127   unsigned char *src = source, *src_end = source + src_bytes;
4128   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4129   int i;
4130
4131   /* At first, skip all ASCII characters and control characters except
4132      for three ISO2022 specific control characters.  */
4133   ascii_skip_code[ISO_CODE_SO] = 0;
4134   ascii_skip_code[ISO_CODE_SI] = 0;
4135   ascii_skip_code[ISO_CODE_ESC] = 0;
4136
4137  label_loop_detect_coding:
4138   while (src < src_end && ascii_skip_code[*src]) src++;
4139   *skip = src - source;
4140
4141   if (src >= src_end)
4142     /* We found nothing other than ASCII.  There's nothing to do.  */
4143     return 0;
4144
4145   c = *src;
4146   /* The text seems to be encoded in some multilingual coding system.
4147      Now, try to find in which coding system the text is encoded.  */
4148   if (c < 0x80)
4149     {
4150       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4151       /* C is an ISO2022 specific control code of C0.  */
4152       mask = detect_coding_iso2022 (src, src_end, multibytep);
4153       if (mask == 0)
4154         {
4155           /* No valid ISO2022 code follows C.  Try again.  */
4156           src++;
4157           if (c == ISO_CODE_ESC)
4158             ascii_skip_code[ISO_CODE_ESC] = 1;
4159           else
4160             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4161           goto label_loop_detect_coding;
4162         }
4163       if (priorities)
4164         {
4165           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4166             {
4167               if (mask & priorities[i])
4168                 return priorities[i];
4169             }
4170           return CODING_CATEGORY_MASK_RAW_TEXT;
4171         }
4172     }
4173   else
4174     {
4175       int try;
4176
4177       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4178         c = src[1] - 0x20;
4179
4180       if (c < 0xA0)
4181         {
4182           /* C is the first byte of SJIS character code,
4183              or a leading-code of Emacs' internal format (emacs-mule),
4184              or the first byte of UTF-16.  */
4185           try = (CODING_CATEGORY_MASK_SJIS
4186                   | CODING_CATEGORY_MASK_EMACS_MULE
4187                   | CODING_CATEGORY_MASK_UTF_16_BE
4188                   | CODING_CATEGORY_MASK_UTF_16_LE);
4189
4190           /* Or, if C is a special latin extra code,
4191              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4192              or is an ISO2022 control-sequence-introducer (CSI),
4193              we should also consider the possibility of ISO2022 codings.  */
4194           if ((VECTORP (Vlatin_extra_code_table)
4195                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4196               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4197               || (c == ISO_CODE_CSI
4198                   && (src < src_end
4199                       && (*src == ']'
4200                           || ((*src == '0' || *src == '1' || *src == '2')
4201                               && src + 1 < src_end
4202                               && src[1] == ']')))))
4203             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4204                      | CODING_CATEGORY_MASK_ISO_8BIT);
4205         }
4206       else
4207         /* C is a character of ISO2022 in graphic plane right,
4208            or a SJIS's 1-byte character code (i.e. JISX0201),
4209            or the first byte of BIG5's 2-byte code,
4210            or the first byte of UTF-8/16.  */
4211         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4212                 | CODING_CATEGORY_MASK_ISO_8BIT
4213                 | CODING_CATEGORY_MASK_SJIS
4214                 | CODING_CATEGORY_MASK_BIG5
4215                 | CODING_CATEGORY_MASK_UTF_8
4216                 | CODING_CATEGORY_MASK_UTF_16_BE
4217                 | CODING_CATEGORY_MASK_UTF_16_LE);
4218
4219       /* Or, we may have to consider the possibility of CCL.  */
4220       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4221           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4222               ->spec.ccl.valid_codes)[c])
4223         try |= CODING_CATEGORY_MASK_CCL;
4224
4225       mask = 0;
4226       utf16_examined_p = iso2022_examined_p = 0;
4227       if (priorities)
4228         {
4229           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4230             {
4231               if (!iso2022_examined_p
4232                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4233                 {
4234                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4235                   iso2022_examined_p = 1;
4236                 }
4237               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4238                 mask |= detect_coding_sjis (src, src_end, multibytep);
4239               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4240                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4241               else if (!utf16_examined_p
4242                        && (priorities[i] & try &
4243                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4244                 {
4245                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4246                   utf16_examined_p = 1;
4247                 }
4248               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4249                 mask |= detect_coding_big5 (src, src_end, multibytep);
4250               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4251                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4252               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4253                 mask |= detect_coding_ccl (src, src_end, multibytep);
4254               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4255                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4256               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4257                 mask |= CODING_CATEGORY_MASK_BINARY;
4258               if (mask & priorities[i])
4259                 return priorities[i];
4260             }
4261           return CODING_CATEGORY_MASK_RAW_TEXT;
4262         }
4263       if (try & CODING_CATEGORY_MASK_ISO)
4264         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4265       if (try & CODING_CATEGORY_MASK_SJIS)
4266         mask |= detect_coding_sjis (src, src_end, multibytep);
4267       if (try & CODING_CATEGORY_MASK_BIG5)
4268         mask |= detect_coding_big5 (src, src_end, multibytep);
4269       if (try & CODING_CATEGORY_MASK_UTF_8)
4270         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4271       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4272         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4273       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4274         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4275       if (try & CODING_CATEGORY_MASK_CCL)
4276         mask |= detect_coding_ccl (src, src_end, multibytep);
4277     }
4278   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4279 }
4280
4281 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4282    The information of the detected coding system is set in CODING.  */
4283
4284 void
4285 detect_coding (coding, src, src_bytes)
4286      struct coding_system *coding;
4287      const unsigned char *src;
4288      int src_bytes;
4289 {
4290   unsigned int idx;
4291   int skip, mask;
4292   Lisp_Object val;
4293
4294   val = Vcoding_category_list;
4295   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4296                              coding->src_multibyte);
4297   coding->heading_ascii = skip;
4298
4299   if (!mask) return;
4300
4301   /* We found a single coding system of the highest priority in MASK.  */
4302   idx = 0;
4303   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4304   if (! mask)
4305     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4306
4307   val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[idx]);
4308
4309   if (coding->eol_type != CODING_EOL_UNDECIDED)
4310     {
4311       Lisp_Object tmp;
4312
4313       tmp = Fget (val, Qeol_type);
4314       if (VECTORP (tmp))
4315         val = XVECTOR (tmp)->contents[coding->eol_type];
4316     }
4317
4318   /* Setup this new coding system while preserving some slots.  */
4319   {
4320     int src_multibyte = coding->src_multibyte;
4321     int dst_multibyte = coding->dst_multibyte;
4322
4323     setup_coding_system (val, coding);
4324     coding->src_multibyte = src_multibyte;
4325     coding->dst_multibyte = dst_multibyte;
4326     coding->heading_ascii = skip;
4327   }
4328 }
4329
4330 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4331    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4332    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4333
4334    How many non-eol characters are at the head is returned as *SKIP.  */
4335
4336 #define MAX_EOL_CHECK_COUNT 3
4337
4338 static int
4339 detect_eol_type (source, src_bytes, skip)
4340      const unsigned char *source;
4341      int src_bytes, *skip;
4342 {
4343   const unsigned char *src = source, *src_end = src + src_bytes;
4344   unsigned char c;
4345   int total = 0;                /* How many end-of-lines are found so far.  */
4346   int eol_type = CODING_EOL_UNDECIDED;
4347   int this_eol_type;
4348
4349   *skip = 0;
4350
4351   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4352     {
4353       c = *src++;
4354       if (c == '\n' || c == '\r')
4355         {
4356           if (*skip == 0)
4357             *skip = src - 1 - source;
4358           total++;
4359           if (c == '\n')
4360             this_eol_type = CODING_EOL_LF;
4361           else if (src >= src_end || *src != '\n')
4362             this_eol_type = CODING_EOL_CR;
4363           else
4364             this_eol_type = CODING_EOL_CRLF, src++;
4365
4366           if (eol_type == CODING_EOL_UNDECIDED)
4367             /* This is the first end-of-line.  */
4368             eol_type = this_eol_type;
4369           else if (eol_type != this_eol_type)
4370             {
4371               /* The found type is different from what found before.  */
4372               eol_type = CODING_EOL_INCONSISTENT;
4373               break;
4374             }
4375         }
4376     }
4377
4378   if (*skip == 0)
4379     *skip = src_end - source;
4380   return eol_type;
4381 }
4382
4383 /* Like detect_eol_type, but detect EOL type in 2-octet
4384    big-endian/little-endian format for coding systems utf-16-be and
4385    utf-16-le.  */
4386
4387 static int
4388 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4389      const unsigned char *source;
4390      int src_bytes, *skip, big_endian_p;
4391 {
4392   const unsigned char *src = source, *src_end = src + src_bytes;
4393   unsigned int c1, c2;
4394   int total = 0;                /* How many end-of-lines are found so far.  */
4395   int eol_type = CODING_EOL_UNDECIDED;
4396   int this_eol_type;
4397   int msb, lsb;
4398
4399   if (big_endian_p)
4400     msb = 0, lsb = 1;
4401   else
4402     msb = 1, lsb = 0;
4403
4404   *skip = 0;
4405
4406   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4407     {
4408       c1 = (src[msb] << 8) | (src[lsb]);
4409       src += 2;
4410
4411       if (c1 == '\n' || c1 == '\r')
4412         {
4413           if (*skip == 0)
4414             *skip = src - 2 - source;
4415           total++;
4416           if (c1 == '\n')
4417             {
4418               this_eol_type = CODING_EOL_LF;
4419             }
4420           else
4421             {
4422               if ((src + 1) >= src_end)
4423                 {
4424                   this_eol_type = CODING_EOL_CR;
4425                 }
4426               else
4427                 {
4428                   c2 = (src[msb] << 8) | (src[lsb]);
4429                   if (c2 == '\n')
4430                     this_eol_type = CODING_EOL_CRLF, src += 2;
4431                   else
4432                     this_eol_type = CODING_EOL_CR;
4433                 }
4434             }
4435
4436           if (eol_type == CODING_EOL_UNDECIDED)
4437             /* This is the first end-of-line.  */
4438             eol_type = this_eol_type;
4439           else if (eol_type != this_eol_type)
4440             {
4441               /* The found type is different from what found before.  */
4442               eol_type = CODING_EOL_INCONSISTENT;
4443               break;
4444             }
4445         }
4446     }
4447
4448   if (*skip == 0)
4449     *skip = src_end - source;
4450   return eol_type;
4451 }
4452
4453 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4454    is encoded.  If it detects an appropriate format of end-of-line, it
4455    sets the information in *CODING.  */
4456
4457 void
4458 detect_eol (coding, src, src_bytes)
4459      struct coding_system *coding;
4460      const unsigned char *src;
4461      int src_bytes;
4462 {
4463   Lisp_Object val;
4464   int skip;
4465   int eol_type;
4466
4467   switch (coding->category_idx)
4468     {
4469     case CODING_CATEGORY_IDX_UTF_16_BE:
4470       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4471       break;
4472     case CODING_CATEGORY_IDX_UTF_16_LE:
4473       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4474       break;
4475     default:
4476       eol_type = detect_eol_type (src, src_bytes, &skip);
4477       break;
4478     }
4479
4480   if (coding->heading_ascii > skip)
4481     coding->heading_ascii = skip;
4482   else
4483     skip = coding->heading_ascii;
4484
4485   if (eol_type == CODING_EOL_UNDECIDED)
4486     return;
4487   if (eol_type == CODING_EOL_INCONSISTENT)
4488     {
4489 #if 0
4490       /* This code is suppressed until we find a better way to
4491          distinguish raw text file and binary file.  */
4492
4493       /* If we have already detected that the coding is raw-text, the
4494          coding should actually be no-conversion.  */
4495       if (coding->type == coding_type_raw_text)
4496         {
4497           setup_coding_system (Qno_conversion, coding);
4498           return;
4499         }
4500       /* Else, let's decode only text code anyway.  */
4501 #endif /* 0 */
4502       eol_type = CODING_EOL_LF;
4503     }
4504
4505   val = Fget (coding->symbol, Qeol_type);
4506   if (VECTORP (val) && XVECTOR (val)->size == 3)
4507     {
4508       int src_multibyte = coding->src_multibyte;
4509       int dst_multibyte = coding->dst_multibyte;
4510       struct composition_data *cmp_data = coding->cmp_data;
4511
4512       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4513       coding->src_multibyte = src_multibyte;
4514       coding->dst_multibyte = dst_multibyte;
4515       coding->heading_ascii = skip;
4516       coding->cmp_data = cmp_data;
4517     }
4518 }
4519
4520 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4521
4522 #define DECODING_BUFFER_MAG(coding)                     \
4523   (coding->type == coding_type_iso2022                  \
4524    ? 3                                                  \
4525    : (coding->type == coding_type_ccl                   \
4526       ? coding->spec.ccl.decoder.buf_magnification      \
4527       : 2))
4528
4529 /* Return maximum size (bytes) of a buffer enough for decoding
4530    SRC_BYTES of text encoded in CODING.  */
4531
4532 int
4533 decoding_buffer_size (coding, src_bytes)
4534      struct coding_system *coding;
4535      int src_bytes;
4536 {
4537   return (src_bytes * DECODING_BUFFER_MAG (coding)
4538           + CONVERSION_BUFFER_EXTRA_ROOM);
4539 }
4540
4541 /* Return maximum size (bytes) of a buffer enough for encoding
4542    SRC_BYTES of text to CODING.  */
4543
4544 int
4545 encoding_buffer_size (coding, src_bytes)
4546      struct coding_system *coding;
4547      int src_bytes;
4548 {
4549   int magnification;
4550
4551   if (coding->type == coding_type_ccl)
4552     {
4553       magnification = coding->spec.ccl.encoder.buf_magnification;
4554       if (coding->eol_type == CODING_EOL_CRLF)
4555         magnification *= 2;
4556     }
4557   else if (CODING_REQUIRE_ENCODING (coding))
4558     magnification = 3;
4559   else
4560     magnification = 1;
4561
4562   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4563 }
4564
4565 /* Working buffer for code conversion.  */
4566 struct conversion_buffer
4567 {
4568   int size;                     /* size of data.  */
4569   int on_stack;                 /* 1 if allocated by alloca.  */
4570   unsigned char *data;
4571 };
4572
4573 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4574 #define allocate_conversion_buffer(buf, len)            \
4575   do {                                                  \
4576     if (len < MAX_ALLOCA)                               \
4577       {                                                 \
4578         buf.data = (unsigned char *) alloca (len);      \
4579         buf.on_stack = 1;                               \
4580       }                                                 \
4581     else                                                \
4582       {                                                 \
4583         buf.data = (unsigned char *) xmalloc (len);     \
4584         buf.on_stack = 0;                               \
4585       }                                                 \
4586     buf.size = len;                                     \
4587   } while (0)
4588
4589 /* Double the allocated memory for *BUF.  */
4590 static void
4591 extend_conversion_buffer (buf)
4592      struct conversion_buffer *buf;
4593 {
4594   if (buf->on_stack)
4595     {
4596       unsigned char *save = buf->data;
4597       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4598       bcopy (save, buf->data, buf->size);
4599       buf->on_stack = 0;
4600     }
4601   else
4602     {
4603       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4604     }
4605   buf->size *= 2;
4606 }
4607
4608 /* Free the allocated memory for BUF if it is not on stack.  */
4609 static void
4610 free_conversion_buffer (buf)
4611      struct conversion_buffer *buf;
4612 {
4613   if (!buf->on_stack)
4614     xfree (buf->data);
4615 }
4616
4617 int
4618 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4619      struct coding_system *coding;
4620      unsigned char *source, *destination;
4621      int src_bytes, dst_bytes, encodep;
4622 {
4623   struct ccl_program *ccl
4624     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4625   unsigned char *dst = destination;
4626
4627   ccl->suppress_error = coding->suppress_error;
4628   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4629   if (encodep)
4630     {
4631       /* On encoding, EOL format is converted within ccl_driver.  For
4632          that, setup proper information in the structure CCL.  */
4633       ccl->eol_type = coding->eol_type;
4634       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4635         ccl->eol_type = CODING_EOL_LF;
4636       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4637       ccl->eight_bit_control = coding->dst_multibyte;
4638     }
4639   else
4640     ccl->eight_bit_control = 1;
4641   ccl->multibyte = coding->src_multibyte;
4642   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4643     {
4644       /* Move carryover bytes to DESTINATION.  */
4645       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4646       while (*p)
4647         *dst++ = *p++;
4648       coding->spec.ccl.eight_bit_carryover[0] = 0;
4649       if (dst_bytes)
4650         dst_bytes -= dst - destination;
4651     }
4652
4653   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4654                                   &(coding->consumed))
4655                       + dst - destination);
4656
4657   if (encodep)
4658     {
4659       coding->produced_char = coding->produced;
4660       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4661     }
4662   else if (!ccl->eight_bit_control)
4663     {
4664       /* The produced bytes forms a valid multibyte sequence. */
4665       coding->produced_char
4666         = multibyte_chars_in_text (destination, coding->produced);
4667       coding->spec.ccl.eight_bit_carryover[0] = 0;
4668     }
4669   else
4670     {
4671       /* On decoding, the destination should always multibyte.  But,
4672          CCL program might have been generated an invalid multibyte
4673          sequence.  Here we make such a sequence valid as
4674          multibyte.  */
4675       int bytes
4676         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4677
4678       if ((coding->consumed < src_bytes
4679            || !ccl->last_block)
4680           && coding->produced >= 1
4681           && destination[coding->produced - 1] >= 0x80)
4682         {
4683           /* We should not convert the tailing 8-bit codes to
4684              multibyte form even if they doesn't form a valid
4685              multibyte sequence.  They may form a valid sequence in
4686              the next call.  */
4687           int carryover = 0;
4688
4689           if (destination[coding->produced - 1] < 0xA0)
4690             carryover = 1;
4691           else if (coding->produced >= 2)
4692             {
4693               if (destination[coding->produced - 2] >= 0x80)
4694                 {
4695                   if (destination[coding->produced - 2] < 0xA0)
4696                     carryover = 2;
4697                   else if (coding->produced >= 3
4698                            && destination[coding->produced - 3] >= 0x80
4699                            && destination[coding->produced - 3] < 0xA0)
4700                     carryover = 3;
4701                 }
4702             }
4703           if (carryover > 0)
4704             {
4705               BCOPY_SHORT (destination + coding->produced - carryover,
4706                            coding->spec.ccl.eight_bit_carryover,
4707                            carryover);
4708               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4709               coding->produced -= carryover;
4710             }
4711         }
4712       coding->produced = str_as_multibyte (destination, bytes,
4713                                            coding->produced,
4714                                            &(coding->produced_char));
4715     }
4716
4717   switch (ccl->status)
4718     {
4719     case CCL_STAT_SUSPEND_BY_SRC:
4720       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4721       break;
4722     case CCL_STAT_SUSPEND_BY_DST:
4723       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4724       break;
4725     case CCL_STAT_QUIT:
4726     case CCL_STAT_INVALID_CMD:
4727       coding->result = CODING_FINISH_INTERRUPT;
4728       break;
4729     default:
4730       coding->result = CODING_FINISH_NORMAL;
4731       break;
4732     }
4733   return coding->result;
4734 }
4735
4736 /* Decode EOL format of the text at PTR of BYTES length destructively
4737    according to CODING->eol_type.  This is called after the CCL
4738    program produced a decoded text at PTR.  If we do CRLF->LF
4739    conversion, update CODING->produced and CODING->produced_char.  */
4740
4741 static void
4742 decode_eol_post_ccl (coding, ptr, bytes)
4743      struct coding_system *coding;
4744      unsigned char *ptr;
4745      int bytes;
4746 {
4747   Lisp_Object val, saved_coding_symbol;
4748   unsigned char *pend = ptr + bytes;
4749   int dummy;
4750
4751   /* Remember the current coding system symbol.  We set it back when
4752      an inconsistent EOL is found so that `last-coding-system-used' is
4753      set to the coding system that doesn't specify EOL conversion.  */
4754   saved_coding_symbol = coding->symbol;
4755
4756   coding->spec.ccl.cr_carryover = 0;
4757   if (coding->eol_type == CODING_EOL_UNDECIDED)
4758     {
4759       /* Here, to avoid the call of setup_coding_system, we directly
4760          call detect_eol_type.  */
4761       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4762       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4763         coding->eol_type = CODING_EOL_LF;
4764       if (coding->eol_type != CODING_EOL_UNDECIDED)
4765         {
4766           val = Fget (coding->symbol, Qeol_type);
4767           if (VECTORP (val) && XVECTOR (val)->size == 3)
4768             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4769         }
4770       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4771     }
4772
4773   if (coding->eol_type == CODING_EOL_LF
4774       || coding->eol_type == CODING_EOL_UNDECIDED)
4775     {
4776       /* We have nothing to do.  */
4777       ptr = pend;
4778     }
4779   else if (coding->eol_type == CODING_EOL_CRLF)
4780     {
4781       unsigned char *pstart = ptr, *p = ptr;
4782
4783       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4784           && *(pend - 1) == '\r')
4785         {
4786           /* If the last character is CR, we can't handle it here
4787              because LF will be in the not-yet-decoded source text.
4788              Record that the CR is not yet processed.  */
4789           coding->spec.ccl.cr_carryover = 1;
4790           coding->produced--;
4791           coding->produced_char--;
4792           pend--;
4793         }
4794       while (ptr < pend)
4795         {
4796           if (*ptr == '\r')
4797             {
4798               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4799                 {
4800                   *p++ = '\n';
4801                   ptr += 2;
4802                 }
4803               else
4804                 {
4805                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4806                     goto undo_eol_conversion;
4807                   *p++ = *ptr++;
4808                 }
4809             }
4810           else if (*ptr == '\n'
4811                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4812             goto undo_eol_conversion;
4813           else
4814             *p++ = *ptr++;
4815           continue;
4816
4817         undo_eol_conversion:
4818           /* We have faced with inconsistent EOL format at PTR.
4819              Convert all LFs before PTR back to CRLFs.  */
4820           for (p--, ptr--; p >= pstart; p--)
4821             {
4822               if (*p == '\n')
4823                 *ptr-- = '\n', *ptr-- = '\r';
4824               else
4825                 *ptr-- = *p;
4826             }
4827           /*  If carryover is recorded, cancel it because we don't
4828               convert CRLF anymore.  */
4829           if (coding->spec.ccl.cr_carryover)
4830             {
4831               coding->spec.ccl.cr_carryover = 0;
4832               coding->produced++;
4833               coding->produced_char++;
4834               pend++;
4835             }
4836           p = ptr = pend;
4837           coding->eol_type = CODING_EOL_LF;
4838           coding->symbol = saved_coding_symbol;
4839         }
4840       if (p < pend)
4841         {
4842           /* As each two-byte sequence CRLF was converted to LF, (PEND
4843              - P) is the number of deleted characters.  */
4844           coding->produced -= pend - p;
4845           coding->produced_char -= pend - p;
4846         }
4847     }
4848   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4849     {
4850       unsigned char *p = ptr;
4851
4852       for (; ptr < pend; ptr++)
4853         {
4854           if (*ptr == '\r')
4855             *ptr = '\n';
4856           else if (*ptr == '\n'
4857                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4858             {
4859               for (; p < ptr; p++)
4860                 {
4861                   if (*p == '\n')
4862                     *p = '\r';
4863                 }
4864               ptr = pend;
4865               coding->eol_type = CODING_EOL_LF;
4866               coding->symbol = saved_coding_symbol;
4867             }
4868         }
4869     }
4870 }
4871
4872 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4873    decoding, it may detect coding system and format of end-of-line if
4874    those are not yet decided.  The source should be unibyte, the
4875    result is multibyte if CODING->dst_multibyte is nonzero, else
4876    unibyte.  */
4877
4878 int
4879 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4880      struct coding_system *coding;
4881      const unsigned char *source;
4882      unsigned char *destination;
4883      int src_bytes, dst_bytes;
4884 {
4885   int extra = 0;
4886
4887   if (coding->type == coding_type_undecided)
4888     detect_coding (coding, source, src_bytes);
4889
4890   if (coding->eol_type == CODING_EOL_UNDECIDED
4891       && coding->type != coding_type_ccl)
4892     {
4893       detect_eol (coding, source, src_bytes);
4894       /* We had better recover the original eol format if we
4895          encounter an inconsistent eol format while decoding.  */
4896       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4897     }
4898
4899   coding->produced = coding->produced_char = 0;
4900   coding->consumed = coding->consumed_char = 0;
4901   coding->errors = 0;
4902   coding->result = CODING_FINISH_NORMAL;
4903
4904   switch (coding->type)
4905     {
4906     case coding_type_sjis:
4907       decode_coding_sjis_big5 (coding, source, destination,
4908                                src_bytes, dst_bytes, 1);
4909       break;
4910
4911     case coding_type_iso2022:
4912       decode_coding_iso2022 (coding, source, destination,
4913                              src_bytes, dst_bytes);
4914       break;
4915
4916     case coding_type_big5:
4917       decode_coding_sjis_big5 (coding, source, destination,
4918                                src_bytes, dst_bytes, 0);
4919       break;
4920
4921     case coding_type_emacs_mule:
4922       decode_coding_emacs_mule (coding, source, destination,
4923                                 src_bytes, dst_bytes);
4924       break;
4925
4926     case coding_type_ccl:
4927       if (coding->spec.ccl.cr_carryover)
4928         {
4929           /* Put the CR which was not processed by the previous call
4930              of decode_eol_post_ccl in DESTINATION.  It will be
4931              decoded together with the following LF by the call to
4932              decode_eol_post_ccl below.  */
4933           *destination = '\r';
4934           coding->produced++;
4935           coding->produced_char++;
4936           dst_bytes--;
4937           extra = coding->spec.ccl.cr_carryover;
4938         }
4939       ccl_coding_driver (coding, source, destination + extra,
4940                          src_bytes, dst_bytes, 0);
4941       if (coding->eol_type != CODING_EOL_LF)
4942         {
4943           coding->produced += extra;
4944           coding->produced_char += extra;
4945           decode_eol_post_ccl (coding, destination, coding->produced);
4946         }
4947       break;
4948
4949     default:
4950       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4951     }
4952
4953   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4954       && coding->mode & CODING_MODE_LAST_BLOCK
4955       && coding->consumed == src_bytes)
4956     coding->result = CODING_FINISH_NORMAL;
4957
4958   if (coding->mode & CODING_MODE_LAST_BLOCK
4959       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4960     {
4961       const unsigned char *src = source + coding->consumed;
4962       unsigned char *dst = destination + coding->produced;
4963
4964       src_bytes -= coding->consumed;
4965       coding->errors++;
4966       if (COMPOSING_P (coding))
4967         DECODE_COMPOSITION_END ('1');
4968       while (src_bytes--)
4969         {
4970           int c = *src++;
4971           dst += CHAR_STRING (c, dst);
4972           coding->produced_char++;
4973         }
4974       coding->consumed = coding->consumed_char = src - source;
4975       coding->produced = dst - destination;
4976       coding->result = CODING_FINISH_NORMAL;
4977     }
4978
4979   if (!coding->dst_multibyte)
4980     {
4981       coding->produced = str_as_unibyte (destination, coding->produced);
4982       coding->produced_char = coding->produced;
4983     }
4984
4985   return coding->result;
4986 }
4987
4988 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4989    multibyteness of the source is CODING->src_multibyte, the
4990    multibyteness of the result is always unibyte.  */
4991
4992 int
4993 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4994      struct coding_system *coding;
4995      const unsigned char *source;
4996      unsigned char *destination;
4997      int src_bytes, dst_bytes;
4998 {
4999   coding->produced = coding->produced_char = 0;
5000   coding->consumed = coding->consumed_char = 0;
5001   coding->errors = 0;
5002   coding->result = CODING_FINISH_NORMAL;
5003   if (coding->eol_type == CODING_EOL_UNDECIDED)
5004     coding->eol_type = CODING_EOL_LF;
5005
5006   switch (coding->type)
5007     {
5008     case coding_type_sjis:
5009       encode_coding_sjis_big5 (coding, source, destination,
5010                                src_bytes, dst_bytes, 1);
5011       break;
5012
5013     case coding_type_iso2022:
5014       encode_coding_iso2022 (coding, source, destination,
5015                              src_bytes, dst_bytes);
5016       break;
5017
5018     case coding_type_big5:
5019       encode_coding_sjis_big5 (coding, source, destination,
5020                                src_bytes, dst_bytes, 0);
5021       break;
5022
5023     case coding_type_emacs_mule:
5024       encode_coding_emacs_mule (coding, source, destination,
5025                                 src_bytes, dst_bytes);
5026       break;
5027
5028     case coding_type_ccl:
5029       ccl_coding_driver (coding, source, destination,
5030                          src_bytes, dst_bytes, 1);
5031       break;
5032
5033     default:
5034       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5035     }
5036
5037   if (coding->mode & CODING_MODE_LAST_BLOCK
5038       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5039     {
5040       const unsigned char *src = source + coding->consumed;
5041       unsigned char *dst = destination + coding->produced;
5042
5043       if (coding->type == coding_type_iso2022)
5044         ENCODE_RESET_PLANE_AND_REGISTER;
5045       if (COMPOSING_P (coding))
5046         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5047       if (coding->consumed < src_bytes)
5048         {
5049           int len = src_bytes - coding->consumed;
5050
5051           BCOPY_SHORT (src, dst, len);
5052           if (coding->src_multibyte)
5053             len = str_as_unibyte (dst, len);
5054           dst += len;
5055           coding->consumed = src_bytes;
5056         }
5057       coding->produced = coding->produced_char = dst - destination;
5058       coding->result = CODING_FINISH_NORMAL;
5059     }
5060
5061   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5062       && coding->consumed == src_bytes)
5063     coding->result = CODING_FINISH_NORMAL;
5064
5065   return coding->result;
5066 }
5067
5068 /* Scan text in the region between *BEG and *END (byte positions),
5069    skip characters which we don't have to decode by coding system
5070    CODING at the head and tail, then set *BEG and *END to the region
5071    of the text we actually have to convert.  The caller should move
5072    the gap out of the region in advance if the region is from a
5073    buffer.
5074
5075    If STR is not NULL, *BEG and *END are indices into STR.  */
5076
5077 static void
5078 shrink_decoding_region (beg, end, coding, str)
5079      int *beg, *end;
5080      struct coding_system *coding;
5081      unsigned char *str;
5082 {
5083   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5084   int eol_conversion;
5085   Lisp_Object translation_table;
5086
5087   if (coding->type == coding_type_ccl
5088       || coding->type == coding_type_undecided
5089       || coding->eol_type != CODING_EOL_LF
5090       || !NILP (coding->post_read_conversion)
5091       || coding->composing != COMPOSITION_DISABLED)
5092     {
5093       /* We can't skip any data.  */
5094       return;
5095     }
5096   if (coding->type == coding_type_no_conversion
5097       || coding->type == coding_type_raw_text
5098       || coding->type == coding_type_emacs_mule)
5099     {
5100       /* We need no conversion, but don't have to skip any data here.
5101          Decoding routine handles them effectively anyway.  */
5102       return;
5103     }
5104
5105   translation_table = coding->translation_table_for_decode;
5106   if (NILP (translation_table) && !NILP (Venable_character_translation))
5107     translation_table = Vstandard_translation_table_for_decode;
5108   if (CHAR_TABLE_P (translation_table))
5109     {
5110       int i;
5111       for (i = 0; i < 128; i++)
5112         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5113           break;
5114       if (i < 128)
5115         /* Some ASCII character should be translated.  We give up
5116            shrinking.  */
5117         return;
5118     }
5119
5120   if (coding->heading_ascii >= 0)
5121     /* Detection routine has already found how much we can skip at the
5122        head.  */
5123     *beg += coding->heading_ascii;
5124
5125   if (str)
5126     {
5127       begp_orig = begp = str + *beg;
5128       endp_orig = endp = str + *end;
5129     }
5130   else
5131     {
5132       begp_orig = begp = BYTE_POS_ADDR (*beg);
5133       endp_orig = endp = begp + *end - *beg;
5134     }
5135
5136   eol_conversion = (coding->eol_type == CODING_EOL_CR
5137                     || coding->eol_type == CODING_EOL_CRLF);
5138
5139   switch (coding->type)
5140     {
5141     case coding_type_sjis:
5142     case coding_type_big5:
5143       /* We can skip all ASCII characters at the head.  */
5144       if (coding->heading_ascii < 0)
5145         {
5146           if (eol_conversion)
5147             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5148           else
5149             while (begp < endp && *begp < 0x80) begp++;
5150         }
5151       /* We can skip all ASCII characters at the tail except for the
5152          second byte of SJIS or BIG5 code.  */
5153       if (eol_conversion)
5154         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5155       else
5156         while (begp < endp && endp[-1] < 0x80) endp--;
5157       /* Do not consider LF as ascii if preceded by CR, since that
5158          confuses eol decoding. */
5159       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5160         endp++;
5161       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5162         endp++;
5163       break;
5164
5165     case coding_type_iso2022:
5166       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5167         /* We can't skip any data.  */
5168         break;
5169       if (coding->heading_ascii < 0)
5170         {
5171           /* We can skip all ASCII characters at the head except for a
5172              few control codes.  */
5173           while (begp < endp && (c = *begp) < 0x80
5174                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5175                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5176                  && (!eol_conversion || c != ISO_CODE_LF))
5177             begp++;
5178         }
5179       switch (coding->category_idx)
5180         {
5181         case CODING_CATEGORY_IDX_ISO_8_1:
5182         case CODING_CATEGORY_IDX_ISO_8_2:
5183           /* We can skip all ASCII characters at the tail.  */
5184           if (eol_conversion)
5185             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5186           else
5187             while (begp < endp && endp[-1] < 0x80) endp--;
5188           /* Do not consider LF as ascii if preceded by CR, since that
5189              confuses eol decoding. */
5190           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5191             endp++;
5192           break;
5193
5194         case CODING_CATEGORY_IDX_ISO_7:
5195         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5196           {
5197             /* We can skip all characters at the tail except for 8-bit
5198                codes and ESC and the following 2-byte at the tail.  */
5199             unsigned char *eight_bit = NULL;
5200
5201             if (eol_conversion)
5202               while (begp < endp
5203                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5204                 {
5205                   if (!eight_bit && c & 0x80) eight_bit = endp;
5206                   endp--;
5207                 }
5208             else
5209               while (begp < endp
5210                      && (c = endp[-1]) != ISO_CODE_ESC)
5211                 {
5212                   if (!eight_bit && c & 0x80) eight_bit = endp;
5213                   endp--;
5214                 }
5215             /* Do not consider LF as ascii if preceded by CR, since that
5216                confuses eol decoding. */
5217             if (begp < endp && endp < endp_orig
5218                 && endp[-1] == '\r' && endp[0] == '\n')
5219               endp++;
5220             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5221               {
5222                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5223                   /* This is an ASCII designation sequence.  We can
5224                      surely skip the tail.  But, if we have
5225                      encountered an 8-bit code, skip only the codes
5226                      after that.  */
5227                   endp = eight_bit ? eight_bit : endp + 2;
5228                 else
5229                   /* Hmmm, we can't skip the tail.  */
5230                   endp = endp_orig;
5231               }
5232             else if (eight_bit)
5233               endp = eight_bit;
5234           }
5235         }
5236       break;
5237
5238     default:
5239       abort ();
5240     }
5241   *beg += begp - begp_orig;
5242   *end += endp - endp_orig;
5243   return;
5244 }
5245
5246 /* Like shrink_decoding_region but for encoding.  */
5247
5248 static void
5249 shrink_encoding_region (beg, end, coding, str)
5250      int *beg, *end;
5251      struct coding_system *coding;
5252      unsigned char *str;
5253 {
5254   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5255   int eol_conversion;
5256   Lisp_Object translation_table;
5257
5258   if (coding->type == coding_type_ccl
5259       || coding->eol_type == CODING_EOL_CRLF
5260       || coding->eol_type == CODING_EOL_CR
5261       || (coding->cmp_data && coding->cmp_data->used > 0))
5262     {
5263       /* We can't skip any data.  */
5264       return;
5265     }
5266   if (coding->type == coding_type_no_conversion
5267       || coding->type == coding_type_raw_text
5268       || coding->type == coding_type_emacs_mule
5269       || coding->type == coding_type_undecided)
5270     {
5271       /* We need no conversion, but don't have to skip any data here.
5272          Encoding routine handles them effectively anyway.  */
5273       return;
5274     }
5275
5276   translation_table = coding->translation_table_for_encode;
5277   if (NILP (translation_table) && !NILP (Venable_character_translation))
5278     translation_table = Vstandard_translation_table_for_encode;
5279   if (CHAR_TABLE_P (translation_table))
5280     {
5281       int i;
5282       for (i = 0; i < 128; i++)
5283         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5284           break;
5285       if (i < 128)
5286         /* Some ASCII character should be translated.  We give up
5287            shrinking.  */
5288         return;
5289     }
5290
5291   if (str)
5292     {
5293       begp_orig = begp = str + *beg;
5294       endp_orig = endp = str + *end;
5295     }
5296   else
5297     {
5298       begp_orig = begp = BYTE_POS_ADDR (*beg);
5299       endp_orig = endp = begp + *end - *beg;
5300     }
5301
5302   eol_conversion = (coding->eol_type == CODING_EOL_CR
5303                     || coding->eol_type == CODING_EOL_CRLF);
5304
5305   /* Here, we don't have to check coding->pre_write_conversion because
5306      the caller is expected to have handled it already.  */
5307   switch (coding->type)
5308     {
5309     case coding_type_iso2022:
5310       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5311         /* We can't skip any data.  */
5312         break;
5313       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5314         {
5315           unsigned char *bol = begp;
5316           while (begp < endp && *begp < 0x80)
5317             {
5318               begp++;
5319               if (begp[-1] == '\n')
5320                 bol = begp;
5321             }
5322           begp = bol;
5323           goto label_skip_tail;
5324         }
5325       /* fall down ... */
5326
5327     case coding_type_sjis:
5328     case coding_type_big5:
5329       /* We can skip all ASCII characters at the head and tail.  */
5330       if (eol_conversion)
5331         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5332       else
5333         while (begp < endp && *begp < 0x80) begp++;
5334     label_skip_tail:
5335       if (eol_conversion)
5336         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5337       else
5338         while (begp < endp && *(endp - 1) < 0x80) endp--;
5339       break;
5340
5341     default:
5342       abort ();
5343     }
5344
5345   *beg += begp - begp_orig;
5346   *end += endp - endp_orig;
5347   return;
5348 }
5349
5350 /* As shrinking conversion region requires some overhead, we don't try
5351    shrinking if the length of conversion region is less than this
5352    value.  */
5353 static int shrink_conversion_region_threshhold = 1024;
5354
5355 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5356   do {                                                                  \
5357     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5358       {                                                                 \
5359         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5360         else shrink_decoding_region (beg, end, coding, str);            \
5361       }                                                                 \
5362   } while (0)
5363
5364 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5365    Vlast_coding_system_used and the remaining elements are buffers to
5366    kill.  */
5367 static Lisp_Object
5368 code_convert_region_unwind (arg)
5369      Lisp_Object arg;
5370 {
5371   struct gcpro gcpro1;
5372   GCPRO1 (arg);
5373
5374   inhibit_pre_post_conversion = 0;
5375   Vlast_coding_system_used = XCAR (arg);
5376   for (arg = XCDR (arg); CONSP (arg); arg = XCDR (arg))
5377     Fkill_buffer (XCAR (arg));
5378
5379   UNGCPRO;
5380   return Qnil;
5381 }
5382
5383 /* Store information about all compositions in the range FROM and TO
5384    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5385    buffer or a string, defaults to the current buffer.  */
5386
5387 void
5388 coding_save_composition (coding, from, to, obj)
5389      struct coding_system *coding;
5390      int from, to;
5391      Lisp_Object obj;
5392 {
5393   Lisp_Object prop;
5394   int start, end;
5395
5396   if (coding->composing == COMPOSITION_DISABLED)
5397     return;
5398   if (!coding->cmp_data)
5399     coding_allocate_composition_data (coding, from);
5400   if (!find_composition (from, to, &start, &end, &prop, obj)
5401       || end > to)
5402     return;
5403   if (start < from
5404       && (!find_composition (end, to, &start, &end, &prop, obj)
5405           || end > to))
5406     return;
5407   coding->composing = COMPOSITION_NO;
5408   do
5409     {
5410       if (COMPOSITION_VALID_P (start, end, prop))
5411         {
5412           enum composition_method method = COMPOSITION_METHOD (prop);
5413           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5414               >= COMPOSITION_DATA_SIZE)
5415             coding_allocate_composition_data (coding, from);
5416           /* For relative composition, we remember start and end
5417              positions, for the other compositions, we also remember
5418              components.  */
5419           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5420           if (method != COMPOSITION_RELATIVE)
5421             {
5422               /* We must store a*/
5423               Lisp_Object val, ch;
5424
5425               val = COMPOSITION_COMPONENTS (prop);
5426               if (CONSP (val))
5427                 while (CONSP (val))
5428                   {
5429                     ch = XCAR (val), val = XCDR (val);
5430                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5431                   }
5432               else if (VECTORP (val) || STRINGP (val))
5433                 {
5434                   int len = (VECTORP (val)
5435                              ? XVECTOR (val)->size : SCHARS (val));
5436                   int i;
5437                   for (i = 0; i < len; i++)
5438                     {
5439                       ch = (STRINGP (val)
5440                             ? Faref (val, make_number (i))
5441                             : XVECTOR (val)->contents[i]);
5442                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5443                     }
5444                 }
5445               else              /* INTEGERP (val) */
5446                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5447             }
5448           CODING_ADD_COMPOSITION_END (coding, end - from);
5449         }
5450       start = end;
5451     }
5452   while (start < to
5453          && find_composition (start, to, &start, &end, &prop, obj)
5454          && end <= to);
5455
5456   /* Make coding->cmp_data point to the first memory block.  */
5457   while (coding->cmp_data->prev)
5458     coding->cmp_data = coding->cmp_data->prev;
5459   coding->cmp_data_start = 0;
5460 }
5461
5462 /* Reflect the saved information about compositions to OBJ.
5463    CODING->cmp_data points to a memory block for the information.  OBJ
5464    is a buffer or a string, defaults to the current buffer.  */
5465
5466 void
5467 coding_restore_composition (coding, obj)
5468      struct coding_system *coding;
5469      Lisp_Object obj;
5470 {
5471   struct composition_data *cmp_data = coding->cmp_data;
5472
5473   if (!cmp_data)
5474     return;
5475
5476   while (cmp_data->prev)
5477     cmp_data = cmp_data->prev;
5478
5479   while (cmp_data)
5480     {
5481       int i;
5482
5483       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5484            i += cmp_data->data[i])
5485         {
5486           int *data = cmp_data->data + i;
5487           enum composition_method method = (enum composition_method) data[3];
5488           Lisp_Object components;
5489
5490           if (data[0] < 0 || i + data[0] > cmp_data->used)
5491             /* Invalid composition data.  */
5492             break;
5493
5494           if (method == COMPOSITION_RELATIVE)
5495             components = Qnil;
5496           else
5497             {
5498               int len = data[0] - 4, j;
5499               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5500
5501               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5502                   && len % 2 == 0)
5503                 len --;
5504               if (len < 1)
5505                 /* Invalid composition data.  */
5506                 break;
5507               for (j = 0; j < len; j++)
5508                 args[j] = make_number (data[4 + j]);
5509               components = (method == COMPOSITION_WITH_ALTCHARS
5510                             ? Fstring (len, args)
5511                             : Fvector (len, args));
5512             }
5513           compose_text (data[1], data[2], components, Qnil, obj);
5514         }
5515       cmp_data = cmp_data->next;
5516     }
5517 }
5518
5519 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5520    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5521    coding system CODING, and return the status code of code conversion
5522    (currently, this value has no meaning).
5523
5524    How many characters (and bytes) are converted to how many
5525    characters (and bytes) are recorded in members of the structure
5526    CODING.
5527
5528    If REPLACE is nonzero, we do various things as if the original text
5529    is deleted and a new text is inserted.  See the comments in
5530    replace_range (insdel.c) to know what we are doing.
5531
5532    If REPLACE is zero, it is assumed that the source text is unibyte.
5533    Otherwise, it is assumed that the source text is multibyte.  */
5534
5535 int
5536 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5537      int from, from_byte, to, to_byte, encodep, replace;
5538      struct coding_system *coding;
5539 {
5540   int len = to - from, len_byte = to_byte - from_byte;
5541   int nchars_del = 0, nbytes_del = 0;
5542   int require, inserted, inserted_byte;
5543   int head_skip, tail_skip, total_skip = 0;
5544   Lisp_Object saved_coding_symbol;
5545   int first = 1;
5546   unsigned char *src, *dst;
5547   Lisp_Object deletion;
5548   int orig_point = PT, orig_len = len;
5549   int prev_Z;
5550   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5551
5552   deletion = Qnil;
5553   saved_coding_symbol = coding->symbol;
5554
5555   if (from < PT && PT < to)
5556     {
5557       TEMP_SET_PT_BOTH (from, from_byte);
5558       orig_point = from;
5559     }
5560
5561   if (replace)
5562     {
5563       int saved_from = from;
5564       int saved_inhibit_modification_hooks;
5565
5566       prepare_to_modify_buffer (from, to, &from);
5567       if (saved_from != from)
5568         {
5569           to = from + len;
5570           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5571           len_byte = to_byte - from_byte;
5572         }
5573
5574       /* The code conversion routine can not preserve text properties
5575          for now.  So, we must remove all text properties in the
5576          region.  Here, we must suppress all modification hooks.  */
5577       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5578       inhibit_modification_hooks = 1;
5579       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5580       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5581     }
5582
5583   coding->heading_ascii = 0;
5584
5585   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5586     {
5587       /* We must detect encoding of text and eol format.  */
5588
5589       if (from < GPT && to > GPT)
5590         move_gap_both (from, from_byte);
5591       if (coding->type == coding_type_undecided)
5592         {
5593           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5594           if (coding->type == coding_type_undecided)
5595             {
5596               /* It seems that the text contains only ASCII, but we
5597                  should not leave it undecided because the deeper
5598                  decoding routine (decode_coding) tries to detect the
5599                  encodings again in vain.  */
5600               coding->type = coding_type_emacs_mule;
5601               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5602               /* As emacs-mule decoder will handle composition, we
5603                  need this setting to allocate coding->cmp_data
5604                  later.  */
5605               coding->composing = COMPOSITION_NO;
5606             }
5607         }
5608       if (coding->eol_type == CODING_EOL_UNDECIDED
5609           && coding->type != coding_type_ccl)
5610         {
5611           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5612           if (coding->eol_type == CODING_EOL_UNDECIDED)
5613             coding->eol_type = CODING_EOL_LF;
5614           /* We had better recover the original eol format if we
5615              encounter an inconsistent eol format while decoding.  */
5616           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5617         }
5618     }
5619
5620   /* Now we convert the text.  */
5621
5622   /* For encoding, we must process pre-write-conversion in advance.  */
5623   if (! inhibit_pre_post_conversion
5624       && encodep
5625       && SYMBOLP (coding->pre_write_conversion)
5626       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5627     {
5628       /* The function in pre-write-conversion may put a new text in a
5629          new buffer.  */
5630       struct buffer *prev = current_buffer;
5631       Lisp_Object new;
5632
5633       record_unwind_protect (code_convert_region_unwind,
5634                              Fcons (Vlast_coding_system_used, Qnil));
5635       /* We should not call any more pre-write/post-read-conversion
5636          functions while this pre-write-conversion is running.  */
5637       inhibit_pre_post_conversion = 1;
5638       call2 (coding->pre_write_conversion,
5639              make_number (from), make_number (to));
5640       inhibit_pre_post_conversion = 0;
5641       /* Discard the unwind protect.  */
5642       specpdl_ptr--;
5643
5644       if (current_buffer != prev)
5645         {
5646           len = ZV - BEGV;
5647           new = Fcurrent_buffer ();
5648           set_buffer_internal_1 (prev);
5649           del_range_2 (from, from_byte, to, to_byte, 0);
5650           TEMP_SET_PT_BOTH (from, from_byte);
5651           insert_from_buffer (XBUFFER (new), 1, len, 0);
5652           Fkill_buffer (new);
5653           if (orig_point >= to)
5654             orig_point += len - orig_len;
5655           else if (orig_point > from)
5656             orig_point = from;
5657           orig_len = len;
5658           to = from + len;
5659           from_byte = CHAR_TO_BYTE (from);
5660           to_byte = CHAR_TO_BYTE (to);
5661           len_byte = to_byte - from_byte;
5662           TEMP_SET_PT_BOTH (from, from_byte);
5663         }
5664     }
5665
5666   if (replace)
5667     {
5668       if (! EQ (current_buffer->undo_list, Qt))
5669         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5670       else
5671         {
5672           nchars_del = to - from;
5673           nbytes_del = to_byte - from_byte;
5674         }
5675     }
5676
5677   if (coding->composing != COMPOSITION_DISABLED)
5678     {
5679       if (encodep)
5680         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5681       else
5682         coding_allocate_composition_data (coding, from);
5683     }
5684
5685   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5686      if we must run CCL program or there are compositions to
5687      encode.  */
5688   if (coding->type != coding_type_ccl
5689       && (! coding->cmp_data || coding->cmp_data->used == 0))
5690     {
5691       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5692
5693       if (from < GPT && GPT < to)
5694         move_gap_both (from, from_byte);
5695       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5696       if (from_byte == to_byte
5697           && (encodep || NILP (coding->post_read_conversion))
5698           && ! CODING_REQUIRE_FLUSHING (coding))
5699         {
5700           coding->produced = len_byte;
5701           coding->produced_char = len;
5702           if (!replace)
5703             /* We must record and adjust for this new text now.  */
5704             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5705           coding_free_composition_data (coding);
5706           return 0;
5707         }
5708
5709       head_skip = from_byte - from_byte_orig;
5710       tail_skip = to_byte_orig - to_byte;
5711       total_skip = head_skip + tail_skip;
5712       from += head_skip;
5713       to -= tail_skip;
5714       len -= total_skip; len_byte -= total_skip;
5715     }
5716
5717   /* For conversion, we must put the gap before the text in addition to
5718      making the gap larger for efficient decoding.  The required gap
5719      size starts from 2000 which is the magic number used in make_gap.
5720      But, after one batch of conversion, it will be incremented if we
5721      find that it is not enough .  */
5722   require = 2000;
5723
5724   if (GAP_SIZE  < require)
5725     make_gap (require - GAP_SIZE);
5726   move_gap_both (from, from_byte);
5727
5728   inserted = inserted_byte = 0;
5729
5730   GAP_SIZE += len_byte;
5731   ZV -= len;
5732   Z -= len;
5733   ZV_BYTE -= len_byte;
5734   Z_BYTE -= len_byte;
5735
5736   if (GPT - BEG < BEG_UNCHANGED)
5737     BEG_UNCHANGED = GPT - BEG;
5738   if (Z - GPT < END_UNCHANGED)
5739     END_UNCHANGED = Z - GPT;
5740
5741   if (!encodep && coding->src_multibyte)
5742     {
5743       /* Decoding routines expects that the source text is unibyte.
5744          We must convert 8-bit characters of multibyte form to
5745          unibyte.  */
5746       int len_byte_orig = len_byte;
5747       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5748       if (len_byte < len_byte_orig)
5749         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5750                     len_byte);
5751       coding->src_multibyte = 0;
5752     }
5753
5754   for (;;)
5755     {
5756       int result;
5757
5758       /* The buffer memory is now:
5759          +--------+converted-text+---------+-------original-text-------+---+
5760          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5761                   |<---------------------- GAP ----------------------->|  */
5762       src = GAP_END_ADDR - len_byte;
5763       dst = GPT_ADDR + inserted_byte;
5764
5765       if (encodep)
5766         result = encode_coding (coding, src, dst, len_byte, 0);
5767       else
5768         {
5769           if (coding->composing != COMPOSITION_DISABLED)
5770             coding->cmp_data->char_offset = from + inserted;
5771           result = decode_coding (coding, src, dst, len_byte, 0);
5772         }
5773
5774       /* The buffer memory is now:
5775          +--------+-------converted-text----+--+------original-text----+---+
5776          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5777                   |<---------------------- GAP ----------------------->|  */
5778
5779       inserted += coding->produced_char;
5780       inserted_byte += coding->produced;
5781       len_byte -= coding->consumed;
5782
5783       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5784         {
5785           coding_allocate_composition_data (coding, from + inserted);
5786           continue;
5787         }
5788
5789       src += coding->consumed;
5790       dst += coding->produced;
5791
5792       if (result == CODING_FINISH_NORMAL)
5793         {
5794           src += len_byte;
5795           break;
5796         }
5797       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5798         {
5799           unsigned char *pend = dst, *p = pend - inserted_byte;
5800           Lisp_Object eol_type;
5801
5802           /* Encode LFs back to the original eol format (CR or CRLF).  */
5803           if (coding->eol_type == CODING_EOL_CR)
5804             {
5805               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5806             }
5807           else
5808             {
5809               int count = 0;
5810
5811               while (p < pend) if (*p++ == '\n') count++;
5812               if (src - dst < count)
5813                 {
5814                   /* We don't have sufficient room for encoding LFs
5815                      back to CRLF.  We must record converted and
5816                      not-yet-converted text back to the buffer
5817                      content, enlarge the gap, then record them out of
5818                      the buffer contents again.  */
5819                   int add = len_byte + inserted_byte;
5820
5821                   GAP_SIZE -= add;
5822                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5823                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5824                   make_gap (count - GAP_SIZE);
5825                   GAP_SIZE += add;
5826                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5827                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5828                   /* Don't forget to update SRC, DST, and PEND.  */
5829                   src = GAP_END_ADDR - len_byte;
5830                   dst = GPT_ADDR + inserted_byte;
5831                   pend = dst;
5832                 }
5833               inserted += count;
5834               inserted_byte += count;
5835               coding->produced += count;
5836               p = dst = pend + count;
5837               while (count)
5838                 {
5839                   *--p = *--pend;
5840                   if (*p == '\n') count--, *--p = '\r';
5841                 }
5842             }
5843
5844           /* Suppress eol-format conversion in the further conversion.  */
5845           coding->eol_type = CODING_EOL_LF;
5846
5847           /* Set the coding system symbol to that for Unix-like EOL.  */
5848           eol_type = Fget (saved_coding_symbol, Qeol_type);
5849           if (VECTORP (eol_type)
5850               && XVECTOR (eol_type)->size == 3
5851               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5852             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5853           else
5854             coding->symbol = saved_coding_symbol;
5855
5856           continue;
5857         }
5858       if (len_byte <= 0)
5859         {
5860           if (coding->type != coding_type_ccl
5861               || coding->mode & CODING_MODE_LAST_BLOCK)
5862             break;
5863           coding->mode |= CODING_MODE_LAST_BLOCK;
5864           continue;
5865         }
5866       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5867         {
5868           /* The source text ends in invalid codes.  Let's just
5869              make them valid buffer contents, and finish conversion.  */
5870           if (multibyte_p)
5871             {
5872               unsigned char *start = dst;
5873
5874               inserted += len_byte;
5875               while (len_byte--)
5876                 {
5877                   int c = *src++;
5878                   dst += CHAR_STRING (c, dst);
5879                 }
5880
5881               inserted_byte += dst - start;
5882             }
5883           else
5884             {
5885               inserted += len_byte;
5886               inserted_byte += len_byte;
5887               while (len_byte--)
5888                 *dst++ = *src++;
5889             }
5890           break;
5891         }
5892       if (result == CODING_FINISH_INTERRUPT)
5893         {
5894           /* The conversion procedure was interrupted by a user.  */
5895           break;
5896         }
5897       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5898       if (coding->consumed < 1)
5899         {
5900           /* It's quite strange to require more memory without
5901              consuming any bytes.  Perhaps CCL program bug.  */
5902           break;
5903         }
5904       if (first)
5905         {
5906           /* We have just done the first batch of conversion which was
5907              stopped because of insufficient gap.  Let's reconsider the
5908              required gap size (i.e. SRT - DST) now.
5909
5910              We have converted ORIG bytes (== coding->consumed) into
5911              NEW bytes (coding->produced).  To convert the remaining
5912              LEN bytes, we may need REQUIRE bytes of gap, where:
5913                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5914                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5915              Here, we are sure that NEW >= ORIG.  */
5916
5917           if (coding->produced <= coding->consumed)
5918             {
5919               /* This happens because of CCL-based coding system with
5920                  eol-type CRLF.  */
5921               require = 0;
5922             }
5923           else
5924             {
5925               float ratio = coding->produced - coding->consumed;
5926               ratio /= coding->consumed;
5927               require = len_byte * ratio;
5928             }
5929           first = 0;
5930         }
5931       if ((src - dst) < (require + 2000))
5932         {
5933           /* See the comment above the previous call of make_gap.  */
5934           int add = len_byte + inserted_byte;
5935
5936           GAP_SIZE -= add;
5937           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5938           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5939           make_gap (require + 2000);
5940           GAP_SIZE += add;
5941           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5942           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5943         }
5944     }
5945   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5946
5947   if (encodep && coding->dst_multibyte)
5948     {
5949       /* The output is unibyte.  We must convert 8-bit characters to
5950          multibyte form.  */
5951       if (inserted_byte * 2 > GAP_SIZE)
5952         {
5953           GAP_SIZE -= inserted_byte;
5954           ZV += inserted_byte; Z += inserted_byte;
5955           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5956           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5957           make_gap (inserted_byte - GAP_SIZE);
5958           GAP_SIZE += inserted_byte;
5959           ZV -= inserted_byte; Z -= inserted_byte;
5960           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5961           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5962         }
5963       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5964     }
5965
5966   /* If we shrank the conversion area, adjust it now.  */
5967   if (total_skip > 0)
5968     {
5969       if (tail_skip > 0)
5970         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5971       inserted += total_skip; inserted_byte += total_skip;
5972       GAP_SIZE += total_skip;
5973       GPT -= head_skip; GPT_BYTE -= head_skip;
5974       ZV -= total_skip; ZV_BYTE -= total_skip;
5975       Z -= total_skip; Z_BYTE -= total_skip;
5976       from -= head_skip; from_byte -= head_skip;
5977       to += tail_skip; to_byte += tail_skip;
5978     }
5979
5980   prev_Z = Z;
5981   if (! EQ (current_buffer->undo_list, Qt))
5982     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5983   else
5984     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5985                                  inserted, inserted_byte);
5986   inserted = Z - prev_Z;
5987
5988   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5989     coding_restore_composition (coding, Fcurrent_buffer ());
5990   coding_free_composition_data (coding);
5991
5992   if (! inhibit_pre_post_conversion
5993       && ! encodep && ! NILP (coding->post_read_conversion))
5994     {
5995       Lisp_Object val;
5996       Lisp_Object saved_coding_system;
5997
5998       if (from != PT)
5999         TEMP_SET_PT_BOTH (from, from_byte);
6000       prev_Z = Z;
6001       record_unwind_protect (code_convert_region_unwind,
6002                              Fcons (Vlast_coding_system_used, Qnil));
6003       saved_coding_system = Vlast_coding_system_used;
6004       Vlast_coding_system_used = coding->symbol;
6005       /* We should not call any more pre-write/post-read-conversion
6006          functions while this post-read-conversion is running.  */
6007       inhibit_pre_post_conversion = 1;
6008       val = call1 (coding->post_read_conversion, make_number (inserted));
6009       inhibit_pre_post_conversion = 0;
6010       coding->symbol = Vlast_coding_system_used;
6011       Vlast_coding_system_used = saved_coding_system;
6012       /* Discard the unwind protect.  */
6013       specpdl_ptr--;
6014       CHECK_NUMBER (val);
6015       inserted += Z - prev_Z;
6016     }
6017
6018   if (orig_point >= from)
6019     {
6020       if (orig_point >= from + orig_len)
6021         orig_point += inserted - orig_len;
6022       else
6023         orig_point = from;
6024       TEMP_SET_PT (orig_point);
6025     }
6026
6027   if (replace)
6028     {
6029       signal_after_change (from, to - from, inserted);
6030       update_compositions (from, from + inserted, CHECK_BORDER);
6031     }
6032
6033   {
6034     coding->consumed = to_byte - from_byte;
6035     coding->consumed_char = to - from;
6036     coding->produced = inserted_byte;
6037     coding->produced_char = inserted;
6038   }
6039
6040   return 0;
6041 }
6042
6043 /* Name (or base name) of work buffer for code conversion.  */
6044 static Lisp_Object Vcode_conversion_workbuf_name;
6045
6046 /* Set the current buffer to the working buffer prepared for
6047    code-conversion.  MULTIBYTE specifies the multibyteness of the
6048    buffer.  Return the buffer we set if it must be killed after use.
6049    Otherwise return Qnil.  */
6050
6051 static Lisp_Object
6052 set_conversion_work_buffer (multibyte)
6053      int multibyte;
6054 {
6055   Lisp_Object buffer, buffer_to_kill;
6056   struct buffer *buf;
6057
6058   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6059   buf = XBUFFER (buffer);
6060   if (buf == current_buffer)
6061     {
6062       /* As we are already in the work buffer, we must generate a new
6063          buffer for the work.  */
6064       Lisp_Object name;
6065
6066       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6067       buffer = buffer_to_kill = Fget_buffer_create (name);
6068       buf = XBUFFER (buffer);
6069     }
6070   else
6071     buffer_to_kill = Qnil;
6072
6073   delete_all_overlays (buf);
6074   buf->directory = current_buffer->directory;
6075   buf->read_only = Qnil;
6076   buf->filename = Qnil;
6077   buf->undo_list = Qt;
6078   eassert (buf->overlays_before == NULL);
6079   eassert (buf->overlays_after == NULL);
6080   set_buffer_internal (buf);
6081   if (BEG != BEGV || Z != ZV)
6082     Fwiden ();
6083   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6084   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6085   return buffer_to_kill;
6086 }
6087
6088 Lisp_Object
6089 run_pre_post_conversion_on_str (str, coding, encodep)
6090      Lisp_Object str;
6091      struct coding_system *coding;
6092      int encodep;
6093 {
6094   int count = SPECPDL_INDEX ();
6095   struct gcpro gcpro1, gcpro2;
6096   int multibyte = STRING_MULTIBYTE (str);
6097   Lisp_Object old_deactivate_mark;
6098   Lisp_Object buffer_to_kill;
6099   Lisp_Object unwind_arg;
6100
6101   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6102   /* It is not crucial to specbind this.  */
6103   old_deactivate_mark = Vdeactivate_mark;
6104   GCPRO2 (str, old_deactivate_mark);
6105
6106   /* We must insert the contents of STR as is without
6107      unibyte<->multibyte conversion.  For that, we adjust the
6108      multibyteness of the working buffer to that of STR.  */
6109   buffer_to_kill = set_conversion_work_buffer (multibyte);
6110   if (NILP (buffer_to_kill))
6111     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6112   else
6113     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6114   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6115
6116   insert_from_string (str, 0, 0,
6117                       SCHARS (str), SBYTES (str), 0);
6118   UNGCPRO;
6119   inhibit_pre_post_conversion = 1;
6120   if (encodep)
6121     {
6122       struct buffer *prev = current_buffer;
6123
6124       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6125       if (prev != current_buffer)
6126         /* We must kill the current buffer too.  */
6127         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6128     }
6129   else
6130     {
6131       Vlast_coding_system_used = coding->symbol;
6132       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6133       call1 (coding->post_read_conversion, make_number (Z - BEG));
6134       coding->symbol = Vlast_coding_system_used;
6135     }
6136   inhibit_pre_post_conversion = 0;
6137   Vdeactivate_mark = old_deactivate_mark;
6138   str = make_buffer_string (BEG, Z, 1);
6139   return unbind_to (count, str);
6140 }
6141
6142
6143 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6144    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6145    is intended that this function is called from encode_terminal_code,
6146    the pre-write-conversion function is run by safe_call and thus
6147    "Error during redisplay: ..." is logged when an error occurs.
6148
6149    Store the resulting text in *STR and set CODING->produced_char and
6150    CODING->produced to the number of characters and bytes
6151    respectively.  If the size of *STR is too small, enlarge it by
6152    xrealloc and update *STR and *SIZE.  */
6153
6154 void
6155 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6156      unsigned char **str;
6157      int *size, nchars, nbytes;
6158      struct coding_system *coding;
6159 {
6160   struct gcpro gcpro1, gcpro2;
6161   struct buffer *cur = current_buffer;
6162   struct buffer *prev;
6163   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6164   Lisp_Object args[3];
6165   Lisp_Object buffer_to_kill;
6166
6167   /* It is not crucial to specbind this.  */
6168   old_deactivate_mark = Vdeactivate_mark;
6169   old_last_coding_system_used = Vlast_coding_system_used;
6170   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6171
6172   /* We must insert the contents of STR as is without
6173      unibyte<->multibyte conversion.  For that, we adjust the
6174      multibyteness of the working buffer to that of STR.  */
6175   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6176   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6177   UNGCPRO;
6178   inhibit_pre_post_conversion = 1;
6179   prev = current_buffer;
6180   args[0] = coding->pre_write_conversion;
6181   args[1] = make_number (BEG);
6182   args[2] = make_number (Z);
6183   safe_call (3, args);
6184   inhibit_pre_post_conversion = 0;
6185   Vdeactivate_mark = old_deactivate_mark;
6186   Vlast_coding_system_used = old_last_coding_system_used;
6187   coding->produced_char = Z - BEG;
6188   coding->produced = Z_BYTE - BEG_BYTE;
6189   if (coding->produced > *size)
6190     {
6191       *size = coding->produced;
6192       *str = xrealloc (*str, *size);
6193     }
6194   if (BEG < GPT && GPT < Z)
6195     move_gap (BEG);
6196   bcopy (BEG_ADDR, *str, coding->produced);
6197   coding->src_multibyte
6198     = ! NILP (current_buffer->enable_multibyte_characters);
6199   if (prev != current_buffer)
6200     Fkill_buffer (Fcurrent_buffer ());
6201   set_buffer_internal (cur);
6202   if (! NILP (buffer_to_kill))
6203     Fkill_buffer (buffer_to_kill);
6204 }
6205
6206
6207 Lisp_Object
6208 decode_coding_string (str, coding, nocopy)
6209      Lisp_Object str;
6210      struct coding_system *coding;
6211      int nocopy;
6212 {
6213   int len;
6214   struct conversion_buffer buf;
6215   int from, to_byte;
6216   Lisp_Object saved_coding_symbol;
6217   int result;
6218   int require_decoding;
6219   int shrinked_bytes = 0;
6220   Lisp_Object newstr;
6221   int consumed, consumed_char, produced, produced_char;
6222
6223   from = 0;
6224   to_byte = SBYTES (str);
6225
6226   saved_coding_symbol = coding->symbol;
6227   coding->src_multibyte = STRING_MULTIBYTE (str);
6228   coding->dst_multibyte = 1;
6229   coding->heading_ascii = 0;
6230
6231   if (CODING_REQUIRE_DETECTION (coding))
6232     {
6233       /* See the comments in code_convert_region.  */
6234       if (coding->type == coding_type_undecided)
6235         {
6236           detect_coding (coding, SDATA (str), to_byte);
6237           if (coding->type == coding_type_undecided)
6238             {
6239               coding->type = coding_type_emacs_mule;
6240               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6241               /* As emacs-mule decoder will handle composition, we
6242                  need this setting to allocate coding->cmp_data
6243                  later.  */
6244               coding->composing = COMPOSITION_NO;
6245             }
6246         }
6247       if (coding->eol_type == CODING_EOL_UNDECIDED
6248           && coding->type != coding_type_ccl)
6249         {
6250           saved_coding_symbol = coding->symbol;
6251           detect_eol (coding, SDATA (str), to_byte);
6252           if (coding->eol_type == CODING_EOL_UNDECIDED)
6253             coding->eol_type = CODING_EOL_LF;
6254           /* We had better recover the original eol format if we
6255              encounter an inconsistent eol format while decoding.  */
6256           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6257         }
6258     }
6259
6260   if (coding->type == coding_type_no_conversion
6261       || coding->type == coding_type_raw_text)
6262     coding->dst_multibyte = 0;
6263
6264   require_decoding = CODING_REQUIRE_DECODING (coding);
6265
6266   if (STRING_MULTIBYTE (str))
6267     {
6268       /* Decoding routines expect the source text to be unibyte.  */
6269       str = Fstring_as_unibyte (str);
6270       to_byte = SBYTES (str);
6271       nocopy = 1;
6272       coding->src_multibyte = 0;
6273     }
6274
6275   /* Try to skip the heading and tailing ASCIIs.  */
6276   if (require_decoding && coding->type != coding_type_ccl)
6277     {
6278       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6279                                 0);
6280       if (from == to_byte)
6281         require_decoding = 0;
6282       shrinked_bytes = from + (SBYTES (str) - to_byte);
6283     }
6284
6285   if (!require_decoding
6286       && !(SYMBOLP (coding->post_read_conversion)
6287            && !NILP (Ffboundp (coding->post_read_conversion))))
6288     {
6289       coding->consumed = SBYTES (str);
6290       coding->consumed_char = SCHARS (str);
6291       if (coding->dst_multibyte)
6292         {
6293           str = Fstring_as_multibyte (str);
6294           nocopy = 1;
6295         }
6296       coding->produced = SBYTES (str);
6297       coding->produced_char = SCHARS (str);
6298       return (nocopy ? str : Fcopy_sequence (str));
6299     }
6300
6301   if (coding->composing != COMPOSITION_DISABLED)
6302     coding_allocate_composition_data (coding, from);
6303   len = decoding_buffer_size (coding, to_byte - from);
6304   allocate_conversion_buffer (buf, len);
6305
6306   consumed = consumed_char = produced = produced_char = 0;
6307   while (1)
6308     {
6309       result = decode_coding (coding, SDATA (str) + from + consumed,
6310                               buf.data + produced, to_byte - from - consumed,
6311                               buf.size - produced);
6312       consumed += coding->consumed;
6313       consumed_char += coding->consumed_char;
6314       produced += coding->produced;
6315       produced_char += coding->produced_char;
6316       if (result == CODING_FINISH_NORMAL
6317           || result == CODING_FINISH_INTERRUPT
6318           || (result == CODING_FINISH_INSUFFICIENT_SRC
6319               && coding->consumed == 0))
6320         break;
6321       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6322         coding_allocate_composition_data (coding, from + produced_char);
6323       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6324         extend_conversion_buffer (&buf);
6325       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6326         {
6327           Lisp_Object eol_type;
6328
6329           /* Recover the original EOL format.  */
6330           if (coding->eol_type == CODING_EOL_CR)
6331             {
6332               unsigned char *p;
6333               for (p = buf.data; p < buf.data + produced; p++)
6334                 if (*p == '\n') *p = '\r';
6335             }
6336           else if (coding->eol_type == CODING_EOL_CRLF)
6337             {
6338               int num_eol = 0;
6339               unsigned char *p0, *p1;
6340               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6341                 if (*p0 == '\n') num_eol++;
6342               if (produced + num_eol >= buf.size)
6343                 extend_conversion_buffer (&buf);
6344               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6345                 {
6346                   *--p1 = *--p0;
6347                   if (*p0 == '\n') *--p1 = '\r';
6348                 }
6349               produced += num_eol;
6350               produced_char += num_eol;
6351             }
6352           /* Suppress eol-format conversion in the further conversion.  */
6353           coding->eol_type = CODING_EOL_LF;
6354
6355           /* Set the coding system symbol to that for Unix-like EOL.  */
6356           eol_type = Fget (saved_coding_symbol, Qeol_type);
6357           if (VECTORP (eol_type)
6358               && XVECTOR (eol_type)->size == 3
6359               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6360             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6361           else
6362             coding->symbol = saved_coding_symbol;
6363
6364
6365         }
6366     }
6367
6368   coding->consumed = consumed;
6369   coding->consumed_char = consumed_char;
6370   coding->produced = produced;
6371   coding->produced_char = produced_char;
6372
6373   if (coding->dst_multibyte)
6374     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6375                                            produced + shrinked_bytes);
6376   else
6377     newstr = make_uninit_string (produced + shrinked_bytes);
6378   if (from > 0)
6379     STRING_COPYIN (newstr, 0, SDATA (str), from);
6380   STRING_COPYIN (newstr, from, buf.data, produced);
6381   if (shrinked_bytes > from)
6382     STRING_COPYIN (newstr, from + produced,
6383                    SDATA (str) + to_byte,
6384                    shrinked_bytes - from);
6385   free_conversion_buffer (&buf);
6386
6387   coding->consumed += shrinked_bytes;
6388   coding->consumed_char += shrinked_bytes;
6389   coding->produced += shrinked_bytes;
6390   coding->produced_char += shrinked_bytes;
6391
6392   if (coding->cmp_data && coding->cmp_data->used)
6393     coding_restore_composition (coding, newstr);
6394   coding_free_composition_data (coding);
6395
6396   if (SYMBOLP (coding->post_read_conversion)
6397       && !NILP (Ffboundp (coding->post_read_conversion)))
6398     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6399
6400   return newstr;
6401 }
6402
6403 Lisp_Object
6404 encode_coding_string (str, coding, nocopy)
6405      Lisp_Object str;
6406      struct coding_system *coding;
6407      int nocopy;
6408 {
6409   int len;
6410   struct conversion_buffer buf;
6411   int from, to, to_byte;
6412   int result;
6413   int shrinked_bytes = 0;
6414   Lisp_Object newstr;
6415   int consumed, consumed_char, produced, produced_char;
6416
6417   if (SYMBOLP (coding->pre_write_conversion)
6418       && !NILP (Ffboundp (coding->pre_write_conversion)))
6419     {
6420       str = run_pre_post_conversion_on_str (str, coding, 1);
6421       /* As STR is just newly generated, we don't have to copy it
6422          anymore.  */
6423       nocopy = 1;
6424     }
6425
6426   from = 0;
6427   to = SCHARS (str);
6428   to_byte = SBYTES (str);
6429
6430   /* Encoding routines determine the multibyteness of the source text
6431      by coding->src_multibyte.  */
6432   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6433   coding->dst_multibyte = 0;
6434   if (! CODING_REQUIRE_ENCODING (coding))
6435     goto no_need_of_encoding;
6436
6437   if (coding->composing != COMPOSITION_DISABLED)
6438     coding_save_composition (coding, from, to, str);
6439
6440   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6441      if we must run CCL program or there are compositions to
6442      encode.  */
6443   coding->heading_ascii = 0;
6444   if (coding->type != coding_type_ccl
6445       && (! coding->cmp_data || coding->cmp_data->used == 0))
6446     {
6447       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6448                                 1);
6449       if (from == to_byte)
6450         {
6451           coding_free_composition_data (coding);
6452           goto no_need_of_encoding;
6453         }
6454       shrinked_bytes = from + (SBYTES (str) - to_byte);
6455     }
6456
6457   len = encoding_buffer_size (coding, to_byte - from);
6458   allocate_conversion_buffer (buf, len);
6459
6460   consumed = consumed_char = produced = produced_char = 0;
6461   while (1)
6462     {
6463       result = encode_coding (coding, SDATA (str) + from + consumed,
6464                               buf.data + produced, to_byte - from - consumed,
6465                               buf.size - produced);
6466       consumed += coding->consumed;
6467       consumed_char += coding->consumed_char;
6468       produced += coding->produced;
6469       produced_char += coding->produced_char;
6470       if (result == CODING_FINISH_NORMAL
6471           || result == CODING_FINISH_INTERRUPT
6472           || (result == CODING_FINISH_INSUFFICIENT_SRC
6473               && coding->consumed == 0))
6474         break;
6475       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6476       extend_conversion_buffer (&buf);
6477     }
6478
6479   coding->consumed = consumed;
6480   coding->consumed_char = consumed_char;
6481   coding->produced = produced;
6482   coding->produced_char = produced_char;
6483
6484   newstr = make_uninit_string (produced + shrinked_bytes);
6485   if (from > 0)
6486     STRING_COPYIN (newstr, 0, SDATA (str), from);
6487   STRING_COPYIN (newstr, from, buf.data, produced);
6488   if (shrinked_bytes > from)
6489     STRING_COPYIN (newstr, from + produced,
6490                    SDATA (str) + to_byte,
6491                    shrinked_bytes - from);
6492
6493   free_conversion_buffer (&buf);
6494   coding_free_composition_data (coding);
6495
6496   return newstr;
6497
6498  no_need_of_encoding:
6499   coding->consumed = SBYTES (str);
6500   coding->consumed_char = SCHARS (str);
6501   if (STRING_MULTIBYTE (str))
6502     {
6503       if (nocopy)
6504         /* We are sure that STR doesn't contain a multibyte
6505            character.  */
6506         STRING_SET_UNIBYTE (str);
6507       else
6508         {
6509           str = Fstring_as_unibyte (str);
6510           nocopy = 1;
6511         }
6512     }
6513   coding->produced = SBYTES (str);
6514   coding->produced_char = SCHARS (str);
6515   return (nocopy ? str : Fcopy_sequence (str));
6516 }
6517
6518 \f
6519 #ifdef emacs
6520 /*** 8. Emacs Lisp library functions ***/
6521
6522 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6523        doc: /* Return t if OBJECT is nil or a coding-system.
6524 See the documentation of `make-coding-system' for information
6525 about coding-system objects.  */)
6526      (obj)
6527      Lisp_Object obj;
6528 {
6529   if (NILP (obj))
6530     return Qt;
6531   if (!SYMBOLP (obj))
6532     return Qnil;
6533   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6534     return Qt;
6535   /* Get coding-spec vector for OBJ.  */
6536   obj = Fget (obj, Qcoding_system);
6537   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6538           ? Qt : Qnil);
6539 }
6540
6541 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6542        Sread_non_nil_coding_system, 1, 1, 0,
6543        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6544      (prompt)
6545      Lisp_Object prompt;
6546 {
6547   Lisp_Object val;
6548   do
6549     {
6550       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6551                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6552     }
6553   while (SCHARS (val) == 0);
6554   return (Fintern (val, Qnil));
6555 }
6556
6557 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6558        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6559 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
6560 Ignores case when completing coding systems (all Emacs coding systems
6561 are lower-case).  */)
6562      (prompt, default_coding_system)
6563      Lisp_Object prompt, default_coding_system;
6564 {
6565   Lisp_Object val;
6566   int count = SPECPDL_INDEX ();
6567
6568   if (SYMBOLP (default_coding_system))
6569     default_coding_system = SYMBOL_NAME (default_coding_system);
6570   specbind (Qcompletion_ignore_case, Qt);
6571   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6572                           Qt, Qnil, Qcoding_system_history,
6573                           default_coding_system, Qnil);
6574   unbind_to (count, Qnil);
6575   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6576 }
6577
6578 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6579        1, 1, 0,
6580        doc: /* Check validity of CODING-SYSTEM.
6581 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6582 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6583 The value of this property should be a vector of length 5.  */)
6584      (coding_system)
6585      Lisp_Object coding_system;
6586 {
6587   Lisp_Object define_form;
6588
6589   define_form = Fget (coding_system, Qcoding_system_define_form);
6590   if (! NILP (define_form))
6591     {
6592       Fput (coding_system, Qcoding_system_define_form, Qnil);
6593       safe_eval (define_form);
6594     }
6595   if (!NILP (Fcoding_system_p (coding_system)))
6596     return coding_system;
6597   xsignal1 (Qcoding_system_error, coding_system);
6598 }
6599 \f
6600 Lisp_Object
6601 detect_coding_system (src, src_bytes, highest, multibytep)
6602      const unsigned char *src;
6603      int src_bytes, highest;
6604      int multibytep;
6605 {
6606   int coding_mask, eol_type;
6607   Lisp_Object val, tmp;
6608   int dummy;
6609
6610   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6611   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6612   if (eol_type == CODING_EOL_INCONSISTENT)
6613     eol_type = CODING_EOL_UNDECIDED;
6614
6615   if (!coding_mask)
6616     {
6617       val = Qundecided;
6618       if (eol_type != CODING_EOL_UNDECIDED)
6619         {
6620           Lisp_Object val2;
6621           val2 = Fget (Qundecided, Qeol_type);
6622           if (VECTORP (val2))
6623             val = XVECTOR (val2)->contents[eol_type];
6624         }
6625       return (highest ? val : Fcons (val, Qnil));
6626     }
6627
6628   /* At first, gather possible coding systems in VAL.  */
6629   val = Qnil;
6630   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6631     {
6632       Lisp_Object category_val, category_index;
6633
6634       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6635       category_val = Fsymbol_value (XCAR (tmp));
6636       if (!NILP (category_val)
6637           && NATNUMP (category_index)
6638           && (coding_mask & (1 << XFASTINT (category_index))))
6639         {
6640           val = Fcons (category_val, val);
6641           if (highest)
6642             break;
6643         }
6644     }
6645   if (!highest)
6646     val = Fnreverse (val);
6647
6648   /* Then, replace the elements with subsidiary coding systems.  */
6649   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6650     {
6651       if (eol_type != CODING_EOL_UNDECIDED
6652           && eol_type != CODING_EOL_INCONSISTENT)
6653         {
6654           Lisp_Object eol;
6655           eol = Fget (XCAR (tmp), Qeol_type);
6656           if (VECTORP (eol))
6657             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6658         }
6659     }
6660   return (highest ? XCAR (val) : val);
6661 }
6662
6663 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6664        2, 3, 0,
6665        doc: /* Detect how the byte sequence in the region is encoded.
6666 Return a list of possible coding systems used on decoding a byte
6667 sequence containing the bytes in the region between START and END when
6668 the coding system `undecided' is specified.  The list is ordered by
6669 priority decided in the current language environment.
6670
6671 If only ASCII characters are found (except for such ISO-2022 control
6672 characters ISO-2022 as ESC), it returns a list of single element
6673 `undecided' or its subsidiary coding system according to a detected
6674 end-of-line format.
6675
6676 If optional argument HIGHEST is non-nil, return the coding system of
6677 highest priority.  */)
6678      (start, end, highest)
6679      Lisp_Object start, end, highest;
6680 {
6681   int from, to;
6682   int from_byte, to_byte;
6683   int include_anchor_byte = 0;
6684
6685   CHECK_NUMBER_COERCE_MARKER (start);
6686   CHECK_NUMBER_COERCE_MARKER (end);
6687
6688   validate_region (&start, &end);
6689   from = XINT (start), to = XINT (end);
6690   from_byte = CHAR_TO_BYTE (from);
6691   to_byte = CHAR_TO_BYTE (to);
6692
6693   if (from < GPT && to >= GPT)
6694     move_gap_both (to, to_byte);
6695   /* If we an anchor byte `\0' follows the region, we include it in
6696      the detecting source.  Then code detectors can handle the tailing
6697      byte sequence more accurately.
6698
6699      Fix me: This is not a perfect solution.  It is better that we
6700      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6701   */
6702   if (to == Z || (to == GPT && GAP_SIZE > 0))
6703     include_anchor_byte = 1;
6704   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6705                                to_byte - from_byte + include_anchor_byte,
6706                                !NILP (highest),
6707                                !NILP (current_buffer
6708                                       ->enable_multibyte_characters));
6709 }
6710
6711 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6712        1, 2, 0,
6713        doc: /* Detect how the byte sequence in STRING is encoded.
6714 Return a list of possible coding systems used on decoding a byte
6715 sequence containing the bytes in STRING when the coding system
6716 `undecided' is specified.  The list is ordered by priority decided in
6717 the current language environment.
6718
6719 If only ASCII characters are found (except for such ISO-2022 control
6720 characters ISO-2022 as ESC), it returns a list of single element
6721 `undecided' or its subsidiary coding system according to a detected
6722 end-of-line format.
6723
6724 If optional argument HIGHEST is non-nil, return the coding system of
6725 highest priority.  */)
6726      (string, highest)
6727      Lisp_Object string, highest;
6728 {
6729   CHECK_STRING (string);
6730
6731   return detect_coding_system (SDATA (string),
6732                                /* "+ 1" is to include the anchor byte
6733                                   `\0'.  With this, code detectors can
6734                                   handle the tailing bytes more
6735                                   accurately.  */
6736                                SBYTES (string) + 1,
6737                                !NILP (highest),
6738                                STRING_MULTIBYTE (string));
6739 }
6740
6741 /*  Subroutine for Ffind_coding_systems_region_internal.
6742
6743     Return a list of coding systems that safely encode the multibyte
6744     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6745     possible coding systems.  If it is nil, it means that we have not
6746     yet found any coding systems.
6747
6748     WORK_TABLE a char-table of which element is set to t once the
6749     element is looked up.
6750
6751     If a non-ASCII single byte char is found, set
6752     *single_byte_char_found to 1.  */
6753
6754 static Lisp_Object
6755 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6756      unsigned char *p, *pend;
6757      Lisp_Object safe_codings, work_table;
6758      int *single_byte_char_found;
6759 {
6760   int c, len;
6761   Lisp_Object val, ch;
6762   Lisp_Object prev, tail;
6763
6764   if (NILP (safe_codings))
6765     goto done_safe_codings;
6766   while (p < pend)
6767     {
6768       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6769       p += len;
6770       if (ASCII_BYTE_P (c))
6771         /* We can ignore ASCII characters here.  */
6772         continue;
6773       if (SINGLE_BYTE_CHAR_P (c))
6774         *single_byte_char_found = 1;
6775       /* Check the safe coding systems for C.  */
6776       ch = make_number (c);
6777       val = Faref (work_table, ch);
6778       if (EQ (val, Qt))
6779         /* This element was already checked.  Ignore it.  */
6780         continue;
6781       /* Remember that we checked this element.  */
6782       Faset (work_table, ch, Qt);
6783
6784       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6785         {
6786           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6787           int encodable;
6788
6789           elt = XCAR (tail);
6790           if (CONSP (XCDR (elt)))
6791             {
6792               /* This entry has this format now:
6793                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6794                           ACCEPT-LATIN-EXTRA ) */
6795               val = XCDR (elt);
6796               encodable = ! NILP (Faref (XCAR (val), ch));
6797               if (! encodable)
6798                 {
6799                   val = XCDR (val);
6800                   translation_table = XCAR (val);
6801                   hash_table = XCAR (XCDR (val));
6802                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6803                 }
6804             }
6805           else
6806             {
6807               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6808               encodable = ! NILP (Faref (XCDR (elt), ch));
6809               if (! encodable)
6810                 {
6811                   /* Transform the format to:
6812                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6813                        ACCEPT-LATIN-EXTRA )  */
6814                   val = Fget (XCAR (elt), Qcoding_system);
6815                   translation_table
6816                     = Fplist_get (AREF (val, 3),
6817                                   Qtranslation_table_for_encode);
6818                   if (SYMBOLP (translation_table))
6819                     translation_table = Fget (translation_table,
6820                                               Qtranslation_table);
6821                   hash_table
6822                     = (CHAR_TABLE_P (translation_table)
6823                        ? XCHAR_TABLE (translation_table)->extras[1]
6824                        : Qnil);
6825                   accept_latin_extra
6826                     = ((EQ (AREF (val, 0), make_number (2))
6827                         && VECTORP (AREF (val, 4)))
6828                        ? AREF (AREF (val, 4), 16)
6829                        : Qnil);
6830                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6831                                         translation_table, hash_table,
6832                                         accept_latin_extra));
6833                 }
6834             }
6835
6836           if (! encodable
6837               && ((CHAR_TABLE_P (translation_table)
6838                    && ! NILP (Faref (translation_table, ch)))
6839                   || (HASH_TABLE_P (hash_table)
6840                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6841                   || (SINGLE_BYTE_CHAR_P (c)
6842                       && ! NILP (accept_latin_extra)
6843                       && VECTORP (Vlatin_extra_code_table)
6844                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6845             encodable = 1;
6846           if (encodable)
6847             prev = tail;
6848           else
6849             {
6850               /* Exclude this coding system from SAFE_CODINGS.  */
6851               if (EQ (tail, safe_codings))
6852                 {
6853                   safe_codings = XCDR (safe_codings);
6854                   if (NILP (safe_codings))
6855                     goto done_safe_codings;
6856                 }
6857               else
6858                 XSETCDR (prev, XCDR (tail));
6859             }
6860         }
6861     }
6862
6863  done_safe_codings:
6864   /* If the above loop was terminated before P reaches PEND, it means
6865      SAFE_CODINGS was set to nil.  If we have not yet found an
6866      non-ASCII single-byte char, check it now.  */
6867   if (! *single_byte_char_found)
6868     while (p < pend)
6869       {
6870         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6871         p += len;
6872         if (! ASCII_BYTE_P (c)
6873             && SINGLE_BYTE_CHAR_P (c))
6874           {
6875             *single_byte_char_found = 1;
6876             break;
6877           }
6878       }
6879   return safe_codings;
6880 }
6881
6882 DEFUN ("find-coding-systems-region-internal",
6883        Ffind_coding_systems_region_internal,
6884        Sfind_coding_systems_region_internal, 2, 2, 0,
6885        doc: /* Internal use only.  */)
6886      (start, end)
6887      Lisp_Object start, end;
6888 {
6889   Lisp_Object work_table, safe_codings;
6890   int non_ascii_p = 0;
6891   int single_byte_char_found = 0;
6892   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6893
6894   if (STRINGP (start))
6895     {
6896       if (!STRING_MULTIBYTE (start))
6897         return Qt;
6898       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6899       p2 = p2end = p1end;
6900       if (SCHARS (start) != SBYTES (start))
6901         non_ascii_p = 1;
6902     }
6903   else
6904     {
6905       int from, to, stop;
6906
6907       CHECK_NUMBER_COERCE_MARKER (start);
6908       CHECK_NUMBER_COERCE_MARKER (end);
6909       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6910         args_out_of_range (start, end);
6911       if (NILP (current_buffer->enable_multibyte_characters))
6912         return Qt;
6913       from = CHAR_TO_BYTE (XINT (start));
6914       to = CHAR_TO_BYTE (XINT (end));
6915       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6916       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6917       if (stop == to)
6918         p2 = p2end = p1end;
6919       else
6920         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6921       if (XINT (end) - XINT (start) != to - from)
6922         non_ascii_p = 1;
6923     }
6924
6925   if (!non_ascii_p)
6926     {
6927       /* We are sure that the text contains no multibyte character.
6928          Check if it contains eight-bit-graphic.  */
6929       p = p1;
6930       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6931       if (p == p1end)
6932         {
6933           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6934           if (p == p2end)
6935             return Qt;
6936         }
6937     }
6938
6939   /* The text contains non-ASCII characters.  */
6940
6941   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6942   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6943
6944   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6945                                     &single_byte_char_found);
6946   if (p2 < p2end)
6947     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6948                                       &single_byte_char_found);
6949   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6950     safe_codings = Qt;
6951   else
6952     {
6953       /* Turn safe_codings to a list of coding systems... */
6954       Lisp_Object val;
6955
6956       if (single_byte_char_found)
6957         /* ... and append these for eight-bit chars.  */
6958         val = Fcons (Qraw_text,
6959                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6960       else
6961         /* ... and append generic coding systems.  */
6962         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6963
6964       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6965         val = Fcons (XCAR (XCAR (safe_codings)), val);
6966       safe_codings = val;
6967     }
6968
6969   return safe_codings;
6970 }
6971
6972
6973 /* Search from position POS for such characters that are unencodable
6974    accoding to SAFE_CHARS, and return a list of their positions.  P
6975    points where in the memory the character at POS exists.  Limit the
6976    search at PEND or when Nth unencodable characters are found.
6977
6978    If SAFE_CHARS is a char table, an element for an unencodable
6979    character is nil.
6980
6981    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6982
6983    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6984    eight-bit-graphic characters are unencodable.  */
6985
6986 static Lisp_Object
6987 unencodable_char_position (safe_chars, pos, p, pend, n)
6988      Lisp_Object safe_chars;
6989      int pos;
6990      unsigned char *p, *pend;
6991      int n;
6992 {
6993   Lisp_Object pos_list;
6994
6995   pos_list = Qnil;
6996   while (p < pend)
6997     {
6998       int len;
6999       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7000
7001       if (c >= 128
7002           && (CHAR_TABLE_P (safe_chars)
7003               ? NILP (CHAR_TABLE_REF (safe_chars, c))
7004               : (NILP (safe_chars) || c < 256)))
7005         {
7006           pos_list = Fcons (make_number (pos), pos_list);
7007           if (--n <= 0)
7008             break;
7009         }
7010       pos++;
7011       p += len;
7012     }
7013   return Fnreverse (pos_list);
7014 }
7015
7016
7017 DEFUN ("unencodable-char-position", Funencodable_char_position,
7018        Sunencodable_char_position, 3, 5, 0,
7019        doc: /*
7020 Return position of first un-encodable character in a region.
7021 START and END specfiy the region and CODING-SYSTEM specifies the
7022 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7023
7024 If optional 4th argument COUNT is non-nil, it specifies at most how
7025 many un-encodable characters to search.  In this case, the value is a
7026 list of positions.
7027
7028 If optional 5th argument STRING is non-nil, it is a string to search
7029 for un-encodable characters.  In that case, START and END are indexes
7030 to the string.  */)
7031      (start, end, coding_system, count, string)
7032      Lisp_Object start, end, coding_system, count, string;
7033 {
7034   int n;
7035   Lisp_Object safe_chars;
7036   struct coding_system coding;
7037   Lisp_Object positions;
7038   int from, to;
7039   unsigned char *p, *pend;
7040
7041   if (NILP (string))
7042     {
7043       validate_region (&start, &end);
7044       from = XINT (start);
7045       to = XINT (end);
7046       if (NILP (current_buffer->enable_multibyte_characters))
7047         return Qnil;
7048       p = CHAR_POS_ADDR (from);
7049       if (to == GPT)
7050         pend = GPT_ADDR;
7051       else
7052         pend = CHAR_POS_ADDR (to);
7053     }
7054   else
7055     {
7056       CHECK_STRING (string);
7057       CHECK_NATNUM (start);
7058       CHECK_NATNUM (end);
7059       from = XINT (start);
7060       to = XINT (end);
7061       if (from > to
7062           || to > SCHARS (string))
7063         args_out_of_range_3 (string, start, end);
7064       if (! STRING_MULTIBYTE (string))
7065         return Qnil;
7066       p = SDATA (string) + string_char_to_byte (string, from);
7067       pend = SDATA (string) + string_char_to_byte (string, to);
7068     }
7069
7070   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7071
7072   if (NILP (count))
7073     n = 1;
7074   else
7075     {
7076       CHECK_NATNUM (count);
7077       n = XINT (count);
7078     }
7079
7080   if (coding.type == coding_type_no_conversion
7081       || coding.type == coding_type_raw_text)
7082     return Qnil;
7083
7084   if (coding.type == coding_type_undecided)
7085     safe_chars = Qnil;
7086   else
7087     safe_chars = coding_safe_chars (coding_system);
7088
7089   if (STRINGP (string)
7090       || from >= GPT || to <= GPT)
7091     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7092   else
7093     {
7094       Lisp_Object args[2];
7095
7096       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7097       n -= XINT (Flength (args[0]));
7098       if (n <= 0)
7099         positions = args[0];
7100       else
7101         {
7102           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7103                                                pend, n);
7104           positions = Fappend (2, args);
7105         }
7106     }
7107
7108   return  (NILP (count) ? Fcar (positions) : positions);
7109 }
7110
7111
7112 Lisp_Object
7113 code_convert_region1 (start, end, coding_system, encodep)
7114      Lisp_Object start, end, coding_system;
7115      int encodep;
7116 {
7117   struct coding_system coding;
7118   int from, to;
7119
7120   CHECK_NUMBER_COERCE_MARKER (start);
7121   CHECK_NUMBER_COERCE_MARKER (end);
7122   CHECK_SYMBOL (coding_system);
7123
7124   validate_region (&start, &end);
7125   from = XFASTINT (start);
7126   to = XFASTINT (end);
7127
7128   if (NILP (coding_system))
7129     return make_number (to - from);
7130
7131   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7132     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7133
7134   coding.mode |= CODING_MODE_LAST_BLOCK;
7135   coding.src_multibyte = coding.dst_multibyte
7136     = !NILP (current_buffer->enable_multibyte_characters);
7137   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7138                        &coding, encodep, 1);
7139   Vlast_coding_system_used = coding.symbol;
7140   return make_number (coding.produced_char);
7141 }
7142
7143 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7144        3, 3, "r\nzCoding system: ",
7145        doc: /* Decode the current region from the specified coding system.
7146 When called from a program, takes three arguments:
7147 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7148 This function sets `last-coding-system-used' to the precise coding system
7149 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7150 not fully specified.)
7151 It returns the length of the decoded text.  */)
7152      (start, end, coding_system)
7153      Lisp_Object start, end, coding_system;
7154 {
7155   return code_convert_region1 (start, end, coding_system, 0);
7156 }
7157
7158 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7159        3, 3, "r\nzCoding system: ",
7160        doc: /* Encode the current region into the specified coding system.
7161 When called from a program, takes three arguments:
7162 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7163 This function sets `last-coding-system-used' to the precise coding system
7164 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7165 not fully specified.)
7166 It returns the length of the encoded text.  */)
7167      (start, end, coding_system)
7168      Lisp_Object start, end, coding_system;
7169 {
7170   return code_convert_region1 (start, end, coding_system, 1);
7171 }
7172
7173 Lisp_Object
7174 code_convert_string1 (string, coding_system, nocopy, encodep)
7175      Lisp_Object string, coding_system, nocopy;
7176      int encodep;
7177 {
7178   struct coding_system coding;
7179
7180   CHECK_STRING (string);
7181   CHECK_SYMBOL (coding_system);
7182
7183   if (NILP (coding_system))
7184     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7185
7186   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7187     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7188
7189   coding.mode |= CODING_MODE_LAST_BLOCK;
7190   string = (encodep
7191             ? encode_coding_string (string, &coding, !NILP (nocopy))
7192             : decode_coding_string (string, &coding, !NILP (nocopy)));
7193   Vlast_coding_system_used = coding.symbol;
7194
7195   return string;
7196 }
7197
7198 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7199        2, 3, 0,
7200        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7201 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7202 if the decoding operation is trivial.
7203 This function sets `last-coding-system-used' to the precise coding system
7204 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7205 not fully specified.)  */)
7206      (string, coding_system, nocopy)
7207      Lisp_Object string, coding_system, nocopy;
7208 {
7209   return code_convert_string1 (string, coding_system, nocopy, 0);
7210 }
7211
7212 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7213        2, 3, 0,
7214        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7215 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7216 if the encoding operation is trivial.
7217 This function sets `last-coding-system-used' to the precise coding system
7218 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7219 not fully specified.)  */)
7220      (string, coding_system, nocopy)
7221      Lisp_Object string, coding_system, nocopy;
7222 {
7223   return code_convert_string1 (string, coding_system, nocopy, 1);
7224 }
7225
7226 /* Encode or decode STRING according to CODING_SYSTEM.
7227    Do not set Vlast_coding_system_used.
7228
7229    This function is called only from macros DECODE_FILE and
7230    ENCODE_FILE, thus we ignore character composition.  */
7231
7232 Lisp_Object
7233 code_convert_string_norecord (string, coding_system, encodep)
7234      Lisp_Object string, coding_system;
7235      int encodep;
7236 {
7237   struct coding_system coding;
7238
7239   CHECK_STRING (string);
7240   CHECK_SYMBOL (coding_system);
7241
7242   if (NILP (coding_system))
7243     return string;
7244
7245   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7246     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7247
7248   coding.composing = COMPOSITION_DISABLED;
7249   coding.mode |= CODING_MODE_LAST_BLOCK;
7250   return (encodep
7251           ? encode_coding_string (string, &coding, 1)
7252           : decode_coding_string (string, &coding, 1));
7253 }
7254 \f
7255 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7256        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7257 Return the corresponding character.  */)
7258      (code)
7259      Lisp_Object code;
7260 {
7261   unsigned char c1, c2, s1, s2;
7262   Lisp_Object val;
7263
7264   CHECK_NUMBER (code);
7265   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7266   if (s1 == 0)
7267     {
7268       if (s2 < 0x80)
7269         XSETFASTINT (val, s2);
7270       else if (s2 >= 0xA0 || s2 <= 0xDF)
7271         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7272       else
7273         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7274     }
7275   else
7276     {
7277       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7278           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7279         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7280       DECODE_SJIS (s1, s2, c1, c2);
7281       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7282     }
7283   return val;
7284 }
7285
7286 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7287        doc: /* Encode a Japanese character CH to shift_jis encoding.
7288 Return the corresponding code in SJIS.  */)
7289      (ch)
7290      Lisp_Object ch;
7291 {
7292   int charset, c1, c2, s1, s2;
7293   Lisp_Object val;
7294
7295   CHECK_NUMBER (ch);
7296   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7297   if (charset == CHARSET_ASCII)
7298     {
7299       val = ch;
7300     }
7301   else if (charset == charset_jisx0208
7302            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7303     {
7304       ENCODE_SJIS (c1, c2, s1, s2);
7305       XSETFASTINT (val, (s1 << 8) | s2);
7306     }
7307   else if (charset == charset_katakana_jisx0201
7308            && c1 > 0x20 && c2 < 0xE0)
7309     {
7310       XSETFASTINT (val, c1 | 0x80);
7311     }
7312   else
7313     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7314   return val;
7315 }
7316
7317 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7318        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7319 Return the corresponding character.  */)
7320      (code)
7321      Lisp_Object code;
7322 {
7323   int charset;
7324   unsigned char b1, b2, c1, c2;
7325   Lisp_Object val;
7326
7327   CHECK_NUMBER (code);
7328   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7329   if (b1 == 0)
7330     {
7331       if (b2 >= 0x80)
7332         error ("Invalid BIG5 code: %x", XFASTINT (code));
7333       val = code;
7334     }
7335   else
7336     {
7337       if ((b1 < 0xA1 || b1 > 0xFE)
7338           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7339         error ("Invalid BIG5 code: %x", XFASTINT (code));
7340       DECODE_BIG5 (b1, b2, charset, c1, c2);
7341       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7342     }
7343   return val;
7344 }
7345
7346 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7347        doc: /* Encode the Big5 character CH to BIG5 coding system.
7348 Return the corresponding character code in Big5.  */)
7349      (ch)
7350      Lisp_Object ch;
7351 {
7352   int charset, c1, c2, b1, b2;
7353   Lisp_Object val;
7354
7355   CHECK_NUMBER (ch);
7356   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7357   if (charset == CHARSET_ASCII)
7358     {
7359       val = ch;
7360     }
7361   else if ((charset == charset_big5_1
7362             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7363            || (charset == charset_big5_2
7364                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7365     {
7366       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7367       XSETFASTINT (val, (b1 << 8) | b2);
7368     }
7369   else
7370     error ("Can't encode to Big5: %d", XFASTINT (ch));
7371   return val;
7372 }
7373 \f
7374 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7375        Sset_terminal_coding_system_internal, 1, 2, 0,
7376        doc: /* Internal use only.  */)
7377      (coding_system, terminal)
7378      Lisp_Object coding_system;
7379      Lisp_Object terminal;
7380 {
7381   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
7382   CHECK_SYMBOL (coding_system);
7383   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7384   /* We had better not send unsafe characters to terminal.  */
7385   terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7386   /* Character composition should be disabled.  */
7387   terminal_coding->composing = COMPOSITION_DISABLED;
7388   /* Error notification should be suppressed.  */
7389   terminal_coding->suppress_error = 1;
7390   terminal_coding->src_multibyte = 1;
7391   terminal_coding->dst_multibyte = 0;
7392   return Qnil;
7393 }
7394
7395 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7396        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7397        doc: /* Internal use only.  */)
7398      (coding_system)
7399      Lisp_Object coding_system;
7400 {
7401   CHECK_SYMBOL (coding_system);
7402   setup_coding_system (Fcheck_coding_system (coding_system),
7403                        &safe_terminal_coding);
7404   /* Character composition should be disabled.  */
7405   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7406   /* Error notification should be suppressed.  */
7407   safe_terminal_coding.suppress_error = 1;
7408   safe_terminal_coding.src_multibyte = 1;
7409   safe_terminal_coding.dst_multibyte = 0;
7410   return Qnil;
7411 }
7412
7413 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7414        Sterminal_coding_system, 0, 1, 0,
7415        doc: /* Return coding system specified for terminal output on the given terminal.
7416 TERMINAL may be a terminal id, a frame, or nil for the selected
7417 frame's terminal device.  */)
7418      (terminal)
7419      Lisp_Object terminal;
7420 {
7421   return TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
7422 }
7423
7424 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7425        Sset_keyboard_coding_system_internal, 1, 2, 0,
7426        doc: /* Internal use only.  */)
7427      (coding_system, terminal)
7428      Lisp_Object coding_system;
7429      Lisp_Object terminal;
7430 {
7431   struct terminal *t = get_terminal (terminal, 1);
7432   CHECK_SYMBOL (coding_system);
7433
7434   setup_coding_system (Fcheck_coding_system (coding_system),
7435                        TERMINAL_KEYBOARD_CODING (t));
7436   /* Character composition should be disabled.  */
7437   TERMINAL_KEYBOARD_CODING (t)->composing = COMPOSITION_DISABLED;
7438   return Qnil;
7439 }
7440
7441 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7442        Skeyboard_coding_system, 0, 1, 0,
7443        doc: /* Return coding system for decoding keyboard input on TERMINAL.
7444 TERMINAL may be a terminal id, a frame, or nil for the selected
7445 frame's terminal device.  */)
7446      (terminal)
7447      Lisp_Object terminal;
7448 {
7449   return TERMINAL_KEYBOARD_CODING (get_terminal (terminal, 1))->symbol;
7450 }
7451
7452 \f
7453 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7454        Sfind_operation_coding_system,  1, MANY, 0,
7455        doc: /* Choose a coding system for an operation based on the target name.
7456 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7457 DECODING-SYSTEM is the coding system to use for decoding
7458 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7459 for encoding (in case OPERATION does encoding).
7460
7461 The first argument OPERATION specifies an I/O primitive:
7462   For file I/O, `insert-file-contents' or `write-region'.
7463   For process I/O, `call-process', `call-process-region', or `start-process'.
7464   For network I/O, `open-network-stream'.
7465
7466 The remaining arguments should be the same arguments that were passed
7467 to the primitive.  Depending on which primitive, one of those arguments
7468 is selected as the TARGET.  For example, if OPERATION does file I/O,
7469 whichever argument specifies the file name is TARGET.
7470
7471 TARGET has a meaning which depends on OPERATION:
7472   For file I/O, TARGET is a file name (except for the special case below).
7473   For process I/O, TARGET is a process name.
7474   For network I/O, TARGET is a service name or a port number
7475
7476 This function looks up what specified for TARGET in,
7477 `file-coding-system-alist', `process-coding-system-alist',
7478 or `network-coding-system-alist' depending on OPERATION.
7479 They may specify a coding system, a cons of coding systems,
7480 or a function symbol to call.
7481 In the last case, we call the function with one argument,
7482 which is a list of all the arguments given to this function.
7483 If the function can't decide a coding system, it can return
7484 `undecided' so that the normal code-detection is performed.
7485
7486 If OPERATION is `insert-file-contents', the argument corresponding to
7487 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7488 file name to look up, and BUFFER is a buffer that contains the file's
7489 contents (not yet decoded).  If `file-coding-system-alist' specifies a
7490 function to call for FILENAME, that function should examine the
7491 contents of BUFFER instead of reading the file.
7492
7493 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
7494      (nargs, args)
7495      int nargs;
7496      Lisp_Object *args;
7497 {
7498   Lisp_Object operation, target_idx, target, val;
7499   register Lisp_Object chain;
7500
7501   if (nargs < 2)
7502     error ("Too few arguments");
7503   operation = args[0];
7504   if (!SYMBOLP (operation)
7505       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7506     error ("Invalid first argument");
7507   if (nargs < 1 + XINT (target_idx))
7508     error ("Too few arguments for operation: %s",
7509            SDATA (SYMBOL_NAME (operation)));
7510   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7511      argument to write-region) is string, it must be treated as a
7512      target file name.  */
7513   if (EQ (operation, Qwrite_region)
7514       && nargs > 5
7515       && STRINGP (args[5]))
7516     target_idx = make_number (4);
7517   target = args[XINT (target_idx) + 1];
7518   if (!(STRINGP (target)
7519         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7520             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7521         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7522     error ("Invalid argument %d", XINT (target_idx) + 1);
7523   if (CONSP (target))
7524     target = XCAR (target);
7525
7526   chain = ((EQ (operation, Qinsert_file_contents)
7527             || EQ (operation, Qwrite_region))
7528            ? Vfile_coding_system_alist
7529            : (EQ (operation, Qopen_network_stream)
7530               ? Vnetwork_coding_system_alist
7531               : Vprocess_coding_system_alist));
7532   if (NILP (chain))
7533     return Qnil;
7534
7535   for (; CONSP (chain); chain = XCDR (chain))
7536     {
7537       Lisp_Object elt;
7538       elt = XCAR (chain);
7539
7540       if (CONSP (elt)
7541           && ((STRINGP (target)
7542                && STRINGP (XCAR (elt))
7543                && fast_string_match (XCAR (elt), target) >= 0)
7544               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7545         {
7546           val = XCDR (elt);
7547           /* Here, if VAL is both a valid coding system and a valid
7548              function symbol, we return VAL as a coding system.  */
7549           if (CONSP (val))
7550             return val;
7551           if (! SYMBOLP (val))
7552             return Qnil;
7553           if (! NILP (Fcoding_system_p (val)))
7554             return Fcons (val, val);
7555           if (! NILP (Ffboundp (val)))
7556             {
7557               /* We use call1 rather than safe_call1
7558                  so as to get bug reports about functions called here
7559                  which don't handle the current interface.  */
7560               val = call1 (val, Flist (nargs, args));
7561               if (CONSP (val))
7562                 return val;
7563               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7564                 return Fcons (val, val);
7565             }
7566           return Qnil;
7567         }
7568     }
7569   return Qnil;
7570 }
7571
7572 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7573        Supdate_coding_systems_internal, 0, 0, 0,
7574        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7575 When values of any coding categories are changed, you must
7576 call this function.  */)
7577      ()
7578 {
7579   int i;
7580
7581   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7582     {
7583       Lisp_Object val;
7584
7585       val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[i]);
7586       if (!NILP (val))
7587         {
7588           if (! coding_system_table[i])
7589             coding_system_table[i] = ((struct coding_system *)
7590                                       xmalloc (sizeof (struct coding_system)));
7591           setup_coding_system (val, coding_system_table[i]);
7592         }
7593       else if (coding_system_table[i])
7594         {
7595           xfree (coding_system_table[i]);
7596           coding_system_table[i] = NULL;
7597         }
7598     }
7599
7600   return Qnil;
7601 }
7602
7603 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7604        Sset_coding_priority_internal, 0, 0, 0,
7605        doc: /* Update internal database for the current value of `coding-category-list'.
7606 This function is internal use only.  */)
7607      ()
7608 {
7609   int i = 0, idx;
7610   Lisp_Object val;
7611
7612   val = Vcoding_category_list;
7613
7614   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7615     {
7616       if (! SYMBOLP (XCAR (val)))
7617         break;
7618       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7619       if (idx >= CODING_CATEGORY_IDX_MAX)
7620         break;
7621       coding_priorities[i++] = (1 << idx);
7622       val = XCDR (val);
7623     }
7624   /* If coding-category-list is valid and contains all coding
7625      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7626      the following code saves Emacs from crashing.  */
7627   while (i < CODING_CATEGORY_IDX_MAX)
7628     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7629
7630   return Qnil;
7631 }
7632
7633 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7634        Sdefine_coding_system_internal, 1, 1, 0,
7635        doc: /* Register CODING-SYSTEM as a base coding system.
7636 This function is internal use only.  */)
7637      (coding_system)
7638      Lisp_Object coding_system;
7639 {
7640   Lisp_Object safe_chars, slot;
7641
7642   if (NILP (Fcheck_coding_system (coding_system)))
7643     xsignal1 (Qcoding_system_error, coding_system);
7644
7645   safe_chars = coding_safe_chars (coding_system);
7646   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7647     error ("No valid safe-chars property for %s",
7648            SDATA (SYMBOL_NAME (coding_system)));
7649
7650   if (EQ (safe_chars, Qt))
7651     {
7652       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7653         XSETCAR (Vcoding_system_safe_chars,
7654                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7655     }
7656   else
7657     {
7658       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7659       if (NILP (slot))
7660         XSETCDR (Vcoding_system_safe_chars,
7661                  nconc2 (XCDR (Vcoding_system_safe_chars),
7662                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7663       else
7664         XSETCDR (slot, safe_chars);
7665     }
7666   return Qnil;
7667 }
7668
7669 #endif /* emacs */
7670
7671 \f
7672 /*** 9. Post-amble ***/
7673
7674 void
7675 init_coding_once ()
7676 {
7677   int i;
7678
7679   /* Emacs' internal format specific initialize routine.  */
7680   for (i = 0; i <= 0x20; i++)
7681     emacs_code_class[i] = EMACS_control_code;
7682   emacs_code_class[0x0A] = EMACS_linefeed_code;
7683   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7684   for (i = 0x21 ; i < 0x7F; i++)
7685     emacs_code_class[i] = EMACS_ascii_code;
7686   emacs_code_class[0x7F] = EMACS_control_code;
7687   for (i = 0x80; i < 0xFF; i++)
7688     emacs_code_class[i] = EMACS_invalid_code;
7689   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7690   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7691   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7692   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7693
7694   /* ISO2022 specific initialize routine.  */
7695   for (i = 0; i < 0x20; i++)
7696     iso_code_class[i] = ISO_control_0;
7697   for (i = 0x21; i < 0x7F; i++)
7698     iso_code_class[i] = ISO_graphic_plane_0;
7699   for (i = 0x80; i < 0xA0; i++)
7700     iso_code_class[i] = ISO_control_1;
7701   for (i = 0xA1; i < 0xFF; i++)
7702     iso_code_class[i] = ISO_graphic_plane_1;
7703   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7704   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7705   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7706   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7707   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7708   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7709   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7710   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7711   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7712   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7713
7714   setup_coding_system (Qnil, &safe_terminal_coding);
7715   setup_coding_system (Qnil, &default_buffer_file_coding);
7716
7717   bzero (coding_system_table, sizeof coding_system_table);
7718
7719   bzero (ascii_skip_code, sizeof ascii_skip_code);
7720   for (i = 0; i < 128; i++)
7721     ascii_skip_code[i] = 1;
7722
7723 #if defined (MSDOS) || defined (WINDOWSNT)
7724   system_eol_type = CODING_EOL_CRLF;
7725 #else
7726   system_eol_type = CODING_EOL_LF;
7727 #endif
7728
7729   inhibit_pre_post_conversion = 0;
7730 }
7731
7732 #ifdef emacs
7733
7734 void
7735 syms_of_coding ()
7736 {
7737   staticpro (&Vcode_conversion_workbuf_name);
7738   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7739
7740   Qtarget_idx = intern ("target-idx");
7741   staticpro (&Qtarget_idx);
7742
7743   Qcoding_system_history = intern ("coding-system-history");
7744   staticpro (&Qcoding_system_history);
7745   Fset (Qcoding_system_history, Qnil);
7746
7747   /* Target FILENAME is the first argument.  */
7748   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7749   /* Target FILENAME is the third argument.  */
7750   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7751
7752   Qcall_process = intern ("call-process");
7753   staticpro (&Qcall_process);
7754   /* Target PROGRAM is the first argument.  */
7755   Fput (Qcall_process, Qtarget_idx, make_number (0));
7756
7757   Qcall_process_region = intern ("call-process-region");
7758   staticpro (&Qcall_process_region);
7759   /* Target PROGRAM is the third argument.  */
7760   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7761
7762   Qstart_process = intern ("start-process");
7763   staticpro (&Qstart_process);
7764   /* Target PROGRAM is the third argument.  */
7765   Fput (Qstart_process, Qtarget_idx, make_number (2));
7766
7767   Qopen_network_stream = intern ("open-network-stream");
7768   staticpro (&Qopen_network_stream);
7769   /* Target SERVICE is the fourth argument.  */
7770   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7771
7772   Qcoding_system = intern ("coding-system");
7773   staticpro (&Qcoding_system);
7774
7775   Qeol_type = intern ("eol-type");
7776   staticpro (&Qeol_type);
7777
7778   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7779   staticpro (&Qbuffer_file_coding_system);
7780
7781   Qpost_read_conversion = intern ("post-read-conversion");
7782   staticpro (&Qpost_read_conversion);
7783
7784   Qpre_write_conversion = intern ("pre-write-conversion");
7785   staticpro (&Qpre_write_conversion);
7786
7787   Qno_conversion = intern ("no-conversion");
7788   staticpro (&Qno_conversion);
7789
7790   Qundecided = intern ("undecided");
7791   staticpro (&Qundecided);
7792
7793   Qcoding_system_p = intern ("coding-system-p");
7794   staticpro (&Qcoding_system_p);
7795
7796   Qcoding_system_error = intern ("coding-system-error");
7797   staticpro (&Qcoding_system_error);
7798
7799   Fput (Qcoding_system_error, Qerror_conditions,
7800         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7801   Fput (Qcoding_system_error, Qerror_message,
7802         build_string ("Invalid coding system"));
7803
7804   Qcoding_category = intern ("coding-category");
7805   staticpro (&Qcoding_category);
7806   Qcoding_category_index = intern ("coding-category-index");
7807   staticpro (&Qcoding_category_index);
7808
7809   Vcoding_category_table
7810     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7811   staticpro (&Vcoding_category_table);
7812   {
7813     int i;
7814     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7815       {
7816         XVECTOR (Vcoding_category_table)->contents[i]
7817           = intern (coding_category_name[i]);
7818         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7819               Qcoding_category_index, make_number (i));
7820       }
7821   }
7822
7823   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7824   staticpro (&Vcoding_system_safe_chars);
7825
7826   Qtranslation_table = intern ("translation-table");
7827   staticpro (&Qtranslation_table);
7828   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7829
7830   Qtranslation_table_id = intern ("translation-table-id");
7831   staticpro (&Qtranslation_table_id);
7832
7833   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7834   staticpro (&Qtranslation_table_for_decode);
7835
7836   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7837   staticpro (&Qtranslation_table_for_encode);
7838
7839   Qsafe_chars = intern ("safe-chars");
7840   staticpro (&Qsafe_chars);
7841
7842   Qchar_coding_system = intern ("char-coding-system");
7843   staticpro (&Qchar_coding_system);
7844
7845   /* Intern this now in case it isn't already done.
7846      Setting this variable twice is harmless.
7847      But don't staticpro it here--that is done in alloc.c.  */
7848   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7849   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7850   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7851
7852   Qvalid_codes = intern ("valid-codes");
7853   staticpro (&Qvalid_codes);
7854
7855   Qascii_incompatible = intern ("ascii-incompatible");
7856   staticpro (&Qascii_incompatible);
7857
7858   Qemacs_mule = intern ("emacs-mule");
7859   staticpro (&Qemacs_mule);
7860
7861   Qraw_text = intern ("raw-text");
7862   staticpro (&Qraw_text);
7863
7864   Qutf_8 = intern ("utf-8");
7865   staticpro (&Qutf_8);
7866
7867   Qcoding_system_define_form = intern ("coding-system-define-form");
7868   staticpro (&Qcoding_system_define_form);
7869
7870   defsubr (&Scoding_system_p);
7871   defsubr (&Sread_coding_system);
7872   defsubr (&Sread_non_nil_coding_system);
7873   defsubr (&Scheck_coding_system);
7874   defsubr (&Sdetect_coding_region);
7875   defsubr (&Sdetect_coding_string);
7876   defsubr (&Sfind_coding_systems_region_internal);
7877   defsubr (&Sunencodable_char_position);
7878   defsubr (&Sdecode_coding_region);
7879   defsubr (&Sencode_coding_region);
7880   defsubr (&Sdecode_coding_string);
7881   defsubr (&Sencode_coding_string);
7882   defsubr (&Sdecode_sjis_char);
7883   defsubr (&Sencode_sjis_char);
7884   defsubr (&Sdecode_big5_char);
7885   defsubr (&Sencode_big5_char);
7886   defsubr (&Sset_terminal_coding_system_internal);
7887   defsubr (&Sset_safe_terminal_coding_system_internal);
7888   defsubr (&Sterminal_coding_system);
7889   defsubr (&Sset_keyboard_coding_system_internal);
7890   defsubr (&Skeyboard_coding_system);
7891   defsubr (&Sfind_operation_coding_system);
7892   defsubr (&Supdate_coding_systems_internal);
7893   defsubr (&Sset_coding_priority_internal);
7894   defsubr (&Sdefine_coding_system_internal);
7895
7896   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7897                doc: /* List of coding systems.
7898
7899 Do not alter the value of this variable manually.  This variable should be
7900 updated by the functions `make-coding-system' and
7901 `define-coding-system-alias'.  */);
7902   Vcoding_system_list = Qnil;
7903
7904   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7905                doc: /* Alist of coding system names.
7906 Each element is one element list of coding system name.
7907 This variable is given to `completing-read' as TABLE argument.
7908
7909 Do not alter the value of this variable manually.  This variable should be
7910 updated by the functions `make-coding-system' and
7911 `define-coding-system-alias'.  */);
7912   Vcoding_system_alist = Qnil;
7913
7914   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7915                doc: /* List of coding-categories (symbols) ordered by priority.
7916
7917 On detecting a coding system, Emacs tries code detection algorithms
7918 associated with each coding-category one by one in this order.  When
7919 one algorithm agrees with a byte sequence of source text, the coding
7920 system bound to the corresponding coding-category is selected.
7921
7922 Don't modify this variable directly, but use `set-coding-priority'.  */);
7923   {
7924     int i;
7925
7926     Vcoding_category_list = Qnil;
7927     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7928       Vcoding_category_list
7929         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7930                  Vcoding_category_list);
7931   }
7932
7933   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7934                doc: /* Specify the coding system for read operations.
7935 It is useful to bind this variable with `let', but do not set it globally.
7936 If the value is a coding system, it is used for decoding on read operation.
7937 If not, an appropriate element is used from one of the coding system alists:
7938 There are three such tables, `file-coding-system-alist',
7939 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7940   Vcoding_system_for_read = Qnil;
7941
7942   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7943                doc: /* Specify the coding system for write operations.
7944 Programs bind this variable with `let', but you should not set it globally.
7945 If the value is a coding system, it is used for encoding of output,
7946 when writing it to a file and when sending it to a file or subprocess.
7947
7948 If this does not specify a coding system, an appropriate element
7949 is used from one of the coding system alists:
7950 There are three such tables, `file-coding-system-alist',
7951 `process-coding-system-alist', and `network-coding-system-alist'.
7952 For output to files, if the above procedure does not specify a coding system,
7953 the value of `buffer-file-coding-system' is used.  */);
7954   Vcoding_system_for_write = Qnil;
7955
7956   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7957                doc: /* Coding system used in the latest file or process I/O.
7958 Also set by `encode-coding-region', `decode-coding-region',
7959 `encode-coding-string' and `decode-coding-string'.  */);
7960   Vlast_coding_system_used = Qnil;
7961
7962   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7963                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7964 See info node `Coding Systems' and info node `Text and Binary' concerning
7965 such conversion.  */);
7966   inhibit_eol_conversion = 0;
7967
7968   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7969                doc: /* Non-nil means process buffer inherits coding system of process output.
7970 Bind it to t if the process output is to be treated as if it were a file
7971 read from some filesystem.  */);
7972   inherit_process_coding_system = 0;
7973
7974   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7975                doc: /* Alist to decide a coding system to use for a file I/O operation.
7976 The format is ((PATTERN . VAL) ...),
7977 where PATTERN is a regular expression matching a file name,
7978 VAL is a coding system, a cons of coding systems, or a function symbol.
7979 If VAL is a coding system, it is used for both decoding and encoding
7980 the file contents.
7981 If VAL is a cons of coding systems, the car part is used for decoding,
7982 and the cdr part is used for encoding.
7983 If VAL is a function symbol, the function must return a coding system
7984 or a cons of coding systems which are used as above.  The function is
7985 called with an argument that is a list of the arguments with which
7986 `find-operation-coding-system' was called.  If the function can't decide
7987 a coding system, it can return `undecided' so that the normal
7988 code-detection is performed.
7989
7990 See also the function `find-operation-coding-system'
7991 and the variable `auto-coding-alist'.  */);
7992   Vfile_coding_system_alist = Qnil;
7993
7994   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7995     doc: /* Alist to decide a coding system to use for a process I/O operation.
7996 The format is ((PATTERN . VAL) ...),
7997 where PATTERN is a regular expression matching a program name,
7998 VAL is a coding system, a cons of coding systems, or a function symbol.
7999 If VAL is a coding system, it is used for both decoding what received
8000 from the program and encoding what sent to the program.
8001 If VAL is a cons of coding systems, the car part is used for decoding,
8002 and the cdr part is used for encoding.
8003 If VAL is a function symbol, the function must return a coding system
8004 or a cons of coding systems which are used as above.
8005
8006 See also the function `find-operation-coding-system'.  */);
8007   Vprocess_coding_system_alist = Qnil;
8008
8009   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8010     doc: /* Alist to decide a coding system to use for a network I/O operation.
8011 The format is ((PATTERN . VAL) ...),
8012 where PATTERN is a regular expression matching a network service name
8013 or is a port number to connect to,
8014 VAL is a coding system, a cons of coding systems, or a function symbol.
8015 If VAL is a coding system, it is used for both decoding what received
8016 from the network stream and encoding what sent to the network stream.
8017 If VAL is a cons of coding systems, the car part is used for decoding,
8018 and the cdr part is used for encoding.
8019 If VAL is a function symbol, the function must return a coding system
8020 or a cons of coding systems which are used as above.
8021
8022 See also the function `find-operation-coding-system'.  */);
8023   Vnetwork_coding_system_alist = Qnil;
8024
8025   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8026                doc: /* Coding system to use with system messages.
8027 Also used for decoding keyboard input on X Window system.  */);
8028   Vlocale_coding_system = Qnil;
8029
8030   /* The eol mnemonics are reset in startup.el system-dependently.  */
8031   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8032                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8033   eol_mnemonic_unix = build_string (":");
8034
8035   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8036                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8037   eol_mnemonic_dos = build_string ("\\");
8038
8039   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8040                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8041   eol_mnemonic_mac = build_string ("/");
8042
8043   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8044                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8045   eol_mnemonic_undecided = build_string (":");
8046
8047   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8048                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8049   Venable_character_translation = Qt;
8050
8051   DEFVAR_LISP ("standard-translation-table-for-decode",
8052                &Vstandard_translation_table_for_decode,
8053                doc: /* Table for translating characters while decoding.  */);
8054   Vstandard_translation_table_for_decode = Qnil;
8055
8056   DEFVAR_LISP ("standard-translation-table-for-encode",
8057                &Vstandard_translation_table_for_encode,
8058                doc: /* Table for translating characters while encoding.  */);
8059   Vstandard_translation_table_for_encode = Qnil;
8060
8061   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8062                doc: /* Alist of charsets vs revision numbers.
8063 While encoding, if a charset (car part of an element) is found,
8064 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8065   Vcharset_revision_alist = Qnil;
8066
8067   DEFVAR_LISP ("default-process-coding-system",
8068                &Vdefault_process_coding_system,
8069                doc: /* Cons of coding systems used for process I/O by default.
8070 The car part is used for decoding a process output,
8071 the cdr part is used for encoding a text to be sent to a process.  */);
8072   Vdefault_process_coding_system = Qnil;
8073
8074   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8075                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8076 This is a vector of length 256.
8077 If Nth element is non-nil, the existence of code N in a file
8078 \(or output of subprocess) doesn't prevent it to be detected as
8079 a coding system of ISO 2022 variant which has a flag
8080 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8081 or reading output of a subprocess.
8082 Only 128th through 159th elements has a meaning.  */);
8083   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8084
8085   DEFVAR_LISP ("select-safe-coding-system-function",
8086                &Vselect_safe_coding_system_function,
8087                doc: /* Function to call to select safe coding system for encoding a text.
8088
8089 If set, this function is called to force a user to select a proper
8090 coding system which can encode the text in the case that a default
8091 coding system used in each operation can't encode the text.
8092
8093 The default value is `select-safe-coding-system' (which see).  */);
8094   Vselect_safe_coding_system_function = Qnil;
8095
8096   DEFVAR_BOOL ("coding-system-require-warning",
8097                &coding_system_require_warning,
8098                doc: /* Internal use only.
8099 If non-nil, on writing a file, `select-safe-coding-system-function' is
8100 called even if `coding-system-for-write' is non-nil.  The command
8101 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8102   coding_system_require_warning = 0;
8103
8104
8105   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8106                &inhibit_iso_escape_detection,
8107                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8108
8109 By default, on reading a file, Emacs tries to detect how the text is
8110 encoded.  This code detection is sensitive to escape sequences.  If
8111 the sequence is valid as ISO2022, the code is determined as one of
8112 the ISO2022 encodings, and the file is decoded by the corresponding
8113 coding system (e.g. `iso-2022-7bit').
8114
8115 However, there may be a case that you want to read escape sequences in
8116 a file as is.  In such a case, you can set this variable to non-nil.
8117 Then, as the code detection ignores any escape sequences, no file is
8118 detected as encoded in some ISO2022 encoding.  The result is that all
8119 escape sequences become visible in a buffer.
8120
8121 The default value is nil, and it is strongly recommended not to change
8122 it.  That is because many Emacs Lisp source files that contain
8123 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8124 in Emacs's distribution, and they won't be decoded correctly on
8125 reading if you suppress escape sequence detection.
8126
8127 The other way to read escape sequences in a file without decoding is
8128 to explicitly specify some coding system that doesn't use ISO2022's
8129 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8130   inhibit_iso_escape_detection = 0;
8131
8132   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8133                doc: /* Char table for translating self-inserting characters.
8134 This is applied to the result of input methods, not their input.  See also
8135 `keyboard-translate-table'.  */);
8136     Vtranslation_table_for_input = Qnil;
8137 }
8138
8139 char *
8140 emacs_strerror (error_number)
8141      int error_number;
8142 {
8143   char *str;
8144
8145   synchronize_system_messages_locale ();
8146   str = strerror (error_number);
8147
8148   if (! NILP (Vlocale_coding_system))
8149     {
8150       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8151                                                       Vlocale_coding_system,
8152                                                       0);
8153       str = (char *) SDATA (dec);
8154     }
8155
8156   return str;
8157 }
8158
8159 #endif /* emacs */
8160
8161 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8162    (do not change this comment) */