code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7
   8 This file is part of GNU Emacs.
   9
  10 GNU Emacs is free software; you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation; either version 2, or (at your option)
  13 any later version.
  14
  15 GNU Emacs is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with GNU Emacs; see the file COPYING.  If not, write to
  22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  23 Boston, MA 02110-1301, USA.  */
  24
  25 /*** TABLE OF CONTENTS ***
  26
  27   0. General comments
  28   1. Preamble
  29   2. Emacs' internal format (emacs-mule) handlers
  30   3. ISO2022 handlers
  31   4. Shift-JIS and BIG5 handlers
  32   5. CCL handlers
  33   6. End-of-line handlers
  34   7. C library functions
  35   8. Emacs Lisp library functions
  36   9. Post-amble
  37
  38 */
  39
  40 /*** 0. General comments ***/
  41
  42
  43 /*** GENERAL NOTE on CODING SYSTEMS ***
  44
  45   A coding system is an encoding mechanism for one or more character
  46   sets.  Here's a list of coding systems which Emacs can handle.  When
  47   we say "decode", it means converting some other coding system to
  48   Emacs' internal format (emacs-mule), and when we say "encode",
  49   it means converting the coding system emacs-mule to some other
  50   coding system.
  51
  52   0. Emacs' internal format (emacs-mule)
  53
  54   Emacs itself holds a multi-lingual character in buffers and strings
  55   in a special format.  Details are described in section 2.
  56
  57   1. ISO2022
  58
  59   The most famous coding system for multiple character sets.  X's
  60   Compound Text, various EUCs (Extended Unix Code), and coding
  61   systems used in Internet communication such as ISO-2022-JP are
  62   all variants of ISO2022.  Details are described in section 3.
  63
  64   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  65
  66   A coding system to encode character sets: ASCII, JISX0201, and
  67   JISX0208.  Widely used for PC's in Japan.  Details are described in
  68   section 4.
  69
  70   3. BIG5
  71
  72   A coding system to encode the character sets ASCII and Big5.  Widely
  73   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  74   described in section 4.  In this file, when we write "BIG5"
  75   (all uppercase), we mean the coding system, and when we write
  76   "Big5" (capitalized), we mean the character set.
  77
  78   4. Raw text
  79
  80   A coding system for text containing random 8-bit code.  Emacs does
  81   no code conversion on such text except for end-of-line format.
  82
  83   5. Other
  84
  85   If a user wants to read/write text encoded in a coding system not
  86   listed above, he can supply a decoder and an encoder for it as CCL
  87   (Code Conversion Language) programs.  Emacs executes the CCL program
  88   while reading/writing.
  89
  90   Emacs represents a coding system by a Lisp symbol that has a property
  91   `coding-system'.  But, before actually using the coding system, the
  92   information about it is set in a structure of type `struct
  93   coding_system' for rapid processing.  See section 6 for more details.
  94
  95 */
  96
  97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  98
  99   How end-of-line of text is encoded depends on the operating system.
 100   For instance, Unix's format is just one byte of `line-feed' code,
 101   whereas DOS's format is two-byte sequence of `carriage-return' and
 102   `line-feed' codes.  MacOS's format is usually one byte of
 103   `carriage-return'.
 104
 105   Since text character encoding and end-of-line encoding are
 106   independent, any coding system described above can have any
 107   end-of-line format.  So Emacs has information about end-of-line
 108   format in each coding-system.  See section 6 for more details.
 109
 110 */
 111
 112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 113
 114   These functions check if a text between SRC and SRC_END is encoded
 115   in the coding system category XXX.  Each returns an integer value in
 116   which appropriate flag bits for the category XXX are set.  The flag
 117   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 118   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 119   of the range 0x80..0x9F are in multibyte form.  */
 120 #if 0
 121 int
 122 detect_coding_emacs_mule (src, src_end, multibytep)
 123      unsigned char *src, *src_end;
 124      int multibytep;
 125 {
 126   ...
 127 }
 128 #endif
 129
 130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 131
 132   These functions decode SRC_BYTES length of unibyte text at SOURCE
 133   encoded in CODING to Emacs' internal format.  The resulting
 134   multibyte text goes to a place pointed to by DESTINATION, the length
 135   of which should not exceed DST_BYTES.
 136
 137   These functions set the information about original and decoded texts
 138   in the members `produced', `produced_char', `consumed', and
 139   `consumed_char' of the structure *CODING.  They also set the member
 140   `result' to one of CODING_FINISH_XXX indicating how the decoding
 141   finished.
 142
 143   DST_BYTES zero means that the source area and destination area are
 144   overlapped, which means that we can produce a decoded text until it
 145   reaches the head of the not-yet-decoded source text.
 146
 147   Below is a template for these functions.  */
 148 #if 0
 149 static void
 150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 151      struct coding_system *coding;
 152      const unsigned char *source;
 153      unsigned char *destination;
 154      int src_bytes, dst_bytes;
 155 {
 156   ...
 157 }
 158 #endif
 159
 160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 161
 162   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 163   internal multibyte format to CODING.  The resulting unibyte text
 164   goes to a place pointed to by DESTINATION, the length of which
 165   should not exceed DST_BYTES.
 166
 167   These functions set the information about original and encoded texts
 168   in the members `produced', `produced_char', `consumed', and
 169   `consumed_char' of the structure *CODING.  They also set the member
 170   `result' to one of CODING_FINISH_XXX indicating how the encoding
 171   finished.
 172
 173   DST_BYTES zero means that the source area and destination area are
 174   overlapped, which means that we can produce encoded text until it
 175   reaches at the head of the not-yet-encoded source text.
 176
 177   Below is a template for these functions.  */
 178 #if 0
 179 static void
 180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 181      struct coding_system *coding;
 182      unsigned char *source, *destination;
 183      int src_bytes, dst_bytes;
 184 {
 185   ...
 186 }
 187 #endif
 188
 189 /*** COMMONLY USED MACROS ***/
 190
 191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 192    get one, two, and three bytes from the source text respectively.
 193    If there are not enough bytes in the source, they jump to
 194    `label_end_of_loop'.  The caller should set variables `coding',
 195    `src' and `src_end' to appropriate pointer in advance.  These
 196    macros are called from decoding routines `decode_coding_XXX', thus
 197    it is assumed that the source text is unibyte.  */
 198
 199 #define ONE_MORE_BYTE(c1)                                       \
 200   do {                                                          \
 201     if (src >= src_end)                                         \
 202       {                                                         \
 203         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 204         goto label_end_of_loop;                                 \
 205       }                                                         \
 206     c1 = *src++;                                                \
 207   } while (0)
 208
 209 #define TWO_MORE_BYTES(c1, c2)                                  \
 210   do {                                                          \
 211     if (src + 1 >= src_end)                                     \
 212       {                                                         \
 213         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 214         goto label_end_of_loop;                                 \
 215       }                                                         \
 216     c1 = *src++;                                                \
 217     c2 = *src++;                                                \
 218   } while (0)
 219
 220
 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 222    form if MULTIBYTEP is nonzero.  */
 223
 224 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 225   do {                                                          \
 226     if (src >= src_end)                                         \
 227       {                                                         \
 228         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 229         goto label_end_of_loop;                                 \
 230       }                                                         \
 231     c1 = *src++;                                                \
 232     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 233       c1 = *src++ - 0x20;                                       \
 234   } while (0)
 235
 236 /* Set C to the next character at the source text pointed by `src'.
 237    If there are not enough characters in the source, jump to
 238    `label_end_of_loop'.  The caller should set variables `coding'
 239    `src', `src_end', and `translation_table' to appropriate pointers
 240    in advance.  This macro is used in encoding routines
 241    `encode_coding_XXX', thus it assumes that the source text is in
 242    multibyte form except for 8-bit characters.  8-bit characters are
 243    in multibyte form if coding->src_multibyte is nonzero, else they
 244    are represented by a single byte.  */
 245
 246 #define ONE_MORE_CHAR(c)                                        \
 247   do {                                                          \
 248     int len = src_end - src;                                    \
 249     int bytes;                                                  \
 250     if (len <= 0)                                               \
 251       {                                                         \
 252         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 253         goto label_end_of_loop;                                 \
 254       }                                                         \
 255     if (coding->src_multibyte                                   \
 256         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 257       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 258     else                                                        \
 259       c = *src, bytes = 1;                                      \
 260     if (!NILP (translation_table))                              \
 261       c = translate_char (translation_table, c, -1, 0, 0);      \
 262     src += bytes;                                               \
 263   } while (0)
 264
 265
 266 /* Produce a multibyte form of character C to `dst'.  Jump to
 267    `label_end_of_loop' if there's not enough space at `dst'.
 268
 269    If we are now in the middle of a composition sequence, the decoded
 270    character may be ALTCHAR (for the current composition).  In that
 271    case, the character goes to coding->cmp_data->data instead of
 272    `dst'.
 273
 274    This macro is used in decoding routines.  */
 275
 276 #define EMIT_CHAR(c)                                                    \
 277   do {                                                                  \
 278     if (! COMPOSING_P (coding)                                          \
 279         || coding->composing == COMPOSITION_RELATIVE                    \
 280         || coding->composing == COMPOSITION_WITH_RULE)                  \
 281       {                                                                 \
 282         int bytes = CHAR_BYTES (c);                                     \
 283         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 284           {                                                             \
 285             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 286             goto label_end_of_loop;                                     \
 287           }                                                             \
 288         dst += CHAR_STRING (c, dst);                                    \
 289         coding->produced_char++;                                        \
 290       }                                                                 \
 291                                                                         \
 292     if (COMPOSING_P (coding)                                            \
 293         && coding->composing != COMPOSITION_RELATIVE)                   \
 294       {                                                                 \
 295         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 296         coding->composition_rule_follows                                \
 297           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 298       }                                                                 \
 299   } while (0)
 300
 301
 302 #define EMIT_ONE_BYTE(c)                                        \
 303   do {                                                          \
 304     if (dst >= (dst_bytes ? dst_end : src))                     \
 305       {                                                         \
 306         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 307         goto label_end_of_loop;                                 \
 308       }                                                         \
 309     *dst++ = c;                                                 \
 310   } while (0)
 311
 312 #define EMIT_TWO_BYTES(c1, c2)                                  \
 313   do {                                                          \
 314     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 315       {                                                         \
 316         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 317         goto label_end_of_loop;                                 \
 318       }                                                         \
 319     *dst++ = c1, *dst++ = c2;                                   \
 320   } while (0)
 321
 322 #define EMIT_BYTES(from, to)                                    \
 323   do {                                                          \
 324     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 325       {                                                         \
 326         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 327         goto label_end_of_loop;                                 \
 328       }                                                         \
 329     while (from < to)                                           \
 330       *dst++ = *from++;                                         \
 331   } while (0)
 332
 333 \f
 334 /*** 1. Preamble ***/
 335
 336 #ifdef emacs
 337 #include <config.h>
 338 #endif
 339
 340 #include <stdio.h>
 341
 342 #ifdef emacs
 343
 344 #include "lisp.h"
 345 #include "buffer.h"
 346 #include "charset.h"
 347 #include "composite.h"
 348 #include "ccl.h"
 349 #include "coding.h"
 350 #include "window.h"
 351 #include "intervals.h"
 352
 353 #else  /* not emacs */
 354
 355 #include "mulelib.h"
 356
 357 #endif /* not emacs */
 358
 359 Lisp_Object Qcoding_system, Qeol_type;
 360 Lisp_Object Qbuffer_file_coding_system;
 361 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 362 Lisp_Object Qno_conversion, Qundecided;
 363 Lisp_Object Qcoding_system_history;
 364 Lisp_Object Qsafe_chars;
 365 Lisp_Object Qvalid_codes;
 366 Lisp_Object Qascii_incompatible;
 367
 368 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 369 Lisp_Object Qcall_process, Qcall_process_region;
 370 Lisp_Object Qstart_process, Qopen_network_stream;
 371 Lisp_Object Qtarget_idx;
 372
 373 /* If a symbol has this property, evaluate the value to define the
 374    symbol as a coding system.  */
 375 Lisp_Object Qcoding_system_define_form;
 376
 377 Lisp_Object Vselect_safe_coding_system_function;
 378
 379 int coding_system_require_warning;
 380
 381 /* Mnemonic string for each format of end-of-line.  */
 382 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 383 /* Mnemonic string to indicate format of end-of-line is not yet
 384    decided.  */
 385 Lisp_Object eol_mnemonic_undecided;
 386
 387 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 388    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 389    This has an effect only for external encoding (i.e. for output to
 390    file and process), not for in-buffer or Lisp string encoding.  */
 391 int system_eol_type;
 392
 393 #ifdef emacs
 394
 395 /* Information about which coding system is safe for which chars.
 396    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 397
 398    GENERIC-LIST is a list of generic coding systems which can encode
 399    any characters.
 400
 401    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 402    corresponding char table that contains safe chars.  */
 403 Lisp_Object Vcoding_system_safe_chars;
 404
 405 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 406
 407 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 408
 409 /* Coding system emacs-mule and raw-text are for converting only
 410    end-of-line format.  */
 411 Lisp_Object Qemacs_mule, Qraw_text;
 412
 413 Lisp_Object Qutf_8;
 414
 415 /* Coding-systems are handed between Emacs Lisp programs and C internal
 416    routines by the following three variables.  */
 417 /* Coding-system for reading files and receiving data from process.  */
 418 Lisp_Object Vcoding_system_for_read;
 419 /* Coding-system for writing files and sending data to process.  */
 420 Lisp_Object Vcoding_system_for_write;
 421 /* Coding-system actually used in the latest I/O.  */
 422 Lisp_Object Vlast_coding_system_used;
 423
 424 /* A vector of length 256 which contains information about special
 425    Latin codes (especially for dealing with Microsoft codes).  */
 426 Lisp_Object Vlatin_extra_code_table;
 427
 428 /* Flag to inhibit code conversion of end-of-line format.  */
 429 int inhibit_eol_conversion;
 430
 431 /* Flag to inhibit ISO2022 escape sequence detection.  */
 432 int inhibit_iso_escape_detection;
 433
 434 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 435 int inherit_process_coding_system;
 436
 437 /* Coding system to be used to encode text for terminal display.  */
 438 struct coding_system terminal_coding;
 439
 440 /* Coding system to be used to encode text for terminal display when
 441    terminal coding system is nil.  */
 442 struct coding_system safe_terminal_coding;
 443
 444 /* Coding system of what is sent from terminal keyboard.  */
 445 struct coding_system keyboard_coding;
 446
 447 /* Default coding system to be used to write a file.  */
 448 struct coding_system default_buffer_file_coding;
 449
 450 Lisp_Object Vfile_coding_system_alist;
 451 Lisp_Object Vprocess_coding_system_alist;
 452 Lisp_Object Vnetwork_coding_system_alist;
 453
 454 Lisp_Object Vlocale_coding_system;
 455
 456 #endif /* emacs */
 457
 458 Lisp_Object Qcoding_category, Qcoding_category_index;
 459
 460 /* List of symbols `coding-category-xxx' ordered by priority.  */
 461 Lisp_Object Vcoding_category_list;
 462
 463 /* Table of coding categories (Lisp symbols).  */
 464 Lisp_Object Vcoding_category_table;
 465
 466 /* Table of names of symbol for each coding-category.  */
 467 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 468   "coding-category-emacs-mule",
 469   "coding-category-sjis",
 470   "coding-category-iso-7",
 471   "coding-category-iso-7-tight",
 472   "coding-category-iso-8-1",
 473   "coding-category-iso-8-2",
 474   "coding-category-iso-7-else",
 475   "coding-category-iso-8-else",
 476   "coding-category-ccl",
 477   "coding-category-big5",
 478   "coding-category-utf-8",
 479   "coding-category-utf-16-be",
 480   "coding-category-utf-16-le",
 481   "coding-category-raw-text",
 482   "coding-category-binary"
 483 };
 484
 485 /* Table of pointers to coding systems corresponding to each coding
 486    categories.  */
 487 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 488
 489 /* Table of coding category masks.  Nth element is a mask for a coding
 490    category of which priority is Nth.  */
 491 static
 492 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 493
 494 /* Flag to tell if we look up translation table on character code
 495    conversion.  */
 496 Lisp_Object Venable_character_translation;
 497 /* Standard translation table to look up on decoding (reading).  */
 498 Lisp_Object Vstandard_translation_table_for_decode;
 499 /* Standard translation table to look up on encoding (writing).  */
 500 Lisp_Object Vstandard_translation_table_for_encode;
 501
 502 Lisp_Object Qtranslation_table;
 503 Lisp_Object Qtranslation_table_id;
 504 Lisp_Object Qtranslation_table_for_decode;
 505 Lisp_Object Qtranslation_table_for_encode;
 506
 507 /* Alist of charsets vs revision number.  */
 508 Lisp_Object Vcharset_revision_alist;
 509
 510 /* Default coding systems used for process I/O.  */
 511 Lisp_Object Vdefault_process_coding_system;
 512
 513 /* Char table for translating Quail and self-inserting input.  */
 514 Lisp_Object Vtranslation_table_for_input;
 515
 516 /* Global flag to tell that we can't call post-read-conversion and
 517    pre-write-conversion functions.  Usually the value is zero, but it
 518    is set to 1 temporarily while such functions are running.  This is
 519    to avoid infinite recursive call.  */
 520 static int inhibit_pre_post_conversion;
 521
 522 Lisp_Object Qchar_coding_system;
 523
 524 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 525    its validity.  */
 526
 527 Lisp_Object
 528 coding_safe_chars (coding_system)
 529      Lisp_Object coding_system;
 530 {
 531   Lisp_Object coding_spec, plist, safe_chars;
 532
 533   coding_spec = Fget (coding_system, Qcoding_system);
 534   plist = XVECTOR (coding_spec)->contents[3];
 535   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 536   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 537 }
 538
 539 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 540   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 541
 542 \f
 543 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 544
 545 /* Emacs' internal format for representation of multiple character
 546    sets is a kind of multi-byte encoding, i.e. characters are
 547    represented by variable-length sequences of one-byte codes.
 548
 549    ASCII characters and control characters (e.g. `tab', `newline') are
 550    represented by one-byte sequences which are their ASCII codes, in
 551    the range 0x00 through 0x7F.
 552
 553    8-bit characters of the range 0x80..0x9F are represented by
 554    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 555    code + 0x20).
 556
 557    8-bit characters of the range 0xA0..0xFF are represented by
 558    one-byte sequences which are their 8-bit code.
 559
 560    The other characters are represented by a sequence of `base
 561    leading-code', optional `extended leading-code', and one or two
 562    `position-code's.  The length of the sequence is determined by the
 563    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 564    whereas extended leading-code and position-code take the range 0xA0
 565    through 0xFF.  See `charset.h' for more details about leading-code
 566    and position-code.
 567
 568    --- CODE RANGE of Emacs' internal format ---
 569    character set        range
 570    -------------        -----
 571    ascii                0x00..0x7F
 572    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 573    eight-bit-graphic    0xA0..0xBF
 574    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 575    ---------------------------------------------
 576
 577    As this is the internal character representation, the format is
 578    usually not used externally (i.e. in a file or in a data sent to a
 579    process).  But, it is possible to have a text externally in this
 580    format (i.e. by encoding by the coding system `emacs-mule').
 581
 582    In that case, a sequence of one-byte codes has a slightly different
 583    form.
 584
 585    Firstly, all characters in eight-bit-control are represented by
 586    one-byte sequences which are their 8-bit code.
 587
 588    Next, character composition data are represented by the byte
 589    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 590    where,
 591         METHOD is 0xF0 plus one of composition method (enum
 592         composition_method),
 593
 594         BYTES is 0xA0 plus the byte length of these composition data,
 595
 596         CHARS is 0xA0 plus the number of characters composed by these
 597         data,
 598
 599         COMPONENTs are characters of multibyte form or composition
 600         rules encoded by two-byte of ASCII codes.
 601
 602    In addition, for backward compatibility, the following formats are
 603    also recognized as composition data on decoding.
 604
 605    0x80 MSEQ ...
 606    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 607
 608    Here,
 609         MSEQ is a multibyte form but in these special format:
 610           ASCII: 0xA0 ASCII_CODE+0x80,
 611           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 612         RULE is a one byte code of the range 0xA0..0xF0 that
 613         represents a composition rule.
 614   */
 615
 616 enum emacs_code_class_type emacs_code_class[256];
 617
 618 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 619    Check if a text is encoded in Emacs' internal format.  If it is,
 620    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 621
 622 static int
 623 detect_coding_emacs_mule (src, src_end, multibytep)
 624       unsigned char *src, *src_end;
 625       int multibytep;
 626 {
 627   unsigned char c;
 628   int composing = 0;
 629   /* Dummy for ONE_MORE_BYTE.  */
 630   struct coding_system dummy_coding;
 631   struct coding_system *coding = &dummy_coding;
 632
 633   while (1)
 634     {
 635       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 636
 637       if (composing)
 638         {
 639           if (c < 0xA0)
 640             composing = 0;
 641           else if (c == 0xA0)
 642             {
 643               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 644               c &= 0x7F;
 645             }
 646           else
 647             c -= 0x20;
 648         }
 649
 650       if (c < 0x20)
 651         {
 652           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 653             return 0;
 654         }
 655       else if (c >= 0x80 && c < 0xA0)
 656         {
 657           if (c == 0x80)
 658             /* Old leading code for a composite character.  */
 659             composing = 1;
 660           else
 661             {
 662               unsigned char *src_base = src - 1;
 663               int bytes;
 664
 665               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 666                                                bytes))
 667                 return 0;
 668               src = src_base + bytes;
 669             }
 670         }
 671     }
 672  label_end_of_loop:
 673   return CODING_CATEGORY_MASK_EMACS_MULE;
 674 }
 675
 676
 677 /* Record the starting position START and METHOD of one composition.  */
 678
 679 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 680   do {                                                          \
 681     struct composition_data *cmp_data = coding->cmp_data;       \
 682     int *data = cmp_data->data + cmp_data->used;                \
 683     coding->cmp_data_start = cmp_data->used;                    \
 684     data[0] = -1;                                               \
 685     data[1] = cmp_data->char_offset + start;                    \
 686     data[3] = (int) method;                                     \
 687     cmp_data->used += 4;                                        \
 688   } while (0)
 689
 690 /* Record the ending position END of the current composition.  */
 691
 692 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 693   do {                                                          \
 694     struct composition_data *cmp_data = coding->cmp_data;       \
 695     int *data = cmp_data->data + coding->cmp_data_start;        \
 696     data[0] = cmp_data->used - coding->cmp_data_start;          \
 697     data[2] = cmp_data->char_offset + end;                      \
 698   } while (0)
 699
 700 /* Record one COMPONENT (alternate character or composition rule).  */
 701
 702 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 703   do {                                                                  \
 704     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 705     if (coding->cmp_data->used - coding->cmp_data_start                 \
 706         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 707       {                                                                 \
 708         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 709         coding->composing = COMPOSITION_NO;                             \
 710       }                                                                 \
 711   } while (0)
 712
 713
 714 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 715    is not less than SRC_END, return -1 without incrementing Src.  */
 716
 717 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 718
 719
 720 /* Decode a character represented as a component of composition
 721    sequence of Emacs 20 style at SRC.  Set C to that character, store
 722    its multibyte form sequence at P, and set P to the end of that
 723    sequence.  If no valid character is found, set C to -1.  */
 724
 725 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 726   do {                                                          \
 727     int bytes;                                                  \
 728                                                                 \
 729     c = SAFE_ONE_MORE_BYTE ();                                  \
 730     if (c < 0)                                                  \
 731       break;                                                    \
 732     if (CHAR_HEAD_P (c))                                        \
 733       c = -1;                                                   \
 734     else if (c == 0xA0)                                         \
 735       {                                                         \
 736         c = SAFE_ONE_MORE_BYTE ();                              \
 737         if (c < 0xA0)                                           \
 738           c = -1;                                               \
 739         else                                                    \
 740           {                                                     \
 741             c -= 0x80;                                          \
 742             *p++ = c;                                           \
 743           }                                                     \
 744       }                                                         \
 745     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 746       {                                                         \
 747         unsigned char *p0 = p;                                  \
 748                                                                 \
 749         c -= 0x20;                                              \
 750         *p++ = c;                                               \
 751         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 752         while (--bytes)                                         \
 753           {                                                     \
 754             c = SAFE_ONE_MORE_BYTE ();                          \
 755             if (c < 0)                                          \
 756               break;                                            \
 757             *p++ = c;                                           \
 758           }                                                     \
 759         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 760             || (coding->flags /* We are recovering a file.  */  \
 761                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 762                 && ! CHAR_HEAD_P (p0[1])))                      \
 763           c = STRING_CHAR (p0, bytes);                          \
 764         else                                                    \
 765           c = -1;                                               \
 766       }                                                         \
 767     else                                                        \
 768       c = -1;                                                   \
 769   } while (0)
 770
 771
 772 /* Decode a composition rule represented as a component of composition
 773    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 774    valid rule is found, set C to -1.  */
 775
 776 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 777   do {                                                  \
 778     c = SAFE_ONE_MORE_BYTE ();                          \
 779     c -= 0xA0;                                          \
 780     if (c < 0 || c >= 81)                               \
 781       c = -1;                                           \
 782     else                                                \
 783       {                                                 \
 784         gref = c / 9, nref = c % 9;                     \
 785         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 786       }                                                 \
 787   } while (0)
 788
 789
 790 /* Decode composition sequence encoded by `emacs-mule' at the source
 791    pointed by SRC.  SRC_END is the end of source.  Store information
 792    of the composition in CODING->cmp_data.
 793
 794    For backward compatibility, decode also a composition sequence of
 795    Emacs 20 style.  In that case, the composition sequence contains
 796    characters that should be extracted into a buffer or string.  Store
 797    those characters at *DESTINATION in multibyte form.
 798
 799    If we encounter an invalid byte sequence, return 0.
 800    If we encounter an insufficient source or destination, or
 801    insufficient space in CODING->cmp_data, return 1.
 802    Otherwise, return consumed bytes in the source.
 803
 804 */
 805 static INLINE int
 806 decode_composition_emacs_mule (coding, src, src_end,
 807                                destination, dst_end, dst_bytes)
 808      struct coding_system *coding;
 809      const unsigned char *src, *src_end;
 810      unsigned char **destination, *dst_end;
 811      int dst_bytes;
 812 {
 813   unsigned char *dst = *destination;
 814   int method, data_len, nchars;
 815   const unsigned char *src_base = src++;
 816   /* Store components of composition.  */
 817   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 818   int ncomponent;
 819   /* Store multibyte form of characters to be composed.  This is for
 820      Emacs 20 style composition sequence.  */
 821   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 822   unsigned char *bufp = buf;
 823   int c, i, gref, nref;
 824
 825   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 826       >= COMPOSITION_DATA_SIZE)
 827     {
 828       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 829       return -1;
 830     }
 831
 832   ONE_MORE_BYTE (c);
 833   if (c - 0xF0 >= COMPOSITION_RELATIVE
 834            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 835     {
 836       int with_rule;
 837
 838       method = c - 0xF0;
 839       with_rule = (method == COMPOSITION_WITH_RULE
 840                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 841       ONE_MORE_BYTE (c);
 842       data_len = c - 0xA0;
 843       if (data_len < 4
 844           || src_base + data_len > src_end)
 845         return 0;
 846       ONE_MORE_BYTE (c);
 847       nchars = c - 0xA0;
 848       if (c < 1)
 849         return 0;
 850       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 851         {
 852           /* If it is longer than this, it can't be valid.  */
 853           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 854             return 0;
 855
 856           if (ncomponent % 2 && with_rule)
 857             {
 858               ONE_MORE_BYTE (gref);
 859               gref -= 32;
 860               ONE_MORE_BYTE (nref);
 861               nref -= 32;
 862               c = COMPOSITION_ENCODE_RULE (gref, nref);
 863             }
 864           else
 865             {
 866               int bytes;
 867               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 868                   || (coding->flags /* We are recovering a file.  */
 869                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 870                       && ! CHAR_HEAD_P (src[1])))
 871                 c = STRING_CHAR (src, bytes);
 872               else
 873                 c = *src, bytes = 1;
 874               src += bytes;
 875             }
 876           component[ncomponent] = c;
 877         }
 878     }
 879   else if (c >= 0x80)
 880     {
 881       /* This may be an old Emacs 20 style format.  See the comment at
 882          the section 2 of this file.  */
 883       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 884       if (src == src_end
 885           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 886         goto label_end_of_loop;
 887
 888       src_end = src;
 889       src = src_base + 1;
 890       if (c < 0xC0)
 891         {
 892           method = COMPOSITION_RELATIVE;
 893           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 894             {
 895               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 896               if (c < 0)
 897                 break;
 898               component[ncomponent++] = c;
 899             }
 900           if (ncomponent < 2)
 901             return 0;
 902           nchars = ncomponent;
 903         }
 904       else if (c == 0xFF)
 905         {
 906           method = COMPOSITION_WITH_RULE;
 907           src++;
 908           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 909           if (c < 0)
 910             return 0;
 911           component[0] = c;
 912           for (ncomponent = 1;
 913                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 914             {
 915               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 916               if (c < 0)
 917                 break;
 918               component[ncomponent++] = c;
 919               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 920               if (c < 0)
 921                 break;
 922               component[ncomponent++] = c;
 923             }
 924           if (ncomponent < 3)
 925             return 0;
 926           nchars = (ncomponent + 1) / 2;
 927         }
 928       else
 929         return 0;
 930     }
 931   else
 932     return 0;
 933
 934   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 935     {
 936       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 937       for (i = 0; i < ncomponent; i++)
 938         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 939       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 940       if (buf < bufp)
 941         {
 942           unsigned char *p = buf;
 943           EMIT_BYTES (p, bufp);
 944           *destination += bufp - buf;
 945           coding->produced_char += nchars;
 946         }
 947       return (src - src_base);
 948     }
 949  label_end_of_loop:
 950   return -1;
 951 }
 952
 953 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 954
 955 static void
 956 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 957      struct coding_system *coding;
 958      const unsigned char *source;
 959      unsigned char *destination;
 960      int src_bytes, dst_bytes;
 961 {
 962   const unsigned char *src = source;
 963   const unsigned char *src_end = source + src_bytes;
 964   unsigned char *dst = destination;
 965   unsigned char *dst_end = destination + dst_bytes;
 966   /* SRC_BASE remembers the start position in source in each loop.
 967      The loop will be exited when there's not enough source code, or
 968      when there's not enough destination area to produce a
 969      character.  */
 970   const unsigned char *src_base;
 971
 972   coding->produced_char = 0;
 973   while ((src_base = src) < src_end)
 974     {
 975       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 976       const unsigned char *p;
 977       int bytes;
 978
 979       if (*src == '\r')
 980         {
 981           int c = *src++;
 982
 983           if (coding->eol_type == CODING_EOL_CR)
 984             c = '\n';
 985           else if (coding->eol_type == CODING_EOL_CRLF)
 986             {
 987               ONE_MORE_BYTE (c);
 988               if (c != '\n')
 989                 {
 990                   src--;
 991                   c = '\r';
 992                 }
 993             }
 994           *dst++ = c;
 995           coding->produced_char++;
 996           continue;
 997         }
 998       else if (*src == '\n')
 999         {
1000           if ((coding->eol_type == CODING_EOL_CR
1001                || coding->eol_type == CODING_EOL_CRLF)
1002               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1003             {
1004               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1005               goto label_end_of_loop;
1006             }
1007           *dst++ = *src++;
1008           coding->produced_char++;
1009           continue;
1010         }
1011       else if (*src == 0x80 && coding->cmp_data)
1012         {
1013           /* Start of composition data.  */
1014           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1015                                                          &dst, dst_end,
1016                                                          dst_bytes);
1017           if (consumed < 0)
1018             goto label_end_of_loop;
1019           else if (consumed > 0)
1020             {
1021               src += consumed;
1022               continue;
1023             }
1024           bytes = CHAR_STRING (*src, tmp);
1025           p = tmp;
1026           src++;
1027         }
1028       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1029                || (coding->flags /* We are recovering a file.  */
1030                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1031                    && ! CHAR_HEAD_P (src[1])))
1032         {
1033           p = src;
1034           src += bytes;
1035         }
1036       else
1037         {
1038           int i, c;
1039
1040           bytes = BYTES_BY_CHAR_HEAD (*src);
1041           src++;
1042           for (i = 1; i < bytes; i++)
1043             {
1044               ONE_MORE_BYTE (c);
1045               if (CHAR_HEAD_P (c))
1046                 break;
1047             }
1048           if (i < bytes)
1049             {
1050               bytes = CHAR_STRING (*src_base, tmp);
1051               p = tmp;
1052               src = src_base + 1;
1053             }
1054           else
1055             {
1056               p = src_base;
1057             }
1058         }
1059       if (dst + bytes >= (dst_bytes ? dst_end : src))
1060         {
1061           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1062           break;
1063         }
1064       while (bytes--) *dst++ = *p++;
1065       coding->produced_char++;
1066     }
1067  label_end_of_loop:
1068   coding->consumed = coding->consumed_char = src_base - source;
1069   coding->produced = dst - destination;
1070 }
1071
1072
1073 /* Encode composition data stored at DATA into a special byte sequence
1074    starting by 0x80.  Update CODING->cmp_data_start and maybe
1075    CODING->cmp_data for the next call.  */
1076
1077 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1078   do {                                                                  \
1079     unsigned char buf[1024], *p0 = buf, *p;                             \
1080     int len = data[0];                                                  \
1081     int i;                                                              \
1082                                                                         \
1083     buf[0] = 0x80;                                                      \
1084     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1085     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1086     p = buf + 4;                                                        \
1087     if (data[3] == COMPOSITION_WITH_RULE                                \
1088         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1089       {                                                                 \
1090         p += CHAR_STRING (data[4], p);                                  \
1091         for (i = 5; i < len; i += 2)                                    \
1092           {                                                             \
1093             int gref, nref;                                             \
1094              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1095             *p++ = 0x20 + gref;                                         \
1096             *p++ = 0x20 + nref;                                         \
1097             p += CHAR_STRING (data[i + 1], p);                          \
1098           }                                                             \
1099       }                                                                 \
1100     else                                                                \
1101       {                                                                 \
1102         for (i = 4; i < len; i++)                                       \
1103           p += CHAR_STRING (data[i], p);                                \
1104       }                                                                 \
1105     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1106                                                                         \
1107     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1108       {                                                                 \
1109         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1110         goto label_end_of_loop;                                         \
1111       }                                                                 \
1112     while (p0 < p)                                                      \
1113       *dst++ = *p0++;                                                   \
1114     coding->cmp_data_start += data[0];                                  \
1115     if (coding->cmp_data_start == coding->cmp_data->used                \
1116         && coding->cmp_data->next)                                      \
1117       {                                                                 \
1118         coding->cmp_data = coding->cmp_data->next;                      \
1119         coding->cmp_data_start = 0;                                     \
1120       }                                                                 \
1121   } while (0)
1122
1123
1124 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1125                             unsigned char *, int, int));
1126
1127 static void
1128 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1129      struct coding_system *coding;
1130      const unsigned char *source;
1131      unsigned char *destination;
1132      int src_bytes, dst_bytes;
1133 {
1134   const unsigned char *src = source;
1135   const unsigned char *src_end = source + src_bytes;
1136   unsigned char *dst = destination;
1137   unsigned char *dst_end = destination + dst_bytes;
1138   const unsigned char *src_base;
1139   int c;
1140   int char_offset;
1141   int *data;
1142
1143   Lisp_Object translation_table;
1144
1145   translation_table = Qnil;
1146
1147   /* Optimization for the case that there's no composition.  */
1148   if (!coding->cmp_data || coding->cmp_data->used == 0)
1149     {
1150       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1151       return;
1152     }
1153
1154   char_offset = coding->cmp_data->char_offset;
1155   data = coding->cmp_data->data + coding->cmp_data_start;
1156   while (1)
1157     {
1158       src_base = src;
1159
1160       /* If SRC starts a composition, encode the information about the
1161          composition in advance.  */
1162       if (coding->cmp_data_start < coding->cmp_data->used
1163           && char_offset + coding->consumed_char == data[1])
1164         {
1165           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1166           char_offset = coding->cmp_data->char_offset;
1167           data = coding->cmp_data->data + coding->cmp_data_start;
1168         }
1169
1170       ONE_MORE_CHAR (c);
1171       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1172                         || coding->eol_type == CODING_EOL_CR))
1173         {
1174           if (coding->eol_type == CODING_EOL_CRLF)
1175             EMIT_TWO_BYTES ('\r', c);
1176           else
1177             EMIT_ONE_BYTE ('\r');
1178         }
1179       else if (SINGLE_BYTE_CHAR_P (c))
1180         {
1181           if (coding->flags && ! ASCII_BYTE_P (c))
1182             {
1183               /* As we are auto saving, retain the multibyte form for
1184                  8-bit chars.  */
1185               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1186               int bytes = CHAR_STRING (c, buf);
1187
1188               if (bytes == 1)
1189                 EMIT_ONE_BYTE (buf[0]);
1190               else
1191                 EMIT_TWO_BYTES (buf[0], buf[1]);
1192             }
1193           else
1194             EMIT_ONE_BYTE (c);
1195         }
1196       else
1197         EMIT_BYTES (src_base, src);
1198       coding->consumed_char++;
1199     }
1200  label_end_of_loop:
1201   coding->consumed = src_base - source;
1202   coding->produced = coding->produced_char = dst - destination;
1203   return;
1204 }
1205
1206 \f
1207 /*** 3. ISO2022 handlers ***/
1208
1209 /* The following note describes the coding system ISO2022 briefly.
1210    Since the intention of this note is to help understand the
1211    functions in this file, some parts are NOT ACCURATE or are OVERLY
1212    SIMPLIFIED.  For thorough understanding, please refer to the
1213    original document of ISO2022.  This is equivalent to the standard
1214    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1215
1216    ISO2022 provides many mechanisms to encode several character sets
1217    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1218    is encoded using bytes less than 128.  This may make the encoded
1219    text a little bit longer, but the text passes more easily through
1220    several types of gateway, some of which strip off the MSB (Most
1221    Significant Bit).
1222
1223    There are two kinds of character sets: control character sets and
1224    graphic character sets.  The former contain control characters such
1225    as `newline' and `escape' to provide control functions (control
1226    functions are also provided by escape sequences).  The latter
1227    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1228    two control character sets and many graphic character sets.
1229
1230    Graphic character sets are classified into one of the following
1231    four classes, according to the number of bytes (DIMENSION) and
1232    number of characters in one dimension (CHARS) of the set:
1233    - DIMENSION1_CHARS94
1234    - DIMENSION1_CHARS96
1235    - DIMENSION2_CHARS94
1236    - DIMENSION2_CHARS96
1237
1238    In addition, each character set is assigned an identification tag,
1239    unique for each set, called the "final character" (denoted as <F>
1240    hereafter).  The <F> of each character set is decided by ECMA(*)
1241    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1242    (0x30..0x3F are for private use only).
1243
1244    Note (*): ECMA = European Computer Manufacturers Association
1245
1246    Here are examples of graphic character sets [NAME(<F>)]:
1247         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1248         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1249         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1250         o DIMENSION2_CHARS96 -- none for the moment
1251
1252    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1253         C0 [0x00..0x1F] -- control character plane 0
1254         GL [0x20..0x7F] -- graphic character plane 0
1255         C1 [0x80..0x9F] -- control character plane 1
1256         GR [0xA0..0xFF] -- graphic character plane 1
1257
1258    A control character set is directly designated and invoked to C0 or
1259    C1 by an escape sequence.  The most common case is that:
1260    - ISO646's  control character set is designated/invoked to C0, and
1261    - ISO6429's control character set is designated/invoked to C1,
1262    and usually these designations/invocations are omitted in encoded
1263    text.  In a 7-bit environment, only C0 can be used, and a control
1264    character for C1 is encoded by an appropriate escape sequence to
1265    fit into the environment.  All control characters for C1 are
1266    defined to have corresponding escape sequences.
1267
1268    A graphic character set is at first designated to one of four
1269    graphic registers (G0 through G3), then these graphic registers are
1270    invoked to GL or GR.  These designations and invocations can be
1271    done independently.  The most common case is that G0 is invoked to
1272    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1273    these invocations and designations are omitted in encoded text.
1274    In a 7-bit environment, only GL can be used.
1275
1276    When a graphic character set of CHARS94 is invoked to GL, codes
1277    0x20 and 0x7F of the GL area work as control characters SPACE and
1278    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1279    be used.
1280
1281    There are two ways of invocation: locking-shift and single-shift.
1282    With locking-shift, the invocation lasts until the next different
1283    invocation, whereas with single-shift, the invocation affects the
1284    following character only and doesn't affect the locking-shift
1285    state.  Invocations are done by the following control characters or
1286    escape sequences:
1287
1288    ----------------------------------------------------------------------
1289    abbrev  function                  cntrl escape seq   description
1290    ----------------------------------------------------------------------
1291    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1292    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1293    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1294    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1295    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1296    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1297    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1298    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1299    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1300    ----------------------------------------------------------------------
1301    (*) These are not used by any known coding system.
1302
1303    Control characters for these functions are defined by macros
1304    ISO_CODE_XXX in `coding.h'.
1305
1306    Designations are done by the following escape sequences:
1307    ----------------------------------------------------------------------
1308    escape sequence      description
1309    ----------------------------------------------------------------------
1310    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1311    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1312    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1313    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1314    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1315    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1316    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1317    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1318    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1319    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1320    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1321    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1322    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1323    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1324    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1325    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1326    ----------------------------------------------------------------------
1327
1328    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1329    of dimension 1, chars 94, and final character <F>, etc...
1330
1331    Note (*): Although these designations are not allowed in ISO2022,
1332    Emacs accepts them on decoding, and produces them on encoding
1333    CHARS96 character sets in a coding system which is characterized as
1334    7-bit environment, non-locking-shift, and non-single-shift.
1335
1336    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1337    '(' can be omitted.  We refer to this as "short-form" hereafter.
1338
1339    Now you may notice that there are a lot of ways of encoding the
1340    same multilingual text in ISO2022.  Actually, there exist many
1341    coding systems such as Compound Text (used in X11's inter client
1342    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1343    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1344    localized platforms), and all of these are variants of ISO2022.
1345
1346    In addition to the above, Emacs handles two more kinds of escape
1347    sequences: ISO6429's direction specification and Emacs' private
1348    sequence for specifying character composition.
1349
1350    ISO6429's direction specification takes the following form:
1351         o CSI ']'      -- end of the current direction
1352         o CSI '0' ']'  -- end of the current direction
1353         o CSI '1' ']'  -- start of left-to-right text
1354         o CSI '2' ']'  -- start of right-to-left text
1355    The control character CSI (0x9B: control sequence introducer) is
1356    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1357
1358    Character composition specification takes the following form:
1359         o ESC '0' -- start relative composition
1360         o ESC '1' -- end composition
1361         o ESC '2' -- start rule-base composition (*)
1362         o ESC '3' -- start relative composition with alternate chars  (**)
1363         o ESC '4' -- start rule-base composition with alternate chars  (**)
1364   Since these are not standard escape sequences of any ISO standard,
1365   the use of them with these meanings is restricted to Emacs only.
1366
1367   (*) This form is used only in Emacs 20.5 and older versions,
1368   but the newer versions can safely decode it.
1369   (**) This form is used only in Emacs 21.1 and newer versions,
1370   and the older versions can't decode it.
1371
1372   Here's a list of example usages of these composition escape
1373   sequences (categorized by `enum composition_method').
1374
1375   COMPOSITION_RELATIVE:
1376         ESC 0 CHAR [ CHAR ] ESC 1
1377   COMPOSITION_WITH_RULE:
1378         ESC 2 CHAR [ RULE CHAR ] ESC 1
1379   COMPOSITION_WITH_ALTCHARS:
1380         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1381   COMPOSITION_WITH_RULE_ALTCHARS:
1382         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1383
1384 enum iso_code_class_type iso_code_class[256];
1385
1386 #define CHARSET_OK(idx, charset, c)                                     \
1387   (coding_system_table[idx]                                             \
1388    && (charset == CHARSET_ASCII                                         \
1389        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1390            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1391    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1392                                               charset)                  \
1393        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1394
1395 #define SHIFT_OUT_OK(idx) \
1396   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1397
1398 #define COMPOSITION_OK(idx)     \
1399   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1400
1401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1402    Check if a text is encoded in ISO2022.  If it is, return an
1403    integer in which appropriate flag bits any of:
1404         CODING_CATEGORY_MASK_ISO_7
1405         CODING_CATEGORY_MASK_ISO_7_TIGHT
1406         CODING_CATEGORY_MASK_ISO_8_1
1407         CODING_CATEGORY_MASK_ISO_8_2
1408         CODING_CATEGORY_MASK_ISO_7_ELSE
1409         CODING_CATEGORY_MASK_ISO_8_ELSE
1410    are set.  If a code which should never appear in ISO2022 is found,
1411    returns 0.  */
1412
1413 static int
1414 detect_coding_iso2022 (src, src_end, multibytep)
1415      unsigned char *src, *src_end;
1416      int multibytep;
1417 {
1418   int mask = CODING_CATEGORY_MASK_ISO;
1419   int mask_found = 0;
1420   int reg[4], shift_out = 0, single_shifting = 0;
1421   int c, c1, charset;
1422   /* Dummy for ONE_MORE_BYTE.  */
1423   struct coding_system dummy_coding;
1424   struct coding_system *coding = &dummy_coding;
1425   Lisp_Object safe_chars;
1426
1427   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1428   while (mask && src < src_end)
1429     {
1430       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1431     retry:
1432       switch (c)
1433         {
1434         case ISO_CODE_ESC:
1435           if (inhibit_iso_escape_detection)
1436             break;
1437           single_shifting = 0;
1438           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1439           if (c >= '(' && c <= '/')
1440             {
1441               /* Designation sequence for a charset of dimension 1.  */
1442               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1443               if (c1 < ' ' || c1 >= 0x80
1444                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1445                 /* Invalid designation sequence.  Just ignore.  */
1446                 break;
1447               reg[(c - '(') % 4] = charset;
1448             }
1449           else if (c == '$')
1450             {
1451               /* Designation sequence for a charset of dimension 2.  */
1452               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1453               if (c >= '@' && c <= 'B')
1454                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1455                 reg[0] = charset = iso_charset_table[1][0][c];
1456               else if (c >= '(' && c <= '/')
1457                 {
1458                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1459                   if (c1 < ' ' || c1 >= 0x80
1460                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1461                     /* Invalid designation sequence.  Just ignore.  */
1462                     break;
1463                   reg[(c - '(') % 4] = charset;
1464                 }
1465               else
1466                 /* Invalid designation sequence.  Just ignore.  */
1467                 break;
1468             }
1469           else if (c == 'N' || c == 'O')
1470             {
1471               /* ESC <Fe> for SS2 or SS3.  */
1472               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1473               break;
1474             }
1475           else if (c >= '0' && c <= '4')
1476             {
1477               /* ESC <Fp> for start/end composition.  */
1478               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1479                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480               else
1481                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1483                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484               else
1485                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1487                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1488               else
1489                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1490               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1491                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1492               else
1493                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1494               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1495                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1496               else
1497                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1498               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1499                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1500               else
1501                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1502               break;
1503             }
1504           else
1505             /* Invalid escape sequence.  Just ignore.  */
1506             break;
1507
1508           /* We found a valid designation sequence for CHARSET.  */
1509           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1510           c = MAKE_CHAR (charset, 0, 0);
1511           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1512             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1513           else
1514             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1515           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1516             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1517           else
1518             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1519           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1520             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1521           else
1522             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1523           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1524             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1525           else
1526             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1527           break;
1528
1529         case ISO_CODE_SO:
1530           if (inhibit_iso_escape_detection)
1531             break;
1532           single_shifting = 0;
1533           if (shift_out == 0
1534               && (reg[1] >= 0
1535                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1536                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1537             {
1538               /* Locking shift out.  */
1539               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1540               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1541             }
1542           break;
1543
1544         case ISO_CODE_SI:
1545           if (inhibit_iso_escape_detection)
1546             break;
1547           single_shifting = 0;
1548           if (shift_out == 1)
1549             {
1550               /* Locking shift in.  */
1551               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1552               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1553             }
1554           break;
1555
1556         case ISO_CODE_CSI:
1557           single_shifting = 0;
1558         case ISO_CODE_SS2:
1559         case ISO_CODE_SS3:
1560           {
1561             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1562
1563             if (inhibit_iso_escape_detection)
1564               break;
1565             if (c != ISO_CODE_CSI)
1566               {
1567                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1568                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1569                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1570                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1571                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1572                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1573                 single_shifting = 1;
1574               }
1575             if (VECTORP (Vlatin_extra_code_table)
1576                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1577               {
1578                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1579                     & CODING_FLAG_ISO_LATIN_EXTRA)
1580                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1581                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1582                     & CODING_FLAG_ISO_LATIN_EXTRA)
1583                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1584               }
1585             mask &= newmask;
1586             mask_found |= newmask;
1587           }
1588           break;
1589
1590         default:
1591           if (c < 0x80)
1592             {
1593               single_shifting = 0;
1594               break;
1595             }
1596           else if (c < 0xA0)
1597             {
1598               single_shifting = 0;
1599               if (VECTORP (Vlatin_extra_code_table)
1600                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1601                 {
1602                   int newmask = 0;
1603
1604                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1605                       & CODING_FLAG_ISO_LATIN_EXTRA)
1606                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1607                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1608                       & CODING_FLAG_ISO_LATIN_EXTRA)
1609                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1610                   mask &= newmask;
1611                   mask_found |= newmask;
1612                 }
1613               else
1614                 return 0;
1615             }
1616           else
1617             {
1618               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1619                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1620               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1621               /* Check the length of succeeding codes of the range
1622                  0xA0..0FF.  If the byte length is odd, we exclude
1623                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1624                  when we are not single shifting.  */
1625               if (!single_shifting
1626                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1627                 {
1628                   int i = 1;
1629
1630                   c = -1;
1631                   while (src < src_end)
1632                     {
1633                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1634                       if (c < 0xA0)
1635                         break;
1636                       i++;
1637                     }
1638
1639                   if (i & 1 && src < src_end)
1640                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1641                   else
1642                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1643                   if (c >= 0)
1644                     /* This means that we have read one extra byte.  */
1645                     goto retry;
1646                 }
1647             }
1648           break;
1649         }
1650     }
1651  label_end_of_loop:
1652   return (mask & mask_found);
1653 }
1654
1655 /* Decode a character of which charset is CHARSET, the 1st position
1656    code is C1, the 2nd position code is C2, and return the decoded
1657    character code.  If the variable `translation_table' is non-nil,
1658    returned the translated code.  */
1659
1660 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1661   (NILP (translation_table)                     \
1662    ? MAKE_CHAR (charset, c1, c2)                \
1663    : translate_char (translation_table, -1, charset, c1, c2))
1664
1665 /* Set designation state into CODING.  */
1666 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1667   do {                                                                     \
1668     int charset, c;                                                        \
1669                                                                            \
1670     if (final_char < '0' || final_char >= 128)                             \
1671       goto label_invalid_code;                                             \
1672     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1673                                  make_number (chars),                      \
1674                                  make_number (final_char));                \
1675     c = MAKE_CHAR (charset, 0, 0);                                         \
1676     if (charset >= 0                                                       \
1677         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1678             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1679       {                                                                    \
1680         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1681             && reg == 0                                                    \
1682             && charset == CHARSET_ASCII)                                   \
1683           {                                                                \
1684             /* We should insert this designation sequence as is so         \
1685                that it is surely written back to a file.  */               \
1686             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1687             goto label_invalid_code;                                       \
1688           }                                                                \
1689         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1690         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1691             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1692           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1693         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1694       }                                                                    \
1695     else                                                                   \
1696       {                                                                    \
1697         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1698         goto label_invalid_code;                                           \
1699       }                                                                    \
1700   } while (0)
1701
1702 /* Allocate a memory block for storing information about compositions.
1703    The block is chained to the already allocated blocks.  */
1704
1705 void
1706 coding_allocate_composition_data (coding, char_offset)
1707      struct coding_system *coding;
1708      int char_offset;
1709 {
1710   struct composition_data *cmp_data
1711     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1712
1713   cmp_data->char_offset = char_offset;
1714   cmp_data->used = 0;
1715   cmp_data->prev = coding->cmp_data;
1716   cmp_data->next = NULL;
1717   if (coding->cmp_data)
1718     coding->cmp_data->next = cmp_data;
1719   coding->cmp_data = cmp_data;
1720   coding->cmp_data_start = 0;
1721   coding->composing = COMPOSITION_NO;
1722 }
1723
1724 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1725    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1726    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1727    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1728    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1729   */
1730
1731 #define DECODE_COMPOSITION_START(c1)                                       \
1732   do {                                                                     \
1733     if (coding->composing == COMPOSITION_DISABLED)                         \
1734       {                                                                    \
1735         *dst++ = ISO_CODE_ESC;                                             \
1736         *dst++ = c1 & 0x7f;                                                \
1737         coding->produced_char += 2;                                        \
1738       }                                                                    \
1739     else if (!COMPOSING_P (coding))                                        \
1740       {                                                                    \
1741         /* This is surely the start of a composition.  We must be sure     \
1742            that coding->cmp_data has enough space to store the             \
1743            information about the composition.  If not, terminate the       \
1744            current decoding loop, allocate one more memory block for       \
1745            coding->cmp_data in the caller, then start the decoding         \
1746            loop again.  We can't allocate memory here directly because     \
1747            it may cause buffer/string relocation.  */                      \
1748         if (!coding->cmp_data                                              \
1749             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1750                 >= COMPOSITION_DATA_SIZE))                                 \
1751           {                                                                \
1752             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1753             goto label_end_of_loop;                                        \
1754           }                                                                \
1755         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1756                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1757                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1758                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1759         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1760                                       coding->composing);                  \
1761         coding->composition_rule_follows = 0;                              \
1762       }                                                                    \
1763     else                                                                   \
1764       {                                                                    \
1765         /* We are already handling a composition.  If the method is        \
1766            the following two, the codes following the current escape       \
1767            sequence are actual characters stored in a buffer.  */          \
1768         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1769             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1770           {                                                                \
1771             coding->composing = COMPOSITION_RELATIVE;                      \
1772             coding->composition_rule_follows = 0;                          \
1773           }                                                                \
1774       }                                                                    \
1775   } while (0)
1776
1777 /* Handle composition end sequence ESC 1.  */
1778
1779 #define DECODE_COMPOSITION_END(c1)                                      \
1780   do {                                                                  \
1781     if (! COMPOSING_P (coding))                                         \
1782       {                                                                 \
1783         *dst++ = ISO_CODE_ESC;                                          \
1784         *dst++ = c1;                                                    \
1785         coding->produced_char += 2;                                     \
1786       }                                                                 \
1787     else                                                                \
1788       {                                                                 \
1789         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1790         coding->composing = COMPOSITION_NO;                             \
1791       }                                                                 \
1792   } while (0)
1793
1794 /* Decode a composition rule from the byte C1 (and maybe one more byte
1795    from SRC) and store one encoded composition rule in
1796    coding->cmp_data.  */
1797
1798 #define DECODE_COMPOSITION_RULE(c1)                                     \
1799   do {                                                                  \
1800     int rule = 0;                                                       \
1801     (c1) -= 32;                                                         \
1802     if (c1 < 81)                /* old format (before ver.21) */        \
1803       {                                                                 \
1804         int gref = (c1) / 9;                                            \
1805         int nref = (c1) % 9;                                            \
1806         if (gref == 4) gref = 10;                                       \
1807         if (nref == 4) nref = 10;                                       \
1808         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1809       }                                                                 \
1810     else if (c1 < 93)           /* new format (after ver.21) */         \
1811       {                                                                 \
1812         ONE_MORE_BYTE (c2);                                             \
1813         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1814       }                                                                 \
1815     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1816     coding->composition_rule_follows = 0;                               \
1817   } while (0)
1818
1819
1820 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1821
1822 static void
1823 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1824      struct coding_system *coding;
1825      const unsigned char *source;
1826      unsigned char *destination;
1827      int src_bytes, dst_bytes;
1828 {
1829   const unsigned char *src = source;
1830   const unsigned char *src_end = source + src_bytes;
1831   unsigned char *dst = destination;
1832   unsigned char *dst_end = destination + dst_bytes;
1833   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1834   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1835   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1836   /* SRC_BASE remembers the start position in source in each loop.
1837      The loop will be exited when there's not enough source code
1838      (within macro ONE_MORE_BYTE), or when there's not enough
1839      destination area to produce a character (within macro
1840      EMIT_CHAR).  */
1841   const unsigned char *src_base;
1842   int c, charset;
1843   Lisp_Object translation_table;
1844   Lisp_Object safe_chars;
1845
1846   safe_chars = coding_safe_chars (coding->symbol);
1847
1848   if (NILP (Venable_character_translation))
1849     translation_table = Qnil;
1850   else
1851     {
1852       translation_table = coding->translation_table_for_decode;
1853       if (NILP (translation_table))
1854         translation_table = Vstandard_translation_table_for_decode;
1855     }
1856
1857   coding->result = CODING_FINISH_NORMAL;
1858
1859   while (1)
1860     {
1861       int c1, c2 = 0;
1862
1863       src_base = src;
1864       ONE_MORE_BYTE (c1);
1865
1866       /* We produce no character or one character.  */
1867       switch (iso_code_class [c1])
1868         {
1869         case ISO_0x20_or_0x7F:
1870           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1871             {
1872               DECODE_COMPOSITION_RULE (c1);
1873               continue;
1874             }
1875           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1876             {
1877               /* This is SPACE or DEL.  */
1878               charset = CHARSET_ASCII;
1879               break;
1880             }
1881           /* This is a graphic character, we fall down ...  */
1882
1883         case ISO_graphic_plane_0:
1884           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1885             {
1886               DECODE_COMPOSITION_RULE (c1);
1887               continue;
1888             }
1889           charset = charset0;
1890           break;
1891
1892         case ISO_0xA0_or_0xFF:
1893           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1894               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1895             goto label_invalid_code;
1896           /* This is a graphic character, we fall down ... */
1897
1898         case ISO_graphic_plane_1:
1899           if (charset1 < 0)
1900             goto label_invalid_code;
1901           charset = charset1;
1902           break;
1903
1904         case ISO_control_0:
1905           if (COMPOSING_P (coding))
1906             DECODE_COMPOSITION_END ('1');
1907
1908           /* All ISO2022 control characters in this class have the
1909              same representation in Emacs internal format.  */
1910           if (c1 == '\n'
1911               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1912               && (coding->eol_type == CODING_EOL_CR
1913                   || coding->eol_type == CODING_EOL_CRLF))
1914             {
1915               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1916               goto label_end_of_loop;
1917             }
1918           charset = CHARSET_ASCII;
1919           break;
1920
1921         case ISO_control_1:
1922           if (COMPOSING_P (coding))
1923             DECODE_COMPOSITION_END ('1');
1924           goto label_invalid_code;
1925
1926         case ISO_carriage_return:
1927           if (COMPOSING_P (coding))
1928             DECODE_COMPOSITION_END ('1');
1929
1930           if (coding->eol_type == CODING_EOL_CR)
1931             c1 = '\n';
1932           else if (coding->eol_type == CODING_EOL_CRLF)
1933             {
1934               ONE_MORE_BYTE (c1);
1935               if (c1 != ISO_CODE_LF)
1936                 {
1937                   src--;
1938                   c1 = '\r';
1939                 }
1940             }
1941           charset = CHARSET_ASCII;
1942           break;
1943
1944         case ISO_shift_out:
1945           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1946               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1947             goto label_invalid_code;
1948           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1949           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950           continue;
1951
1952         case ISO_shift_in:
1953           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1954             goto label_invalid_code;
1955           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1956           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1957           continue;
1958
1959         case ISO_single_shift_2_7:
1960         case ISO_single_shift_2:
1961           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962             goto label_invalid_code;
1963           /* SS2 is handled as an escape sequence of ESC 'N' */
1964           c1 = 'N';
1965           goto label_escape_sequence;
1966
1967         case ISO_single_shift_3:
1968           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1969             goto label_invalid_code;
1970           /* SS2 is handled as an escape sequence of ESC 'O' */
1971           c1 = 'O';
1972           goto label_escape_sequence;
1973
1974         case ISO_control_sequence_introducer:
1975           /* CSI is handled as an escape sequence of ESC '[' ...  */
1976           c1 = '[';
1977           goto label_escape_sequence;
1978
1979         case ISO_escape:
1980           ONE_MORE_BYTE (c1);
1981         label_escape_sequence:
1982           /* Escape sequences handled by Emacs are invocation,
1983              designation, direction specification, and character
1984              composition specification.  */
1985           switch (c1)
1986             {
1987             case '&':           /* revision of following character set */
1988               ONE_MORE_BYTE (c1);
1989               if (!(c1 >= '@' && c1 <= '~'))
1990                 goto label_invalid_code;
1991               ONE_MORE_BYTE (c1);
1992               if (c1 != ISO_CODE_ESC)
1993                 goto label_invalid_code;
1994               ONE_MORE_BYTE (c1);
1995               goto label_escape_sequence;
1996
1997             case '$':           /* designation of 2-byte character set */
1998               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1999                 goto label_invalid_code;
2000               ONE_MORE_BYTE (c1);
2001               if (c1 >= '@' && c1 <= 'B')
2002                 {       /* designation of JISX0208.1978, GB2312.1980,
2003                            or JISX0208.1980 */
2004                   DECODE_DESIGNATION (0, 2, 94, c1);
2005                 }
2006               else if (c1 >= 0x28 && c1 <= 0x2B)
2007                 {       /* designation of DIMENSION2_CHARS94 character set */
2008                   ONE_MORE_BYTE (c2);
2009                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2010                 }
2011               else if (c1 >= 0x2C && c1 <= 0x2F)
2012                 {       /* designation of DIMENSION2_CHARS96 character set */
2013                   ONE_MORE_BYTE (c2);
2014                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2015                 }
2016               else
2017                 goto label_invalid_code;
2018               /* We must update these variables now.  */
2019               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2020               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2021               continue;
2022
2023             case 'n':           /* invocation of locking-shift-2 */
2024               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2025                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2026                 goto label_invalid_code;
2027               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2028               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2029               continue;
2030
2031             case 'o':           /* invocation of locking-shift-3 */
2032               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2033                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2034                 goto label_invalid_code;
2035               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2036               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2037               continue;
2038
2039             case 'N':           /* invocation of single-shift-2 */
2040               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2041                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2042                 goto label_invalid_code;
2043               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2044               ONE_MORE_BYTE (c1);
2045               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2046                 goto label_invalid_code;
2047               break;
2048
2049             case 'O':           /* invocation of single-shift-3 */
2050               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2051                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2052                 goto label_invalid_code;
2053               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2054               ONE_MORE_BYTE (c1);
2055               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2056                 goto label_invalid_code;
2057               break;
2058
2059             case '0': case '2': case '3': case '4': /* start composition */
2060               DECODE_COMPOSITION_START (c1);
2061               continue;
2062
2063             case '1':           /* end composition */
2064               DECODE_COMPOSITION_END (c1);
2065               continue;
2066
2067             case '[':           /* specification of direction */
2068               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2069                 goto label_invalid_code;
2070               /* For the moment, nested direction is not supported.
2071                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2072                  left-to-right, and nonzero means right-to-left.  */
2073               ONE_MORE_BYTE (c1);
2074               switch (c1)
2075                 {
2076                 case ']':       /* end of the current direction */
2077                   coding->mode &= ~CODING_MODE_DIRECTION;
2078
2079                 case '0':       /* end of the current direction */
2080                 case '1':       /* start of left-to-right direction */
2081                   ONE_MORE_BYTE (c1);
2082                   if (c1 == ']')
2083                     coding->mode &= ~CODING_MODE_DIRECTION;
2084                   else
2085                     goto label_invalid_code;
2086                   break;
2087
2088                 case '2':       /* start of right-to-left direction */
2089                   ONE_MORE_BYTE (c1);
2090                   if (c1 == ']')
2091                     coding->mode |= CODING_MODE_DIRECTION;
2092                   else
2093                     goto label_invalid_code;
2094                   break;
2095
2096                 default:
2097                   goto label_invalid_code;
2098                 }
2099               continue;
2100
2101             case '%':
2102               if (COMPOSING_P (coding))
2103                 DECODE_COMPOSITION_END ('1');
2104               ONE_MORE_BYTE (c1);
2105               if (c1 == '/')
2106                 {
2107                   /* CTEXT extended segment:
2108                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2109                      We keep these bytes as is for the moment.
2110                      They may be decoded by post-read-conversion.  */
2111                   int dim, M, L;
2112                   int size, required;
2113                   int produced_chars;
2114
2115                   ONE_MORE_BYTE (dim);
2116                   ONE_MORE_BYTE (M);
2117                   ONE_MORE_BYTE (L);
2118                   size = ((M - 128) * 128) + (L - 128);
2119                   required = 8 + size * 2;
2120                   if (dst + required > (dst_bytes ? dst_end : src))
2121                     goto label_end_of_loop;
2122                   *dst++ = ISO_CODE_ESC;
2123                   *dst++ = '%';
2124                   *dst++ = '/';
2125                   *dst++ = dim;
2126                   produced_chars = 4;
2127                   dst += CHAR_STRING (M, dst), produced_chars++;
2128                   dst += CHAR_STRING (L, dst), produced_chars++;
2129                   while (size-- > 0)
2130                     {
2131                       ONE_MORE_BYTE (c1);
2132                       dst += CHAR_STRING (c1, dst), produced_chars++;
2133                     }
2134                   coding->produced_char += produced_chars;
2135                 }
2136               else if (c1 == 'G')
2137                 {
2138                   unsigned char *d = dst;
2139                   int produced_chars;
2140
2141                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2142                      ESC % G --UTF-8-BYTES-- ESC % @
2143                      We keep these bytes as is for the moment.
2144                      They may be decoded by post-read-conversion.  */
2145                   if (d + 6 > (dst_bytes ? dst_end : src))
2146                     goto label_end_of_loop;
2147                   *d++ = ISO_CODE_ESC;
2148                   *d++ = '%';
2149                   *d++ = 'G';
2150                   produced_chars = 3;
2151                   while (d + 1 < (dst_bytes ? dst_end : src))
2152                     {
2153                       ONE_MORE_BYTE (c1);
2154                       if (c1 == ISO_CODE_ESC
2155                           && src + 1 < src_end
2156                           && src[0] == '%'
2157                           && src[1] == '@')
2158                         {
2159                           src += 2;
2160                           break;
2161                         }
2162                       d += CHAR_STRING (c1, d), produced_chars++;
2163                     }
2164                   if (d + 3 > (dst_bytes ? dst_end : src))
2165                     goto label_end_of_loop;
2166                   *d++ = ISO_CODE_ESC;
2167                   *d++ = '%';
2168                   *d++ = '@';
2169                   dst = d;
2170                   coding->produced_char += produced_chars + 3;
2171                 }
2172               else
2173                 goto label_invalid_code;
2174               continue;
2175
2176             default:
2177               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2178                 goto label_invalid_code;
2179               if (c1 >= 0x28 && c1 <= 0x2B)
2180                 {       /* designation of DIMENSION1_CHARS94 character set */
2181                   ONE_MORE_BYTE (c2);
2182                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2183                 }
2184               else if (c1 >= 0x2C && c1 <= 0x2F)
2185                 {       /* designation of DIMENSION1_CHARS96 character set */
2186                   ONE_MORE_BYTE (c2);
2187                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2188                 }
2189               else
2190                 goto label_invalid_code;
2191               /* We must update these variables now.  */
2192               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2193               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2194               continue;
2195             }
2196         }
2197
2198       /* Now we know CHARSET and 1st position code C1 of a character.
2199          Produce a multibyte sequence for that character while getting
2200          2nd position code C2 if necessary.  */
2201       if (CHARSET_DIMENSION (charset) == 2)
2202         {
2203           ONE_MORE_BYTE (c2);
2204           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2205             /* C2 is not in a valid range.  */
2206             goto label_invalid_code;
2207         }
2208       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2209       EMIT_CHAR (c);
2210       continue;
2211
2212     label_invalid_code:
2213       coding->errors++;
2214       if (COMPOSING_P (coding))
2215         DECODE_COMPOSITION_END ('1');
2216       src = src_base;
2217       c = *src++;
2218       if (! NILP (translation_table))
2219         c = translate_char (translation_table, c, 0, 0, 0);
2220       EMIT_CHAR (c);
2221     }
2222
2223  label_end_of_loop:
2224   coding->consumed = coding->consumed_char = src_base - source;
2225   coding->produced = dst - destination;
2226   return;
2227 }
2228
2229
2230 /* ISO2022 encoding stuff.  */
2231
2232 /*
2233    It is not enough to say just "ISO2022" on encoding, we have to
2234    specify more details.  In Emacs, each ISO2022 coding system
2235    variant has the following specifications:
2236         1. Initial designation to G0 through G3.
2237         2. Allows short-form designation?
2238         3. ASCII should be designated to G0 before control characters?
2239         4. ASCII should be designated to G0 at end of line?
2240         5. 7-bit environment or 8-bit environment?
2241         6. Use locking-shift?
2242         7. Use Single-shift?
2243    And the following two are only for Japanese:
2244         8. Use ASCII in place of JIS0201-1976-Roman?
2245         9. Use JISX0208-1983 in place of JISX0208-1978?
2246    These specifications are encoded in `coding->flags' as flag bits
2247    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2248    details.
2249 */
2250
2251 /* Produce codes (escape sequence) for designating CHARSET to graphic
2252    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2253    '@', 'A', or 'B' and the coding system CODING allows, produce
2254    designation sequence of short-form.  */
2255
2256 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2257   do {                                                                  \
2258     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2259     char *intermediate_char_94 = "()*+";                                \
2260     char *intermediate_char_96 = ",-./";                                \
2261     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2262                                                                         \
2263     if (revision < 255)                                                 \
2264       {                                                                 \
2265         *dst++ = ISO_CODE_ESC;                                          \
2266         *dst++ = '&';                                                   \
2267         *dst++ = '@' + revision;                                        \
2268       }                                                                 \
2269     *dst++ = ISO_CODE_ESC;                                              \
2270     if (CHARSET_DIMENSION (charset) == 1)                               \
2271       {                                                                 \
2272         if (CHARSET_CHARS (charset) == 94)                              \
2273           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2274         else                                                            \
2275           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2276       }                                                                 \
2277     else                                                                \
2278       {                                                                 \
2279         *dst++ = '$';                                                   \
2280         if (CHARSET_CHARS (charset) == 94)                              \
2281           {                                                             \
2282             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2283                 || reg != 0                                             \
2284                 || final_char < '@' || final_char > 'B')                \
2285               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2286           }                                                             \
2287         else                                                            \
2288           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2289       }                                                                 \
2290     *dst++ = final_char;                                                \
2291     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2292   } while (0)
2293
2294 /* The following two macros produce codes (control character or escape
2295    sequence) for ISO2022 single-shift functions (single-shift-2 and
2296    single-shift-3).  */
2297
2298 #define ENCODE_SINGLE_SHIFT_2                           \
2299   do {                                                  \
2300     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2301       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2302     else                                                \
2303       *dst++ = ISO_CODE_SS2;                            \
2304     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2305   } while (0)
2306
2307 #define ENCODE_SINGLE_SHIFT_3                           \
2308   do {                                                  \
2309     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2310       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2311     else                                                \
2312       *dst++ = ISO_CODE_SS3;                            \
2313     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2314   } while (0)
2315
2316 /* The following four macros produce codes (control character or
2317    escape sequence) for ISO2022 locking-shift functions (shift-in,
2318    shift-out, locking-shift-2, and locking-shift-3).  */
2319
2320 #define ENCODE_SHIFT_IN                         \
2321   do {                                          \
2322     *dst++ = ISO_CODE_SI;                       \
2323     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2324   } while (0)
2325
2326 #define ENCODE_SHIFT_OUT                        \
2327   do {                                          \
2328     *dst++ = ISO_CODE_SO;                       \
2329     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2330   } while (0)
2331
2332 #define ENCODE_LOCKING_SHIFT_2                  \
2333   do {                                          \
2334     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2335     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2336   } while (0)
2337
2338 #define ENCODE_LOCKING_SHIFT_3                  \
2339   do {                                          \
2340     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2341     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2342   } while (0)
2343
2344 /* Produce codes for a DIMENSION1 character whose character set is
2345    CHARSET and whose position-code is C1.  Designation and invocation
2346    sequences are also produced in advance if necessary.  */
2347
2348 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2349   do {                                                                  \
2350     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2351       {                                                                 \
2352         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2353           *dst++ = c1 & 0x7F;                                           \
2354         else                                                            \
2355           *dst++ = c1 | 0x80;                                           \
2356         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2357         break;                                                          \
2358       }                                                                 \
2359     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2360       {                                                                 \
2361         *dst++ = c1 & 0x7F;                                             \
2362         break;                                                          \
2363       }                                                                 \
2364     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2365       {                                                                 \
2366         *dst++ = c1 | 0x80;                                             \
2367         break;                                                          \
2368       }                                                                 \
2369     else                                                                \
2370       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2371          must invoke it, or, at first, designate it to some graphic     \
2372          register.  Then repeat the loop to actually produce the        \
2373          character.  */                                                 \
2374       dst = encode_invocation_designation (charset, coding, dst);       \
2375   } while (1)
2376
2377 /* Produce codes for a DIMENSION2 character whose character set is
2378    CHARSET and whose position-codes are C1 and C2.  Designation and
2379    invocation codes are also produced in advance if necessary.  */
2380
2381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2382   do {                                                                  \
2383     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2384       {                                                                 \
2385         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2386           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2387         else                                                            \
2388           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2389         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2390         break;                                                          \
2391       }                                                                 \
2392     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2393       {                                                                 \
2394         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2395         break;                                                          \
2396       }                                                                 \
2397     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2398       {                                                                 \
2399         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2400         break;                                                          \
2401       }                                                                 \
2402     else                                                                \
2403       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2404          must invoke it, or, at first, designate it to some graphic     \
2405          register.  Then repeat the loop to actually produce the        \
2406          character.  */                                                 \
2407       dst = encode_invocation_designation (charset, coding, dst);       \
2408   } while (1)
2409
2410 #define ENCODE_ISO_CHARACTER(c)                                 \
2411   do {                                                          \
2412     int charset, c1, c2;                                        \
2413                                                                 \
2414     SPLIT_CHAR (c, charset, c1, c2);                            \
2415     if (CHARSET_DEFINED_P (charset))                            \
2416       {                                                         \
2417         if (CHARSET_DIMENSION (charset) == 1)                   \
2418           {                                                     \
2419             if (charset == CHARSET_ASCII                        \
2420                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2421               charset = charset_latin_jisx0201;                 \
2422             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2423           }                                                     \
2424         else                                                    \
2425           {                                                     \
2426             if (charset == charset_jisx0208                     \
2427                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2428               charset = charset_jisx0208_1978;                  \
2429             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2430           }                                                     \
2431       }                                                         \
2432     else                                                        \
2433       {                                                         \
2434         *dst++ = c1;                                            \
2435         if (c2 >= 0)                                            \
2436           *dst++ = c2;                                          \
2437       }                                                         \
2438   } while (0)
2439
2440
2441 /* Instead of encoding character C, produce one or two `?'s.  */
2442
2443 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2444   do {                                                          \
2445     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2446     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2447       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2448   } while (0)
2449
2450
2451 /* Produce designation and invocation codes at a place pointed by DST
2452    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2453    Return new DST.  */
2454
2455 unsigned char *
2456 encode_invocation_designation (charset, coding, dst)
2457      int charset;
2458      struct coding_system *coding;
2459      unsigned char *dst;
2460 {
2461   int reg;                      /* graphic register number */
2462
2463   /* At first, check designations.  */
2464   for (reg = 0; reg < 4; reg++)
2465     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2466       break;
2467
2468   if (reg >= 4)
2469     {
2470       /* CHARSET is not yet designated to any graphic registers.  */
2471       /* At first check the requested designation.  */
2472       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2473       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2474         /* Since CHARSET requests no special designation, designate it
2475            to graphic register 0.  */
2476         reg = 0;
2477
2478       ENCODE_DESIGNATION (charset, reg, coding);
2479     }
2480
2481   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2482       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2483     {
2484       /* Since the graphic register REG is not invoked to any graphic
2485          planes, invoke it to graphic plane 0.  */
2486       switch (reg)
2487         {
2488         case 0:                 /* graphic register 0 */
2489           ENCODE_SHIFT_IN;
2490           break;
2491
2492         case 1:                 /* graphic register 1 */
2493           ENCODE_SHIFT_OUT;
2494           break;
2495
2496         case 2:                 /* graphic register 2 */
2497           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2498             ENCODE_SINGLE_SHIFT_2;
2499           else
2500             ENCODE_LOCKING_SHIFT_2;
2501           break;
2502
2503         case 3:                 /* graphic register 3 */
2504           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2505             ENCODE_SINGLE_SHIFT_3;
2506           else
2507             ENCODE_LOCKING_SHIFT_3;
2508           break;
2509         }
2510     }
2511
2512   return dst;
2513 }
2514
2515 /* Produce 2-byte codes for encoded composition rule RULE.  */
2516
2517 #define ENCODE_COMPOSITION_RULE(rule)           \
2518   do {                                          \
2519     int gref, nref;                             \
2520     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2521     *dst++ = 32 + 81 + gref;                    \
2522     *dst++ = 32 + nref;                         \
2523   } while (0)
2524
2525 /* Produce codes for indicating the start of a composition sequence
2526    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2527    which specify information about the composition.  See the comment
2528    in coding.h for the format of DATA.  */
2529
2530 #define ENCODE_COMPOSITION_START(coding, data)                          \
2531   do {                                                                  \
2532     coding->composing = data[3];                                        \
2533     *dst++ = ISO_CODE_ESC;                                              \
2534     if (coding->composing == COMPOSITION_RELATIVE)                      \
2535       *dst++ = '0';                                                     \
2536     else                                                                \
2537       {                                                                 \
2538         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2539                   ? '3' : '4');                                         \
2540         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2541         coding->composition_rule_follows = 0;                           \
2542       }                                                                 \
2543   } while (0)
2544
2545 /* Produce codes for indicating the end of the current composition.  */
2546
2547 #define ENCODE_COMPOSITION_END(coding, data)                    \
2548   do {                                                          \
2549     *dst++ = ISO_CODE_ESC;                                      \
2550     *dst++ = '1';                                               \
2551     coding->cmp_data_start += data[0];                          \
2552     coding->composing = COMPOSITION_NO;                         \
2553     if (coding->cmp_data_start == coding->cmp_data->used        \
2554         && coding->cmp_data->next)                              \
2555       {                                                         \
2556         coding->cmp_data = coding->cmp_data->next;              \
2557         coding->cmp_data_start = 0;                             \
2558       }                                                         \
2559   } while (0)
2560
2561 /* Produce composition start sequence ESC 0.  Here, this sequence
2562    doesn't mean the start of a new composition but means that we have
2563    just produced components (alternate chars and composition rules) of
2564    the composition and the actual text follows in SRC.  */
2565
2566 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2567   do {                                          \
2568     *dst++ = ISO_CODE_ESC;                      \
2569     *dst++ = '0';                               \
2570     coding->composing = COMPOSITION_RELATIVE;   \
2571   } while (0)
2572
2573 /* The following three macros produce codes for indicating direction
2574    of text.  */
2575 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2576   do {                                                  \
2577     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2578       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2579     else                                                \
2580       *dst++ = ISO_CODE_CSI;                            \
2581   } while (0)
2582
2583 #define ENCODE_DIRECTION_R2L    \
2584   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2585
2586 #define ENCODE_DIRECTION_L2R    \
2587   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2588
2589 /* Produce codes for designation and invocation to reset the graphic
2590    planes and registers to initial state.  */
2591 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2592   do {                                                                      \
2593     int reg;                                                                \
2594     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2595       ENCODE_SHIFT_IN;                                                      \
2596     for (reg = 0; reg < 4; reg++)                                           \
2597       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2598           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2599               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2600         ENCODE_DESIGNATION                                                  \
2601           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2602   } while (0)
2603
2604 /* Produce designation sequences of charsets in the line started from
2605    SRC to a place pointed by DST, and return updated DST.
2606
2607    If the current block ends before any end-of-line, we may fail to
2608    find all the necessary designations.  */
2609
2610 static unsigned char *
2611 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2612      struct coding_system *coding;
2613      Lisp_Object translation_table;
2614      const unsigned char *src, *src_end;
2615      unsigned char *dst;
2616 {
2617   int charset, c, found = 0, reg;
2618   /* Table of charsets to be designated to each graphic register.  */
2619   int r[4];
2620
2621   for (reg = 0; reg < 4; reg++)
2622     r[reg] = -1;
2623
2624   while (found < 4)
2625     {
2626       ONE_MORE_CHAR (c);
2627       if (c == '\n')
2628         break;
2629
2630       charset = CHAR_CHARSET (c);
2631       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2632       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2633         {
2634           found++;
2635           r[reg] = charset;
2636         }
2637     }
2638
2639  label_end_of_loop:
2640   if (found)
2641     {
2642       for (reg = 0; reg < 4; reg++)
2643         if (r[reg] >= 0
2644             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2645           ENCODE_DESIGNATION (r[reg], reg, coding);
2646     }
2647
2648   return dst;
2649 }
2650
2651 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2652
2653 static void
2654 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2655      struct coding_system *coding;
2656      const unsigned char *source;
2657      unsigned char *destination;
2658      int src_bytes, dst_bytes;
2659 {
2660   const unsigned char *src = source;
2661   const unsigned char *src_end = source + src_bytes;
2662   unsigned char *dst = destination;
2663   unsigned char *dst_end = destination + dst_bytes;
2664   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2665      from DST_END to assure overflow checking is necessary only at the
2666      head of loop.  */
2667   unsigned char *adjusted_dst_end = dst_end - 19;
2668   /* SRC_BASE remembers the start position in source in each loop.
2669      The loop will be exited when there's not enough source text to
2670      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2671      there's not enough destination area to produce encoded codes
2672      (within macro EMIT_BYTES).  */
2673   const unsigned char *src_base;
2674   int c;
2675   Lisp_Object translation_table;
2676   Lisp_Object safe_chars;
2677
2678   if (coding->flags & CODING_FLAG_ISO_SAFE)
2679     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2680
2681   safe_chars = coding_safe_chars (coding->symbol);
2682
2683   if (NILP (Venable_character_translation))
2684     translation_table = Qnil;
2685   else
2686     {
2687       translation_table = coding->translation_table_for_encode;
2688       if (NILP (translation_table))
2689         translation_table = Vstandard_translation_table_for_encode;
2690     }
2691
2692   coding->consumed_char = 0;
2693   coding->errors = 0;
2694   while (1)
2695     {
2696       src_base = src;
2697
2698       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2699         {
2700           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2701           break;
2702         }
2703
2704       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2705           && CODING_SPEC_ISO_BOL (coding))
2706         {
2707           /* We have to produce designation sequences if any now.  */
2708           dst = encode_designation_at_bol (coding, translation_table,
2709                                            src, src_end, dst);
2710           CODING_SPEC_ISO_BOL (coding) = 0;
2711         }
2712
2713       /* Check composition start and end.  */
2714       if (coding->composing != COMPOSITION_DISABLED
2715           && coding->cmp_data_start < coding->cmp_data->used)
2716         {
2717           struct composition_data *cmp_data = coding->cmp_data;
2718           int *data = cmp_data->data + coding->cmp_data_start;
2719           int this_pos = cmp_data->char_offset + coding->consumed_char;
2720
2721           if (coding->composing == COMPOSITION_RELATIVE)
2722             {
2723               if (this_pos == data[2])
2724                 {
2725                   ENCODE_COMPOSITION_END (coding, data);
2726                   cmp_data = coding->cmp_data;
2727                   data = cmp_data->data + coding->cmp_data_start;
2728                 }
2729             }
2730           else if (COMPOSING_P (coding))
2731             {
2732               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2733               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2734                 /* We have consumed components of the composition.
2735                    What follows in SRC is the composition's base
2736                    text.  */
2737                 ENCODE_COMPOSITION_FAKE_START (coding);
2738               else
2739                 {
2740                   int c = cmp_data->data[coding->cmp_data_index++];
2741                   if (coding->composition_rule_follows)
2742                     {
2743                       ENCODE_COMPOSITION_RULE (c);
2744                       coding->composition_rule_follows = 0;
2745                     }
2746                   else
2747                     {
2748                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2749                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2750                         ENCODE_UNSAFE_CHARACTER (c);
2751                       else
2752                         ENCODE_ISO_CHARACTER (c);
2753                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2754                         coding->composition_rule_follows = 1;
2755                     }
2756                   continue;
2757                 }
2758             }
2759           if (!COMPOSING_P (coding))
2760             {
2761               if (this_pos == data[1])
2762                 {
2763                   ENCODE_COMPOSITION_START (coding, data);
2764                   continue;
2765                 }
2766             }
2767         }
2768
2769       ONE_MORE_CHAR (c);
2770
2771       /* Now encode the character C.  */
2772       if (c < 0x20 || c == 0x7F)
2773         {
2774           if (c == '\r')
2775             {
2776               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2777                 {
2778                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2779                     ENCODE_RESET_PLANE_AND_REGISTER;
2780                   *dst++ = c;
2781                   continue;
2782                 }
2783               /* fall down to treat '\r' as '\n' ...  */
2784               c = '\n';
2785             }
2786           if (c == '\n')
2787             {
2788               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2789                 ENCODE_RESET_PLANE_AND_REGISTER;
2790               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2791                 bcopy (coding->spec.iso2022.initial_designation,
2792                        coding->spec.iso2022.current_designation,
2793                        sizeof coding->spec.iso2022.initial_designation);
2794               if (coding->eol_type == CODING_EOL_LF
2795                   || coding->eol_type == CODING_EOL_UNDECIDED)
2796                 *dst++ = ISO_CODE_LF;
2797               else if (coding->eol_type == CODING_EOL_CRLF)
2798                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2799               else
2800                 *dst++ = ISO_CODE_CR;
2801               CODING_SPEC_ISO_BOL (coding) = 1;
2802             }
2803           else
2804             {
2805               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2806                 ENCODE_RESET_PLANE_AND_REGISTER;
2807               *dst++ = c;
2808             }
2809         }
2810       else if (ASCII_BYTE_P (c))
2811         ENCODE_ISO_CHARACTER (c);
2812       else if (SINGLE_BYTE_CHAR_P (c))
2813         {
2814           *dst++ = c;
2815           coding->errors++;
2816         }
2817       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2818                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2819         ENCODE_UNSAFE_CHARACTER (c);
2820       else
2821         ENCODE_ISO_CHARACTER (c);
2822
2823       coding->consumed_char++;
2824     }
2825
2826  label_end_of_loop:
2827   coding->consumed = src_base - source;
2828   coding->produced = coding->produced_char = dst - destination;
2829 }
2830
2831 \f
2832 /*** 4. SJIS and BIG5 handlers ***/
2833
2834 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2835    quite widely.  So, for the moment, Emacs supports them in the bare
2836    C code.  But, in the future, they may be supported only by CCL.  */
2837
2838 /* SJIS is a coding system encoding three character sets: ASCII, right
2839    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2840    as is.  A character of charset katakana-jisx0201 is encoded by
2841    "position-code + 0x80".  A character of charset japanese-jisx0208
2842    is encoded in 2-byte but two position-codes are divided and shifted
2843    so that it fits in the range below.
2844
2845    --- CODE RANGE of SJIS ---
2846    (character set)      (range)
2847    ASCII                0x00 .. 0x7F
2848    KATAKANA-JISX0201    0xA1 .. 0xDF
2849    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2850             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2851    -------------------------------
2852
2853 */
2854
2855 /* BIG5 is a coding system encoding two character sets: ASCII and
2856    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2857    character set and is encoded in two bytes.
2858
2859    --- CODE RANGE of BIG5 ---
2860    (character set)      (range)
2861    ASCII                0x00 .. 0x7F
2862    Big5 (1st byte)      0xA1 .. 0xFE
2863         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2864    --------------------------
2865
2866    Since the number of characters in Big5 is larger than maximum
2867    characters in Emacs' charset (96x96), it can't be handled as one
2868    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2869    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2870    contains frequently used characters and the latter contains less
2871    frequently used characters.  */
2872
2873 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2874    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2875    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2876    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2877
2878 /* Number of Big5 characters which have the same code in 1st byte.  */
2879 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2880
2881 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2882   do {                                                                  \
2883     unsigned int temp                                                   \
2884       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2885     if (b1 < 0xC9)                                                      \
2886       charset = charset_big5_1;                                         \
2887     else                                                                \
2888       {                                                                 \
2889         charset = charset_big5_2;                                       \
2890         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2891       }                                                                 \
2892     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2893     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2894   } while (0)
2895
2896 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2897   do {                                                                  \
2898     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2899     if (charset == charset_big5_2)                                      \
2900       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2901     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2902     b2 = temp % BIG5_SAME_ROW;                                          \
2903     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2904   } while (0)
2905
2906 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2907    Check if a text is encoded in SJIS.  If it is, return
2908    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2909
2910 static int
2911 detect_coding_sjis (src, src_end, multibytep)
2912      unsigned char *src, *src_end;
2913      int multibytep;
2914 {
2915   int c;
2916   /* Dummy for ONE_MORE_BYTE.  */
2917   struct coding_system dummy_coding;
2918   struct coding_system *coding = &dummy_coding;
2919
2920   while (1)
2921     {
2922       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2923       if (c < 0x80)
2924         continue;
2925       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2926         return 0;
2927       if (c <= 0x9F || c >= 0xE0)
2928         {
2929           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2930           if (c < 0x40 || c == 0x7F || c > 0xFC)
2931             return 0;
2932         }
2933     }
2934  label_end_of_loop:
2935   return CODING_CATEGORY_MASK_SJIS;
2936 }
2937
2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2939    Check if a text is encoded in BIG5.  If it is, return
2940    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2941
2942 static int
2943 detect_coding_big5 (src, src_end, multibytep)
2944      unsigned char *src, *src_end;
2945      int multibytep;
2946 {
2947   int c;
2948   /* Dummy for ONE_MORE_BYTE.  */
2949   struct coding_system dummy_coding;
2950   struct coding_system *coding = &dummy_coding;
2951
2952   while (1)
2953     {
2954       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2955       if (c < 0x80)
2956         continue;
2957       if (c < 0xA1 || c > 0xFE)
2958         return 0;
2959       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2960       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2961         return 0;
2962     }
2963  label_end_of_loop:
2964   return CODING_CATEGORY_MASK_BIG5;
2965 }
2966
2967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2968    Check if a text is encoded in UTF-8.  If it is, return
2969    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2970
2971 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2972 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2973 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2974 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2975 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2976 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2977 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2978
2979 static int
2980 detect_coding_utf_8 (src, src_end, multibytep)
2981      unsigned char *src, *src_end;
2982      int multibytep;
2983 {
2984   unsigned char c;
2985   int seq_maybe_bytes;
2986   /* Dummy for ONE_MORE_BYTE.  */
2987   struct coding_system dummy_coding;
2988   struct coding_system *coding = &dummy_coding;
2989
2990   while (1)
2991     {
2992       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2993       if (UTF_8_1_OCTET_P (c))
2994         continue;
2995       else if (UTF_8_2_OCTET_LEADING_P (c))
2996         seq_maybe_bytes = 1;
2997       else if (UTF_8_3_OCTET_LEADING_P (c))
2998         seq_maybe_bytes = 2;
2999       else if (UTF_8_4_OCTET_LEADING_P (c))
3000         seq_maybe_bytes = 3;
3001       else if (UTF_8_5_OCTET_LEADING_P (c))
3002         seq_maybe_bytes = 4;
3003       else if (UTF_8_6_OCTET_LEADING_P (c))
3004         seq_maybe_bytes = 5;
3005       else
3006         return 0;
3007
3008       do
3009         {
3010           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3011           if (!UTF_8_EXTRA_OCTET_P (c))
3012             return 0;
3013           seq_maybe_bytes--;
3014         }
3015       while (seq_maybe_bytes > 0);
3016     }
3017
3018  label_end_of_loop:
3019   return CODING_CATEGORY_MASK_UTF_8;
3020 }
3021
3022 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3023    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3024    Little Endian (otherwise).  If it is, return
3025    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3026    else return 0.  */
3027
3028 #define UTF_16_INVALID_P(val)   \
3029   (((val) == 0xFFFE)            \
3030    || ((val) == 0xFFFF))
3031
3032 #define UTF_16_HIGH_SURROGATE_P(val) \
3033   (((val) & 0xD800) == 0xD800)
3034
3035 #define UTF_16_LOW_SURROGATE_P(val) \
3036   (((val) & 0xDC00) == 0xDC00)
3037
3038 static int
3039 detect_coding_utf_16 (src, src_end, multibytep)
3040      unsigned char *src, *src_end;
3041      int multibytep;
3042 {
3043   unsigned char c1, c2;
3044   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3045   struct coding_system dummy_coding;
3046   struct coding_system *coding = &dummy_coding;
3047
3048   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3049   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3050
3051   if ((c1 == 0xFF) && (c2 == 0xFE))
3052     return CODING_CATEGORY_MASK_UTF_16_LE;
3053   else if ((c1 == 0xFE) && (c2 == 0xFF))
3054     return CODING_CATEGORY_MASK_UTF_16_BE;
3055
3056  label_end_of_loop:
3057   return 0;
3058 }
3059
3060 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3061    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3062
3063 static void
3064 decode_coding_sjis_big5 (coding, source, destination,
3065                          src_bytes, dst_bytes, sjis_p)
3066      struct coding_system *coding;
3067      const unsigned char *source;
3068      unsigned char  *destination;
3069      int src_bytes, dst_bytes;
3070      int sjis_p;
3071 {
3072   const unsigned char *src = source;
3073   const unsigned char *src_end = source + src_bytes;
3074   unsigned char *dst = destination;
3075   unsigned char *dst_end = destination + dst_bytes;
3076   /* SRC_BASE remembers the start position in source in each loop.
3077      The loop will be exited when there's not enough source code
3078      (within macro ONE_MORE_BYTE), or when there's not enough
3079      destination area to produce a character (within macro
3080      EMIT_CHAR).  */
3081   const unsigned char *src_base;
3082   Lisp_Object translation_table;
3083
3084   if (NILP (Venable_character_translation))
3085     translation_table = Qnil;
3086   else
3087     {
3088       translation_table = coding->translation_table_for_decode;
3089       if (NILP (translation_table))
3090         translation_table = Vstandard_translation_table_for_decode;
3091     }
3092
3093   coding->produced_char = 0;
3094   while (1)
3095     {
3096       int c, charset, c1, c2 = 0;
3097
3098       src_base = src;
3099       ONE_MORE_BYTE (c1);
3100
3101       if (c1 < 0x80)
3102         {
3103           charset = CHARSET_ASCII;
3104           if (c1 < 0x20)
3105             {
3106               if (c1 == '\r')
3107                 {
3108                   if (coding->eol_type == CODING_EOL_CRLF)
3109                     {
3110                       ONE_MORE_BYTE (c2);
3111                       if (c2 == '\n')
3112                         c1 = c2;
3113                       else
3114                         /* To process C2 again, SRC is subtracted by 1.  */
3115                         src--;
3116                     }
3117                   else if (coding->eol_type == CODING_EOL_CR)
3118                     c1 = '\n';
3119                 }
3120               else if (c1 == '\n'
3121                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3122                        && (coding->eol_type == CODING_EOL_CR
3123                            || coding->eol_type == CODING_EOL_CRLF))
3124                 {
3125                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3126                   goto label_end_of_loop;
3127                 }
3128             }
3129         }
3130       else
3131         {
3132           if (sjis_p)
3133             {
3134               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3135                 goto label_invalid_code;
3136               if (c1 <= 0x9F || c1 >= 0xE0)
3137                 {
3138                   /* SJIS -> JISX0208 */
3139                   ONE_MORE_BYTE (c2);
3140                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3141                     goto label_invalid_code;
3142                   DECODE_SJIS (c1, c2, c1, c2);
3143                   charset = charset_jisx0208;
3144                 }
3145               else
3146                 /* SJIS -> JISX0201-Kana */
3147                 charset = charset_katakana_jisx0201;
3148             }
3149           else
3150             {
3151               /* BIG5 -> Big5 */
3152               if (c1 < 0xA0 || c1 > 0xFE)
3153                 goto label_invalid_code;
3154               ONE_MORE_BYTE (c2);
3155               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3156                 goto label_invalid_code;
3157               DECODE_BIG5 (c1, c2, charset, c1, c2);
3158             }
3159         }
3160
3161       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3162       EMIT_CHAR (c);
3163       continue;
3164
3165     label_invalid_code:
3166       coding->errors++;
3167       src = src_base;
3168       c = *src++;
3169       EMIT_CHAR (c);
3170     }
3171
3172  label_end_of_loop:
3173   coding->consumed = coding->consumed_char = src_base - source;
3174   coding->produced = dst - destination;
3175   return;
3176 }
3177
3178 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3179    This function can encode charsets `ascii', `katakana-jisx0201',
3180    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3181    are sure that all these charsets are registered as official charset
3182    (i.e. do not have extended leading-codes).  Characters of other
3183    charsets are produced without any encoding.  If SJIS_P is 1, encode
3184    SJIS text, else encode BIG5 text.  */
3185
3186 static void
3187 encode_coding_sjis_big5 (coding, source, destination,
3188                          src_bytes, dst_bytes, sjis_p)
3189      struct coding_system *coding;
3190      unsigned char *source, *destination;
3191      int src_bytes, dst_bytes;
3192      int sjis_p;
3193 {
3194   unsigned char *src = source;
3195   unsigned char *src_end = source + src_bytes;
3196   unsigned char *dst = destination;
3197   unsigned char *dst_end = destination + dst_bytes;
3198   /* SRC_BASE remembers the start position in source in each loop.
3199      The loop will be exited when there's not enough source text to
3200      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3201      there's not enough destination area to produce encoded codes
3202      (within macro EMIT_BYTES).  */
3203   unsigned char *src_base;
3204   Lisp_Object translation_table;
3205
3206   if (NILP (Venable_character_translation))
3207     translation_table = Qnil;
3208   else
3209     {
3210       translation_table = coding->translation_table_for_encode;
3211       if (NILP (translation_table))
3212         translation_table = Vstandard_translation_table_for_encode;
3213     }
3214
3215   while (1)
3216     {
3217       int c, charset, c1, c2;
3218
3219       src_base = src;
3220       ONE_MORE_CHAR (c);
3221
3222       /* Now encode the character C.  */
3223       if (SINGLE_BYTE_CHAR_P (c))
3224         {
3225           switch (c)
3226             {
3227             case '\r':
3228               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3229                 {
3230                   EMIT_ONE_BYTE (c);
3231                   break;
3232                 }
3233               c = '\n';
3234             case '\n':
3235               if (coding->eol_type == CODING_EOL_CRLF)
3236                 {
3237                   EMIT_TWO_BYTES ('\r', c);
3238                   break;
3239                 }
3240               else if (coding->eol_type == CODING_EOL_CR)
3241                 c = '\r';
3242             default:
3243               EMIT_ONE_BYTE (c);
3244             }
3245         }
3246       else
3247         {
3248           SPLIT_CHAR (c, charset, c1, c2);
3249           if (sjis_p)
3250             {
3251               if (charset == charset_jisx0208
3252                   || charset == charset_jisx0208_1978)
3253                 {
3254                   ENCODE_SJIS (c1, c2, c1, c2);
3255                   EMIT_TWO_BYTES (c1, c2);
3256                 }
3257               else if (charset == charset_katakana_jisx0201)
3258                 EMIT_ONE_BYTE (c1 | 0x80);
3259               else if (charset == charset_latin_jisx0201)
3260                 EMIT_ONE_BYTE (c1);
3261               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3262                 {
3263                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3264                   if (CHARSET_WIDTH (charset) > 1)
3265                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3266                 }
3267               else
3268                 /* There's no way other than producing the internal
3269                    codes as is.  */
3270                 EMIT_BYTES (src_base, src);
3271             }
3272           else
3273             {
3274               if (charset == charset_big5_1 || charset == charset_big5_2)
3275                 {
3276                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3277                   EMIT_TWO_BYTES (c1, c2);
3278                 }
3279               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3280                 {
3281                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3282                   if (CHARSET_WIDTH (charset) > 1)
3283                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3284                 }
3285               else
3286                 /* There's no way other than producing the internal
3287                    codes as is.  */
3288                 EMIT_BYTES (src_base, src);
3289             }
3290         }
3291       coding->consumed_char++;
3292     }
3293
3294  label_end_of_loop:
3295   coding->consumed = src_base - source;
3296   coding->produced = coding->produced_char = dst - destination;
3297 }
3298
3299 \f
3300 /*** 5. CCL handlers ***/
3301
3302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3303    Check if a text is encoded in a coding system of which
3304    encoder/decoder are written in CCL program.  If it is, return
3305    CODING_CATEGORY_MASK_CCL, else return 0.  */
3306
3307 static int
3308 detect_coding_ccl (src, src_end, multibytep)
3309      unsigned char *src, *src_end;
3310      int multibytep;
3311 {
3312   unsigned char *valid;
3313   int c;
3314   /* Dummy for ONE_MORE_BYTE.  */
3315   struct coding_system dummy_coding;
3316   struct coding_system *coding = &dummy_coding;
3317
3318   /* No coding system is assigned to coding-category-ccl.  */
3319   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3320     return 0;
3321
3322   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3323   while (1)
3324     {
3325       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3326       if (! valid[c])
3327         return 0;
3328     }
3329  label_end_of_loop:
3330   return CODING_CATEGORY_MASK_CCL;
3331 }
3332
3333 \f
3334 /*** 6. End-of-line handlers ***/
3335
3336 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3337
3338 static void
3339 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3340      struct coding_system *coding;
3341      const unsigned char *source;
3342      unsigned char *destination;
3343      int src_bytes, dst_bytes;
3344 {
3345   const unsigned char *src = source;
3346   unsigned char *dst = destination;
3347   const unsigned char *src_end = src + src_bytes;
3348   unsigned char *dst_end = dst + dst_bytes;
3349   Lisp_Object translation_table;
3350   /* SRC_BASE remembers the start position in source in each loop.
3351      The loop will be exited when there's not enough source code
3352      (within macro ONE_MORE_BYTE), or when there's not enough
3353      destination area to produce a character (within macro
3354      EMIT_CHAR).  */
3355   const unsigned char *src_base;
3356   int c;
3357
3358   translation_table = Qnil;
3359   switch (coding->eol_type)
3360     {
3361     case CODING_EOL_CRLF:
3362       while (1)
3363         {
3364           src_base = src;
3365           ONE_MORE_BYTE (c);
3366           if (c == '\r')
3367             {
3368               ONE_MORE_BYTE (c);
3369               if (c != '\n')
3370                 {
3371                   src--;
3372                   c = '\r';
3373                 }
3374             }
3375           else if (c == '\n'
3376                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3377             {
3378               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3379               goto label_end_of_loop;
3380             }
3381           EMIT_CHAR (c);
3382         }
3383       break;
3384
3385     case CODING_EOL_CR:
3386       while (1)
3387         {
3388           src_base = src;
3389           ONE_MORE_BYTE (c);
3390           if (c == '\n')
3391             {
3392               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3393                 {
3394                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3395                   goto label_end_of_loop;
3396                 }
3397             }
3398           else if (c == '\r')
3399             c = '\n';
3400           EMIT_CHAR (c);
3401         }
3402       break;
3403
3404     default:                    /* no need for EOL handling */
3405       while (1)
3406         {
3407           src_base = src;
3408           ONE_MORE_BYTE (c);
3409           EMIT_CHAR (c);
3410         }
3411     }
3412
3413  label_end_of_loop:
3414   coding->consumed = coding->consumed_char = src_base - source;
3415   coding->produced = dst - destination;
3416   return;
3417 }
3418
3419 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3420    format of end-of-line according to `coding->eol_type'.  It also
3421    convert multibyte form 8-bit characters to unibyte if
3422    CODING->src_multibyte is nonzero.  If `coding->mode &
3423    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3424    also means end-of-line.  */
3425
3426 static void
3427 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3428      struct coding_system *coding;
3429      const unsigned char *source;
3430      unsigned char *destination;
3431      int src_bytes, dst_bytes;
3432 {
3433   const unsigned char *src = source;
3434   unsigned char *dst = destination;
3435   const unsigned char *src_end = src + src_bytes;
3436   unsigned char *dst_end = dst + dst_bytes;
3437   Lisp_Object translation_table;
3438   /* SRC_BASE remembers the start position in source in each loop.
3439      The loop will be exited when there's not enough source text to
3440      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3441      there's not enough destination area to produce encoded codes
3442      (within macro EMIT_BYTES).  */
3443   const unsigned char *src_base;
3444   unsigned char *tmp;
3445   int c;
3446   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3447
3448   translation_table = Qnil;
3449   if (coding->src_multibyte
3450       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3451     {
3452       src_end--;
3453       src_bytes--;
3454       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3455     }
3456
3457   if (coding->eol_type == CODING_EOL_CRLF)
3458     {
3459       while (src < src_end)
3460         {
3461           src_base = src;
3462           c = *src++;
3463           if (c >= 0x20)
3464             EMIT_ONE_BYTE (c);
3465           else if (c == '\n' || (c == '\r' && selective_display))
3466             EMIT_TWO_BYTES ('\r', '\n');
3467           else
3468             EMIT_ONE_BYTE (c);
3469         }
3470       src_base = src;
3471     label_end_of_loop:
3472       ;
3473     }
3474   else
3475     {
3476       if (!dst_bytes || src_bytes <= dst_bytes)
3477         {
3478           safe_bcopy (src, dst, src_bytes);
3479           src_base = src_end;
3480           dst += src_bytes;
3481         }
3482       else
3483         {
3484           if (coding->src_multibyte
3485               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3486             dst_bytes--;
3487           safe_bcopy (src, dst, dst_bytes);
3488           src_base = src + dst_bytes;
3489           dst = destination + dst_bytes;
3490           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3491         }
3492       if (coding->eol_type == CODING_EOL_CR)
3493         {
3494           for (tmp = destination; tmp < dst; tmp++)
3495             if (*tmp == '\n') *tmp = '\r';
3496         }
3497       else if (selective_display)
3498         {
3499           for (tmp = destination; tmp < dst; tmp++)
3500             if (*tmp == '\r') *tmp = '\n';
3501         }
3502     }
3503   if (coding->src_multibyte)
3504     dst = destination + str_as_unibyte (destination, dst - destination);
3505
3506   coding->consumed = src_base - source;
3507   coding->produced = dst - destination;
3508   coding->produced_char = coding->produced;
3509 }
3510
3511 \f
3512 /*** 7. C library functions ***/
3513
3514 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3515    has a property `coding-system'.  The value of this property is a
3516    vector of length 5 (called the coding-vector).  Among elements of
3517    this vector, the first (element[0]) and the fifth (element[4])
3518    carry important information for decoding/encoding.  Before
3519    decoding/encoding, this information should be set in fields of a
3520    structure of type `coding_system'.
3521
3522    The value of the property `coding-system' can be a symbol of another
3523    subsidiary coding-system.  In that case, Emacs gets coding-vector
3524    from that symbol.
3525
3526    `element[0]' contains information to be set in `coding->type'.  The
3527    value and its meaning is as follows:
3528
3529    0 -- coding_type_emacs_mule
3530    1 -- coding_type_sjis
3531    2 -- coding_type_iso2022
3532    3 -- coding_type_big5
3533    4 -- coding_type_ccl encoder/decoder written in CCL
3534    nil -- coding_type_no_conversion
3535    t -- coding_type_undecided (automatic conversion on decoding,
3536                                no-conversion on encoding)
3537
3538    `element[4]' contains information to be set in `coding->flags' and
3539    `coding->spec'.  The meaning varies by `coding->type'.
3540
3541    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3542    of length 32 (of which the first 13 sub-elements are used now).
3543    Meanings of these sub-elements are:
3544
3545    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3546         If the value is an integer of valid charset, the charset is
3547         assumed to be designated to graphic register N initially.
3548
3549         If the value is minus, it is a minus value of charset which
3550         reserves graphic register N, which means that the charset is
3551         not designated initially but should be designated to graphic
3552         register N just before encoding a character in that charset.
3553
3554         If the value is nil, graphic register N is never used on
3555         encoding.
3556
3557    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3558         Each value takes t or nil.  See the section ISO2022 of
3559         `coding.h' for more information.
3560
3561    If `coding->type' is `coding_type_big5', element[4] is t to denote
3562    BIG5-ETen or nil to denote BIG5-HKU.
3563
3564    If `coding->type' takes the other value, element[4] is ignored.
3565
3566    Emacs Lisp's coding systems also carry information about format of
3567    end-of-line in a value of property `eol-type'.  If the value is
3568    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3569    means CODING_EOL_CR.  If it is not integer, it should be a vector
3570    of subsidiary coding systems of which property `eol-type' has one
3571    of the above values.
3572
3573 */
3574
3575 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3576    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3577    is setup so that no conversion is necessary and return -1, else
3578    return 0.  */
3579
3580 int
3581 setup_coding_system (coding_system, coding)
3582      Lisp_Object coding_system;
3583      struct coding_system *coding;
3584 {
3585   Lisp_Object coding_spec, coding_type, eol_type, plist;
3586   Lisp_Object val;
3587
3588   /* At first, zero clear all members.  */
3589   bzero (coding, sizeof (struct coding_system));
3590
3591   /* Initialize some fields required for all kinds of coding systems.  */
3592   coding->symbol = coding_system;
3593   coding->heading_ascii = -1;
3594   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3595   coding->composing = COMPOSITION_DISABLED;
3596   coding->cmp_data = NULL;
3597
3598   if (NILP (coding_system))
3599     goto label_invalid_coding_system;
3600
3601   coding_spec = Fget (coding_system, Qcoding_system);
3602
3603   if (!VECTORP (coding_spec)
3604       || XVECTOR (coding_spec)->size != 5
3605       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3606     goto label_invalid_coding_system;
3607
3608   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3609   if (VECTORP (eol_type))
3610     {
3611       coding->eol_type = CODING_EOL_UNDECIDED;
3612       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3613       if (system_eol_type != CODING_EOL_LF)
3614         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3615     }
3616   else if (XFASTINT (eol_type) == 1)
3617     {
3618       coding->eol_type = CODING_EOL_CRLF;
3619       coding->common_flags
3620         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3621     }
3622   else if (XFASTINT (eol_type) == 2)
3623     {
3624       coding->eol_type = CODING_EOL_CR;
3625       coding->common_flags
3626         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3627     }
3628   else
3629     {
3630       coding->common_flags = 0;
3631       coding->eol_type = CODING_EOL_LF;
3632     }
3633
3634   coding_type = XVECTOR (coding_spec)->contents[0];
3635   /* Try short cut.  */
3636   if (SYMBOLP (coding_type))
3637     {
3638       if (EQ (coding_type, Qt))
3639         {
3640           coding->type = coding_type_undecided;
3641           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3642         }
3643       else
3644         coding->type = coding_type_no_conversion;
3645       /* Initialize this member.  Any thing other than
3646          CODING_CATEGORY_IDX_UTF_16_BE and
3647          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3648          special treatment in detect_eol.  */
3649       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3650
3651       return 0;
3652     }
3653
3654   /* Get values of coding system properties:
3655      `post-read-conversion', `pre-write-conversion',
3656      `translation-table-for-decode', `translation-table-for-encode'.  */
3657   plist = XVECTOR (coding_spec)->contents[3];
3658   /* Pre & post conversion functions should be disabled if
3659      inhibit_eol_conversion is nonzero.  This is the case that a code
3660      conversion function is called while those functions are running.  */
3661   if (! inhibit_pre_post_conversion)
3662     {
3663       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3664       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3665     }
3666   val = Fplist_get (plist, Qtranslation_table_for_decode);
3667   if (SYMBOLP (val))
3668     val = Fget (val, Qtranslation_table_for_decode);
3669   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3670   val = Fplist_get (plist, Qtranslation_table_for_encode);
3671   if (SYMBOLP (val))
3672     val = Fget (val, Qtranslation_table_for_encode);
3673   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3674   val = Fplist_get (plist, Qcoding_category);
3675   if (!NILP (val))
3676     {
3677       val = Fget (val, Qcoding_category_index);
3678       if (INTEGERP (val))
3679         coding->category_idx = XINT (val);
3680       else
3681         goto label_invalid_coding_system;
3682     }
3683   else
3684     goto label_invalid_coding_system;
3685
3686   /* If the coding system has non-nil `composition' property, enable
3687      composition handling.  */
3688   val = Fplist_get (plist, Qcomposition);
3689   if (!NILP (val))
3690     coding->composing = COMPOSITION_NO;
3691
3692   /* If the coding system is ascii-incompatible, record it in
3693      common_flags.   */
3694   val = Fplist_get (plist, Qascii_incompatible);
3695   if (! NILP (val))
3696     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3697
3698   switch (XFASTINT (coding_type))
3699     {
3700     case 0:
3701       coding->type = coding_type_emacs_mule;
3702       coding->common_flags
3703         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3704       if (!NILP (coding->post_read_conversion))
3705         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3706       if (!NILP (coding->pre_write_conversion))
3707         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3708       break;
3709
3710     case 1:
3711       coding->type = coding_type_sjis;
3712       coding->common_flags
3713         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3714       break;
3715
3716     case 2:
3717       coding->type = coding_type_iso2022;
3718       coding->common_flags
3719         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3720       {
3721         Lisp_Object val, temp;
3722         Lisp_Object *flags;
3723         int i, charset, reg_bits = 0;
3724
3725         val = XVECTOR (coding_spec)->contents[4];
3726
3727         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3728           goto label_invalid_coding_system;
3729
3730         flags = XVECTOR (val)->contents;
3731         coding->flags
3732           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3733              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3734              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3735              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3736              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3737              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3738              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3739              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3740              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3741              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3742              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3743              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3744              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3745              );
3746
3747         /* Invoke graphic register 0 to plane 0.  */
3748         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3749         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3750         CODING_SPEC_ISO_INVOCATION (coding, 1)
3751           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3752         /* Not single shifting at first.  */
3753         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3754         /* Beginning of buffer should also be regarded as bol. */
3755         CODING_SPEC_ISO_BOL (coding) = 1;
3756
3757         for (charset = 0; charset <= MAX_CHARSET; charset++)
3758           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3759         val = Vcharset_revision_alist;
3760         while (CONSP (val))
3761           {
3762             charset = get_charset_id (Fcar_safe (XCAR (val)));
3763             if (charset >= 0
3764                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3765                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3766               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3767             val = XCDR (val);
3768           }
3769
3770         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3771            FLAGS[REG] can be one of below:
3772                 integer CHARSET: CHARSET occupies register I,
3773                 t: designate nothing to REG initially, but can be used
3774                   by any charsets,
3775                 list of integer, nil, or t: designate the first
3776                   element (if integer) to REG initially, the remaining
3777                   elements (if integer) is designated to REG on request,
3778                   if an element is t, REG can be used by any charsets,
3779                 nil: REG is never used.  */
3780         for (charset = 0; charset <= MAX_CHARSET; charset++)
3781           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3782             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3783         for (i = 0; i < 4; i++)
3784           {
3785             if ((INTEGERP (flags[i])
3786                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3787                 || (charset = get_charset_id (flags[i])) >= 0)
3788               {
3789                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3790                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3791               }
3792             else if (EQ (flags[i], Qt))
3793               {
3794                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3795                 reg_bits |= 1 << i;
3796                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3797               }
3798             else if (CONSP (flags[i]))
3799               {
3800                 Lisp_Object tail;
3801                 tail = flags[i];
3802
3803                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3804                 if ((INTEGERP (XCAR (tail))
3805                      && (charset = XINT (XCAR (tail)),
3806                          CHARSET_VALID_P (charset)))
3807                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3808                   {
3809                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3810                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3811                   }
3812                 else
3813                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3814                 tail = XCDR (tail);
3815                 while (CONSP (tail))
3816                   {
3817                     if ((INTEGERP (XCAR (tail))
3818                          && (charset = XINT (XCAR (tail)),
3819                              CHARSET_VALID_P (charset)))
3820                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3821                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3822                         = i;
3823                     else if (EQ (XCAR (tail), Qt))
3824                       reg_bits |= 1 << i;
3825                     tail = XCDR (tail);
3826                   }
3827               }
3828             else
3829               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3830
3831             CODING_SPEC_ISO_DESIGNATION (coding, i)
3832               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3833           }
3834
3835         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3836           {
3837             /* REG 1 can be used only by locking shift in 7-bit env.  */
3838             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3839               reg_bits &= ~2;
3840             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3841               /* Without any shifting, only REG 0 and 1 can be used.  */
3842               reg_bits &= 3;
3843           }
3844
3845         if (reg_bits)
3846           for (charset = 0; charset <= MAX_CHARSET; charset++)
3847             {
3848               if (CHARSET_DEFINED_P (charset)
3849                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3850                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3851                 {
3852                   /* There exist some default graphic registers to be
3853                      used by CHARSET.  */
3854
3855                   /* We had better avoid designating a charset of
3856                      CHARS96 to REG 0 as far as possible.  */
3857                   if (CHARSET_CHARS (charset) == 96)
3858                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3859                       = (reg_bits & 2
3860                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3861                   else
3862                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3863                       = (reg_bits & 1
3864                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3865                 }
3866             }
3867       }
3868       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3869       coding->spec.iso2022.last_invalid_designation_register = -1;
3870       break;
3871
3872     case 3:
3873       coding->type = coding_type_big5;
3874       coding->common_flags
3875         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3876       coding->flags
3877         = (NILP (XVECTOR (coding_spec)->contents[4])
3878            ? CODING_FLAG_BIG5_HKU
3879            : CODING_FLAG_BIG5_ETEN);
3880       break;
3881
3882     case 4:
3883       coding->type = coding_type_ccl;
3884       coding->common_flags
3885         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3886       {
3887         val = XVECTOR (coding_spec)->contents[4];
3888         if (! CONSP (val)
3889             || setup_ccl_program (&(coding->spec.ccl.decoder),
3890                                   XCAR (val)) < 0
3891             || setup_ccl_program (&(coding->spec.ccl.encoder),
3892                                   XCDR (val)) < 0)
3893           goto label_invalid_coding_system;
3894
3895         bzero (coding->spec.ccl.valid_codes, 256);
3896         val = Fplist_get (plist, Qvalid_codes);
3897         if (CONSP (val))
3898           {
3899             Lisp_Object this;
3900
3901             for (; CONSP (val); val = XCDR (val))
3902               {
3903                 this = XCAR (val);
3904                 if (INTEGERP (this)
3905                     && XINT (this) >= 0 && XINT (this) < 256)
3906                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3907                 else if (CONSP (this)
3908                          && INTEGERP (XCAR (this))
3909                          && INTEGERP (XCDR (this)))
3910                   {
3911                     int start = XINT (XCAR (this));
3912                     int end = XINT (XCDR (this));
3913
3914                     if (start >= 0 && start <= end && end < 256)
3915                       while (start <= end)
3916                         coding->spec.ccl.valid_codes[start++] = 1;
3917                   }
3918               }
3919           }
3920       }
3921       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3922       coding->spec.ccl.cr_carryover = 0;
3923       coding->spec.ccl.eight_bit_carryover[0] = 0;
3924       break;
3925
3926     case 5:
3927       coding->type = coding_type_raw_text;
3928       break;
3929
3930     default:
3931       goto label_invalid_coding_system;
3932     }
3933   return 0;
3934
3935  label_invalid_coding_system:
3936   coding->type = coding_type_no_conversion;
3937   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3938   coding->common_flags = 0;
3939   coding->eol_type = CODING_EOL_UNDECIDED;
3940   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3941   return NILP (coding_system) ? 0 : -1;
3942 }
3943
3944 /* Free memory blocks allocated for storing composition information.  */
3945
3946 void
3947 coding_free_composition_data (coding)
3948      struct coding_system *coding;
3949 {
3950   struct composition_data *cmp_data = coding->cmp_data, *next;
3951
3952   if (!cmp_data)
3953     return;
3954   /* Memory blocks are chained.  At first, rewind to the first, then,
3955      free blocks one by one.  */
3956   while (cmp_data->prev)
3957     cmp_data = cmp_data->prev;
3958   while (cmp_data)
3959     {
3960       next = cmp_data->next;
3961       xfree (cmp_data);
3962       cmp_data = next;
3963     }
3964   coding->cmp_data = NULL;
3965 }
3966
3967 /* Set `char_offset' member of all memory blocks pointed by
3968    coding->cmp_data to POS.  */
3969
3970 void
3971 coding_adjust_composition_offset (coding, pos)
3972      struct coding_system *coding;
3973      int pos;
3974 {
3975   struct composition_data *cmp_data;
3976
3977   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3978     cmp_data->char_offset = pos;
3979 }
3980
3981 /* Setup raw-text or one of its subsidiaries in the structure
3982    coding_system CODING according to the already setup value eol_type
3983    in CODING.  CODING should be setup for some coding system in
3984    advance.  */
3985
3986 void
3987 setup_raw_text_coding_system (coding)
3988      struct coding_system *coding;
3989 {
3990   if (coding->type != coding_type_raw_text)
3991     {
3992       coding->symbol = Qraw_text;
3993       coding->type = coding_type_raw_text;
3994       if (coding->eol_type != CODING_EOL_UNDECIDED)
3995         {
3996           Lisp_Object subsidiaries;
3997           subsidiaries = Fget (Qraw_text, Qeol_type);
3998
3999           if (VECTORP (subsidiaries)
4000               && XVECTOR (subsidiaries)->size == 3)
4001             coding->symbol
4002               = XVECTOR (subsidiaries)->contents[coding->eol_type];
4003         }
4004       setup_coding_system (coding->symbol, coding);
4005     }
4006   return;
4007 }
4008
4009 /* Emacs has a mechanism to automatically detect a coding system if it
4010    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4011    it's impossible to distinguish some coding systems accurately
4012    because they use the same range of codes.  So, at first, coding
4013    systems are categorized into 7, those are:
4014
4015    o coding-category-emacs-mule
4016
4017         The category for a coding system which has the same code range
4018         as Emacs' internal format.  Assigned the coding-system (Lisp
4019         symbol) `emacs-mule' by default.
4020
4021    o coding-category-sjis
4022
4023         The category for a coding system which has the same code range
4024         as SJIS.  Assigned the coding-system (Lisp
4025         symbol) `japanese-shift-jis' by default.
4026
4027    o coding-category-iso-7
4028
4029         The category for a coding system which has the same code range
4030         as ISO2022 of 7-bit environment.  This doesn't use any locking
4031         shift and single shift functions.  This can encode/decode all
4032         charsets.  Assigned the coding-system (Lisp symbol)
4033         `iso-2022-7bit' by default.
4034
4035    o coding-category-iso-7-tight
4036
4037         Same as coding-category-iso-7 except that this can
4038         encode/decode only the specified charsets.
4039
4040    o coding-category-iso-8-1
4041
4042         The category for a coding system which has the same code range
4043         as ISO2022 of 8-bit environment and graphic plane 1 used only
4044         for DIMENSION1 charset.  This doesn't use any locking shift
4045         and single shift functions.  Assigned the coding-system (Lisp
4046         symbol) `iso-latin-1' by default.
4047
4048    o coding-category-iso-8-2
4049
4050         The category for a coding system which has the same code range
4051         as ISO2022 of 8-bit environment and graphic plane 1 used only
4052         for DIMENSION2 charset.  This doesn't use any locking shift
4053         and single shift functions.  Assigned the coding-system (Lisp
4054         symbol) `japanese-iso-8bit' by default.
4055
4056    o coding-category-iso-7-else
4057
4058         The category for a coding system which has the same code range
4059         as ISO2022 of 7-bit environment but uses locking shift or
4060         single shift functions.  Assigned the coding-system (Lisp
4061         symbol) `iso-2022-7bit-lock' by default.
4062
4063    o coding-category-iso-8-else
4064
4065         The category for a coding system which has the same code range
4066         as ISO2022 of 8-bit environment but uses locking shift or
4067         single shift functions.  Assigned the coding-system (Lisp
4068         symbol) `iso-2022-8bit-ss2' by default.
4069
4070    o coding-category-big5
4071
4072         The category for a coding system which has the same code range
4073         as BIG5.  Assigned the coding-system (Lisp symbol)
4074         `cn-big5' by default.
4075
4076    o coding-category-utf-8
4077
4078         The category for a coding system which has the same code range
4079         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4080         symbol) `utf-8' by default.
4081
4082    o coding-category-utf-16-be
4083
4084         The category for a coding system in which a text has an
4085         Unicode signature (cf. Unicode Standard) in the order of BIG
4086         endian at the head.  Assigned the coding-system (Lisp symbol)
4087         `utf-16-be' by default.
4088
4089    o coding-category-utf-16-le
4090
4091         The category for a coding system in which a text has an
4092         Unicode signature (cf. Unicode Standard) in the order of
4093         LITTLE endian at the head.  Assigned the coding-system (Lisp
4094         symbol) `utf-16-le' by default.
4095
4096    o coding-category-ccl
4097
4098         The category for a coding system of which encoder/decoder is
4099         written in CCL programs.  The default value is nil, i.e., no
4100         coding system is assigned.
4101
4102    o coding-category-binary
4103
4104         The category for a coding system not categorized in any of the
4105         above.  Assigned the coding-system (Lisp symbol)
4106         `no-conversion' by default.
4107
4108    Each of them is a Lisp symbol and the value is an actual
4109    `coding-system' (this is also a Lisp symbol) assigned by a user.
4110    What Emacs does actually is to detect a category of coding system.
4111    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4112    decide a single possible category, it selects a category of the
4113    highest priority.  Priorities of categories are also specified by a
4114    user in a Lisp variable `coding-category-list'.
4115
4116 */
4117
4118 static
4119 int ascii_skip_code[256];
4120
4121 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4122    If it detects possible coding systems, return an integer in which
4123    appropriate flag bits are set.  Flag bits are defined by macros
4124    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4125    it should point the table `coding_priorities'.  In that case, only
4126    the flag bit for a coding system of the highest priority is set in
4127    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4128    range 0x80..0x9F are in multibyte form.
4129
4130    How many ASCII characters are at the head is returned as *SKIP.  */
4131
4132 static int
4133 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4134      unsigned char *source;
4135      int src_bytes, *priorities, *skip;
4136      int multibytep;
4137 {
4138   register unsigned char c;
4139   unsigned char *src = source, *src_end = source + src_bytes;
4140   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4141   int i;
4142
4143   /* At first, skip all ASCII characters and control characters except
4144      for three ISO2022 specific control characters.  */
4145   ascii_skip_code[ISO_CODE_SO] = 0;
4146   ascii_skip_code[ISO_CODE_SI] = 0;
4147   ascii_skip_code[ISO_CODE_ESC] = 0;
4148
4149  label_loop_detect_coding:
4150   while (src < src_end && ascii_skip_code[*src]) src++;
4151   *skip = src - source;
4152
4153   if (src >= src_end)
4154     /* We found nothing other than ASCII.  There's nothing to do.  */
4155     return 0;
4156
4157   c = *src;
4158   /* The text seems to be encoded in some multilingual coding system.
4159      Now, try to find in which coding system the text is encoded.  */
4160   if (c < 0x80)
4161     {
4162       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4163       /* C is an ISO2022 specific control code of C0.  */
4164       mask = detect_coding_iso2022 (src, src_end, multibytep);
4165       if (mask == 0)
4166         {
4167           /* No valid ISO2022 code follows C.  Try again.  */
4168           src++;
4169           if (c == ISO_CODE_ESC)
4170             ascii_skip_code[ISO_CODE_ESC] = 1;
4171           else
4172             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4173           goto label_loop_detect_coding;
4174         }
4175       if (priorities)
4176         {
4177           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4178             {
4179               if (mask & priorities[i])
4180                 return priorities[i];
4181             }
4182           return CODING_CATEGORY_MASK_RAW_TEXT;
4183         }
4184     }
4185   else
4186     {
4187       int try;
4188
4189       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4190         c = src[1] - 0x20;
4191
4192       if (c < 0xA0)
4193         {
4194           /* C is the first byte of SJIS character code,
4195              or a leading-code of Emacs' internal format (emacs-mule),
4196              or the first byte of UTF-16.  */
4197           try = (CODING_CATEGORY_MASK_SJIS
4198                   | CODING_CATEGORY_MASK_EMACS_MULE
4199                   | CODING_CATEGORY_MASK_UTF_16_BE
4200                   | CODING_CATEGORY_MASK_UTF_16_LE);
4201
4202           /* Or, if C is a special latin extra code,
4203              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4204              or is an ISO2022 control-sequence-introducer (CSI),
4205              we should also consider the possibility of ISO2022 codings.  */
4206           if ((VECTORP (Vlatin_extra_code_table)
4207                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4208               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4209               || (c == ISO_CODE_CSI
4210                   && (src < src_end
4211                       && (*src == ']'
4212                           || ((*src == '0' || *src == '1' || *src == '2')
4213                               && src + 1 < src_end
4214                               && src[1] == ']')))))
4215             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4216                      | CODING_CATEGORY_MASK_ISO_8BIT);
4217         }
4218       else
4219         /* C is a character of ISO2022 in graphic plane right,
4220            or a SJIS's 1-byte character code (i.e. JISX0201),
4221            or the first byte of BIG5's 2-byte code,
4222            or the first byte of UTF-8/16.  */
4223         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4224                 | CODING_CATEGORY_MASK_ISO_8BIT
4225                 | CODING_CATEGORY_MASK_SJIS
4226                 | CODING_CATEGORY_MASK_BIG5
4227                 | CODING_CATEGORY_MASK_UTF_8
4228                 | CODING_CATEGORY_MASK_UTF_16_BE
4229                 | CODING_CATEGORY_MASK_UTF_16_LE);
4230
4231       /* Or, we may have to consider the possibility of CCL.  */
4232       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4233           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4234               ->spec.ccl.valid_codes)[c])
4235         try |= CODING_CATEGORY_MASK_CCL;
4236
4237       mask = 0;
4238       utf16_examined_p = iso2022_examined_p = 0;
4239       if (priorities)
4240         {
4241           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4242             {
4243               if (!iso2022_examined_p
4244                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4245                 {
4246                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4247                   iso2022_examined_p = 1;
4248                 }
4249               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4250                 mask |= detect_coding_sjis (src, src_end, multibytep);
4251               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4252                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4253               else if (!utf16_examined_p
4254                        && (priorities[i] & try &
4255                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4256                 {
4257                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4258                   utf16_examined_p = 1;
4259                 }
4260               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4261                 mask |= detect_coding_big5 (src, src_end, multibytep);
4262               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4263                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4264               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4265                 mask |= detect_coding_ccl (src, src_end, multibytep);
4266               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4267                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4268               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4269                 mask |= CODING_CATEGORY_MASK_BINARY;
4270               if (mask & priorities[i])
4271                 return priorities[i];
4272             }
4273           return CODING_CATEGORY_MASK_RAW_TEXT;
4274         }
4275       if (try & CODING_CATEGORY_MASK_ISO)
4276         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4277       if (try & CODING_CATEGORY_MASK_SJIS)
4278         mask |= detect_coding_sjis (src, src_end, multibytep);
4279       if (try & CODING_CATEGORY_MASK_BIG5)
4280         mask |= detect_coding_big5 (src, src_end, multibytep);
4281       if (try & CODING_CATEGORY_MASK_UTF_8)
4282         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4283       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4284         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4285       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4286         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4287       if (try & CODING_CATEGORY_MASK_CCL)
4288         mask |= detect_coding_ccl (src, src_end, multibytep);
4289     }
4290   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4291 }
4292
4293 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4294    The information of the detected coding system is set in CODING.  */
4295
4296 void
4297 detect_coding (coding, src, src_bytes)
4298      struct coding_system *coding;
4299      const unsigned char *src;
4300      int src_bytes;
4301 {
4302   unsigned int idx;
4303   int skip, mask;
4304   Lisp_Object val;
4305
4306   val = Vcoding_category_list;
4307   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4308                              coding->src_multibyte);
4309   coding->heading_ascii = skip;
4310
4311   if (!mask) return;
4312
4313   /* We found a single coding system of the highest priority in MASK.  */
4314   idx = 0;
4315   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4316   if (! mask)
4317     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4318
4319   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4320
4321   if (coding->eol_type != CODING_EOL_UNDECIDED)
4322     {
4323       Lisp_Object tmp;
4324
4325       tmp = Fget (val, Qeol_type);
4326       if (VECTORP (tmp))
4327         val = XVECTOR (tmp)->contents[coding->eol_type];
4328     }
4329
4330   /* Setup this new coding system while preserving some slots.  */
4331   {
4332     int src_multibyte = coding->src_multibyte;
4333     int dst_multibyte = coding->dst_multibyte;
4334
4335     setup_coding_system (val, coding);
4336     coding->src_multibyte = src_multibyte;
4337     coding->dst_multibyte = dst_multibyte;
4338     coding->heading_ascii = skip;
4339   }
4340 }
4341
4342 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4343    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4344    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4345
4346    How many non-eol characters are at the head is returned as *SKIP.  */
4347
4348 #define MAX_EOL_CHECK_COUNT 3
4349
4350 static int
4351 detect_eol_type (source, src_bytes, skip)
4352      unsigned char *source;
4353      int src_bytes, *skip;
4354 {
4355   unsigned char *src = source, *src_end = src + src_bytes;
4356   unsigned char c;
4357   int total = 0;                /* How many end-of-lines are found so far.  */
4358   int eol_type = CODING_EOL_UNDECIDED;
4359   int this_eol_type;
4360
4361   *skip = 0;
4362
4363   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4364     {
4365       c = *src++;
4366       if (c == '\n' || c == '\r')
4367         {
4368           if (*skip == 0)
4369             *skip = src - 1 - source;
4370           total++;
4371           if (c == '\n')
4372             this_eol_type = CODING_EOL_LF;
4373           else if (src >= src_end || *src != '\n')
4374             this_eol_type = CODING_EOL_CR;
4375           else
4376             this_eol_type = CODING_EOL_CRLF, src++;
4377
4378           if (eol_type == CODING_EOL_UNDECIDED)
4379             /* This is the first end-of-line.  */
4380             eol_type = this_eol_type;
4381           else if (eol_type != this_eol_type)
4382             {
4383               /* The found type is different from what found before.  */
4384               eol_type = CODING_EOL_INCONSISTENT;
4385               break;
4386             }
4387         }
4388     }
4389
4390   if (*skip == 0)
4391     *skip = src_end - source;
4392   return eol_type;
4393 }
4394
4395 /* Like detect_eol_type, but detect EOL type in 2-octet
4396    big-endian/little-endian format for coding systems utf-16-be and
4397    utf-16-le.  */
4398
4399 static int
4400 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4401      unsigned char *source;
4402      int src_bytes, *skip, big_endian_p;
4403 {
4404   unsigned char *src = source, *src_end = src + src_bytes;
4405   unsigned int c1, c2;
4406   int total = 0;                /* How many end-of-lines are found so far.  */
4407   int eol_type = CODING_EOL_UNDECIDED;
4408   int this_eol_type;
4409   int msb, lsb;
4410
4411   if (big_endian_p)
4412     msb = 0, lsb = 1;
4413   else
4414     msb = 1, lsb = 0;
4415
4416   *skip = 0;
4417
4418   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4419     {
4420       c1 = (src[msb] << 8) | (src[lsb]);
4421       src += 2;
4422
4423       if (c1 == '\n' || c1 == '\r')
4424         {
4425           if (*skip == 0)
4426             *skip = src - 2 - source;
4427           total++;
4428           if (c1 == '\n')
4429             {
4430               this_eol_type = CODING_EOL_LF;
4431             }
4432           else
4433             {
4434               if ((src + 1) >= src_end)
4435                 {
4436                   this_eol_type = CODING_EOL_CR;
4437                 }
4438               else
4439                 {
4440                   c2 = (src[msb] << 8) | (src[lsb]);
4441                   if (c2 == '\n')
4442                     this_eol_type = CODING_EOL_CRLF, src += 2;
4443                   else
4444                     this_eol_type = CODING_EOL_CR;
4445                 }
4446             }
4447
4448           if (eol_type == CODING_EOL_UNDECIDED)
4449             /* This is the first end-of-line.  */
4450             eol_type = this_eol_type;
4451           else if (eol_type != this_eol_type)
4452             {
4453               /* The found type is different from what found before.  */
4454               eol_type = CODING_EOL_INCONSISTENT;
4455               break;
4456             }
4457         }
4458     }
4459
4460   if (*skip == 0)
4461     *skip = src_end - source;
4462   return eol_type;
4463 }
4464
4465 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4466    is encoded.  If it detects an appropriate format of end-of-line, it
4467    sets the information in *CODING.  */
4468
4469 void
4470 detect_eol (coding, src, src_bytes)
4471      struct coding_system *coding;
4472      const unsigned char *src;
4473      int src_bytes;
4474 {
4475   Lisp_Object val;
4476   int skip;
4477   int eol_type;
4478
4479   switch (coding->category_idx)
4480     {
4481     case CODING_CATEGORY_IDX_UTF_16_BE:
4482       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4483       break;
4484     case CODING_CATEGORY_IDX_UTF_16_LE:
4485       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4486       break;
4487     default:
4488       eol_type = detect_eol_type (src, src_bytes, &skip);
4489       break;
4490     }
4491
4492   if (coding->heading_ascii > skip)
4493     coding->heading_ascii = skip;
4494   else
4495     skip = coding->heading_ascii;
4496
4497   if (eol_type == CODING_EOL_UNDECIDED)
4498     return;
4499   if (eol_type == CODING_EOL_INCONSISTENT)
4500     {
4501 #if 0
4502       /* This code is suppressed until we find a better way to
4503          distinguish raw text file and binary file.  */
4504
4505       /* If we have already detected that the coding is raw-text, the
4506          coding should actually be no-conversion.  */
4507       if (coding->type == coding_type_raw_text)
4508         {
4509           setup_coding_system (Qno_conversion, coding);
4510           return;
4511         }
4512       /* Else, let's decode only text code anyway.  */
4513 #endif /* 0 */
4514       eol_type = CODING_EOL_LF;
4515     }
4516
4517   val = Fget (coding->symbol, Qeol_type);
4518   if (VECTORP (val) && XVECTOR (val)->size == 3)
4519     {
4520       int src_multibyte = coding->src_multibyte;
4521       int dst_multibyte = coding->dst_multibyte;
4522       struct composition_data *cmp_data = coding->cmp_data;
4523
4524       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4525       coding->src_multibyte = src_multibyte;
4526       coding->dst_multibyte = dst_multibyte;
4527       coding->heading_ascii = skip;
4528       coding->cmp_data = cmp_data;
4529     }
4530 }
4531
4532 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4533
4534 #define DECODING_BUFFER_MAG(coding)                     \
4535   (coding->type == coding_type_iso2022                  \
4536    ? 3                                                  \
4537    : (coding->type == coding_type_ccl                   \
4538       ? coding->spec.ccl.decoder.buf_magnification      \
4539       : 2))
4540
4541 /* Return maximum size (bytes) of a buffer enough for decoding
4542    SRC_BYTES of text encoded in CODING.  */
4543
4544 int
4545 decoding_buffer_size (coding, src_bytes)
4546      struct coding_system *coding;
4547      int src_bytes;
4548 {
4549   return (src_bytes * DECODING_BUFFER_MAG (coding)
4550           + CONVERSION_BUFFER_EXTRA_ROOM);
4551 }
4552
4553 /* Return maximum size (bytes) of a buffer enough for encoding
4554    SRC_BYTES of text to CODING.  */
4555
4556 int
4557 encoding_buffer_size (coding, src_bytes)
4558      struct coding_system *coding;
4559      int src_bytes;
4560 {
4561   int magnification;
4562
4563   if (coding->type == coding_type_ccl)
4564     {
4565       magnification = coding->spec.ccl.encoder.buf_magnification;
4566       if (coding->eol_type == CODING_EOL_CRLF)
4567         magnification *= 2;
4568     }
4569   else if (CODING_REQUIRE_ENCODING (coding))
4570     magnification = 3;
4571   else
4572     magnification = 1;
4573
4574   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4575 }
4576
4577 /* Working buffer for code conversion.  */
4578 struct conversion_buffer
4579 {
4580   int size;                     /* size of data.  */
4581   int on_stack;                 /* 1 if allocated by alloca.  */
4582   unsigned char *data;
4583 };
4584
4585 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4586 #define allocate_conversion_buffer(buf, len)            \
4587   do {                                                  \
4588     if (len < MAX_ALLOCA)                               \
4589       {                                                 \
4590         buf.data = (unsigned char *) alloca (len);      \
4591         buf.on_stack = 1;                               \
4592       }                                                 \
4593     else                                                \
4594       {                                                 \
4595         buf.data = (unsigned char *) xmalloc (len);     \
4596         buf.on_stack = 0;                               \
4597       }                                                 \
4598     buf.size = len;                                     \
4599   } while (0)
4600
4601 /* Double the allocated memory for *BUF.  */
4602 static void
4603 extend_conversion_buffer (buf)
4604      struct conversion_buffer *buf;
4605 {
4606   if (buf->on_stack)
4607     {
4608       unsigned char *save = buf->data;
4609       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4610       bcopy (save, buf->data, buf->size);
4611       buf->on_stack = 0;
4612     }
4613   else
4614     {
4615       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4616     }
4617   buf->size *= 2;
4618 }
4619
4620 /* Free the allocated memory for BUF if it is not on stack.  */
4621 static void
4622 free_conversion_buffer (buf)
4623      struct conversion_buffer *buf;
4624 {
4625   if (!buf->on_stack)
4626     xfree (buf->data);
4627 }
4628
4629 int
4630 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4631      struct coding_system *coding;
4632      unsigned char *source, *destination;
4633      int src_bytes, dst_bytes, encodep;
4634 {
4635   struct ccl_program *ccl
4636     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4637   unsigned char *dst = destination;
4638
4639   ccl->suppress_error = coding->suppress_error;
4640   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4641   if (encodep)
4642     {
4643       /* On encoding, EOL format is converted within ccl_driver.  For
4644          that, setup proper information in the structure CCL.  */
4645       ccl->eol_type = coding->eol_type;
4646       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4647         ccl->eol_type = CODING_EOL_LF;
4648       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4649       ccl->eight_bit_control = coding->dst_multibyte;
4650     }
4651   else
4652     ccl->eight_bit_control = 1;
4653   ccl->multibyte = coding->src_multibyte;
4654   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4655     {
4656       /* Move carryover bytes to DESTINATION.  */
4657       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4658       while (*p)
4659         *dst++ = *p++;
4660       coding->spec.ccl.eight_bit_carryover[0] = 0;
4661       if (dst_bytes)
4662         dst_bytes -= dst - destination;
4663     }
4664
4665   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4666                                   &(coding->consumed))
4667                       + dst - destination);
4668
4669   if (encodep)
4670     {
4671       coding->produced_char = coding->produced;
4672       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4673     }
4674   else if (!ccl->eight_bit_control)
4675     {
4676       /* The produced bytes forms a valid multibyte sequence. */
4677       coding->produced_char
4678         = multibyte_chars_in_text (destination, coding->produced);
4679       coding->spec.ccl.eight_bit_carryover[0] = 0;
4680     }
4681   else
4682     {
4683       /* On decoding, the destination should always multibyte.  But,
4684          CCL program might have been generated an invalid multibyte
4685          sequence.  Here we make such a sequence valid as
4686          multibyte.  */
4687       int bytes
4688         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4689
4690       if ((coding->consumed < src_bytes
4691            || !ccl->last_block)
4692           && coding->produced >= 1
4693           && destination[coding->produced - 1] >= 0x80)
4694         {
4695           /* We should not convert the tailing 8-bit codes to
4696              multibyte form even if they doesn't form a valid
4697              multibyte sequence.  They may form a valid sequence in
4698              the next call.  */
4699           int carryover = 0;
4700
4701           if (destination[coding->produced - 1] < 0xA0)
4702             carryover = 1;
4703           else if (coding->produced >= 2)
4704             {
4705               if (destination[coding->produced - 2] >= 0x80)
4706                 {
4707                   if (destination[coding->produced - 2] < 0xA0)
4708                     carryover = 2;
4709                   else if (coding->produced >= 3
4710                            && destination[coding->produced - 3] >= 0x80
4711                            && destination[coding->produced - 3] < 0xA0)
4712                     carryover = 3;
4713                 }
4714             }
4715           if (carryover > 0)
4716             {
4717               BCOPY_SHORT (destination + coding->produced - carryover,
4718                            coding->spec.ccl.eight_bit_carryover,
4719                            carryover);
4720               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4721               coding->produced -= carryover;
4722             }
4723         }
4724       coding->produced = str_as_multibyte (destination, bytes,
4725                                            coding->produced,
4726                                            &(coding->produced_char));
4727     }
4728
4729   switch (ccl->status)
4730     {
4731     case CCL_STAT_SUSPEND_BY_SRC:
4732       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4733       break;
4734     case CCL_STAT_SUSPEND_BY_DST:
4735       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4736       break;
4737     case CCL_STAT_QUIT:
4738     case CCL_STAT_INVALID_CMD:
4739       coding->result = CODING_FINISH_INTERRUPT;
4740       break;
4741     default:
4742       coding->result = CODING_FINISH_NORMAL;
4743       break;
4744     }
4745   return coding->result;
4746 }
4747
4748 /* Decode EOL format of the text at PTR of BYTES length destructively
4749    according to CODING->eol_type.  This is called after the CCL
4750    program produced a decoded text at PTR.  If we do CRLF->LF
4751    conversion, update CODING->produced and CODING->produced_char.  */
4752
4753 static void
4754 decode_eol_post_ccl (coding, ptr, bytes)
4755      struct coding_system *coding;
4756      unsigned char *ptr;
4757      int bytes;
4758 {
4759   Lisp_Object val, saved_coding_symbol;
4760   unsigned char *pend = ptr + bytes;
4761   int dummy;
4762
4763   /* Remember the current coding system symbol.  We set it back when
4764      an inconsistent EOL is found so that `last-coding-system-used' is
4765      set to the coding system that doesn't specify EOL conversion.  */
4766   saved_coding_symbol = coding->symbol;
4767
4768   coding->spec.ccl.cr_carryover = 0;
4769   if (coding->eol_type == CODING_EOL_UNDECIDED)
4770     {
4771       /* Here, to avoid the call of setup_coding_system, we directly
4772          call detect_eol_type.  */
4773       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4774       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4775         coding->eol_type = CODING_EOL_LF;
4776       if (coding->eol_type != CODING_EOL_UNDECIDED)
4777         {
4778           val = Fget (coding->symbol, Qeol_type);
4779           if (VECTORP (val) && XVECTOR (val)->size == 3)
4780             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4781         }
4782       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4783     }
4784
4785   if (coding->eol_type == CODING_EOL_LF
4786       || coding->eol_type == CODING_EOL_UNDECIDED)
4787     {
4788       /* We have nothing to do.  */
4789       ptr = pend;
4790     }
4791   else if (coding->eol_type == CODING_EOL_CRLF)
4792     {
4793       unsigned char *pstart = ptr, *p = ptr;
4794
4795       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4796           && *(pend - 1) == '\r')
4797         {
4798           /* If the last character is CR, we can't handle it here
4799              because LF will be in the not-yet-decoded source text.
4800              Record that the CR is not yet processed.  */
4801           coding->spec.ccl.cr_carryover = 1;
4802           coding->produced--;
4803           coding->produced_char--;
4804           pend--;
4805         }
4806       while (ptr < pend)
4807         {
4808           if (*ptr == '\r')
4809             {
4810               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4811                 {
4812                   *p++ = '\n';
4813                   ptr += 2;
4814                 }
4815               else
4816                 {
4817                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4818                     goto undo_eol_conversion;
4819                   *p++ = *ptr++;
4820                 }
4821             }
4822           else if (*ptr == '\n'
4823                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4824             goto undo_eol_conversion;
4825           else
4826             *p++ = *ptr++;
4827           continue;
4828
4829         undo_eol_conversion:
4830           /* We have faced with inconsistent EOL format at PTR.
4831              Convert all LFs before PTR back to CRLFs.  */
4832           for (p--, ptr--; p >= pstart; p--)
4833             {
4834               if (*p == '\n')
4835                 *ptr-- = '\n', *ptr-- = '\r';
4836               else
4837                 *ptr-- = *p;
4838             }
4839           /*  If carryover is recorded, cancel it because we don't
4840               convert CRLF anymore.  */
4841           if (coding->spec.ccl.cr_carryover)
4842             {
4843               coding->spec.ccl.cr_carryover = 0;
4844               coding->produced++;
4845               coding->produced_char++;
4846               pend++;
4847             }
4848           p = ptr = pend;
4849           coding->eol_type = CODING_EOL_LF;
4850           coding->symbol = saved_coding_symbol;
4851         }
4852       if (p < pend)
4853         {
4854           /* As each two-byte sequence CRLF was converted to LF, (PEND
4855              - P) is the number of deleted characters.  */
4856           coding->produced -= pend - p;
4857           coding->produced_char -= pend - p;
4858         }
4859     }
4860   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4861     {
4862       unsigned char *p = ptr;
4863
4864       for (; ptr < pend; ptr++)
4865         {
4866           if (*ptr == '\r')
4867             *ptr = '\n';
4868           else if (*ptr == '\n'
4869                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4870             {
4871               for (; p < ptr; p++)
4872                 {
4873                   if (*p == '\n')
4874                     *p = '\r';
4875                 }
4876               ptr = pend;
4877               coding->eol_type = CODING_EOL_LF;
4878               coding->symbol = saved_coding_symbol;
4879             }
4880         }
4881     }
4882 }
4883
4884 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4885    decoding, it may detect coding system and format of end-of-line if
4886    those are not yet decided.  The source should be unibyte, the
4887    result is multibyte if CODING->dst_multibyte is nonzero, else
4888    unibyte.  */
4889
4890 int
4891 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4892      struct coding_system *coding;
4893      const unsigned char *source;
4894      unsigned char *destination;
4895      int src_bytes, dst_bytes;
4896 {
4897   int extra = 0;
4898
4899   if (coding->type == coding_type_undecided)
4900     detect_coding (coding, source, src_bytes);
4901
4902   if (coding->eol_type == CODING_EOL_UNDECIDED
4903       && coding->type != coding_type_ccl)
4904     {
4905       detect_eol (coding, source, src_bytes);
4906       /* We had better recover the original eol format if we
4907          encounter an inconsistent eol format while decoding.  */
4908       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4909     }
4910
4911   coding->produced = coding->produced_char = 0;
4912   coding->consumed = coding->consumed_char = 0;
4913   coding->errors = 0;
4914   coding->result = CODING_FINISH_NORMAL;
4915
4916   switch (coding->type)
4917     {
4918     case coding_type_sjis:
4919       decode_coding_sjis_big5 (coding, source, destination,
4920                                src_bytes, dst_bytes, 1);
4921       break;
4922
4923     case coding_type_iso2022:
4924       decode_coding_iso2022 (coding, source, destination,
4925                              src_bytes, dst_bytes);
4926       break;
4927
4928     case coding_type_big5:
4929       decode_coding_sjis_big5 (coding, source, destination,
4930                                src_bytes, dst_bytes, 0);
4931       break;
4932
4933     case coding_type_emacs_mule:
4934       decode_coding_emacs_mule (coding, source, destination,
4935                                 src_bytes, dst_bytes);
4936       break;
4937
4938     case coding_type_ccl:
4939       if (coding->spec.ccl.cr_carryover)
4940         {
4941           /* Put the CR which was not processed by the previous call
4942              of decode_eol_post_ccl in DESTINATION.  It will be
4943              decoded together with the following LF by the call to
4944              decode_eol_post_ccl below.  */
4945           *destination = '\r';
4946           coding->produced++;
4947           coding->produced_char++;
4948           dst_bytes--;
4949           extra = coding->spec.ccl.cr_carryover;
4950         }
4951       ccl_coding_driver (coding, source, destination + extra,
4952                          src_bytes, dst_bytes, 0);
4953       if (coding->eol_type != CODING_EOL_LF)
4954         {
4955           coding->produced += extra;
4956           coding->produced_char += extra;
4957           decode_eol_post_ccl (coding, destination, coding->produced);
4958         }
4959       break;
4960
4961     default:
4962       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4963     }
4964
4965   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4966       && coding->mode & CODING_MODE_LAST_BLOCK
4967       && coding->consumed == src_bytes)
4968     coding->result = CODING_FINISH_NORMAL;
4969
4970   if (coding->mode & CODING_MODE_LAST_BLOCK
4971       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4972     {
4973       const unsigned char *src = source + coding->consumed;
4974       unsigned char *dst = destination + coding->produced;
4975
4976       src_bytes -= coding->consumed;
4977       coding->errors++;
4978       if (COMPOSING_P (coding))
4979         DECODE_COMPOSITION_END ('1');
4980       while (src_bytes--)
4981         {
4982           int c = *src++;
4983           dst += CHAR_STRING (c, dst);
4984           coding->produced_char++;
4985         }
4986       coding->consumed = coding->consumed_char = src - source;
4987       coding->produced = dst - destination;
4988       coding->result = CODING_FINISH_NORMAL;
4989     }
4990
4991   if (!coding->dst_multibyte)
4992     {
4993       coding->produced = str_as_unibyte (destination, coding->produced);
4994       coding->produced_char = coding->produced;
4995     }
4996
4997   return coding->result;
4998 }
4999
5000 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
5001    multibyteness of the source is CODING->src_multibyte, the
5002    multibyteness of the result is always unibyte.  */
5003
5004 int
5005 encode_coding (coding, source, destination, src_bytes, dst_bytes)
5006      struct coding_system *coding;
5007      const unsigned char *source;
5008      unsigned char *destination;
5009      int src_bytes, dst_bytes;
5010 {
5011   coding->produced = coding->produced_char = 0;
5012   coding->consumed = coding->consumed_char = 0;
5013   coding->errors = 0;
5014   coding->result = CODING_FINISH_NORMAL;
5015   if (coding->eol_type == CODING_EOL_UNDECIDED)
5016     coding->eol_type = CODING_EOL_LF;
5017
5018   switch (coding->type)
5019     {
5020     case coding_type_sjis:
5021       encode_coding_sjis_big5 (coding, source, destination,
5022                                src_bytes, dst_bytes, 1);
5023       break;
5024
5025     case coding_type_iso2022:
5026       encode_coding_iso2022 (coding, source, destination,
5027                              src_bytes, dst_bytes);
5028       break;
5029
5030     case coding_type_big5:
5031       encode_coding_sjis_big5 (coding, source, destination,
5032                                src_bytes, dst_bytes, 0);
5033       break;
5034
5035     case coding_type_emacs_mule:
5036       encode_coding_emacs_mule (coding, source, destination,
5037                                 src_bytes, dst_bytes);
5038       break;
5039
5040     case coding_type_ccl:
5041       ccl_coding_driver (coding, source, destination,
5042                          src_bytes, dst_bytes, 1);
5043       break;
5044
5045     default:
5046       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5047     }
5048
5049   if (coding->mode & CODING_MODE_LAST_BLOCK
5050       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5051     {
5052       const unsigned char *src = source + coding->consumed;
5053       unsigned char *dst = destination + coding->produced;
5054
5055       if (coding->type == coding_type_iso2022)
5056         ENCODE_RESET_PLANE_AND_REGISTER;
5057       if (COMPOSING_P (coding))
5058         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5059       if (coding->consumed < src_bytes)
5060         {
5061           int len = src_bytes - coding->consumed;
5062
5063           BCOPY_SHORT (src, dst, len);
5064           if (coding->src_multibyte)
5065             len = str_as_unibyte (dst, len);
5066           dst += len;
5067           coding->consumed = src_bytes;
5068         }
5069       coding->produced = coding->produced_char = dst - destination;
5070       coding->result = CODING_FINISH_NORMAL;
5071     }
5072
5073   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5074       && coding->consumed == src_bytes)
5075     coding->result = CODING_FINISH_NORMAL;
5076
5077   return coding->result;
5078 }
5079
5080 /* Scan text in the region between *BEG and *END (byte positions),
5081    skip characters which we don't have to decode by coding system
5082    CODING at the head and tail, then set *BEG and *END to the region
5083    of the text we actually have to convert.  The caller should move
5084    the gap out of the region in advance if the region is from a
5085    buffer.
5086
5087    If STR is not NULL, *BEG and *END are indices into STR.  */
5088
5089 static void
5090 shrink_decoding_region (beg, end, coding, str)
5091      int *beg, *end;
5092      struct coding_system *coding;
5093      unsigned char *str;
5094 {
5095   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5096   int eol_conversion;
5097   Lisp_Object translation_table;
5098
5099   if (coding->type == coding_type_ccl
5100       || coding->type == coding_type_undecided
5101       || coding->eol_type != CODING_EOL_LF
5102       || !NILP (coding->post_read_conversion)
5103       || coding->composing != COMPOSITION_DISABLED)
5104     {
5105       /* We can't skip any data.  */
5106       return;
5107     }
5108   if (coding->type == coding_type_no_conversion
5109       || coding->type == coding_type_raw_text
5110       || coding->type == coding_type_emacs_mule)
5111     {
5112       /* We need no conversion, but don't have to skip any data here.
5113          Decoding routine handles them effectively anyway.  */
5114       return;
5115     }
5116
5117   translation_table = coding->translation_table_for_decode;
5118   if (NILP (translation_table) && !NILP (Venable_character_translation))
5119     translation_table = Vstandard_translation_table_for_decode;
5120   if (CHAR_TABLE_P (translation_table))
5121     {
5122       int i;
5123       for (i = 0; i < 128; i++)
5124         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5125           break;
5126       if (i < 128)
5127         /* Some ASCII character should be translated.  We give up
5128            shrinking.  */
5129         return;
5130     }
5131
5132   if (coding->heading_ascii >= 0)
5133     /* Detection routine has already found how much we can skip at the
5134        head.  */
5135     *beg += coding->heading_ascii;
5136
5137   if (str)
5138     {
5139       begp_orig = begp = str + *beg;
5140       endp_orig = endp = str + *end;
5141     }
5142   else
5143     {
5144       begp_orig = begp = BYTE_POS_ADDR (*beg);
5145       endp_orig = endp = begp + *end - *beg;
5146     }
5147
5148   eol_conversion = (coding->eol_type == CODING_EOL_CR
5149                     || coding->eol_type == CODING_EOL_CRLF);
5150
5151   switch (coding->type)
5152     {
5153     case coding_type_sjis:
5154     case coding_type_big5:
5155       /* We can skip all ASCII characters at the head.  */
5156       if (coding->heading_ascii < 0)
5157         {
5158           if (eol_conversion)
5159             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5160           else
5161             while (begp < endp && *begp < 0x80) begp++;
5162         }
5163       /* We can skip all ASCII characters at the tail except for the
5164          second byte of SJIS or BIG5 code.  */
5165       if (eol_conversion)
5166         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5167       else
5168         while (begp < endp && endp[-1] < 0x80) endp--;
5169       /* Do not consider LF as ascii if preceded by CR, since that
5170          confuses eol decoding. */
5171       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5172         endp++;
5173       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5174         endp++;
5175       break;
5176
5177     case coding_type_iso2022:
5178       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5179         /* We can't skip any data.  */
5180         break;
5181       if (coding->heading_ascii < 0)
5182         {
5183           /* We can skip all ASCII characters at the head except for a
5184              few control codes.  */
5185           while (begp < endp && (c = *begp) < 0x80
5186                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5187                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5188                  && (!eol_conversion || c != ISO_CODE_LF))
5189             begp++;
5190         }
5191       switch (coding->category_idx)
5192         {
5193         case CODING_CATEGORY_IDX_ISO_8_1:
5194         case CODING_CATEGORY_IDX_ISO_8_2:
5195           /* We can skip all ASCII characters at the tail.  */
5196           if (eol_conversion)
5197             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5198           else
5199             while (begp < endp && endp[-1] < 0x80) endp--;
5200           /* Do not consider LF as ascii if preceded by CR, since that
5201              confuses eol decoding. */
5202           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5203             endp++;
5204           break;
5205
5206         case CODING_CATEGORY_IDX_ISO_7:
5207         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5208           {
5209             /* We can skip all characters at the tail except for 8-bit
5210                codes and ESC and the following 2-byte at the tail.  */
5211             unsigned char *eight_bit = NULL;
5212
5213             if (eol_conversion)
5214               while (begp < endp
5215                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5216                 {
5217                   if (!eight_bit && c & 0x80) eight_bit = endp;
5218                   endp--;
5219                 }
5220             else
5221               while (begp < endp
5222                      && (c = endp[-1]) != ISO_CODE_ESC)
5223                 {
5224                   if (!eight_bit && c & 0x80) eight_bit = endp;
5225                   endp--;
5226                 }
5227             /* Do not consider LF as ascii if preceded by CR, since that
5228                confuses eol decoding. */
5229             if (begp < endp && endp < endp_orig
5230                 && endp[-1] == '\r' && endp[0] == '\n')
5231               endp++;
5232             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5233               {
5234                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5235                   /* This is an ASCII designation sequence.  We can
5236                      surely skip the tail.  But, if we have
5237                      encountered an 8-bit code, skip only the codes
5238                      after that.  */
5239                   endp = eight_bit ? eight_bit : endp + 2;
5240                 else
5241                   /* Hmmm, we can't skip the tail.  */
5242                   endp = endp_orig;
5243               }
5244             else if (eight_bit)
5245               endp = eight_bit;
5246           }
5247         }
5248       break;
5249
5250     default:
5251       abort ();
5252     }
5253   *beg += begp - begp_orig;
5254   *end += endp - endp_orig;
5255   return;
5256 }
5257
5258 /* Like shrink_decoding_region but for encoding.  */
5259
5260 static void
5261 shrink_encoding_region (beg, end, coding, str)
5262      int *beg, *end;
5263      struct coding_system *coding;
5264      unsigned char *str;
5265 {
5266   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5267   int eol_conversion;
5268   Lisp_Object translation_table;
5269
5270   if (coding->type == coding_type_ccl
5271       || coding->eol_type == CODING_EOL_CRLF
5272       || coding->eol_type == CODING_EOL_CR
5273       || (coding->cmp_data && coding->cmp_data->used > 0))
5274     {
5275       /* We can't skip any data.  */
5276       return;
5277     }
5278   if (coding->type == coding_type_no_conversion
5279       || coding->type == coding_type_raw_text
5280       || coding->type == coding_type_emacs_mule
5281       || coding->type == coding_type_undecided)
5282     {
5283       /* We need no conversion, but don't have to skip any data here.
5284          Encoding routine handles them effectively anyway.  */
5285       return;
5286     }
5287
5288   translation_table = coding->translation_table_for_encode;
5289   if (NILP (translation_table) && !NILP (Venable_character_translation))
5290     translation_table = Vstandard_translation_table_for_encode;
5291   if (CHAR_TABLE_P (translation_table))
5292     {
5293       int i;
5294       for (i = 0; i < 128; i++)
5295         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5296           break;
5297       if (i < 128)
5298         /* Some ASCII character should be translated.  We give up
5299            shrinking.  */
5300         return;
5301     }
5302
5303   if (str)
5304     {
5305       begp_orig = begp = str + *beg;
5306       endp_orig = endp = str + *end;
5307     }
5308   else
5309     {
5310       begp_orig = begp = BYTE_POS_ADDR (*beg);
5311       endp_orig = endp = begp + *end - *beg;
5312     }
5313
5314   eol_conversion = (coding->eol_type == CODING_EOL_CR
5315                     || coding->eol_type == CODING_EOL_CRLF);
5316
5317   /* Here, we don't have to check coding->pre_write_conversion because
5318      the caller is expected to have handled it already.  */
5319   switch (coding->type)
5320     {
5321     case coding_type_iso2022:
5322       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5323         /* We can't skip any data.  */
5324         break;
5325       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5326         {
5327           unsigned char *bol = begp;
5328           while (begp < endp && *begp < 0x80)
5329             {
5330               begp++;
5331               if (begp[-1] == '\n')
5332                 bol = begp;
5333             }
5334           begp = bol;
5335           goto label_skip_tail;
5336         }
5337       /* fall down ... */
5338
5339     case coding_type_sjis:
5340     case coding_type_big5:
5341       /* We can skip all ASCII characters at the head and tail.  */
5342       if (eol_conversion)
5343         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5344       else
5345         while (begp < endp && *begp < 0x80) begp++;
5346     label_skip_tail:
5347       if (eol_conversion)
5348         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5349       else
5350         while (begp < endp && *(endp - 1) < 0x80) endp--;
5351       break;
5352
5353     default:
5354       abort ();
5355     }
5356
5357   *beg += begp - begp_orig;
5358   *end += endp - endp_orig;
5359   return;
5360 }
5361
5362 /* As shrinking conversion region requires some overhead, we don't try
5363    shrinking if the length of conversion region is less than this
5364    value.  */
5365 static int shrink_conversion_region_threshhold = 1024;
5366
5367 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5368   do {                                                                  \
5369     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5370       {                                                                 \
5371         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5372         else shrink_decoding_region (beg, end, coding, str);            \
5373       }                                                                 \
5374   } while (0)
5375
5376 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5377    Vlast_coding_system_used and the remaining elements are buffers to
5378    kill.  */
5379 static Lisp_Object
5380 code_convert_region_unwind (arg)
5381      Lisp_Object arg;
5382 {
5383   struct gcpro gcpro1;
5384   GCPRO1 (arg);
5385
5386   inhibit_pre_post_conversion = 0;
5387   Vlast_coding_system_used = XCAR (arg);
5388   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5389     Fkill_buffer (XCAR (arg));
5390
5391   UNGCPRO;
5392   return Qnil;
5393 }
5394
5395 /* Store information about all compositions in the range FROM and TO
5396    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5397    buffer or a string, defaults to the current buffer.  */
5398
5399 void
5400 coding_save_composition (coding, from, to, obj)
5401      struct coding_system *coding;
5402      int from, to;
5403      Lisp_Object obj;
5404 {
5405   Lisp_Object prop;
5406   int start, end;
5407
5408   if (coding->composing == COMPOSITION_DISABLED)
5409     return;
5410   if (!coding->cmp_data)
5411     coding_allocate_composition_data (coding, from);
5412   if (!find_composition (from, to, &start, &end, &prop, obj)
5413       || end > to)
5414     return;
5415   if (start < from
5416       && (!find_composition (end, to, &start, &end, &prop, obj)
5417           || end > to))
5418     return;
5419   coding->composing = COMPOSITION_NO;
5420   do
5421     {
5422       if (COMPOSITION_VALID_P (start, end, prop))
5423         {
5424           enum composition_method method = COMPOSITION_METHOD (prop);
5425           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5426               >= COMPOSITION_DATA_SIZE)
5427             coding_allocate_composition_data (coding, from);
5428           /* For relative composition, we remember start and end
5429              positions, for the other compositions, we also remember
5430              components.  */
5431           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5432           if (method != COMPOSITION_RELATIVE)
5433             {
5434               /* We must store a*/
5435               Lisp_Object val, ch;
5436
5437               val = COMPOSITION_COMPONENTS (prop);
5438               if (CONSP (val))
5439                 while (CONSP (val))
5440                   {
5441                     ch = XCAR (val), val = XCDR (val);
5442                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5443                   }
5444               else if (VECTORP (val) || STRINGP (val))
5445                 {
5446                   int len = (VECTORP (val)
5447                              ? XVECTOR (val)->size : SCHARS (val));
5448                   int i;
5449                   for (i = 0; i < len; i++)
5450                     {
5451                       ch = (STRINGP (val)
5452                             ? Faref (val, make_number (i))
5453                             : XVECTOR (val)->contents[i]);
5454                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5455                     }
5456                 }
5457               else              /* INTEGERP (val) */
5458                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5459             }
5460           CODING_ADD_COMPOSITION_END (coding, end - from);
5461         }
5462       start = end;
5463     }
5464   while (start < to
5465          && find_composition (start, to, &start, &end, &prop, obj)
5466          && end <= to);
5467
5468   /* Make coding->cmp_data point to the first memory block.  */
5469   while (coding->cmp_data->prev)
5470     coding->cmp_data = coding->cmp_data->prev;
5471   coding->cmp_data_start = 0;
5472 }
5473
5474 /* Reflect the saved information about compositions to OBJ.
5475    CODING->cmp_data points to a memory block for the information.  OBJ
5476    is a buffer or a string, defaults to the current buffer.  */
5477
5478 void
5479 coding_restore_composition (coding, obj)
5480      struct coding_system *coding;
5481      Lisp_Object obj;
5482 {
5483   struct composition_data *cmp_data = coding->cmp_data;
5484
5485   if (!cmp_data)
5486     return;
5487
5488   while (cmp_data->prev)
5489     cmp_data = cmp_data->prev;
5490
5491   while (cmp_data)
5492     {
5493       int i;
5494
5495       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5496            i += cmp_data->data[i])
5497         {
5498           int *data = cmp_data->data + i;
5499           enum composition_method method = (enum composition_method) data[3];
5500           Lisp_Object components;
5501
5502           if (data[0] < 0 || i + data[0] > cmp_data->used)
5503             /* Invalid composition data.  */
5504             break;
5505
5506           if (method == COMPOSITION_RELATIVE)
5507             components = Qnil;
5508           else
5509             {
5510               int len = data[0] - 4, j;
5511               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5512
5513               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5514                   && len % 2 == 0)
5515                 len --;
5516               if (len < 1)
5517                 /* Invalid composition data.  */
5518                 break;
5519               for (j = 0; j < len; j++)
5520                 args[j] = make_number (data[4 + j]);
5521               components = (method == COMPOSITION_WITH_ALTCHARS
5522                             ? Fstring (len, args)
5523                             : Fvector (len, args));
5524             }
5525           compose_text (data[1], data[2], components, Qnil, obj);
5526         }
5527       cmp_data = cmp_data->next;
5528     }
5529 }
5530
5531 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5532    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5533    coding system CODING, and return the status code of code conversion
5534    (currently, this value has no meaning).
5535
5536    How many characters (and bytes) are converted to how many
5537    characters (and bytes) are recorded in members of the structure
5538    CODING.
5539
5540    If REPLACE is nonzero, we do various things as if the original text
5541    is deleted and a new text is inserted.  See the comments in
5542    replace_range (insdel.c) to know what we are doing.
5543
5544    If REPLACE is zero, it is assumed that the source text is unibyte.
5545    Otherwise, it is assumed that the source text is multibyte.  */
5546
5547 int
5548 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5549      int from, from_byte, to, to_byte, encodep, replace;
5550      struct coding_system *coding;
5551 {
5552   int len = to - from, len_byte = to_byte - from_byte;
5553   int nchars_del = 0, nbytes_del = 0;
5554   int require, inserted, inserted_byte;
5555   int head_skip, tail_skip, total_skip = 0;
5556   Lisp_Object saved_coding_symbol;
5557   int first = 1;
5558   unsigned char *src, *dst;
5559   Lisp_Object deletion;
5560   int orig_point = PT, orig_len = len;
5561   int prev_Z;
5562   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5563
5564   deletion = Qnil;
5565   saved_coding_symbol = coding->symbol;
5566
5567   if (from < PT && PT < to)
5568     {
5569       TEMP_SET_PT_BOTH (from, from_byte);
5570       orig_point = from;
5571     }
5572
5573   if (replace)
5574     {
5575       int saved_from = from;
5576       int saved_inhibit_modification_hooks;
5577
5578       prepare_to_modify_buffer (from, to, &from);
5579       if (saved_from != from)
5580         {
5581           to = from + len;
5582           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5583           len_byte = to_byte - from_byte;
5584         }
5585
5586       /* The code conversion routine can not preserve text properties
5587          for now.  So, we must remove all text properties in the
5588          region.  Here, we must suppress all modification hooks.  */
5589       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5590       inhibit_modification_hooks = 1;
5591       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5592       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5593     }
5594
5595   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5596     {
5597       /* We must detect encoding of text and eol format.  */
5598
5599       if (from < GPT && to > GPT)
5600         move_gap_both (from, from_byte);
5601       if (coding->type == coding_type_undecided)
5602         {
5603           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5604           if (coding->type == coding_type_undecided)
5605             {
5606               /* It seems that the text contains only ASCII, but we
5607                  should not leave it undecided because the deeper
5608                  decoding routine (decode_coding) tries to detect the
5609                  encodings again in vain.  */
5610               coding->type = coding_type_emacs_mule;
5611               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5612               /* As emacs-mule decoder will handle composition, we
5613                  need this setting to allocate coding->cmp_data
5614                  later.  */
5615               coding->composing = COMPOSITION_NO;
5616             }
5617         }
5618       if (coding->eol_type == CODING_EOL_UNDECIDED
5619           && coding->type != coding_type_ccl)
5620         {
5621           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5622           if (coding->eol_type == CODING_EOL_UNDECIDED)
5623             coding->eol_type = CODING_EOL_LF;
5624           /* We had better recover the original eol format if we
5625              encounter an inconsistent eol format while decoding.  */
5626           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5627         }
5628     }
5629
5630   /* Now we convert the text.  */
5631
5632   /* For encoding, we must process pre-write-conversion in advance.  */
5633   if (! inhibit_pre_post_conversion
5634       && encodep
5635       && SYMBOLP (coding->pre_write_conversion)
5636       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5637     {
5638       /* The function in pre-write-conversion may put a new text in a
5639          new buffer.  */
5640       struct buffer *prev = current_buffer;
5641       Lisp_Object new;
5642
5643       record_unwind_protect (code_convert_region_unwind,
5644                              Fcons (Vlast_coding_system_used, Qnil));
5645       /* We should not call any more pre-write/post-read-conversion
5646          functions while this pre-write-conversion is running.  */
5647       inhibit_pre_post_conversion = 1;
5648       call2 (coding->pre_write_conversion,
5649              make_number (from), make_number (to));
5650       inhibit_pre_post_conversion = 0;
5651       /* Discard the unwind protect.  */
5652       specpdl_ptr--;
5653
5654       if (current_buffer != prev)
5655         {
5656           len = ZV - BEGV;
5657           new = Fcurrent_buffer ();
5658           set_buffer_internal_1 (prev);
5659           del_range_2 (from, from_byte, to, to_byte, 0);
5660           TEMP_SET_PT_BOTH (from, from_byte);
5661           insert_from_buffer (XBUFFER (new), 1, len, 0);
5662           Fkill_buffer (new);
5663           if (orig_point >= to)
5664             orig_point += len - orig_len;
5665           else if (orig_point > from)
5666             orig_point = from;
5667           orig_len = len;
5668           to = from + len;
5669           from_byte = CHAR_TO_BYTE (from);
5670           to_byte = CHAR_TO_BYTE (to);
5671           len_byte = to_byte - from_byte;
5672           TEMP_SET_PT_BOTH (from, from_byte);
5673         }
5674     }
5675
5676   if (replace)
5677     {
5678       if (! EQ (current_buffer->undo_list, Qt))
5679         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5680       else
5681         {
5682           nchars_del = to - from;
5683           nbytes_del = to_byte - from_byte;
5684         }
5685     }
5686
5687   if (coding->composing != COMPOSITION_DISABLED)
5688     {
5689       if (encodep)
5690         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5691       else
5692         coding_allocate_composition_data (coding, from);
5693     }
5694
5695   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5696      if we must run CCL program or there are compositions to
5697      encode.  */
5698   if (coding->type != coding_type_ccl
5699       && (! coding->cmp_data || coding->cmp_data->used == 0))
5700     {
5701       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5702
5703       if (from < GPT && GPT < to)
5704         move_gap_both (from, from_byte);
5705       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5706       if (from_byte == to_byte
5707           && (encodep || NILP (coding->post_read_conversion))
5708           && ! CODING_REQUIRE_FLUSHING (coding))
5709         {
5710           coding->produced = len_byte;
5711           coding->produced_char = len;
5712           if (!replace)
5713             /* We must record and adjust for this new text now.  */
5714             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5715           coding_free_composition_data (coding);
5716           return 0;
5717         }
5718
5719       head_skip = from_byte - from_byte_orig;
5720       tail_skip = to_byte_orig - to_byte;
5721       total_skip = head_skip + tail_skip;
5722       from += head_skip;
5723       to -= tail_skip;
5724       len -= total_skip; len_byte -= total_skip;
5725     }
5726
5727   /* For conversion, we must put the gap before the text in addition to
5728      making the gap larger for efficient decoding.  The required gap
5729      size starts from 2000 which is the magic number used in make_gap.
5730      But, after one batch of conversion, it will be incremented if we
5731      find that it is not enough .  */
5732   require = 2000;
5733
5734   if (GAP_SIZE  < require)
5735     make_gap (require - GAP_SIZE);
5736   move_gap_both (from, from_byte);
5737
5738   inserted = inserted_byte = 0;
5739
5740   GAP_SIZE += len_byte;
5741   ZV -= len;
5742   Z -= len;
5743   ZV_BYTE -= len_byte;
5744   Z_BYTE -= len_byte;
5745
5746   if (GPT - BEG < BEG_UNCHANGED)
5747     BEG_UNCHANGED = GPT - BEG;
5748   if (Z - GPT < END_UNCHANGED)
5749     END_UNCHANGED = Z - GPT;
5750
5751   if (!encodep && coding->src_multibyte)
5752     {
5753       /* Decoding routines expects that the source text is unibyte.
5754          We must convert 8-bit characters of multibyte form to
5755          unibyte.  */
5756       int len_byte_orig = len_byte;
5757       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5758       if (len_byte < len_byte_orig)
5759         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5760                     len_byte);
5761       coding->src_multibyte = 0;
5762     }
5763
5764   for (;;)
5765     {
5766       int result;
5767
5768       /* The buffer memory is now:
5769          +--------+converted-text+---------+-------original-text-------+---+
5770          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5771                   |<---------------------- GAP ----------------------->|  */
5772       src = GAP_END_ADDR - len_byte;
5773       dst = GPT_ADDR + inserted_byte;
5774
5775       if (encodep)
5776         result = encode_coding (coding, src, dst, len_byte, 0);
5777       else
5778         {
5779           if (coding->composing != COMPOSITION_DISABLED)
5780             coding->cmp_data->char_offset = from + inserted;
5781           result = decode_coding (coding, src, dst, len_byte, 0);
5782         }
5783
5784       /* The buffer memory is now:
5785          +--------+-------converted-text----+--+------original-text----+---+
5786          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5787                   |<---------------------- GAP ----------------------->|  */
5788
5789       inserted += coding->produced_char;
5790       inserted_byte += coding->produced;
5791       len_byte -= coding->consumed;
5792
5793       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5794         {
5795           coding_allocate_composition_data (coding, from + inserted);
5796           continue;
5797         }
5798
5799       src += coding->consumed;
5800       dst += coding->produced;
5801
5802       if (result == CODING_FINISH_NORMAL)
5803         {
5804           src += len_byte;
5805           break;
5806         }
5807       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5808         {
5809           unsigned char *pend = dst, *p = pend - inserted_byte;
5810           Lisp_Object eol_type;
5811
5812           /* Encode LFs back to the original eol format (CR or CRLF).  */
5813           if (coding->eol_type == CODING_EOL_CR)
5814             {
5815               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5816             }
5817           else
5818             {
5819               int count = 0;
5820
5821               while (p < pend) if (*p++ == '\n') count++;
5822               if (src - dst < count)
5823                 {
5824                   /* We don't have sufficient room for encoding LFs
5825                      back to CRLF.  We must record converted and
5826                      not-yet-converted text back to the buffer
5827                      content, enlarge the gap, then record them out of
5828                      the buffer contents again.  */
5829                   int add = len_byte + inserted_byte;
5830
5831                   GAP_SIZE -= add;
5832                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5833                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5834                   make_gap (count - GAP_SIZE);
5835                   GAP_SIZE += add;
5836                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5837                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5838                   /* Don't forget to update SRC, DST, and PEND.  */
5839                   src = GAP_END_ADDR - len_byte;
5840                   dst = GPT_ADDR + inserted_byte;
5841                   pend = dst;
5842                 }
5843               inserted += count;
5844               inserted_byte += count;
5845               coding->produced += count;
5846               p = dst = pend + count;
5847               while (count)
5848                 {
5849                   *--p = *--pend;
5850                   if (*p == '\n') count--, *--p = '\r';
5851                 }
5852             }
5853
5854           /* Suppress eol-format conversion in the further conversion.  */
5855           coding->eol_type = CODING_EOL_LF;
5856
5857           /* Set the coding system symbol to that for Unix-like EOL.  */
5858           eol_type = Fget (saved_coding_symbol, Qeol_type);
5859           if (VECTORP (eol_type)
5860               && XVECTOR (eol_type)->size == 3
5861               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5862             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5863           else
5864             coding->symbol = saved_coding_symbol;
5865
5866           continue;
5867         }
5868       if (len_byte <= 0)
5869         {
5870           if (coding->type != coding_type_ccl
5871               || coding->mode & CODING_MODE_LAST_BLOCK)
5872             break;
5873           coding->mode |= CODING_MODE_LAST_BLOCK;
5874           continue;
5875         }
5876       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5877         {
5878           /* The source text ends in invalid codes.  Let's just
5879              make them valid buffer contents, and finish conversion.  */
5880           if (multibyte_p)
5881             {
5882               unsigned char *start = dst;
5883
5884               inserted += len_byte;
5885               while (len_byte--)
5886                 {
5887                   int c = *src++;
5888                   dst += CHAR_STRING (c, dst);
5889                 }
5890
5891               inserted_byte += dst - start;
5892             }
5893           else
5894             {
5895               inserted += len_byte;
5896               inserted_byte += len_byte;
5897               while (len_byte--)
5898                 *dst++ = *src++;
5899             }
5900           break;
5901         }
5902       if (result == CODING_FINISH_INTERRUPT)
5903         {
5904           /* The conversion procedure was interrupted by a user.  */
5905           break;
5906         }
5907       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5908       if (coding->consumed < 1)
5909         {
5910           /* It's quite strange to require more memory without
5911              consuming any bytes.  Perhaps CCL program bug.  */
5912           break;
5913         }
5914       if (first)
5915         {
5916           /* We have just done the first batch of conversion which was
5917              stopped because of insufficient gap.  Let's reconsider the
5918              required gap size (i.e. SRT - DST) now.
5919
5920              We have converted ORIG bytes (== coding->consumed) into
5921              NEW bytes (coding->produced).  To convert the remaining
5922              LEN bytes, we may need REQUIRE bytes of gap, where:
5923                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5924                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5925              Here, we are sure that NEW >= ORIG.  */
5926
5927           if (coding->produced <= coding->consumed)
5928             {
5929               /* This happens because of CCL-based coding system with
5930                  eol-type CRLF.  */
5931               require = 0;
5932             }
5933           else
5934             {
5935               float ratio = coding->produced - coding->consumed;
5936               ratio /= coding->consumed;
5937               require = len_byte * ratio;
5938             }
5939           first = 0;
5940         }
5941       if ((src - dst) < (require + 2000))
5942         {
5943           /* See the comment above the previous call of make_gap.  */
5944           int add = len_byte + inserted_byte;
5945
5946           GAP_SIZE -= add;
5947           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5948           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5949           make_gap (require + 2000);
5950           GAP_SIZE += add;
5951           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5952           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5953         }
5954     }
5955   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5956
5957   if (encodep && coding->dst_multibyte)
5958     {
5959       /* The output is unibyte.  We must convert 8-bit characters to
5960          multibyte form.  */
5961       if (inserted_byte * 2 > GAP_SIZE)
5962         {
5963           GAP_SIZE -= inserted_byte;
5964           ZV += inserted_byte; Z += inserted_byte;
5965           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5966           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5967           make_gap (inserted_byte - GAP_SIZE);
5968           GAP_SIZE += inserted_byte;
5969           ZV -= inserted_byte; Z -= inserted_byte;
5970           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5971           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5972         }
5973       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5974     }
5975
5976   /* If we shrank the conversion area, adjust it now.  */
5977   if (total_skip > 0)
5978     {
5979       if (tail_skip > 0)
5980         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5981       inserted += total_skip; inserted_byte += total_skip;
5982       GAP_SIZE += total_skip;
5983       GPT -= head_skip; GPT_BYTE -= head_skip;
5984       ZV -= total_skip; ZV_BYTE -= total_skip;
5985       Z -= total_skip; Z_BYTE -= total_skip;
5986       from -= head_skip; from_byte -= head_skip;
5987       to += tail_skip; to_byte += tail_skip;
5988     }
5989
5990   prev_Z = Z;
5991   if (! EQ (current_buffer->undo_list, Qt))
5992     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5993   else
5994     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5995                                  inserted, inserted_byte);
5996   inserted = Z - prev_Z;
5997
5998   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5999     coding_restore_composition (coding, Fcurrent_buffer ());
6000   coding_free_composition_data (coding);
6001
6002   if (! inhibit_pre_post_conversion
6003       && ! encodep && ! NILP (coding->post_read_conversion))
6004     {
6005       Lisp_Object val;
6006       Lisp_Object saved_coding_system;
6007
6008       if (from != PT)
6009         TEMP_SET_PT_BOTH (from, from_byte);
6010       prev_Z = Z;
6011       record_unwind_protect (code_convert_region_unwind,
6012                              Fcons (Vlast_coding_system_used, Qnil));
6013       saved_coding_system = Vlast_coding_system_used;
6014       Vlast_coding_system_used = coding->symbol;
6015       /* We should not call any more pre-write/post-read-conversion
6016          functions while this post-read-conversion is running.  */
6017       inhibit_pre_post_conversion = 1;
6018       val = call1 (coding->post_read_conversion, make_number (inserted));
6019       inhibit_pre_post_conversion = 0;
6020       coding->symbol = Vlast_coding_system_used;
6021       Vlast_coding_system_used = saved_coding_system;
6022       /* Discard the unwind protect.  */
6023       specpdl_ptr--;
6024       CHECK_NUMBER (val);
6025       inserted += Z - prev_Z;
6026     }
6027
6028   if (orig_point >= from)
6029     {
6030       if (orig_point >= from + orig_len)
6031         orig_point += inserted - orig_len;
6032       else
6033         orig_point = from;
6034       TEMP_SET_PT (orig_point);
6035     }
6036
6037   if (replace)
6038     {
6039       signal_after_change (from, to - from, inserted);
6040       update_compositions (from, from + inserted, CHECK_BORDER);
6041     }
6042
6043   {
6044     coding->consumed = to_byte - from_byte;
6045     coding->consumed_char = to - from;
6046     coding->produced = inserted_byte;
6047     coding->produced_char = inserted;
6048   }
6049
6050   return 0;
6051 }
6052
6053 /* Name (or base name) of work buffer for code conversion.  */
6054 static Lisp_Object Vcode_conversion_workbuf_name;
6055
6056 /* Set the current buffer to the working buffer prepared for
6057    code-conversion.  MULTIBYTE specifies the multibyteness of the
6058    buffer.  Return the buffer we set if it must be killed after use.
6059    Otherwise return Qnil.  */
6060
6061 static Lisp_Object
6062 set_conversion_work_buffer (multibyte)
6063      int multibyte;
6064 {
6065   Lisp_Object buffer, buffer_to_kill;
6066   struct buffer *buf;
6067
6068   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6069   buf = XBUFFER (buffer);
6070   if (buf == current_buffer)
6071     {
6072       /* As we are already in the work buffer, we must generate a new
6073          buffer for the work.  */
6074       Lisp_Object name;
6075
6076       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6077       buffer = buffer_to_kill = Fget_buffer_create (name);
6078       buf = XBUFFER (buffer);
6079     }
6080   else
6081     buffer_to_kill = Qnil;
6082
6083   delete_all_overlays (buf);
6084   buf->directory = current_buffer->directory;
6085   buf->read_only = Qnil;
6086   buf->filename = Qnil;
6087   buf->undo_list = Qt;
6088   eassert (buf->overlays_before == NULL);
6089   eassert (buf->overlays_after == NULL);
6090   set_buffer_internal (buf);
6091   if (BEG != BEGV || Z != ZV)
6092     Fwiden ();
6093   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6094   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6095   return buffer_to_kill;
6096 }
6097
6098 Lisp_Object
6099 run_pre_post_conversion_on_str (str, coding, encodep)
6100      Lisp_Object str;
6101      struct coding_system *coding;
6102      int encodep;
6103 {
6104   int count = SPECPDL_INDEX ();
6105   struct gcpro gcpro1, gcpro2;
6106   int multibyte = STRING_MULTIBYTE (str);
6107   Lisp_Object old_deactivate_mark;
6108   Lisp_Object buffer_to_kill;
6109   Lisp_Object unwind_arg;
6110
6111   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6112   /* It is not crucial to specbind this.  */
6113   old_deactivate_mark = Vdeactivate_mark;
6114   GCPRO2 (str, old_deactivate_mark);
6115
6116   /* We must insert the contents of STR as is without
6117      unibyte<->multibyte conversion.  For that, we adjust the
6118      multibyteness of the working buffer to that of STR.  */
6119   buffer_to_kill = set_conversion_work_buffer (multibyte);
6120   if (NILP (buffer_to_kill))
6121     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6122   else
6123     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6124   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6125
6126   insert_from_string (str, 0, 0,
6127                       SCHARS (str), SBYTES (str), 0);
6128   UNGCPRO;
6129   inhibit_pre_post_conversion = 1;
6130   if (encodep)
6131     {
6132       struct buffer *prev = current_buffer;
6133
6134       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6135       if (prev != current_buffer)
6136         /* We must kill the current buffer too.  */
6137         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6138     }
6139   else
6140     {
6141       Vlast_coding_system_used = coding->symbol;
6142       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6143       call1 (coding->post_read_conversion, make_number (Z - BEG));
6144       coding->symbol = Vlast_coding_system_used;
6145     }
6146   inhibit_pre_post_conversion = 0;
6147   Vdeactivate_mark = old_deactivate_mark;
6148   str = make_buffer_string (BEG, Z, 1);
6149   return unbind_to (count, str);
6150 }
6151
6152
6153 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6154    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6155    is intended that this function is called from encode_terminal_code,
6156    the pre-write-conversion function is run by safe_call and thus
6157    "Error during redisplay: ..." is logged when an error occurs.
6158
6159    Store the resulting text in *STR and set CODING->produced_char and
6160    CODING->produced to the number of characters and bytes
6161    respectively.  If the size of *STR is too small, enlarge it by
6162    xrealloc and update *STR and *SIZE.  */
6163
6164 void
6165 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6166      unsigned char **str;
6167      int *size, nchars, nbytes;
6168      struct coding_system *coding;
6169 {
6170   struct gcpro gcpro1, gcpro2;
6171   struct buffer *cur = current_buffer;
6172   struct buffer *prev;
6173   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6174   Lisp_Object args[3];
6175   Lisp_Object buffer_to_kill;
6176
6177   /* It is not crucial to specbind this.  */
6178   old_deactivate_mark = Vdeactivate_mark;
6179   old_last_coding_system_used = Vlast_coding_system_used;
6180   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6181
6182   /* We must insert the contents of STR as is without
6183      unibyte<->multibyte conversion.  For that, we adjust the
6184      multibyteness of the working buffer to that of STR.  */
6185   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6186   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6187   UNGCPRO;
6188   inhibit_pre_post_conversion = 1;
6189   prev = current_buffer;
6190   args[0] = coding->pre_write_conversion;
6191   args[1] = make_number (BEG);
6192   args[2] = make_number (Z);
6193   safe_call (3, args);
6194   inhibit_pre_post_conversion = 0;
6195   Vdeactivate_mark = old_deactivate_mark;
6196   Vlast_coding_system_used = old_last_coding_system_used;
6197   coding->produced_char = Z - BEG;
6198   coding->produced = Z_BYTE - BEG_BYTE;
6199   if (coding->produced > *size)
6200     {
6201       *size = coding->produced;
6202       *str = xrealloc (*str, *size);
6203     }
6204   if (BEG < GPT && GPT < Z)
6205     move_gap (BEG);
6206   bcopy (BEG_ADDR, *str, coding->produced);
6207   coding->src_multibyte
6208     = ! NILP (current_buffer->enable_multibyte_characters);
6209   if (prev != current_buffer)
6210     Fkill_buffer (Fcurrent_buffer ());
6211   set_buffer_internal (cur);
6212   if (! NILP (buffer_to_kill))
6213     Fkill_buffer (buffer_to_kill);
6214 }
6215
6216
6217 Lisp_Object
6218 decode_coding_string (str, coding, nocopy)
6219      Lisp_Object str;
6220      struct coding_system *coding;
6221      int nocopy;
6222 {
6223   int len;
6224   struct conversion_buffer buf;
6225   int from, to_byte;
6226   Lisp_Object saved_coding_symbol;
6227   int result;
6228   int require_decoding;
6229   int shrinked_bytes = 0;
6230   Lisp_Object newstr;
6231   int consumed, consumed_char, produced, produced_char;
6232
6233   from = 0;
6234   to_byte = SBYTES (str);
6235
6236   saved_coding_symbol = coding->symbol;
6237   coding->src_multibyte = STRING_MULTIBYTE (str);
6238   coding->dst_multibyte = 1;
6239   if (CODING_REQUIRE_DETECTION (coding))
6240     {
6241       /* See the comments in code_convert_region.  */
6242       if (coding->type == coding_type_undecided)
6243         {
6244           detect_coding (coding, SDATA (str), to_byte);
6245           if (coding->type == coding_type_undecided)
6246             {
6247               coding->type = coding_type_emacs_mule;
6248               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6249               /* As emacs-mule decoder will handle composition, we
6250                  need this setting to allocate coding->cmp_data
6251                  later.  */
6252               coding->composing = COMPOSITION_NO;
6253             }
6254         }
6255       if (coding->eol_type == CODING_EOL_UNDECIDED
6256           && coding->type != coding_type_ccl)
6257         {
6258           saved_coding_symbol = coding->symbol;
6259           detect_eol (coding, SDATA (str), to_byte);
6260           if (coding->eol_type == CODING_EOL_UNDECIDED)
6261             coding->eol_type = CODING_EOL_LF;
6262           /* We had better recover the original eol format if we
6263              encounter an inconsistent eol format while decoding.  */
6264           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6265         }
6266     }
6267
6268   if (coding->type == coding_type_no_conversion
6269       || coding->type == coding_type_raw_text)
6270     coding->dst_multibyte = 0;
6271
6272   require_decoding = CODING_REQUIRE_DECODING (coding);
6273
6274   if (STRING_MULTIBYTE (str))
6275     {
6276       /* Decoding routines expect the source text to be unibyte.  */
6277       str = Fstring_as_unibyte (str);
6278       to_byte = SBYTES (str);
6279       nocopy = 1;
6280       coding->src_multibyte = 0;
6281     }
6282
6283   /* Try to skip the heading and tailing ASCIIs.  */
6284   if (require_decoding && coding->type != coding_type_ccl)
6285     {
6286       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6287                                 0);
6288       if (from == to_byte)
6289         require_decoding = 0;
6290       shrinked_bytes = from + (SBYTES (str) - to_byte);
6291     }
6292
6293   if (!require_decoding
6294       && !(SYMBOLP (coding->post_read_conversion)
6295            && !NILP (Ffboundp (coding->post_read_conversion))))
6296     {
6297       coding->consumed = SBYTES (str);
6298       coding->consumed_char = SCHARS (str);
6299       if (coding->dst_multibyte)
6300         {
6301           str = Fstring_as_multibyte (str);
6302           nocopy = 1;
6303         }
6304       coding->produced = SBYTES (str);
6305       coding->produced_char = SCHARS (str);
6306       return (nocopy ? str : Fcopy_sequence (str));
6307     }
6308
6309   if (coding->composing != COMPOSITION_DISABLED)
6310     coding_allocate_composition_data (coding, from);
6311   len = decoding_buffer_size (coding, to_byte - from);
6312   allocate_conversion_buffer (buf, len);
6313
6314   consumed = consumed_char = produced = produced_char = 0;
6315   while (1)
6316     {
6317       result = decode_coding (coding, SDATA (str) + from + consumed,
6318                               buf.data + produced, to_byte - from - consumed,
6319                               buf.size - produced);
6320       consumed += coding->consumed;
6321       consumed_char += coding->consumed_char;
6322       produced += coding->produced;
6323       produced_char += coding->produced_char;
6324       if (result == CODING_FINISH_NORMAL
6325           || result == CODING_FINISH_INTERRUPT
6326           || (result == CODING_FINISH_INSUFFICIENT_SRC
6327               && coding->consumed == 0))
6328         break;
6329       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6330         coding_allocate_composition_data (coding, from + produced_char);
6331       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6332         extend_conversion_buffer (&buf);
6333       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6334         {
6335           Lisp_Object eol_type;
6336
6337           /* Recover the original EOL format.  */
6338           if (coding->eol_type == CODING_EOL_CR)
6339             {
6340               unsigned char *p;
6341               for (p = buf.data; p < buf.data + produced; p++)
6342                 if (*p == '\n') *p = '\r';
6343             }
6344           else if (coding->eol_type == CODING_EOL_CRLF)
6345             {
6346               int num_eol = 0;
6347               unsigned char *p0, *p1;
6348               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6349                 if (*p0 == '\n') num_eol++;
6350               if (produced + num_eol >= buf.size)
6351                 extend_conversion_buffer (&buf);
6352               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6353                 {
6354                   *--p1 = *--p0;
6355                   if (*p0 == '\n') *--p1 = '\r';
6356                 }
6357               produced += num_eol;
6358               produced_char += num_eol;
6359             }
6360           /* Suppress eol-format conversion in the further conversion.  */
6361           coding->eol_type = CODING_EOL_LF;
6362
6363           /* Set the coding system symbol to that for Unix-like EOL.  */
6364           eol_type = Fget (saved_coding_symbol, Qeol_type);
6365           if (VECTORP (eol_type)
6366               && XVECTOR (eol_type)->size == 3
6367               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6368             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6369           else
6370             coding->symbol = saved_coding_symbol;
6371
6372
6373         }
6374     }
6375
6376   coding->consumed = consumed;
6377   coding->consumed_char = consumed_char;
6378   coding->produced = produced;
6379   coding->produced_char = produced_char;
6380
6381   if (coding->dst_multibyte)
6382     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6383                                            produced + shrinked_bytes);
6384   else
6385     newstr = make_uninit_string (produced + shrinked_bytes);
6386   if (from > 0)
6387     STRING_COPYIN (newstr, 0, SDATA (str), from);
6388   STRING_COPYIN (newstr, from, buf.data, produced);
6389   if (shrinked_bytes > from)
6390     STRING_COPYIN (newstr, from + produced,
6391                    SDATA (str) + to_byte,
6392                    shrinked_bytes - from);
6393   free_conversion_buffer (&buf);
6394
6395   coding->consumed += shrinked_bytes;
6396   coding->consumed_char += shrinked_bytes;
6397   coding->produced += shrinked_bytes;
6398   coding->produced_char += shrinked_bytes;
6399
6400   if (coding->cmp_data && coding->cmp_data->used)
6401     coding_restore_composition (coding, newstr);
6402   coding_free_composition_data (coding);
6403
6404   if (SYMBOLP (coding->post_read_conversion)
6405       && !NILP (Ffboundp (coding->post_read_conversion)))
6406     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6407
6408   return newstr;
6409 }
6410
6411 Lisp_Object
6412 encode_coding_string (str, coding, nocopy)
6413      Lisp_Object str;
6414      struct coding_system *coding;
6415      int nocopy;
6416 {
6417   int len;
6418   struct conversion_buffer buf;
6419   int from, to, to_byte;
6420   int result;
6421   int shrinked_bytes = 0;
6422   Lisp_Object newstr;
6423   int consumed, consumed_char, produced, produced_char;
6424
6425   if (SYMBOLP (coding->pre_write_conversion)
6426       && !NILP (Ffboundp (coding->pre_write_conversion)))
6427     {
6428       str = run_pre_post_conversion_on_str (str, coding, 1);
6429       /* As STR is just newly generated, we don't have to copy it
6430          anymore.  */
6431       nocopy = 1;
6432     }
6433
6434   from = 0;
6435   to = SCHARS (str);
6436   to_byte = SBYTES (str);
6437
6438   /* Encoding routines determine the multibyteness of the source text
6439      by coding->src_multibyte.  */
6440   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6441   coding->dst_multibyte = 0;
6442   if (! CODING_REQUIRE_ENCODING (coding))
6443     goto no_need_of_encoding;
6444
6445   if (coding->composing != COMPOSITION_DISABLED)
6446     coding_save_composition (coding, from, to, str);
6447
6448   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6449      if we must run CCL program or there are compositions to
6450      encode.  */
6451   if (coding->type != coding_type_ccl
6452       && (! coding->cmp_data || coding->cmp_data->used == 0))
6453     {
6454       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6455                                 1);
6456       if (from == to_byte)
6457         {
6458           coding_free_composition_data (coding);
6459           goto no_need_of_encoding;
6460         }
6461       shrinked_bytes = from + (SBYTES (str) - to_byte);
6462     }
6463
6464   len = encoding_buffer_size (coding, to_byte - from);
6465   allocate_conversion_buffer (buf, len);
6466
6467   consumed = consumed_char = produced = produced_char = 0;
6468   while (1)
6469     {
6470       result = encode_coding (coding, SDATA (str) + from + consumed,
6471                               buf.data + produced, to_byte - from - consumed,
6472                               buf.size - produced);
6473       consumed += coding->consumed;
6474       consumed_char += coding->consumed_char;
6475       produced += coding->produced;
6476       produced_char += coding->produced_char;
6477       if (result == CODING_FINISH_NORMAL
6478           || result == CODING_FINISH_INTERRUPT
6479           || (result == CODING_FINISH_INSUFFICIENT_SRC
6480               && coding->consumed == 0))
6481         break;
6482       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6483       extend_conversion_buffer (&buf);
6484     }
6485
6486   coding->consumed = consumed;
6487   coding->consumed_char = consumed_char;
6488   coding->produced = produced;
6489   coding->produced_char = produced_char;
6490
6491   newstr = make_uninit_string (produced + shrinked_bytes);
6492   if (from > 0)
6493     STRING_COPYIN (newstr, 0, SDATA (str), from);
6494   STRING_COPYIN (newstr, from, buf.data, produced);
6495   if (shrinked_bytes > from)
6496     STRING_COPYIN (newstr, from + produced,
6497                    SDATA (str) + to_byte,
6498                    shrinked_bytes - from);
6499
6500   free_conversion_buffer (&buf);
6501   coding_free_composition_data (coding);
6502
6503   return newstr;
6504
6505  no_need_of_encoding:
6506   coding->consumed = SBYTES (str);
6507   coding->consumed_char = SCHARS (str);
6508   if (STRING_MULTIBYTE (str))
6509     {
6510       if (nocopy)
6511         /* We are sure that STR doesn't contain a multibyte
6512            character.  */
6513         STRING_SET_UNIBYTE (str);
6514       else
6515         {
6516           str = Fstring_as_unibyte (str);
6517           nocopy = 1;
6518         }
6519     }
6520   coding->produced = SBYTES (str);
6521   coding->produced_char = SCHARS (str);
6522   return (nocopy ? str : Fcopy_sequence (str));
6523 }
6524
6525 \f
6526 #ifdef emacs
6527 /*** 8. Emacs Lisp library functions ***/
6528
6529 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6530        doc: /* Return t if OBJECT is nil or a coding-system.
6531 See the documentation of `make-coding-system' for information
6532 about coding-system objects.  */)
6533      (obj)
6534      Lisp_Object obj;
6535 {
6536   if (NILP (obj))
6537     return Qt;
6538   if (!SYMBOLP (obj))
6539     return Qnil;
6540   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6541     return Qt;
6542   /* Get coding-spec vector for OBJ.  */
6543   obj = Fget (obj, Qcoding_system);
6544   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6545           ? Qt : Qnil);
6546 }
6547
6548 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6549        Sread_non_nil_coding_system, 1, 1, 0,
6550        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6551      (prompt)
6552      Lisp_Object prompt;
6553 {
6554   Lisp_Object val;
6555   do
6556     {
6557       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6558                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6559     }
6560   while (SCHARS (val) == 0);
6561   return (Fintern (val, Qnil));
6562 }
6563
6564 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6565        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6566 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6567      (prompt, default_coding_system)
6568      Lisp_Object prompt, default_coding_system;
6569 {
6570   Lisp_Object val;
6571   if (SYMBOLP (default_coding_system))
6572     default_coding_system = SYMBOL_NAME (default_coding_system);
6573   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6574                           Qt, Qnil, Qcoding_system_history,
6575                           default_coding_system, Qnil);
6576   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6577 }
6578
6579 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6580        1, 1, 0,
6581        doc: /* Check validity of CODING-SYSTEM.
6582 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6583 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6584 The value of this property should be a vector of length 5.  */)
6585      (coding_system)
6586      Lisp_Object coding_system;
6587 {
6588   Lisp_Object define_form;
6589
6590   define_form = Fget (coding_system, Qcoding_system_define_form);
6591   if (! NILP (define_form))
6592     {
6593       Fput (coding_system, Qcoding_system_define_form, Qnil);
6594       safe_eval (define_form);
6595     }
6596   if (!NILP (Fcoding_system_p (coding_system)))
6597     return coding_system;
6598   while (1)
6599     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6600 }
6601 \f
6602 Lisp_Object
6603 detect_coding_system (src, src_bytes, highest, multibytep)
6604      const unsigned char *src;
6605      int src_bytes, highest;
6606      int multibytep;
6607 {
6608   int coding_mask, eol_type;
6609   Lisp_Object val, tmp;
6610   int dummy;
6611
6612   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6613   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6614   if (eol_type == CODING_EOL_INCONSISTENT)
6615     eol_type = CODING_EOL_UNDECIDED;
6616
6617   if (!coding_mask)
6618     {
6619       val = Qundecided;
6620       if (eol_type != CODING_EOL_UNDECIDED)
6621         {
6622           Lisp_Object val2;
6623           val2 = Fget (Qundecided, Qeol_type);
6624           if (VECTORP (val2))
6625             val = XVECTOR (val2)->contents[eol_type];
6626         }
6627       return (highest ? val : Fcons (val, Qnil));
6628     }
6629
6630   /* At first, gather possible coding systems in VAL.  */
6631   val = Qnil;
6632   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6633     {
6634       Lisp_Object category_val, category_index;
6635
6636       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6637       category_val = Fsymbol_value (XCAR (tmp));
6638       if (!NILP (category_val)
6639           && NATNUMP (category_index)
6640           && (coding_mask & (1 << XFASTINT (category_index))))
6641         {
6642           val = Fcons (category_val, val);
6643           if (highest)
6644             break;
6645         }
6646     }
6647   if (!highest)
6648     val = Fnreverse (val);
6649
6650   /* Then, replace the elements with subsidiary coding systems.  */
6651   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6652     {
6653       if (eol_type != CODING_EOL_UNDECIDED
6654           && eol_type != CODING_EOL_INCONSISTENT)
6655         {
6656           Lisp_Object eol;
6657           eol = Fget (XCAR (tmp), Qeol_type);
6658           if (VECTORP (eol))
6659             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6660         }
6661     }
6662   return (highest ? XCAR (val) : val);
6663 }
6664
6665 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6666        2, 3, 0,
6667        doc: /* Detect how the byte sequence in the region is encoded.
6668 Return a list of possible coding systems used on decoding a byte
6669 sequence containing the bytes in the region between START and END when
6670 the coding system `undecided' is specified.  The list is ordered by
6671 priority decided in the current language environment.
6672
6673 If only ASCII characters are found, it returns a list of single element
6674 `undecided' or its subsidiary coding system according to a detected
6675 end-of-line format.
6676
6677 If optional argument HIGHEST is non-nil, return the coding system of
6678 highest priority.  */)
6679      (start, end, highest)
6680      Lisp_Object start, end, highest;
6681 {
6682   int from, to;
6683   int from_byte, to_byte;
6684   int include_anchor_byte = 0;
6685
6686   CHECK_NUMBER_COERCE_MARKER (start);
6687   CHECK_NUMBER_COERCE_MARKER (end);
6688
6689   validate_region (&start, &end);
6690   from = XINT (start), to = XINT (end);
6691   from_byte = CHAR_TO_BYTE (from);
6692   to_byte = CHAR_TO_BYTE (to);
6693
6694   if (from < GPT && to >= GPT)
6695     move_gap_both (to, to_byte);
6696   /* If we an anchor byte `\0' follows the region, we include it in
6697      the detecting source.  Then code detectors can handle the tailing
6698      byte sequence more accurately.
6699
6700      Fix me: This is not a perfect solution.  It is better that we
6701      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6702   */
6703   if (to == Z || (to == GPT && GAP_SIZE > 0))
6704     include_anchor_byte = 1;
6705   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6706                                to_byte - from_byte + include_anchor_byte,
6707                                !NILP (highest),
6708                                !NILP (current_buffer
6709                                       ->enable_multibyte_characters));
6710 }
6711
6712 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6713        1, 2, 0,
6714        doc: /* Detect how the byte sequence in STRING is encoded.
6715 Return a list of possible coding systems used on decoding a byte
6716 sequence containing the bytes in STRING when the coding system
6717 `undecided' is specified.  The list is ordered by priority decided in
6718 the current language environment.
6719
6720 If only ASCII characters are found, it returns a list of single element
6721 `undecided' or its subsidiary coding system according to a detected
6722 end-of-line format.
6723
6724 If optional argument HIGHEST is non-nil, return the coding system of
6725 highest priority.  */)
6726      (string, highest)
6727      Lisp_Object string, highest;
6728 {
6729   CHECK_STRING (string);
6730
6731   return detect_coding_system (SDATA (string),
6732                                /* "+ 1" is to include the anchor byte
6733                                   `\0'.  With this, code detectors can
6734                                   handle the tailing bytes more
6735                                   accurately.  */
6736                                SBYTES (string) + 1,
6737                                !NILP (highest),
6738                                STRING_MULTIBYTE (string));
6739 }
6740
6741 /*  Subroutine for Ffind_coding_systems_region_internal.
6742
6743     Return a list of coding systems that safely encode the multibyte
6744     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6745     possible coding systems.  If it is nil, it means that we have not
6746     yet found any coding systems.
6747
6748     WORK_TABLE a char-table of which element is set to t once the
6749     element is looked up.
6750
6751     If a non-ASCII single byte char is found, set
6752     *single_byte_char_found to 1.  */
6753
6754 static Lisp_Object
6755 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6756      unsigned char *p, *pend;
6757      Lisp_Object safe_codings, work_table;
6758      int *single_byte_char_found;
6759 {
6760   int c, len;
6761   Lisp_Object val, ch;
6762   Lisp_Object prev, tail;
6763
6764   if (NILP (safe_codings))
6765     goto done_safe_codings;
6766   while (p < pend)
6767     {
6768       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6769       p += len;
6770       if (ASCII_BYTE_P (c))
6771         /* We can ignore ASCII characters here.  */
6772         continue;
6773       if (SINGLE_BYTE_CHAR_P (c))
6774         *single_byte_char_found = 1;
6775       /* Check the safe coding systems for C.  */
6776       ch = make_number (c);
6777       val = Faref (work_table, ch);
6778       if (EQ (val, Qt))
6779         /* This element was already checked.  Ignore it.  */
6780         continue;
6781       /* Remember that we checked this element.  */
6782       Faset (work_table, ch, Qt);
6783
6784       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6785         {
6786           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6787           int encodable;
6788
6789           elt = XCAR (tail);
6790           if (CONSP (XCDR (elt)))
6791             {
6792               /* This entry has this format now:
6793                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6794                           ACCEPT-LATIN-EXTRA ) */
6795               val = XCDR (elt);
6796               encodable = ! NILP (Faref (XCAR (val), ch));
6797               if (! encodable)
6798                 {
6799                   val = XCDR (val);
6800                   translation_table = XCAR (val);
6801                   hash_table = XCAR (XCDR (val));
6802                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6803                 }
6804             }
6805           else
6806             {
6807               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6808               encodable = ! NILP (Faref (XCDR (elt), ch));
6809               if (! encodable)
6810                 {
6811                   /* Transform the format to:
6812                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6813                        ACCEPT-LATIN-EXTRA )  */
6814                   val = Fget (XCAR (elt), Qcoding_system);
6815                   translation_table
6816                     = Fplist_get (AREF (val, 3),
6817                                   Qtranslation_table_for_encode);
6818                   if (SYMBOLP (translation_table))
6819                     translation_table = Fget (translation_table,
6820                                               Qtranslation_table);
6821                   hash_table
6822                     = (CHAR_TABLE_P (translation_table)
6823                        ? XCHAR_TABLE (translation_table)->extras[1]
6824                        : Qnil);
6825                   accept_latin_extra
6826                     = ((EQ (AREF (val, 0), make_number (2))
6827                         && VECTORP (AREF (val, 4)))
6828                        ? AREF (AREF (val, 4), 16)
6829                        : Qnil);
6830                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6831                                         translation_table, hash_table,
6832                                         accept_latin_extra));
6833                 }
6834             }
6835
6836           if (! encodable
6837               && ((CHAR_TABLE_P (translation_table)
6838                    && ! NILP (Faref (translation_table, ch)))
6839                   || (HASH_TABLE_P (hash_table)
6840                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6841                   || (SINGLE_BYTE_CHAR_P (c)
6842                       && ! NILP (accept_latin_extra)
6843                       && VECTORP (Vlatin_extra_code_table)
6844                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6845             encodable = 1;
6846           if (encodable)
6847             prev = tail;
6848           else
6849             {
6850               /* Exclude this coding system from SAFE_CODINGS.  */
6851               if (EQ (tail, safe_codings))
6852                 {
6853                   safe_codings = XCDR (safe_codings);
6854                   if (NILP (safe_codings))
6855                     goto done_safe_codings;
6856                 }
6857               else
6858                 XSETCDR (prev, XCDR (tail));
6859             }
6860         }
6861     }
6862
6863  done_safe_codings:
6864   /* If the above loop was terminated before P reaches PEND, it means
6865      SAFE_CODINGS was set to nil.  If we have not yet found an
6866      non-ASCII single-byte char, check it now.  */
6867   if (! *single_byte_char_found)
6868     while (p < pend)
6869       {
6870         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6871         p += len;
6872         if (! ASCII_BYTE_P (c)
6873             && SINGLE_BYTE_CHAR_P (c))
6874           {
6875             *single_byte_char_found = 1;
6876             break;
6877           }
6878       }
6879   return safe_codings;
6880 }
6881
6882 DEFUN ("find-coding-systems-region-internal",
6883        Ffind_coding_systems_region_internal,
6884        Sfind_coding_systems_region_internal, 2, 2, 0,
6885        doc: /* Internal use only.  */)
6886      (start, end)
6887      Lisp_Object start, end;
6888 {
6889   Lisp_Object work_table, safe_codings;
6890   int non_ascii_p = 0;
6891   int single_byte_char_found = 0;
6892   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6893
6894   if (STRINGP (start))
6895     {
6896       if (!STRING_MULTIBYTE (start))
6897         return Qt;
6898       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6899       p2 = p2end = p1end;
6900       if (SCHARS (start) != SBYTES (start))
6901         non_ascii_p = 1;
6902     }
6903   else
6904     {
6905       int from, to, stop;
6906
6907       CHECK_NUMBER_COERCE_MARKER (start);
6908       CHECK_NUMBER_COERCE_MARKER (end);
6909       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6910         args_out_of_range (start, end);
6911       if (NILP (current_buffer->enable_multibyte_characters))
6912         return Qt;
6913       from = CHAR_TO_BYTE (XINT (start));
6914       to = CHAR_TO_BYTE (XINT (end));
6915       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6916       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6917       if (stop == to)
6918         p2 = p2end = p1end;
6919       else
6920         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6921       if (XINT (end) - XINT (start) != to - from)
6922         non_ascii_p = 1;
6923     }
6924
6925   if (!non_ascii_p)
6926     {
6927       /* We are sure that the text contains no multibyte character.
6928          Check if it contains eight-bit-graphic.  */
6929       p = p1;
6930       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6931       if (p == p1end)
6932         {
6933           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6934           if (p == p2end)
6935             return Qt;
6936         }
6937     }
6938
6939   /* The text contains non-ASCII characters.  */
6940
6941   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6942   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6943
6944   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6945                                     &single_byte_char_found);
6946   if (p2 < p2end)
6947     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6948                                       &single_byte_char_found);
6949   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6950     safe_codings = Qt;
6951   else
6952     {
6953       /* Turn safe_codings to a list of coding systems... */
6954       Lisp_Object val;
6955
6956       if (single_byte_char_found)
6957         /* ... and append these for eight-bit chars.  */
6958         val = Fcons (Qraw_text,
6959                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6960       else
6961         /* ... and append generic coding systems.  */
6962         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6963
6964       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6965         val = Fcons (XCAR (XCAR (safe_codings)), val);
6966       safe_codings = val;
6967     }
6968
6969   return safe_codings;
6970 }
6971
6972
6973 /* Search from position POS for such characters that are unencodable
6974    accoding to SAFE_CHARS, and return a list of their positions.  P
6975    points where in the memory the character at POS exists.  Limit the
6976    search at PEND or when Nth unencodable characters are found.
6977
6978    If SAFE_CHARS is a char table, an element for an unencodable
6979    character is nil.
6980
6981    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6982
6983    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6984    eight-bit-graphic characters are unencodable.  */
6985
6986 static Lisp_Object
6987 unencodable_char_position (safe_chars, pos, p, pend, n)
6988      Lisp_Object safe_chars;
6989      int pos;
6990      unsigned char *p, *pend;
6991      int n;
6992 {
6993   Lisp_Object pos_list;
6994
6995   pos_list = Qnil;
6996   while (p < pend)
6997     {
6998       int len;
6999       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7000
7001       if (c >= 128
7002           && (CHAR_TABLE_P (safe_chars)
7003               ? NILP (CHAR_TABLE_REF (safe_chars, c))
7004               : (NILP (safe_chars) || c < 256)))
7005         {
7006           pos_list = Fcons (make_number (pos), pos_list);
7007           if (--n <= 0)
7008             break;
7009         }
7010       pos++;
7011       p += len;
7012     }
7013   return Fnreverse (pos_list);
7014 }
7015
7016
7017 DEFUN ("unencodable-char-position", Funencodable_char_position,
7018        Sunencodable_char_position, 3, 5, 0,
7019        doc: /*
7020 Return position of first un-encodable character in a region.
7021 START and END specfiy the region and CODING-SYSTEM specifies the
7022 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7023
7024 If optional 4th argument COUNT is non-nil, it specifies at most how
7025 many un-encodable characters to search.  In this case, the value is a
7026 list of positions.
7027
7028 If optional 5th argument STRING is non-nil, it is a string to search
7029 for un-encodable characters.  In that case, START and END are indexes
7030 to the string.  */)
7031      (start, end, coding_system, count, string)
7032      Lisp_Object start, end, coding_system, count, string;
7033 {
7034   int n;
7035   Lisp_Object safe_chars;
7036   struct coding_system coding;
7037   Lisp_Object positions;
7038   int from, to;
7039   unsigned char *p, *pend;
7040
7041   if (NILP (string))
7042     {
7043       validate_region (&start, &end);
7044       from = XINT (start);
7045       to = XINT (end);
7046       if (NILP (current_buffer->enable_multibyte_characters))
7047         return Qnil;
7048       p = CHAR_POS_ADDR (from);
7049       if (to == GPT)
7050         pend = GPT_ADDR;
7051       else
7052         pend = CHAR_POS_ADDR (to);
7053     }
7054   else
7055     {
7056       CHECK_STRING (string);
7057       CHECK_NATNUM (start);
7058       CHECK_NATNUM (end);
7059       from = XINT (start);
7060       to = XINT (end);
7061       if (from > to
7062           || to > SCHARS (string))
7063         args_out_of_range_3 (string, start, end);
7064       if (! STRING_MULTIBYTE (string))
7065         return Qnil;
7066       p = SDATA (string) + string_char_to_byte (string, from);
7067       pend = SDATA (string) + string_char_to_byte (string, to);
7068     }
7069
7070   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7071
7072   if (NILP (count))
7073     n = 1;
7074   else
7075     {
7076       CHECK_NATNUM (count);
7077       n = XINT (count);
7078     }
7079
7080   if (coding.type == coding_type_no_conversion
7081       || coding.type == coding_type_raw_text)
7082     return Qnil;
7083
7084   if (coding.type == coding_type_undecided)
7085     safe_chars = Qnil;
7086   else
7087     safe_chars = coding_safe_chars (coding_system);
7088
7089   if (STRINGP (string)
7090       || from >= GPT || to <= GPT)
7091     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7092   else
7093     {
7094       Lisp_Object args[2];
7095
7096       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7097       n -= XINT (Flength (args[0]));
7098       if (n <= 0)
7099         positions = args[0];
7100       else
7101         {
7102           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7103                                                pend, n);
7104           positions = Fappend (2, args);
7105         }
7106     }
7107
7108   return  (NILP (count) ? Fcar (positions) : positions);
7109 }
7110
7111
7112 Lisp_Object
7113 code_convert_region1 (start, end, coding_system, encodep)
7114      Lisp_Object start, end, coding_system;
7115      int encodep;
7116 {
7117   struct coding_system coding;
7118   int from, to;
7119
7120   CHECK_NUMBER_COERCE_MARKER (start);
7121   CHECK_NUMBER_COERCE_MARKER (end);
7122   CHECK_SYMBOL (coding_system);
7123
7124   validate_region (&start, &end);
7125   from = XFASTINT (start);
7126   to = XFASTINT (end);
7127
7128   if (NILP (coding_system))
7129     return make_number (to - from);
7130
7131   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7132     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7133
7134   coding.mode |= CODING_MODE_LAST_BLOCK;
7135   coding.src_multibyte = coding.dst_multibyte
7136     = !NILP (current_buffer->enable_multibyte_characters);
7137   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7138                        &coding, encodep, 1);
7139   Vlast_coding_system_used = coding.symbol;
7140   return make_number (coding.produced_char);
7141 }
7142
7143 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7144        3, 3, "r\nzCoding system: ",
7145        doc: /* Decode the current region from the specified coding system.
7146 When called from a program, takes three arguments:
7147 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7148 This function sets `last-coding-system-used' to the precise coding system
7149 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7150 not fully specified.)
7151 It returns the length of the decoded text.  */)
7152      (start, end, coding_system)
7153      Lisp_Object start, end, coding_system;
7154 {
7155   return code_convert_region1 (start, end, coding_system, 0);
7156 }
7157
7158 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7159        3, 3, "r\nzCoding system: ",
7160        doc: /* Encode the current region into the specified coding system.
7161 When called from a program, takes three arguments:
7162 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7163 This function sets `last-coding-system-used' to the precise coding system
7164 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7165 not fully specified.)
7166 It returns the length of the encoded text.  */)
7167      (start, end, coding_system)
7168      Lisp_Object start, end, coding_system;
7169 {
7170   return code_convert_region1 (start, end, coding_system, 1);
7171 }
7172
7173 Lisp_Object
7174 code_convert_string1 (string, coding_system, nocopy, encodep)
7175      Lisp_Object string, coding_system, nocopy;
7176      int encodep;
7177 {
7178   struct coding_system coding;
7179
7180   CHECK_STRING (string);
7181   CHECK_SYMBOL (coding_system);
7182
7183   if (NILP (coding_system))
7184     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7185
7186   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7187     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7188
7189   coding.mode |= CODING_MODE_LAST_BLOCK;
7190   string = (encodep
7191             ? encode_coding_string (string, &coding, !NILP (nocopy))
7192             : decode_coding_string (string, &coding, !NILP (nocopy)));
7193   Vlast_coding_system_used = coding.symbol;
7194
7195   return string;
7196 }
7197
7198 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7199        2, 3, 0,
7200        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7201 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7202 if the decoding operation is trivial.
7203 This function sets `last-coding-system-used' to the precise coding system
7204 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7205 not fully specified.)  */)
7206      (string, coding_system, nocopy)
7207      Lisp_Object string, coding_system, nocopy;
7208 {
7209   return code_convert_string1 (string, coding_system, nocopy, 0);
7210 }
7211
7212 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7213        2, 3, 0,
7214        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7215 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7216 if the encoding operation is trivial.
7217 This function sets `last-coding-system-used' to the precise coding system
7218 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7219 not fully specified.)  */)
7220      (string, coding_system, nocopy)
7221      Lisp_Object string, coding_system, nocopy;
7222 {
7223   return code_convert_string1 (string, coding_system, nocopy, 1);
7224 }
7225
7226 /* Encode or decode STRING according to CODING_SYSTEM.
7227    Do not set Vlast_coding_system_used.
7228
7229    This function is called only from macros DECODE_FILE and
7230    ENCODE_FILE, thus we ignore character composition.  */
7231
7232 Lisp_Object
7233 code_convert_string_norecord (string, coding_system, encodep)
7234      Lisp_Object string, coding_system;
7235      int encodep;
7236 {
7237   struct coding_system coding;
7238
7239   CHECK_STRING (string);
7240   CHECK_SYMBOL (coding_system);
7241
7242   if (NILP (coding_system))
7243     return string;
7244
7245   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7246     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7247
7248   coding.composing = COMPOSITION_DISABLED;
7249   coding.mode |= CODING_MODE_LAST_BLOCK;
7250   return (encodep
7251           ? encode_coding_string (string, &coding, 1)
7252           : decode_coding_string (string, &coding, 1));
7253 }
7254 \f
7255 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7256        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7257 Return the corresponding character.  */)
7258      (code)
7259      Lisp_Object code;
7260 {
7261   unsigned char c1, c2, s1, s2;
7262   Lisp_Object val;
7263
7264   CHECK_NUMBER (code);
7265   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7266   if (s1 == 0)
7267     {
7268       if (s2 < 0x80)
7269         XSETFASTINT (val, s2);
7270       else if (s2 >= 0xA0 || s2 <= 0xDF)
7271         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7272       else
7273         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7274     }
7275   else
7276     {
7277       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7278           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7279         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7280       DECODE_SJIS (s1, s2, c1, c2);
7281       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7282     }
7283   return val;
7284 }
7285
7286 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7287        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7288 Return the corresponding code in SJIS.  */)
7289      (ch)
7290      Lisp_Object ch;
7291 {
7292   int charset, c1, c2, s1, s2;
7293   Lisp_Object val;
7294
7295   CHECK_NUMBER (ch);
7296   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7297   if (charset == CHARSET_ASCII)
7298     {
7299       val = ch;
7300     }
7301   else if (charset == charset_jisx0208
7302            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7303     {
7304       ENCODE_SJIS (c1, c2, s1, s2);
7305       XSETFASTINT (val, (s1 << 8) | s2);
7306     }
7307   else if (charset == charset_katakana_jisx0201
7308            && c1 > 0x20 && c2 < 0xE0)
7309     {
7310       XSETFASTINT (val, c1 | 0x80);
7311     }
7312   else
7313     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7314   return val;
7315 }
7316
7317 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7318        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7319 Return the corresponding character.  */)
7320      (code)
7321      Lisp_Object code;
7322 {
7323   int charset;
7324   unsigned char b1, b2, c1, c2;
7325   Lisp_Object val;
7326
7327   CHECK_NUMBER (code);
7328   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7329   if (b1 == 0)
7330     {
7331       if (b2 >= 0x80)
7332         error ("Invalid BIG5 code: %x", XFASTINT (code));
7333       val = code;
7334     }
7335   else
7336     {
7337       if ((b1 < 0xA1 || b1 > 0xFE)
7338           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7339         error ("Invalid BIG5 code: %x", XFASTINT (code));
7340       DECODE_BIG5 (b1, b2, charset, c1, c2);
7341       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7342     }
7343   return val;
7344 }
7345
7346 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7347        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7348 Return the corresponding character code in Big5.  */)
7349      (ch)
7350      Lisp_Object ch;
7351 {
7352   int charset, c1, c2, b1, b2;
7353   Lisp_Object val;
7354
7355   CHECK_NUMBER (ch);
7356   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7357   if (charset == CHARSET_ASCII)
7358     {
7359       val = ch;
7360     }
7361   else if ((charset == charset_big5_1
7362             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7363            || (charset == charset_big5_2
7364                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7365     {
7366       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7367       XSETFASTINT (val, (b1 << 8) | b2);
7368     }
7369   else
7370     error ("Can't encode to Big5: %d", XFASTINT (ch));
7371   return val;
7372 }
7373 \f
7374 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7375        Sset_terminal_coding_system_internal, 1, 1, 0,
7376        doc: /* Internal use only.  */)
7377      (coding_system)
7378      Lisp_Object coding_system;
7379 {
7380   CHECK_SYMBOL (coding_system);
7381   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7382   /* We had better not send unsafe characters to terminal.  */
7383   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7384   /* Character composition should be disabled.  */
7385   terminal_coding.composing = COMPOSITION_DISABLED;
7386   /* Error notification should be suppressed.  */
7387   terminal_coding.suppress_error = 1;
7388   terminal_coding.src_multibyte = 1;
7389   terminal_coding.dst_multibyte = 0;
7390   return Qnil;
7391 }
7392
7393 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7394        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7395        doc: /* Internal use only.  */)
7396      (coding_system)
7397      Lisp_Object coding_system;
7398 {
7399   CHECK_SYMBOL (coding_system);
7400   setup_coding_system (Fcheck_coding_system (coding_system),
7401                        &safe_terminal_coding);
7402   /* Character composition should be disabled.  */
7403   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7404   /* Error notification should be suppressed.  */
7405   safe_terminal_coding.suppress_error = 1;
7406   safe_terminal_coding.src_multibyte = 1;
7407   safe_terminal_coding.dst_multibyte = 0;
7408   return Qnil;
7409 }
7410
7411 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7412        Sterminal_coding_system, 0, 0, 0,
7413        doc: /* Return coding system specified for terminal output.  */)
7414      ()
7415 {
7416   return terminal_coding.symbol;
7417 }
7418
7419 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7420        Sset_keyboard_coding_system_internal, 1, 1, 0,
7421        doc: /* Internal use only.  */)
7422      (coding_system)
7423      Lisp_Object coding_system;
7424 {
7425   CHECK_SYMBOL (coding_system);
7426   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7427   /* Character composition should be disabled.  */
7428   keyboard_coding.composing = COMPOSITION_DISABLED;
7429   return Qnil;
7430 }
7431
7432 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7433        Skeyboard_coding_system, 0, 0, 0,
7434        doc: /* Return coding system specified for decoding keyboard input.  */)
7435      ()
7436 {
7437   return keyboard_coding.symbol;
7438 }
7439
7440 \f
7441 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7442        Sfind_operation_coding_system,  1, MANY, 0,
7443        doc: /* Choose a coding system for an operation based on the target name.
7444 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7445 DECODING-SYSTEM is the coding system to use for decoding
7446 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7447 for encoding (in case OPERATION does encoding).
7448
7449 The first argument OPERATION specifies an I/O primitive:
7450   For file I/O, `insert-file-contents' or `write-region'.
7451   For process I/O, `call-process', `call-process-region', or `start-process'.
7452   For network I/O, `open-network-stream'.
7453
7454 The remaining arguments should be the same arguments that were passed
7455 to the primitive.  Depending on which primitive, one of those arguments
7456 is selected as the TARGET.  For example, if OPERATION does file I/O,
7457 whichever argument specifies the file name is TARGET.
7458
7459 TARGET has a meaning which depends on OPERATION:
7460   For file I/O, TARGET is a file name (except for the special case below).
7461   For process I/O, TARGET is a process name.
7462   For network I/O, TARGET is a service name or a port number
7463
7464 This function looks up what specified for TARGET in,
7465 `file-coding-system-alist', `process-coding-system-alist',
7466 or `network-coding-system-alist' depending on OPERATION.
7467 They may specify a coding system, a cons of coding systems,
7468 or a function symbol to call.
7469 In the last case, we call the function with one argument,
7470 which is a list of all the arguments given to this function.
7471
7472 If OPERATION is `insert-file-contents', the argument corresponding to
7473 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7474 file name to look up, and BUFFER is a buffer that already contains the
7475 file (but not yet decoded).  If a function is found as above, the
7476 function must pay attention to this format of TARGET.
7477
7478 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7479      (nargs, args)
7480      int nargs;
7481      Lisp_Object *args;
7482 {
7483   Lisp_Object operation, target_idx, target, val;
7484   register Lisp_Object chain;
7485
7486   if (nargs < 2)
7487     error ("Too few arguments");
7488   operation = args[0];
7489   if (!SYMBOLP (operation)
7490       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7491     error ("Invalid first argument");
7492   if (nargs < 1 + XINT (target_idx))
7493     error ("Too few arguments for operation: %s",
7494            SDATA (SYMBOL_NAME (operation)));
7495   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7496      argument to write-region) is string, it must be treated as a
7497      target file name.  */
7498   if (EQ (operation, Qwrite_region)
7499       && nargs > 5
7500       && STRINGP (args[5]))
7501     target_idx = make_number (4);
7502   target = args[XINT (target_idx) + 1];
7503   if (!(STRINGP (target)
7504         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7505             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7506         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7507     error ("Invalid argument %d", XINT (target_idx) + 1);
7508   if (CONSP (target))
7509     target = XCAR (target);
7510
7511   chain = ((EQ (operation, Qinsert_file_contents)
7512             || EQ (operation, Qwrite_region))
7513            ? Vfile_coding_system_alist
7514            : (EQ (operation, Qopen_network_stream)
7515               ? Vnetwork_coding_system_alist
7516               : Vprocess_coding_system_alist));
7517   if (NILP (chain))
7518     return Qnil;
7519
7520   for (; CONSP (chain); chain = XCDR (chain))
7521     {
7522       Lisp_Object elt;
7523       elt = XCAR (chain);
7524
7525       if (CONSP (elt)
7526           && ((STRINGP (target)
7527                && STRINGP (XCAR (elt))
7528                && fast_string_match (XCAR (elt), target) >= 0)
7529               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7530         {
7531           val = XCDR (elt);
7532           /* Here, if VAL is both a valid coding system and a valid
7533              function symbol, we return VAL as a coding system.  */
7534           if (CONSP (val))
7535             return val;
7536           if (! SYMBOLP (val))
7537             return Qnil;
7538           if (! NILP (Fcoding_system_p (val)))
7539             return Fcons (val, val);
7540           if (! NILP (Ffboundp (val)))
7541             {
7542               val = safe_call1 (val, Flist (nargs, args));
7543               if (CONSP (val))
7544                 return val;
7545               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7546                 return Fcons (val, val);
7547             }
7548           return Qnil;
7549         }
7550     }
7551   return Qnil;
7552 }
7553
7554 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7555        Supdate_coding_systems_internal, 0, 0, 0,
7556        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7557 When values of any coding categories are changed, you must
7558 call this function.  */)
7559      ()
7560 {
7561   int i;
7562
7563   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7564     {
7565       Lisp_Object val;
7566
7567       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7568       if (!NILP (val))
7569         {
7570           if (! coding_system_table[i])
7571             coding_system_table[i] = ((struct coding_system *)
7572                                       xmalloc (sizeof (struct coding_system)));
7573           setup_coding_system (val, coding_system_table[i]);
7574         }
7575       else if (coding_system_table[i])
7576         {
7577           xfree (coding_system_table[i]);
7578           coding_system_table[i] = NULL;
7579         }
7580     }
7581
7582   return Qnil;
7583 }
7584
7585 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7586        Sset_coding_priority_internal, 0, 0, 0,
7587        doc: /* Update internal database for the current value of `coding-category-list'.
7588 This function is internal use only.  */)
7589      ()
7590 {
7591   int i = 0, idx;
7592   Lisp_Object val;
7593
7594   val = Vcoding_category_list;
7595
7596   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7597     {
7598       if (! SYMBOLP (XCAR (val)))
7599         break;
7600       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7601       if (idx >= CODING_CATEGORY_IDX_MAX)
7602         break;
7603       coding_priorities[i++] = (1 << idx);
7604       val = XCDR (val);
7605     }
7606   /* If coding-category-list is valid and contains all coding
7607      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7608      the following code saves Emacs from crashing.  */
7609   while (i < CODING_CATEGORY_IDX_MAX)
7610     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7611
7612   return Qnil;
7613 }
7614
7615 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7616        Sdefine_coding_system_internal, 1, 1, 0,
7617        doc: /* Register CODING-SYSTEM as a base coding system.
7618 This function is internal use only.  */)
7619      (coding_system)
7620      Lisp_Object coding_system;
7621 {
7622   Lisp_Object safe_chars, slot;
7623
7624   if (NILP (Fcheck_coding_system (coding_system)))
7625     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7626   safe_chars = coding_safe_chars (coding_system);
7627   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7628     error ("No valid safe-chars property for %s",
7629            SDATA (SYMBOL_NAME (coding_system)));
7630   if (EQ (safe_chars, Qt))
7631     {
7632       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7633         XSETCAR (Vcoding_system_safe_chars,
7634                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7635     }
7636   else
7637     {
7638       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7639       if (NILP (slot))
7640         XSETCDR (Vcoding_system_safe_chars,
7641                  nconc2 (XCDR (Vcoding_system_safe_chars),
7642                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7643       else
7644         XSETCDR (slot, safe_chars);
7645     }
7646   return Qnil;
7647 }
7648
7649 #endif /* emacs */
7650
7651 \f
7652 /*** 9. Post-amble ***/
7653
7654 void
7655 init_coding_once ()
7656 {
7657   int i;
7658
7659   /* Emacs' internal format specific initialize routine.  */
7660   for (i = 0; i <= 0x20; i++)
7661     emacs_code_class[i] = EMACS_control_code;
7662   emacs_code_class[0x0A] = EMACS_linefeed_code;
7663   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7664   for (i = 0x21 ; i < 0x7F; i++)
7665     emacs_code_class[i] = EMACS_ascii_code;
7666   emacs_code_class[0x7F] = EMACS_control_code;
7667   for (i = 0x80; i < 0xFF; i++)
7668     emacs_code_class[i] = EMACS_invalid_code;
7669   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7670   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7671   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7672   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7673
7674   /* ISO2022 specific initialize routine.  */
7675   for (i = 0; i < 0x20; i++)
7676     iso_code_class[i] = ISO_control_0;
7677   for (i = 0x21; i < 0x7F; i++)
7678     iso_code_class[i] = ISO_graphic_plane_0;
7679   for (i = 0x80; i < 0xA0; i++)
7680     iso_code_class[i] = ISO_control_1;
7681   for (i = 0xA1; i < 0xFF; i++)
7682     iso_code_class[i] = ISO_graphic_plane_1;
7683   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7684   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7685   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7686   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7687   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7688   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7689   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7690   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7691   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7692   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7693
7694   setup_coding_system (Qnil, &keyboard_coding);
7695   setup_coding_system (Qnil, &terminal_coding);
7696   setup_coding_system (Qnil, &safe_terminal_coding);
7697   setup_coding_system (Qnil, &default_buffer_file_coding);
7698
7699   bzero (coding_system_table, sizeof coding_system_table);
7700
7701   bzero (ascii_skip_code, sizeof ascii_skip_code);
7702   for (i = 0; i < 128; i++)
7703     ascii_skip_code[i] = 1;
7704
7705 #if defined (MSDOS) || defined (WINDOWSNT)
7706   system_eol_type = CODING_EOL_CRLF;
7707 #else
7708   system_eol_type = CODING_EOL_LF;
7709 #endif
7710
7711   inhibit_pre_post_conversion = 0;
7712 }
7713
7714 #ifdef emacs
7715
7716 void
7717 syms_of_coding ()
7718 {
7719   staticpro (&Vcode_conversion_workbuf_name);
7720   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7721
7722   Qtarget_idx = intern ("target-idx");
7723   staticpro (&Qtarget_idx);
7724
7725   Qcoding_system_history = intern ("coding-system-history");
7726   staticpro (&Qcoding_system_history);
7727   Fset (Qcoding_system_history, Qnil);
7728
7729   /* Target FILENAME is the first argument.  */
7730   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7731   /* Target FILENAME is the third argument.  */
7732   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7733
7734   Qcall_process = intern ("call-process");
7735   staticpro (&Qcall_process);
7736   /* Target PROGRAM is the first argument.  */
7737   Fput (Qcall_process, Qtarget_idx, make_number (0));
7738
7739   Qcall_process_region = intern ("call-process-region");
7740   staticpro (&Qcall_process_region);
7741   /* Target PROGRAM is the third argument.  */
7742   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7743
7744   Qstart_process = intern ("start-process");
7745   staticpro (&Qstart_process);
7746   /* Target PROGRAM is the third argument.  */
7747   Fput (Qstart_process, Qtarget_idx, make_number (2));
7748
7749   Qopen_network_stream = intern ("open-network-stream");
7750   staticpro (&Qopen_network_stream);
7751   /* Target SERVICE is the fourth argument.  */
7752   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7753
7754   Qcoding_system = intern ("coding-system");
7755   staticpro (&Qcoding_system);
7756
7757   Qeol_type = intern ("eol-type");
7758   staticpro (&Qeol_type);
7759
7760   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7761   staticpro (&Qbuffer_file_coding_system);
7762
7763   Qpost_read_conversion = intern ("post-read-conversion");
7764   staticpro (&Qpost_read_conversion);
7765
7766   Qpre_write_conversion = intern ("pre-write-conversion");
7767   staticpro (&Qpre_write_conversion);
7768
7769   Qno_conversion = intern ("no-conversion");
7770   staticpro (&Qno_conversion);
7771
7772   Qundecided = intern ("undecided");
7773   staticpro (&Qundecided);
7774
7775   Qcoding_system_p = intern ("coding-system-p");
7776   staticpro (&Qcoding_system_p);
7777
7778   Qcoding_system_error = intern ("coding-system-error");
7779   staticpro (&Qcoding_system_error);
7780
7781   Fput (Qcoding_system_error, Qerror_conditions,
7782         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7783   Fput (Qcoding_system_error, Qerror_message,
7784         build_string ("Invalid coding system"));
7785
7786   Qcoding_category = intern ("coding-category");
7787   staticpro (&Qcoding_category);
7788   Qcoding_category_index = intern ("coding-category-index");
7789   staticpro (&Qcoding_category_index);
7790
7791   Vcoding_category_table
7792     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7793   staticpro (&Vcoding_category_table);
7794   {
7795     int i;
7796     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7797       {
7798         XVECTOR (Vcoding_category_table)->contents[i]
7799           = intern (coding_category_name[i]);
7800         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7801               Qcoding_category_index, make_number (i));
7802       }
7803   }
7804
7805   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7806   staticpro (&Vcoding_system_safe_chars);
7807
7808   Qtranslation_table = intern ("translation-table");
7809   staticpro (&Qtranslation_table);
7810   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7811
7812   Qtranslation_table_id = intern ("translation-table-id");
7813   staticpro (&Qtranslation_table_id);
7814
7815   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7816   staticpro (&Qtranslation_table_for_decode);
7817
7818   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7819   staticpro (&Qtranslation_table_for_encode);
7820
7821   Qsafe_chars = intern ("safe-chars");
7822   staticpro (&Qsafe_chars);
7823
7824   Qchar_coding_system = intern ("char-coding-system");
7825   staticpro (&Qchar_coding_system);
7826
7827   /* Intern this now in case it isn't already done.
7828      Setting this variable twice is harmless.
7829      But don't staticpro it here--that is done in alloc.c.  */
7830   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7831   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7832   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7833
7834   Qvalid_codes = intern ("valid-codes");
7835   staticpro (&Qvalid_codes);
7836
7837   Qascii_incompatible = intern ("ascii-incompatible");
7838   staticpro (&Qascii_incompatible);
7839
7840   Qemacs_mule = intern ("emacs-mule");
7841   staticpro (&Qemacs_mule);
7842
7843   Qraw_text = intern ("raw-text");
7844   staticpro (&Qraw_text);
7845
7846   Qutf_8 = intern ("utf-8");
7847   staticpro (&Qutf_8);
7848
7849   Qcoding_system_define_form = intern ("coding-system-define-form");
7850   staticpro (&Qcoding_system_define_form);
7851
7852   defsubr (&Scoding_system_p);
7853   defsubr (&Sread_coding_system);
7854   defsubr (&Sread_non_nil_coding_system);
7855   defsubr (&Scheck_coding_system);
7856   defsubr (&Sdetect_coding_region);
7857   defsubr (&Sdetect_coding_string);
7858   defsubr (&Sfind_coding_systems_region_internal);
7859   defsubr (&Sunencodable_char_position);
7860   defsubr (&Sdecode_coding_region);
7861   defsubr (&Sencode_coding_region);
7862   defsubr (&Sdecode_coding_string);
7863   defsubr (&Sencode_coding_string);
7864   defsubr (&Sdecode_sjis_char);
7865   defsubr (&Sencode_sjis_char);
7866   defsubr (&Sdecode_big5_char);
7867   defsubr (&Sencode_big5_char);
7868   defsubr (&Sset_terminal_coding_system_internal);
7869   defsubr (&Sset_safe_terminal_coding_system_internal);
7870   defsubr (&Sterminal_coding_system);
7871   defsubr (&Sset_keyboard_coding_system_internal);
7872   defsubr (&Skeyboard_coding_system);
7873   defsubr (&Sfind_operation_coding_system);
7874   defsubr (&Supdate_coding_systems_internal);
7875   defsubr (&Sset_coding_priority_internal);
7876   defsubr (&Sdefine_coding_system_internal);
7877
7878   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7879                doc: /* List of coding systems.
7880
7881 Do not alter the value of this variable manually.  This variable should be
7882 updated by the functions `make-coding-system' and
7883 `define-coding-system-alias'.  */);
7884   Vcoding_system_list = Qnil;
7885
7886   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7887                doc: /* Alist of coding system names.
7888 Each element is one element list of coding system name.
7889 This variable is given to `completing-read' as TABLE argument.
7890
7891 Do not alter the value of this variable manually.  This variable should be
7892 updated by the functions `make-coding-system' and
7893 `define-coding-system-alias'.  */);
7894   Vcoding_system_alist = Qnil;
7895
7896   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7897                doc: /* List of coding-categories (symbols) ordered by priority.
7898
7899 On detecting a coding system, Emacs tries code detection algorithms
7900 associated with each coding-category one by one in this order.  When
7901 one algorithm agrees with a byte sequence of source text, the coding
7902 system bound to the corresponding coding-category is selected.
7903
7904 Don't modify this variable directly, but use `set-coding-priority'.  */);
7905   {
7906     int i;
7907
7908     Vcoding_category_list = Qnil;
7909     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7910       Vcoding_category_list
7911         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7912                  Vcoding_category_list);
7913   }
7914
7915   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7916                doc: /* Specify the coding system for read operations.
7917 It is useful to bind this variable with `let', but do not set it globally.
7918 If the value is a coding system, it is used for decoding on read operation.
7919 If not, an appropriate element is used from one of the coding system alists:
7920 There are three such tables, `file-coding-system-alist',
7921 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7922   Vcoding_system_for_read = Qnil;
7923
7924   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7925                doc: /* Specify the coding system for write operations.
7926 Programs bind this variable with `let', but you should not set it globally.
7927 If the value is a coding system, it is used for encoding of output,
7928 when writing it to a file and when sending it to a file or subprocess.
7929
7930 If this does not specify a coding system, an appropriate element
7931 is used from one of the coding system alists:
7932 There are three such tables, `file-coding-system-alist',
7933 `process-coding-system-alist', and `network-coding-system-alist'.
7934 For output to files, if the above procedure does not specify a coding system,
7935 the value of `buffer-file-coding-system' is used.  */);
7936   Vcoding_system_for_write = Qnil;
7937
7938   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7939                doc: /* Coding system used in the latest file or process I/O.
7940 Also set by `encode-coding-region', `decode-coding-region',
7941 `encode-coding-string' and `decode-coding-string'.  */);
7942   Vlast_coding_system_used = Qnil;
7943
7944   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7945                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7946 See info node `Coding Systems' and info node `Text and Binary' concerning
7947 such conversion.  */);
7948   inhibit_eol_conversion = 0;
7949
7950   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7951                doc: /* Non-nil means process buffer inherits coding system of process output.
7952 Bind it to t if the process output is to be treated as if it were a file
7953 read from some filesystem.  */);
7954   inherit_process_coding_system = 0;
7955
7956   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7957                doc: /* Alist to decide a coding system to use for a file I/O operation.
7958 The format is ((PATTERN . VAL) ...),
7959 where PATTERN is a regular expression matching a file name,
7960 VAL is a coding system, a cons of coding systems, or a function symbol.
7961 If VAL is a coding system, it is used for both decoding and encoding
7962 the file contents.
7963 If VAL is a cons of coding systems, the car part is used for decoding,
7964 and the cdr part is used for encoding.
7965 If VAL is a function symbol, the function must return a coding system
7966 or a cons of coding systems which are used as above.  The function gets
7967 the arguments with which `find-operation-coding-system' was called.
7968
7969 See also the function `find-operation-coding-system'
7970 and the variable `auto-coding-alist'.  */);
7971   Vfile_coding_system_alist = Qnil;
7972
7973   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7974     doc: /* Alist to decide a coding system to use for a process I/O operation.
7975 The format is ((PATTERN . VAL) ...),
7976 where PATTERN is a regular expression matching a program name,
7977 VAL is a coding system, a cons of coding systems, or a function symbol.
7978 If VAL is a coding system, it is used for both decoding what received
7979 from the program and encoding what sent to the program.
7980 If VAL is a cons of coding systems, the car part is used for decoding,
7981 and the cdr part is used for encoding.
7982 If VAL is a function symbol, the function must return a coding system
7983 or a cons of coding systems which are used as above.
7984
7985 See also the function `find-operation-coding-system'.  */);
7986   Vprocess_coding_system_alist = Qnil;
7987
7988   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7989     doc: /* Alist to decide a coding system to use for a network I/O operation.
7990 The format is ((PATTERN . VAL) ...),
7991 where PATTERN is a regular expression matching a network service name
7992 or is a port number to connect to,
7993 VAL is a coding system, a cons of coding systems, or a function symbol.
7994 If VAL is a coding system, it is used for both decoding what received
7995 from the network stream and encoding what sent to the network stream.
7996 If VAL is a cons of coding systems, the car part is used for decoding,
7997 and the cdr part is used for encoding.
7998 If VAL is a function symbol, the function must return a coding system
7999 or a cons of coding systems which are used as above.
8000
8001 See also the function `find-operation-coding-system'.  */);
8002   Vnetwork_coding_system_alist = Qnil;
8003
8004   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8005                doc: /* Coding system to use with system messages.
8006 Also used for decoding keyboard input on X Window system.  */);
8007   Vlocale_coding_system = Qnil;
8008
8009   /* The eol mnemonics are reset in startup.el system-dependently.  */
8010   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8011                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8012   eol_mnemonic_unix = build_string (":");
8013
8014   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8015                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8016   eol_mnemonic_dos = build_string ("\\");
8017
8018   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8019                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8020   eol_mnemonic_mac = build_string ("/");
8021
8022   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8023                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8024   eol_mnemonic_undecided = build_string (":");
8025
8026   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8027                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8028   Venable_character_translation = Qt;
8029
8030   DEFVAR_LISP ("standard-translation-table-for-decode",
8031                &Vstandard_translation_table_for_decode,
8032                doc: /* Table for translating characters while decoding.  */);
8033   Vstandard_translation_table_for_decode = Qnil;
8034
8035   DEFVAR_LISP ("standard-translation-table-for-encode",
8036                &Vstandard_translation_table_for_encode,
8037                doc: /* Table for translating characters while encoding.  */);
8038   Vstandard_translation_table_for_encode = Qnil;
8039
8040   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8041                doc: /* Alist of charsets vs revision numbers.
8042 While encoding, if a charset (car part of an element) is found,
8043 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8044   Vcharset_revision_alist = Qnil;
8045
8046   DEFVAR_LISP ("default-process-coding-system",
8047                &Vdefault_process_coding_system,
8048                doc: /* Cons of coding systems used for process I/O by default.
8049 The car part is used for decoding a process output,
8050 the cdr part is used for encoding a text to be sent to a process.  */);
8051   Vdefault_process_coding_system = Qnil;
8052
8053   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8054                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8055 This is a vector of length 256.
8056 If Nth element is non-nil, the existence of code N in a file
8057 \(or output of subprocess) doesn't prevent it to be detected as
8058 a coding system of ISO 2022 variant which has a flag
8059 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8060 or reading output of a subprocess.
8061 Only 128th through 159th elements has a meaning.  */);
8062   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8063
8064   DEFVAR_LISP ("select-safe-coding-system-function",
8065                &Vselect_safe_coding_system_function,
8066                doc: /* Function to call to select safe coding system for encoding a text.
8067
8068 If set, this function is called to force a user to select a proper
8069 coding system which can encode the text in the case that a default
8070 coding system used in each operation can't encode the text.
8071
8072 The default value is `select-safe-coding-system' (which see).  */);
8073   Vselect_safe_coding_system_function = Qnil;
8074
8075   DEFVAR_BOOL ("coding-system-require-warning",
8076                &coding_system_require_warning,
8077                doc: /* Internal use only.
8078 If non-nil, on writing a file, `select-safe-coding-system-function' is
8079 called even if `coding-system-for-write' is non-nil.  The command
8080 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8081   coding_system_require_warning = 0;
8082
8083
8084   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8085                &inhibit_iso_escape_detection,
8086                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8087
8088 By default, on reading a file, Emacs tries to detect how the text is
8089 encoded.  This code detection is sensitive to escape sequences.  If
8090 the sequence is valid as ISO2022, the code is determined as one of
8091 the ISO2022 encodings, and the file is decoded by the corresponding
8092 coding system (e.g. `iso-2022-7bit').
8093
8094 However, there may be a case that you want to read escape sequences in
8095 a file as is.  In such a case, you can set this variable to non-nil.
8096 Then, as the code detection ignores any escape sequences, no file is
8097 detected as encoded in some ISO2022 encoding.  The result is that all
8098 escape sequences become visible in a buffer.
8099
8100 The default value is nil, and it is strongly recommended not to change
8101 it.  That is because many Emacs Lisp source files that contain
8102 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8103 in Emacs's distribution, and they won't be decoded correctly on
8104 reading if you suppress escape sequence detection.
8105
8106 The other way to read escape sequences in a file without decoding is
8107 to explicitly specify some coding system that doesn't use ISO2022's
8108 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8109   inhibit_iso_escape_detection = 0;
8110
8111   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8112                doc: /* Char table for translating self-inserting characters.
8113 This is applied to the result of input methods, not their input.  See also
8114 `keyboard-translate-table'.  */);
8115     Vtranslation_table_for_input = Qnil;
8116 }
8117
8118 char *
8119 emacs_strerror (error_number)
8120      int error_number;
8121 {
8122   char *str;
8123
8124   synchronize_system_messages_locale ();
8125   str = strerror (error_number);
8126
8127   if (! NILP (Vlocale_coding_system))
8128     {
8129       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8130                                                       Vlocale_coding_system,
8131                                                       0);
8132       str = (char *) SDATA (dec);
8133     }
8134
8135   return str;
8136 }
8137
8138 #endif /* emacs */
8139
8140 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8141    (do not change this comment) */