code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7
   8 This file is part of GNU Emacs.
   9
  10 GNU Emacs is free software; you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation; either version 2, or (at your option)
  13 any later version.
  14
  15 GNU Emacs is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with GNU Emacs; see the file COPYING.  If not, write to
  22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  23 Boston, MA 02110-1301, USA.  */
  24
  25 /*** TABLE OF CONTENTS ***
  26
  27   0. General comments
  28   1. Preamble
  29   2. Emacs' internal format (emacs-mule) handlers
  30   3. ISO2022 handlers
  31   4. Shift-JIS and BIG5 handlers
  32   5. CCL handlers
  33   6. End-of-line handlers
  34   7. C library functions
  35   8. Emacs Lisp library functions
  36   9. Post-amble
  37
  38 */
  39
  40 /*** 0. General comments ***/
  41
  42
  43 /*** GENERAL NOTE on CODING SYSTEMS ***
  44
  45   A coding system is an encoding mechanism for one or more character
  46   sets.  Here's a list of coding systems which Emacs can handle.  When
  47   we say "decode", it means converting some other coding system to
  48   Emacs' internal format (emacs-mule), and when we say "encode",
  49   it means converting the coding system emacs-mule to some other
  50   coding system.
  51
  52   0. Emacs' internal format (emacs-mule)
  53
  54   Emacs itself holds a multi-lingual character in buffers and strings
  55   in a special format.  Details are described in section 2.
  56
  57   1. ISO2022
  58
  59   The most famous coding system for multiple character sets.  X's
  60   Compound Text, various EUCs (Extended Unix Code), and coding
  61   systems used in Internet communication such as ISO-2022-JP are
  62   all variants of ISO2022.  Details are described in section 3.
  63
  64   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  65
  66   A coding system to encode character sets: ASCII, JISX0201, and
  67   JISX0208.  Widely used for PC's in Japan.  Details are described in
  68   section 4.
  69
  70   3. BIG5
  71
  72   A coding system to encode the character sets ASCII and Big5.  Widely
  73   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  74   described in section 4.  In this file, when we write "BIG5"
  75   (all uppercase), we mean the coding system, and when we write
  76   "Big5" (capitalized), we mean the character set.
  77
  78   4. Raw text
  79
  80   A coding system for text containing random 8-bit code.  Emacs does
  81   no code conversion on such text except for end-of-line format.
  82
  83   5. Other
  84
  85   If a user wants to read/write text encoded in a coding system not
  86   listed above, he can supply a decoder and an encoder for it as CCL
  87   (Code Conversion Language) programs.  Emacs executes the CCL program
  88   while reading/writing.
  89
  90   Emacs represents a coding system by a Lisp symbol that has a property
  91   `coding-system'.  But, before actually using the coding system, the
  92   information about it is set in a structure of type `struct
  93   coding_system' for rapid processing.  See section 6 for more details.
  94
  95 */
  96
  97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  98
  99   How end-of-line of text is encoded depends on the operating system.
 100   For instance, Unix's format is just one byte of `line-feed' code,
 101   whereas DOS's format is two-byte sequence of `carriage-return' and
 102   `line-feed' codes.  MacOS's format is usually one byte of
 103   `carriage-return'.
 104
 105   Since text character encoding and end-of-line encoding are
 106   independent, any coding system described above can have any
 107   end-of-line format.  So Emacs has information about end-of-line
 108   format in each coding-system.  See section 6 for more details.
 109
 110 */
 111
 112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 113
 114   These functions check if a text between SRC and SRC_END is encoded
 115   in the coding system category XXX.  Each returns an integer value in
 116   which appropriate flag bits for the category XXX are set.  The flag
 117   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 118   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 119   of the range 0x80..0x9F are in multibyte form.  */
 120 #if 0
 121 int
 122 detect_coding_emacs_mule (src, src_end, multibytep)
 123      unsigned char *src, *src_end;
 124      int multibytep;
 125 {
 126   ...
 127 }
 128 #endif
 129
 130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 131
 132   These functions decode SRC_BYTES length of unibyte text at SOURCE
 133   encoded in CODING to Emacs' internal format.  The resulting
 134   multibyte text goes to a place pointed to by DESTINATION, the length
 135   of which should not exceed DST_BYTES.
 136
 137   These functions set the information about original and decoded texts
 138   in the members `produced', `produced_char', `consumed', and
 139   `consumed_char' of the structure *CODING.  They also set the member
 140   `result' to one of CODING_FINISH_XXX indicating how the decoding
 141   finished.
 142
 143   DST_BYTES zero means that the source area and destination area are
 144   overlapped, which means that we can produce a decoded text until it
 145   reaches the head of the not-yet-decoded source text.
 146
 147   Below is a template for these functions.  */
 148 #if 0
 149 static void
 150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 151      struct coding_system *coding;
 152      const unsigned char *source;
 153      unsigned char *destination;
 154      int src_bytes, dst_bytes;
 155 {
 156   ...
 157 }
 158 #endif
 159
 160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 161
 162   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 163   internal multibyte format to CODING.  The resulting unibyte text
 164   goes to a place pointed to by DESTINATION, the length of which
 165   should not exceed DST_BYTES.
 166
 167   These functions set the information about original and encoded texts
 168   in the members `produced', `produced_char', `consumed', and
 169   `consumed_char' of the structure *CODING.  They also set the member
 170   `result' to one of CODING_FINISH_XXX indicating how the encoding
 171   finished.
 172
 173   DST_BYTES zero means that the source area and destination area are
 174   overlapped, which means that we can produce encoded text until it
 175   reaches at the head of the not-yet-encoded source text.
 176
 177   Below is a template for these functions.  */
 178 #if 0
 179 static void
 180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 181      struct coding_system *coding;
 182      unsigned char *source, *destination;
 183      int src_bytes, dst_bytes;
 184 {
 185   ...
 186 }
 187 #endif
 188
 189 /*** COMMONLY USED MACROS ***/
 190
 191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 192    get one, two, and three bytes from the source text respectively.
 193    If there are not enough bytes in the source, they jump to
 194    `label_end_of_loop'.  The caller should set variables `coding',
 195    `src' and `src_end' to appropriate pointer in advance.  These
 196    macros are called from decoding routines `decode_coding_XXX', thus
 197    it is assumed that the source text is unibyte.  */
 198
 199 #define ONE_MORE_BYTE(c1)                                       \
 200   do {                                                          \
 201     if (src >= src_end)                                         \
 202       {                                                         \
 203         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 204         goto label_end_of_loop;                                 \
 205       }                                                         \
 206     c1 = *src++;                                                \
 207   } while (0)
 208
 209 #define TWO_MORE_BYTES(c1, c2)                                  \
 210   do {                                                          \
 211     if (src + 1 >= src_end)                                     \
 212       {                                                         \
 213         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 214         goto label_end_of_loop;                                 \
 215       }                                                         \
 216     c1 = *src++;                                                \
 217     c2 = *src++;                                                \
 218   } while (0)
 219
 220
 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 222    form if MULTIBYTEP is nonzero.  In addition, if SRC is not less
 223    than SRC_END, return with RET.  */
 224
 225 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret)      \
 226   do {                                                          \
 227     if (src >= src_end)                                         \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         return ret;                                             \
 231       }                                                         \
 232     c1 = *src++;                                                \
 233     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 234       c1 = *src++ - 0x20;                                       \
 235   } while (0)
 236
 237 /* Set C to the next character at the source text pointed by `src'.
 238    If there are not enough characters in the source, jump to
 239    `label_end_of_loop'.  The caller should set variables `coding'
 240    `src', `src_end', and `translation_table' to appropriate pointers
 241    in advance.  This macro is used in encoding routines
 242    `encode_coding_XXX', thus it assumes that the source text is in
 243    multibyte form except for 8-bit characters.  8-bit characters are
 244    in multibyte form if coding->src_multibyte is nonzero, else they
 245    are represented by a single byte.  */
 246
 247 #define ONE_MORE_CHAR(c)                                        \
 248   do {                                                          \
 249     int len = src_end - src;                                    \
 250     int bytes;                                                  \
 251     if (len <= 0)                                               \
 252       {                                                         \
 253         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 254         goto label_end_of_loop;                                 \
 255       }                                                         \
 256     if (coding->src_multibyte                                   \
 257         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 258       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 259     else                                                        \
 260       c = *src, bytes = 1;                                      \
 261     if (!NILP (translation_table))                              \
 262       c = translate_char (translation_table, c, -1, 0, 0);      \
 263     src += bytes;                                               \
 264   } while (0)
 265
 266
 267 /* Produce a multibyte form of character C to `dst'.  Jump to
 268    `label_end_of_loop' if there's not enough space at `dst'.
 269
 270    If we are now in the middle of a composition sequence, the decoded
 271    character may be ALTCHAR (for the current composition).  In that
 272    case, the character goes to coding->cmp_data->data instead of
 273    `dst'.
 274
 275    This macro is used in decoding routines.  */
 276
 277 #define EMIT_CHAR(c)                                                    \
 278   do {                                                                  \
 279     if (! COMPOSING_P (coding)                                          \
 280         || coding->composing == COMPOSITION_RELATIVE                    \
 281         || coding->composing == COMPOSITION_WITH_RULE)                  \
 282       {                                                                 \
 283         int bytes = CHAR_BYTES (c);                                     \
 284         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 285           {                                                             \
 286             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 287             goto label_end_of_loop;                                     \
 288           }                                                             \
 289         dst += CHAR_STRING (c, dst);                                    \
 290         coding->produced_char++;                                        \
 291       }                                                                 \
 292                                                                         \
 293     if (COMPOSING_P (coding)                                            \
 294         && coding->composing != COMPOSITION_RELATIVE)                   \
 295       {                                                                 \
 296         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 297         coding->composition_rule_follows                                \
 298           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 299       }                                                                 \
 300   } while (0)
 301
 302
 303 #define EMIT_ONE_BYTE(c)                                        \
 304   do {                                                          \
 305     if (dst >= (dst_bytes ? dst_end : src))                     \
 306       {                                                         \
 307         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 308         goto label_end_of_loop;                                 \
 309       }                                                         \
 310     *dst++ = c;                                                 \
 311   } while (0)
 312
 313 #define EMIT_TWO_BYTES(c1, c2)                                  \
 314   do {                                                          \
 315     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 316       {                                                         \
 317         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 318         goto label_end_of_loop;                                 \
 319       }                                                         \
 320     *dst++ = c1, *dst++ = c2;                                   \
 321   } while (0)
 322
 323 #define EMIT_BYTES(from, to)                                    \
 324   do {                                                          \
 325     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 326       {                                                         \
 327         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 328         goto label_end_of_loop;                                 \
 329       }                                                         \
 330     while (from < to)                                           \
 331       *dst++ = *from++;                                         \
 332   } while (0)
 333
 334 \f
 335 /*** 1. Preamble ***/
 336
 337 #ifdef emacs
 338 #include <config.h>
 339 #endif
 340
 341 #include <stdio.h>
 342
 343 #ifdef emacs
 344
 345 #include "lisp.h"
 346 #include "buffer.h"
 347 #include "charset.h"
 348 #include "composite.h"
 349 #include "ccl.h"
 350 #include "coding.h"
 351 #include "window.h"
 352 #include "intervals.h"
 353 #include "frame.h"
 354 #include "termhooks.h"
 355
 356 #else  /* not emacs */
 357
 358 #include "mulelib.h"
 359
 360 #endif /* not emacs */
 361
 362 Lisp_Object Qcoding_system, Qeol_type;
 363 Lisp_Object Qbuffer_file_coding_system;
 364 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 365 Lisp_Object Qno_conversion, Qundecided;
 366 Lisp_Object Qcoding_system_history;
 367 Lisp_Object Qsafe_chars;
 368 Lisp_Object Qvalid_codes;
 369 Lisp_Object Qascii_incompatible;
 370
 371 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 372 Lisp_Object Qcall_process, Qcall_process_region;
 373 Lisp_Object Qstart_process, Qopen_network_stream;
 374 Lisp_Object Qtarget_idx;
 375
 376 /* If a symbol has this property, evaluate the value to define the
 377    symbol as a coding system.  */
 378 Lisp_Object Qcoding_system_define_form;
 379
 380 Lisp_Object Vselect_safe_coding_system_function;
 381
 382 int coding_system_require_warning;
 383
 384 /* Mnemonic string for each format of end-of-line.  */
 385 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 386 /* Mnemonic string to indicate format of end-of-line is not yet
 387    decided.  */
 388 Lisp_Object eol_mnemonic_undecided;
 389
 390 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 391    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 392    This has an effect only for external encoding (i.e. for output to
 393    file and process), not for in-buffer or Lisp string encoding.  */
 394 int system_eol_type;
 395
 396 #ifdef emacs
 397
 398 /* Information about which coding system is safe for which chars.
 399    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 400
 401    GENERIC-LIST is a list of generic coding systems which can encode
 402    any characters.
 403
 404    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 405    corresponding char table that contains safe chars.  */
 406 Lisp_Object Vcoding_system_safe_chars;
 407
 408 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 409
 410 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 411
 412 /* Coding system emacs-mule and raw-text are for converting only
 413    end-of-line format.  */
 414 Lisp_Object Qemacs_mule, Qraw_text;
 415
 416 Lisp_Object Qutf_8;
 417
 418 /* Coding-systems are handed between Emacs Lisp programs and C internal
 419    routines by the following three variables.  */
 420 /* Coding-system for reading files and receiving data from process.  */
 421 Lisp_Object Vcoding_system_for_read;
 422 /* Coding-system for writing files and sending data to process.  */
 423 Lisp_Object Vcoding_system_for_write;
 424 /* Coding-system actually used in the latest I/O.  */
 425 Lisp_Object Vlast_coding_system_used;
 426
 427 /* A vector of length 256 which contains information about special
 428    Latin codes (especially for dealing with Microsoft codes).  */
 429 Lisp_Object Vlatin_extra_code_table;
 430
 431 /* Flag to inhibit code conversion of end-of-line format.  */
 432 int inhibit_eol_conversion;
 433
 434 /* Flag to inhibit ISO2022 escape sequence detection.  */
 435 int inhibit_iso_escape_detection;
 436
 437 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 438 int inherit_process_coding_system;
 439
 440 /* Coding system to be used to encode text for terminal display when
 441    terminal coding system is nil.  */
 442 struct coding_system safe_terminal_coding;
 443
 444 /* Default coding system to be used to write a file.  */
 445 struct coding_system default_buffer_file_coding;
 446
 447 Lisp_Object Vfile_coding_system_alist;
 448 Lisp_Object Vprocess_coding_system_alist;
 449 Lisp_Object Vnetwork_coding_system_alist;
 450
 451 Lisp_Object Vlocale_coding_system;
 452
 453 #endif /* emacs */
 454
 455 Lisp_Object Qcoding_category, Qcoding_category_index;
 456
 457 /* List of symbols `coding-category-xxx' ordered by priority.  */
 458 Lisp_Object Vcoding_category_list;
 459
 460 /* Table of coding categories (Lisp symbols).  */
 461 Lisp_Object Vcoding_category_table;
 462
 463 /* Table of names of symbol for each coding-category.  */
 464 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 465   "coding-category-emacs-mule",
 466   "coding-category-sjis",
 467   "coding-category-iso-7",
 468   "coding-category-iso-7-tight",
 469   "coding-category-iso-8-1",
 470   "coding-category-iso-8-2",
 471   "coding-category-iso-7-else",
 472   "coding-category-iso-8-else",
 473   "coding-category-ccl",
 474   "coding-category-big5",
 475   "coding-category-utf-8",
 476   "coding-category-utf-16-be",
 477   "coding-category-utf-16-le",
 478   "coding-category-raw-text",
 479   "coding-category-binary"
 480 };
 481
 482 /* Table of pointers to coding systems corresponding to each coding
 483    categories.  */
 484 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 485
 486 /* Table of coding category masks.  Nth element is a mask for a coding
 487    category of which priority is Nth.  */
 488 static
 489 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 490
 491 /* Flag to tell if we look up translation table on character code
 492    conversion.  */
 493 Lisp_Object Venable_character_translation;
 494 /* Standard translation table to look up on decoding (reading).  */
 495 Lisp_Object Vstandard_translation_table_for_decode;
 496 /* Standard translation table to look up on encoding (writing).  */
 497 Lisp_Object Vstandard_translation_table_for_encode;
 498
 499 Lisp_Object Qtranslation_table;
 500 Lisp_Object Qtranslation_table_id;
 501 Lisp_Object Qtranslation_table_for_decode;
 502 Lisp_Object Qtranslation_table_for_encode;
 503
 504 /* Alist of charsets vs revision number.  */
 505 Lisp_Object Vcharset_revision_alist;
 506
 507 /* Default coding systems used for process I/O.  */
 508 Lisp_Object Vdefault_process_coding_system;
 509
 510 /* Char table for translating Quail and self-inserting input.  */
 511 Lisp_Object Vtranslation_table_for_input;
 512
 513 /* Global flag to tell that we can't call post-read-conversion and
 514    pre-write-conversion functions.  Usually the value is zero, but it
 515    is set to 1 temporarily while such functions are running.  This is
 516    to avoid infinite recursive call.  */
 517 static int inhibit_pre_post_conversion;
 518
 519 Lisp_Object Qchar_coding_system;
 520
 521 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 522    its validity.  */
 523
 524 Lisp_Object
 525 coding_safe_chars (coding_system)
 526      Lisp_Object coding_system;
 527 {
 528   Lisp_Object coding_spec, plist, safe_chars;
 529
 530   coding_spec = Fget (coding_system, Qcoding_system);
 531   plist = XVECTOR (coding_spec)->contents[3];
 532   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 533   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 534 }
 535
 536 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 537   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 538
 539 \f
 540 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 541
 542 /* Emacs' internal format for representation of multiple character
 543    sets is a kind of multi-byte encoding, i.e. characters are
 544    represented by variable-length sequences of one-byte codes.
 545
 546    ASCII characters and control characters (e.g. `tab', `newline') are
 547    represented by one-byte sequences which are their ASCII codes, in
 548    the range 0x00 through 0x7F.
 549
 550    8-bit characters of the range 0x80..0x9F are represented by
 551    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 552    code + 0x20).
 553
 554    8-bit characters of the range 0xA0..0xFF are represented by
 555    one-byte sequences which are their 8-bit code.
 556
 557    The other characters are represented by a sequence of `base
 558    leading-code', optional `extended leading-code', and one or two
 559    `position-code's.  The length of the sequence is determined by the
 560    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 561    whereas extended leading-code and position-code take the range 0xA0
 562    through 0xFF.  See `charset.h' for more details about leading-code
 563    and position-code.
 564
 565    --- CODE RANGE of Emacs' internal format ---
 566    character set        range
 567    -------------        -----
 568    ascii                0x00..0x7F
 569    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 570    eight-bit-graphic    0xA0..0xBF
 571    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 572    ---------------------------------------------
 573
 574    As this is the internal character representation, the format is
 575    usually not used externally (i.e. in a file or in a data sent to a
 576    process).  But, it is possible to have a text externally in this
 577    format (i.e. by encoding by the coding system `emacs-mule').
 578
 579    In that case, a sequence of one-byte codes has a slightly different
 580    form.
 581
 582    Firstly, all characters in eight-bit-control are represented by
 583    one-byte sequences which are their 8-bit code.
 584
 585    Next, character composition data are represented by the byte
 586    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 587    where,
 588         METHOD is 0xF0 plus one of composition method (enum
 589         composition_method),
 590
 591         BYTES is 0xA0 plus the byte length of these composition data,
 592
 593         CHARS is 0xA0 plus the number of characters composed by these
 594         data,
 595
 596         COMPONENTs are characters of multibyte form or composition
 597         rules encoded by two-byte of ASCII codes.
 598
 599    In addition, for backward compatibility, the following formats are
 600    also recognized as composition data on decoding.
 601
 602    0x80 MSEQ ...
 603    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 604
 605    Here,
 606         MSEQ is a multibyte form but in these special format:
 607           ASCII: 0xA0 ASCII_CODE+0x80,
 608           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 609         RULE is a one byte code of the range 0xA0..0xF0 that
 610         represents a composition rule.
 611   */
 612
 613 enum emacs_code_class_type emacs_code_class[256];
 614
 615 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 616    Check if a text is encoded in Emacs' internal format.  If it is,
 617    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 618
 619 static int
 620 detect_coding_emacs_mule (src, src_end, multibytep)
 621       unsigned char *src, *src_end;
 622       int multibytep;
 623 {
 624   unsigned char c;
 625   int composing = 0;
 626   /* Dummy for ONE_MORE_BYTE.  */
 627   struct coding_system dummy_coding;
 628   struct coding_system *coding = &dummy_coding;
 629
 630   while (1)
 631     {
 632       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
 633                                      CODING_CATEGORY_MASK_EMACS_MULE);
 634       if (composing)
 635         {
 636           if (c < 0xA0)
 637             composing = 0;
 638           else if (c == 0xA0)
 639             {
 640               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
 641               c &= 0x7F;
 642             }
 643           else
 644             c -= 0x20;
 645         }
 646
 647       if (c < 0x20)
 648         {
 649           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 650             return 0;
 651         }
 652       else if (c >= 0x80 && c < 0xA0)
 653         {
 654           if (c == 0x80)
 655             /* Old leading code for a composite character.  */
 656             composing = 1;
 657           else
 658             {
 659               unsigned char *src_base = src - 1;
 660               int bytes;
 661
 662               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 663                                                bytes))
 664                 return 0;
 665               src = src_base + bytes;
 666             }
 667         }
 668     }
 669 }
 670
 671
 672 /* Record the starting position START and METHOD of one composition.  */
 673
 674 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 675   do {                                                          \
 676     struct composition_data *cmp_data = coding->cmp_data;       \
 677     int *data = cmp_data->data + cmp_data->used;                \
 678     coding->cmp_data_start = cmp_data->used;                    \
 679     data[0] = -1;                                               \
 680     data[1] = cmp_data->char_offset + start;                    \
 681     data[3] = (int) method;                                     \
 682     cmp_data->used += 4;                                        \
 683   } while (0)
 684
 685 /* Record the ending position END of the current composition.  */
 686
 687 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 688   do {                                                          \
 689     struct composition_data *cmp_data = coding->cmp_data;       \
 690     int *data = cmp_data->data + coding->cmp_data_start;        \
 691     data[0] = cmp_data->used - coding->cmp_data_start;          \
 692     data[2] = cmp_data->char_offset + end;                      \
 693   } while (0)
 694
 695 /* Record one COMPONENT (alternate character or composition rule).  */
 696
 697 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 698   do {                                                                  \
 699     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 700     if (coding->cmp_data->used - coding->cmp_data_start                 \
 701         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 702       {                                                                 \
 703         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 704         coding->composing = COMPOSITION_NO;                             \
 705       }                                                                 \
 706   } while (0)
 707
 708
 709 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 710    is not less than SRC_END, return -1 without incrementing Src.  */
 711
 712 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 713
 714
 715 /* Decode a character represented as a component of composition
 716    sequence of Emacs 20 style at SRC.  Set C to that character, store
 717    its multibyte form sequence at P, and set P to the end of that
 718    sequence.  If no valid character is found, set C to -1.  */
 719
 720 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 721   do {                                                          \
 722     int bytes;                                                  \
 723                                                                 \
 724     c = SAFE_ONE_MORE_BYTE ();                                  \
 725     if (c < 0)                                                  \
 726       break;                                                    \
 727     if (CHAR_HEAD_P (c))                                        \
 728       c = -1;                                                   \
 729     else if (c == 0xA0)                                         \
 730       {                                                         \
 731         c = SAFE_ONE_MORE_BYTE ();                              \
 732         if (c < 0xA0)                                           \
 733           c = -1;                                               \
 734         else                                                    \
 735           {                                                     \
 736             c -= 0x80;                                          \
 737             *p++ = c;                                           \
 738           }                                                     \
 739       }                                                         \
 740     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 741       {                                                         \
 742         unsigned char *p0 = p;                                  \
 743                                                                 \
 744         c -= 0x20;                                              \
 745         *p++ = c;                                               \
 746         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 747         while (--bytes)                                         \
 748           {                                                     \
 749             c = SAFE_ONE_MORE_BYTE ();                          \
 750             if (c < 0)                                          \
 751               break;                                            \
 752             *p++ = c;                                           \
 753           }                                                     \
 754         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 755             || (coding->flags /* We are recovering a file.  */  \
 756                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 757                 && ! CHAR_HEAD_P (p0[1])))                      \
 758           c = STRING_CHAR (p0, bytes);                          \
 759         else                                                    \
 760           c = -1;                                               \
 761       }                                                         \
 762     else                                                        \
 763       c = -1;                                                   \
 764   } while (0)
 765
 766
 767 /* Decode a composition rule represented as a component of composition
 768    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 769    valid rule is found, set C to -1.  */
 770
 771 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 772   do {                                                  \
 773     c = SAFE_ONE_MORE_BYTE ();                          \
 774     c -= 0xA0;                                          \
 775     if (c < 0 || c >= 81)                               \
 776       c = -1;                                           \
 777     else                                                \
 778       {                                                 \
 779         gref = c / 9, nref = c % 9;                     \
 780         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 781       }                                                 \
 782   } while (0)
 783
 784
 785 /* Decode composition sequence encoded by `emacs-mule' at the source
 786    pointed by SRC.  SRC_END is the end of source.  Store information
 787    of the composition in CODING->cmp_data.
 788
 789    For backward compatibility, decode also a composition sequence of
 790    Emacs 20 style.  In that case, the composition sequence contains
 791    characters that should be extracted into a buffer or string.  Store
 792    those characters at *DESTINATION in multibyte form.
 793
 794    If we encounter an invalid byte sequence, return 0.
 795    If we encounter an insufficient source or destination, or
 796    insufficient space in CODING->cmp_data, return 1.
 797    Otherwise, return consumed bytes in the source.
 798
 799 */
 800 static INLINE int
 801 decode_composition_emacs_mule (coding, src, src_end,
 802                                destination, dst_end, dst_bytes)
 803      struct coding_system *coding;
 804      const unsigned char *src, *src_end;
 805      unsigned char **destination, *dst_end;
 806      int dst_bytes;
 807 {
 808   unsigned char *dst = *destination;
 809   int method, data_len, nchars;
 810   const unsigned char *src_base = src++;
 811   /* Store components of composition.  */
 812   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 813   int ncomponent;
 814   /* Store multibyte form of characters to be composed.  This is for
 815      Emacs 20 style composition sequence.  */
 816   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 817   unsigned char *bufp = buf;
 818   int c, i, gref, nref;
 819
 820   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 821       >= COMPOSITION_DATA_SIZE)
 822     {
 823       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 824       return -1;
 825     }
 826
 827   ONE_MORE_BYTE (c);
 828   if (c - 0xF0 >= COMPOSITION_RELATIVE
 829            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 830     {
 831       int with_rule;
 832
 833       method = c - 0xF0;
 834       with_rule = (method == COMPOSITION_WITH_RULE
 835                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 836       ONE_MORE_BYTE (c);
 837       data_len = c - 0xA0;
 838       if (data_len < 4
 839           || src_base + data_len > src_end)
 840         return 0;
 841       ONE_MORE_BYTE (c);
 842       nchars = c - 0xA0;
 843       if (c < 1)
 844         return 0;
 845       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 846         {
 847           /* If it is longer than this, it can't be valid.  */
 848           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 849             return 0;
 850
 851           if (ncomponent % 2 && with_rule)
 852             {
 853               ONE_MORE_BYTE (gref);
 854               gref -= 32;
 855               ONE_MORE_BYTE (nref);
 856               nref -= 32;
 857               c = COMPOSITION_ENCODE_RULE (gref, nref);
 858             }
 859           else
 860             {
 861               int bytes;
 862               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 863                   || (coding->flags /* We are recovering a file.  */
 864                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 865                       && ! CHAR_HEAD_P (src[1])))
 866                 c = STRING_CHAR (src, bytes);
 867               else
 868                 c = *src, bytes = 1;
 869               src += bytes;
 870             }
 871           component[ncomponent] = c;
 872         }
 873     }
 874   else if (c >= 0x80)
 875     {
 876       /* This may be an old Emacs 20 style format.  See the comment at
 877          the section 2 of this file.  */
 878       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 879       if (src == src_end
 880           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 881         goto label_end_of_loop;
 882
 883       src_end = src;
 884       src = src_base + 1;
 885       if (c < 0xC0)
 886         {
 887           method = COMPOSITION_RELATIVE;
 888           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 889             {
 890               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 891               if (c < 0)
 892                 break;
 893               component[ncomponent++] = c;
 894             }
 895           if (ncomponent < 2)
 896             return 0;
 897           nchars = ncomponent;
 898         }
 899       else if (c == 0xFF)
 900         {
 901           method = COMPOSITION_WITH_RULE;
 902           src++;
 903           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 904           if (c < 0)
 905             return 0;
 906           component[0] = c;
 907           for (ncomponent = 1;
 908                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 909             {
 910               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 911               if (c < 0)
 912                 break;
 913               component[ncomponent++] = c;
 914               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 915               if (c < 0)
 916                 break;
 917               component[ncomponent++] = c;
 918             }
 919           if (ncomponent < 3)
 920             return 0;
 921           nchars = (ncomponent + 1) / 2;
 922         }
 923       else
 924         return 0;
 925     }
 926   else
 927     return 0;
 928
 929   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 930     {
 931       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 932       for (i = 0; i < ncomponent; i++)
 933         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 934       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 935       if (buf < bufp)
 936         {
 937           unsigned char *p = buf;
 938           EMIT_BYTES (p, bufp);
 939           *destination += bufp - buf;
 940           coding->produced_char += nchars;
 941         }
 942       return (src - src_base);
 943     }
 944  label_end_of_loop:
 945   return -1;
 946 }
 947
 948 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 949
 950 static void
 951 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 952      struct coding_system *coding;
 953      const unsigned char *source;
 954      unsigned char *destination;
 955      int src_bytes, dst_bytes;
 956 {
 957   const unsigned char *src = source;
 958   const unsigned char *src_end = source + src_bytes;
 959   unsigned char *dst = destination;
 960   unsigned char *dst_end = destination + dst_bytes;
 961   /* SRC_BASE remembers the start position in source in each loop.
 962      The loop will be exited when there's not enough source code, or
 963      when there's not enough destination area to produce a
 964      character.  */
 965   const unsigned char *src_base;
 966
 967   coding->produced_char = 0;
 968   while ((src_base = src) < src_end)
 969     {
 970       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 971       const unsigned char *p;
 972       int bytes;
 973
 974       if (*src == '\r')
 975         {
 976           int c = *src++;
 977
 978           if (coding->eol_type == CODING_EOL_CR)
 979             c = '\n';
 980           else if (coding->eol_type == CODING_EOL_CRLF)
 981             {
 982               ONE_MORE_BYTE (c);
 983               if (c != '\n')
 984                 {
 985                   src--;
 986                   c = '\r';
 987                 }
 988             }
 989           *dst++ = c;
 990           coding->produced_char++;
 991           continue;
 992         }
 993       else if (*src == '\n')
 994         {
 995           if ((coding->eol_type == CODING_EOL_CR
 996                || coding->eol_type == CODING_EOL_CRLF)
 997               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 998             {
 999               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1000               goto label_end_of_loop;
1001             }
1002           *dst++ = *src++;
1003           coding->produced_char++;
1004           continue;
1005         }
1006       else if (*src == 0x80 && coding->cmp_data)
1007         {
1008           /* Start of composition data.  */
1009           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1010                                                          &dst, dst_end,
1011                                                          dst_bytes);
1012           if (consumed < 0)
1013             goto label_end_of_loop;
1014           else if (consumed > 0)
1015             {
1016               src += consumed;
1017               continue;
1018             }
1019           bytes = CHAR_STRING (*src, tmp);
1020           p = tmp;
1021           src++;
1022         }
1023       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1024                || (coding->flags /* We are recovering a file.  */
1025                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1026                    && ! CHAR_HEAD_P (src[1])))
1027         {
1028           p = src;
1029           src += bytes;
1030         }
1031       else
1032         {
1033           int i, c;
1034
1035           bytes = BYTES_BY_CHAR_HEAD (*src);
1036           src++;
1037           for (i = 1; i < bytes; i++)
1038             {
1039               ONE_MORE_BYTE (c);
1040               if (CHAR_HEAD_P (c))
1041                 break;
1042             }
1043           if (i < bytes)
1044             {
1045               bytes = CHAR_STRING (*src_base, tmp);
1046               p = tmp;
1047               src = src_base + 1;
1048             }
1049           else
1050             {
1051               p = src_base;
1052             }
1053         }
1054       if (dst + bytes >= (dst_bytes ? dst_end : src))
1055         {
1056           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1057           break;
1058         }
1059       while (bytes--) *dst++ = *p++;
1060       coding->produced_char++;
1061     }
1062  label_end_of_loop:
1063   coding->consumed = coding->consumed_char = src_base - source;
1064   coding->produced = dst - destination;
1065 }
1066
1067
1068 /* Encode composition data stored at DATA into a special byte sequence
1069    starting by 0x80.  Update CODING->cmp_data_start and maybe
1070    CODING->cmp_data for the next call.  */
1071
1072 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1073   do {                                                                  \
1074     unsigned char buf[1024], *p0 = buf, *p;                             \
1075     int len = data[0];                                                  \
1076     int i;                                                              \
1077                                                                         \
1078     buf[0] = 0x80;                                                      \
1079     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1080     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1081     p = buf + 4;                                                        \
1082     if (data[3] == COMPOSITION_WITH_RULE                                \
1083         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1084       {                                                                 \
1085         p += CHAR_STRING (data[4], p);                                  \
1086         for (i = 5; i < len; i += 2)                                    \
1087           {                                                             \
1088             int gref, nref;                                             \
1089              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1090             *p++ = 0x20 + gref;                                         \
1091             *p++ = 0x20 + nref;                                         \
1092             p += CHAR_STRING (data[i + 1], p);                          \
1093           }                                                             \
1094       }                                                                 \
1095     else                                                                \
1096       {                                                                 \
1097         for (i = 4; i < len; i++)                                       \
1098           p += CHAR_STRING (data[i], p);                                \
1099       }                                                                 \
1100     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1101                                                                         \
1102     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1103       {                                                                 \
1104         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1105         goto label_end_of_loop;                                         \
1106       }                                                                 \
1107     while (p0 < p)                                                      \
1108       *dst++ = *p0++;                                                   \
1109     coding->cmp_data_start += data[0];                                  \
1110     if (coding->cmp_data_start == coding->cmp_data->used                \
1111         && coding->cmp_data->next)                                      \
1112       {                                                                 \
1113         coding->cmp_data = coding->cmp_data->next;                      \
1114         coding->cmp_data_start = 0;                                     \
1115       }                                                                 \
1116   } while (0)
1117
1118
1119 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1120                             unsigned char *, int, int));
1121
1122 static void
1123 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1124      struct coding_system *coding;
1125      const unsigned char *source;
1126      unsigned char *destination;
1127      int src_bytes, dst_bytes;
1128 {
1129   const unsigned char *src = source;
1130   const unsigned char *src_end = source + src_bytes;
1131   unsigned char *dst = destination;
1132   unsigned char *dst_end = destination + dst_bytes;
1133   const unsigned char *src_base;
1134   int c;
1135   int char_offset;
1136   int *data;
1137
1138   Lisp_Object translation_table;
1139
1140   translation_table = Qnil;
1141
1142   /* Optimization for the case that there's no composition.  */
1143   if (!coding->cmp_data || coding->cmp_data->used == 0)
1144     {
1145       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1146       return;
1147     }
1148
1149   char_offset = coding->cmp_data->char_offset;
1150   data = coding->cmp_data->data + coding->cmp_data_start;
1151   while (1)
1152     {
1153       src_base = src;
1154
1155       /* If SRC starts a composition, encode the information about the
1156          composition in advance.  */
1157       if (coding->cmp_data_start < coding->cmp_data->used
1158           && char_offset + coding->consumed_char == data[1])
1159         {
1160           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1161           char_offset = coding->cmp_data->char_offset;
1162           data = coding->cmp_data->data + coding->cmp_data_start;
1163         }
1164
1165       ONE_MORE_CHAR (c);
1166       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1167                         || coding->eol_type == CODING_EOL_CR))
1168         {
1169           if (coding->eol_type == CODING_EOL_CRLF)
1170             EMIT_TWO_BYTES ('\r', c);
1171           else
1172             EMIT_ONE_BYTE ('\r');
1173         }
1174       else if (SINGLE_BYTE_CHAR_P (c))
1175         {
1176           if (coding->flags && ! ASCII_BYTE_P (c))
1177             {
1178               /* As we are auto saving, retain the multibyte form for
1179                  8-bit chars.  */
1180               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1181               int bytes = CHAR_STRING (c, buf);
1182
1183               if (bytes == 1)
1184                 EMIT_ONE_BYTE (buf[0]);
1185               else
1186                 EMIT_TWO_BYTES (buf[0], buf[1]);
1187             }
1188           else
1189             EMIT_ONE_BYTE (c);
1190         }
1191       else
1192         EMIT_BYTES (src_base, src);
1193       coding->consumed_char++;
1194     }
1195  label_end_of_loop:
1196   coding->consumed = src_base - source;
1197   coding->produced = coding->produced_char = dst - destination;
1198   return;
1199 }
1200
1201 \f
1202 /*** 3. ISO2022 handlers ***/
1203
1204 /* The following note describes the coding system ISO2022 briefly.
1205    Since the intention of this note is to help understand the
1206    functions in this file, some parts are NOT ACCURATE or are OVERLY
1207    SIMPLIFIED.  For thorough understanding, please refer to the
1208    original document of ISO2022.  This is equivalent to the standard
1209    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1210
1211    ISO2022 provides many mechanisms to encode several character sets
1212    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1213    is encoded using bytes less than 128.  This may make the encoded
1214    text a little bit longer, but the text passes more easily through
1215    several types of gateway, some of which strip off the MSB (Most
1216    Significant Bit).
1217
1218    There are two kinds of character sets: control character sets and
1219    graphic character sets.  The former contain control characters such
1220    as `newline' and `escape' to provide control functions (control
1221    functions are also provided by escape sequences).  The latter
1222    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1223    two control character sets and many graphic character sets.
1224
1225    Graphic character sets are classified into one of the following
1226    four classes, according to the number of bytes (DIMENSION) and
1227    number of characters in one dimension (CHARS) of the set:
1228    - DIMENSION1_CHARS94
1229    - DIMENSION1_CHARS96
1230    - DIMENSION2_CHARS94
1231    - DIMENSION2_CHARS96
1232
1233    In addition, each character set is assigned an identification tag,
1234    unique for each set, called the "final character" (denoted as <F>
1235    hereafter).  The <F> of each character set is decided by ECMA(*)
1236    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1237    (0x30..0x3F are for private use only).
1238
1239    Note (*): ECMA = European Computer Manufacturers Association
1240
1241    Here are examples of graphic character sets [NAME(<F>)]:
1242         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1243         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1244         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1245         o DIMENSION2_CHARS96 -- none for the moment
1246
1247    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1248         C0 [0x00..0x1F] -- control character plane 0
1249         GL [0x20..0x7F] -- graphic character plane 0
1250         C1 [0x80..0x9F] -- control character plane 1
1251         GR [0xA0..0xFF] -- graphic character plane 1
1252
1253    A control character set is directly designated and invoked to C0 or
1254    C1 by an escape sequence.  The most common case is that:
1255    - ISO646's  control character set is designated/invoked to C0, and
1256    - ISO6429's control character set is designated/invoked to C1,
1257    and usually these designations/invocations are omitted in encoded
1258    text.  In a 7-bit environment, only C0 can be used, and a control
1259    character for C1 is encoded by an appropriate escape sequence to
1260    fit into the environment.  All control characters for C1 are
1261    defined to have corresponding escape sequences.
1262
1263    A graphic character set is at first designated to one of four
1264    graphic registers (G0 through G3), then these graphic registers are
1265    invoked to GL or GR.  These designations and invocations can be
1266    done independently.  The most common case is that G0 is invoked to
1267    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1268    these invocations and designations are omitted in encoded text.
1269    In a 7-bit environment, only GL can be used.
1270
1271    When a graphic character set of CHARS94 is invoked to GL, codes
1272    0x20 and 0x7F of the GL area work as control characters SPACE and
1273    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1274    be used.
1275
1276    There are two ways of invocation: locking-shift and single-shift.
1277    With locking-shift, the invocation lasts until the next different
1278    invocation, whereas with single-shift, the invocation affects the
1279    following character only and doesn't affect the locking-shift
1280    state.  Invocations are done by the following control characters or
1281    escape sequences:
1282
1283    ----------------------------------------------------------------------
1284    abbrev  function                  cntrl escape seq   description
1285    ----------------------------------------------------------------------
1286    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1287    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1288    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1289    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1290    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1291    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1292    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1293    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1294    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1295    ----------------------------------------------------------------------
1296    (*) These are not used by any known coding system.
1297
1298    Control characters for these functions are defined by macros
1299    ISO_CODE_XXX in `coding.h'.
1300
1301    Designations are done by the following escape sequences:
1302    ----------------------------------------------------------------------
1303    escape sequence      description
1304    ----------------------------------------------------------------------
1305    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1306    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1307    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1308    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1309    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1310    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1311    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1312    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1313    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1314    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1315    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1316    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1317    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1318    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1319    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1320    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1321    ----------------------------------------------------------------------
1322
1323    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1324    of dimension 1, chars 94, and final character <F>, etc...
1325
1326    Note (*): Although these designations are not allowed in ISO2022,
1327    Emacs accepts them on decoding, and produces them on encoding
1328    CHARS96 character sets in a coding system which is characterized as
1329    7-bit environment, non-locking-shift, and non-single-shift.
1330
1331    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1332    '(' can be omitted.  We refer to this as "short-form" hereafter.
1333
1334    Now you may notice that there are a lot of ways of encoding the
1335    same multilingual text in ISO2022.  Actually, there exist many
1336    coding systems such as Compound Text (used in X11's inter client
1337    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1338    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1339    localized platforms), and all of these are variants of ISO2022.
1340
1341    In addition to the above, Emacs handles two more kinds of escape
1342    sequences: ISO6429's direction specification and Emacs' private
1343    sequence for specifying character composition.
1344
1345    ISO6429's direction specification takes the following form:
1346         o CSI ']'      -- end of the current direction
1347         o CSI '0' ']'  -- end of the current direction
1348         o CSI '1' ']'  -- start of left-to-right text
1349         o CSI '2' ']'  -- start of right-to-left text
1350    The control character CSI (0x9B: control sequence introducer) is
1351    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1352
1353    Character composition specification takes the following form:
1354         o ESC '0' -- start relative composition
1355         o ESC '1' -- end composition
1356         o ESC '2' -- start rule-base composition (*)
1357         o ESC '3' -- start relative composition with alternate chars  (**)
1358         o ESC '4' -- start rule-base composition with alternate chars  (**)
1359   Since these are not standard escape sequences of any ISO standard,
1360   the use of them with these meanings is restricted to Emacs only.
1361
1362   (*) This form is used only in Emacs 20.5 and older versions,
1363   but the newer versions can safely decode it.
1364   (**) This form is used only in Emacs 21.1 and newer versions,
1365   and the older versions can't decode it.
1366
1367   Here's a list of example usages of these composition escape
1368   sequences (categorized by `enum composition_method').
1369
1370   COMPOSITION_RELATIVE:
1371         ESC 0 CHAR [ CHAR ] ESC 1
1372   COMPOSITION_WITH_RULE:
1373         ESC 2 CHAR [ RULE CHAR ] ESC 1
1374   COMPOSITION_WITH_ALTCHARS:
1375         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1376   COMPOSITION_WITH_RULE_ALTCHARS:
1377         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1378
1379 enum iso_code_class_type iso_code_class[256];
1380
1381 #define CHARSET_OK(idx, charset, c)                                     \
1382   (coding_system_table[idx]                                             \
1383    && (charset == CHARSET_ASCII                                         \
1384        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1385            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1386    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1387                                               charset)                  \
1388        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1389
1390 #define SHIFT_OUT_OK(idx) \
1391   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1392
1393 #define COMPOSITION_OK(idx)     \
1394   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1395
1396 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1397    Check if a text is encoded in ISO2022.  If it is, return an
1398    integer in which appropriate flag bits any of:
1399         CODING_CATEGORY_MASK_ISO_7
1400         CODING_CATEGORY_MASK_ISO_7_TIGHT
1401         CODING_CATEGORY_MASK_ISO_8_1
1402         CODING_CATEGORY_MASK_ISO_8_2
1403         CODING_CATEGORY_MASK_ISO_7_ELSE
1404         CODING_CATEGORY_MASK_ISO_8_ELSE
1405    are set.  If a code which should never appear in ISO2022 is found,
1406    returns 0.  */
1407
1408 static int
1409 detect_coding_iso2022 (src, src_end, multibytep)
1410      unsigned char *src, *src_end;
1411      int multibytep;
1412 {
1413   int mask = CODING_CATEGORY_MASK_ISO;
1414   int mask_found = 0;
1415   int reg[4], shift_out = 0, single_shifting = 0;
1416   int c, c1, charset;
1417   /* Dummy for ONE_MORE_BYTE.  */
1418   struct coding_system dummy_coding;
1419   struct coding_system *coding = &dummy_coding;
1420   Lisp_Object safe_chars;
1421
1422   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1423   while (mask)
1424     {
1425       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1426     retry:
1427       switch (c)
1428         {
1429         case ISO_CODE_ESC:
1430           if (inhibit_iso_escape_detection)
1431             break;
1432           single_shifting = 0;
1433           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1434           if (c >= '(' && c <= '/')
1435             {
1436               /* Designation sequence for a charset of dimension 1.  */
1437               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1438               if (c1 < ' ' || c1 >= 0x80
1439                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1440                 /* Invalid designation sequence.  Just ignore.  */
1441                 break;
1442               reg[(c - '(') % 4] = charset;
1443             }
1444           else if (c == '$')
1445             {
1446               /* Designation sequence for a charset of dimension 2.  */
1447               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1448               if (c >= '@' && c <= 'B')
1449                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1450                 reg[0] = charset = iso_charset_table[1][0][c];
1451               else if (c >= '(' && c <= '/')
1452                 {
1453                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1454                                                  mask & mask_found);
1455                   if (c1 < ' ' || c1 >= 0x80
1456                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1457                     /* Invalid designation sequence.  Just ignore.  */
1458                     break;
1459                   reg[(c - '(') % 4] = charset;
1460                 }
1461               else
1462                 /* Invalid designation sequence.  Just ignore.  */
1463                 break;
1464             }
1465           else if (c == 'N' || c == 'O')
1466             {
1467               /* ESC <Fe> for SS2 or SS3.  */
1468               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1469               break;
1470             }
1471           else if (c >= '0' && c <= '4')
1472             {
1473               /* ESC <Fp> for start/end composition.  */
1474               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1475                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1476               else
1477                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1478               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1479                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1480               else
1481                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1482               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1483                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1484               else
1485                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1486               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1487                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1488               else
1489                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1490               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1491                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1492               else
1493                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1494               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1495                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1496               else
1497                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1498               break;
1499             }
1500           else
1501             /* Invalid escape sequence.  Just ignore.  */
1502             break;
1503
1504           /* We found a valid designation sequence for CHARSET.  */
1505           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1506           c = MAKE_CHAR (charset, 0, 0);
1507           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1508             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1509           else
1510             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1511           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1512             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1513           else
1514             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1515           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1516             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1517           else
1518             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1519           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1520             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1521           else
1522             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1523           break;
1524
1525         case ISO_CODE_SO:
1526           if (inhibit_iso_escape_detection)
1527             break;
1528           single_shifting = 0;
1529           if (shift_out == 0
1530               && (reg[1] >= 0
1531                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1532                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1533             {
1534               /* Locking shift out.  */
1535               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1536               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1537             }
1538           break;
1539
1540         case ISO_CODE_SI:
1541           if (inhibit_iso_escape_detection)
1542             break;
1543           single_shifting = 0;
1544           if (shift_out == 1)
1545             {
1546               /* Locking shift in.  */
1547               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1548               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1549             }
1550           break;
1551
1552         case ISO_CODE_CSI:
1553           single_shifting = 0;
1554         case ISO_CODE_SS2:
1555         case ISO_CODE_SS3:
1556           {
1557             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1558
1559             if (inhibit_iso_escape_detection)
1560               break;
1561             if (c != ISO_CODE_CSI)
1562               {
1563                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1564                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1565                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1566                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1567                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1568                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1569                 single_shifting = 1;
1570               }
1571             if (VECTORP (Vlatin_extra_code_table)
1572                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1573               {
1574                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1575                     & CODING_FLAG_ISO_LATIN_EXTRA)
1576                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1577                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1578                     & CODING_FLAG_ISO_LATIN_EXTRA)
1579                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1580               }
1581             mask &= newmask;
1582             mask_found |= newmask;
1583           }
1584           break;
1585
1586         default:
1587           if (c < 0x80)
1588             {
1589               single_shifting = 0;
1590               break;
1591             }
1592           else if (c < 0xA0)
1593             {
1594               single_shifting = 0;
1595               if (VECTORP (Vlatin_extra_code_table)
1596                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1597                 {
1598                   int newmask = 0;
1599
1600                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1601                       & CODING_FLAG_ISO_LATIN_EXTRA)
1602                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1603                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1604                       & CODING_FLAG_ISO_LATIN_EXTRA)
1605                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1606                   mask &= newmask;
1607                   mask_found |= newmask;
1608                 }
1609               else
1610                 return 0;
1611             }
1612           else
1613             {
1614               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1615                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1616               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1617               /* Check the length of succeeding codes of the range
1618                  0xA0..0FF.  If the byte length is odd, we exclude
1619                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1620                  when we are not single shifting.  */
1621               if (!single_shifting
1622                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1623                 {
1624                   int i = 1;
1625
1626                   c = -1;
1627                   while (src < src_end)
1628                     {
1629                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1630                                                      mask & mask_found);
1631                       if (c < 0xA0)
1632                         break;
1633                       i++;
1634                     }
1635
1636                   if (i & 1 && src < src_end)
1637                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1638                   else
1639                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1640                   if (c >= 0)
1641                     /* This means that we have read one extra byte.  */
1642                     goto retry;
1643                 }
1644             }
1645           break;
1646         }
1647     }
1648   return (mask & mask_found);
1649 }
1650
1651 /* Decode a character of which charset is CHARSET, the 1st position
1652    code is C1, the 2nd position code is C2, and return the decoded
1653    character code.  If the variable `translation_table' is non-nil,
1654    returned the translated code.  */
1655
1656 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1657   (NILP (translation_table)                     \
1658    ? MAKE_CHAR (charset, c1, c2)                \
1659    : translate_char (translation_table, -1, charset, c1, c2))
1660
1661 /* Set designation state into CODING.  */
1662 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1663   do {                                                                     \
1664     int charset, c;                                                        \
1665                                                                            \
1666     if (final_char < '0' || final_char >= 128)                             \
1667       goto label_invalid_code;                                             \
1668     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1669                                  make_number (chars),                      \
1670                                  make_number (final_char));                \
1671     c = MAKE_CHAR (charset, 0, 0);                                         \
1672     if (charset >= 0                                                       \
1673         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1674             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1675       {                                                                    \
1676         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1677             && reg == 0                                                    \
1678             && charset == CHARSET_ASCII)                                   \
1679           {                                                                \
1680             /* We should insert this designation sequence as is so         \
1681                that it is surely written back to a file.  */               \
1682             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1683             goto label_invalid_code;                                       \
1684           }                                                                \
1685         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1686         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1687             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1688           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1689         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1690       }                                                                    \
1691     else                                                                   \
1692       {                                                                    \
1693         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1694         goto label_invalid_code;                                           \
1695       }                                                                    \
1696   } while (0)
1697
1698 /* Allocate a memory block for storing information about compositions.
1699    The block is chained to the already allocated blocks.  */
1700
1701 void
1702 coding_allocate_composition_data (coding, char_offset)
1703      struct coding_system *coding;
1704      int char_offset;
1705 {
1706   struct composition_data *cmp_data
1707     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1708
1709   cmp_data->char_offset = char_offset;
1710   cmp_data->used = 0;
1711   cmp_data->prev = coding->cmp_data;
1712   cmp_data->next = NULL;
1713   if (coding->cmp_data)
1714     coding->cmp_data->next = cmp_data;
1715   coding->cmp_data = cmp_data;
1716   coding->cmp_data_start = 0;
1717   coding->composing = COMPOSITION_NO;
1718 }
1719
1720 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1721    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1722    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1723    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1724    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1725   */
1726
1727 #define DECODE_COMPOSITION_START(c1)                                       \
1728   do {                                                                     \
1729     if (coding->composing == COMPOSITION_DISABLED)                         \
1730       {                                                                    \
1731         *dst++ = ISO_CODE_ESC;                                             \
1732         *dst++ = c1 & 0x7f;                                                \
1733         coding->produced_char += 2;                                        \
1734       }                                                                    \
1735     else if (!COMPOSING_P (coding))                                        \
1736       {                                                                    \
1737         /* This is surely the start of a composition.  We must be sure     \
1738            that coding->cmp_data has enough space to store the             \
1739            information about the composition.  If not, terminate the       \
1740            current decoding loop, allocate one more memory block for       \
1741            coding->cmp_data in the caller, then start the decoding         \
1742            loop again.  We can't allocate memory here directly because     \
1743            it may cause buffer/string relocation.  */                      \
1744         if (!coding->cmp_data                                              \
1745             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1746                 >= COMPOSITION_DATA_SIZE))                                 \
1747           {                                                                \
1748             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1749             goto label_end_of_loop;                                        \
1750           }                                                                \
1751         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1752                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1753                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1754                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1755         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1756                                       coding->composing);                  \
1757         coding->composition_rule_follows = 0;                              \
1758       }                                                                    \
1759     else                                                                   \
1760       {                                                                    \
1761         /* We are already handling a composition.  If the method is        \
1762            the following two, the codes following the current escape       \
1763            sequence are actual characters stored in a buffer.  */          \
1764         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1765             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1766           {                                                                \
1767             coding->composing = COMPOSITION_RELATIVE;                      \
1768             coding->composition_rule_follows = 0;                          \
1769           }                                                                \
1770       }                                                                    \
1771   } while (0)
1772
1773 /* Handle composition end sequence ESC 1.  */
1774
1775 #define DECODE_COMPOSITION_END(c1)                                      \
1776   do {                                                                  \
1777     if (! COMPOSING_P (coding))                                         \
1778       {                                                                 \
1779         *dst++ = ISO_CODE_ESC;                                          \
1780         *dst++ = c1;                                                    \
1781         coding->produced_char += 2;                                     \
1782       }                                                                 \
1783     else                                                                \
1784       {                                                                 \
1785         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1786         coding->composing = COMPOSITION_NO;                             \
1787       }                                                                 \
1788   } while (0)
1789
1790 /* Decode a composition rule from the byte C1 (and maybe one more byte
1791    from SRC) and store one encoded composition rule in
1792    coding->cmp_data.  */
1793
1794 #define DECODE_COMPOSITION_RULE(c1)                                     \
1795   do {                                                                  \
1796     int rule = 0;                                                       \
1797     (c1) -= 32;                                                         \
1798     if (c1 < 81)                /* old format (before ver.21) */        \
1799       {                                                                 \
1800         int gref = (c1) / 9;                                            \
1801         int nref = (c1) % 9;                                            \
1802         if (gref == 4) gref = 10;                                       \
1803         if (nref == 4) nref = 10;                                       \
1804         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1805       }                                                                 \
1806     else if (c1 < 93)           /* new format (after ver.21) */         \
1807       {                                                                 \
1808         ONE_MORE_BYTE (c2);                                             \
1809         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1810       }                                                                 \
1811     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1812     coding->composition_rule_follows = 0;                               \
1813   } while (0)
1814
1815
1816 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1817
1818 static void
1819 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1820      struct coding_system *coding;
1821      const unsigned char *source;
1822      unsigned char *destination;
1823      int src_bytes, dst_bytes;
1824 {
1825   const unsigned char *src = source;
1826   const unsigned char *src_end = source + src_bytes;
1827   unsigned char *dst = destination;
1828   unsigned char *dst_end = destination + dst_bytes;
1829   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1830   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1831   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1832   /* SRC_BASE remembers the start position in source in each loop.
1833      The loop will be exited when there's not enough source code
1834      (within macro ONE_MORE_BYTE), or when there's not enough
1835      destination area to produce a character (within macro
1836      EMIT_CHAR).  */
1837   const unsigned char *src_base;
1838   int c, charset;
1839   Lisp_Object translation_table;
1840   Lisp_Object safe_chars;
1841
1842   safe_chars = coding_safe_chars (coding->symbol);
1843
1844   if (NILP (Venable_character_translation))
1845     translation_table = Qnil;
1846   else
1847     {
1848       translation_table = coding->translation_table_for_decode;
1849       if (NILP (translation_table))
1850         translation_table = Vstandard_translation_table_for_decode;
1851     }
1852
1853   coding->result = CODING_FINISH_NORMAL;
1854
1855   while (1)
1856     {
1857       int c1, c2 = 0;
1858
1859       src_base = src;
1860       ONE_MORE_BYTE (c1);
1861
1862       /* We produce no character or one character.  */
1863       switch (iso_code_class [c1])
1864         {
1865         case ISO_0x20_or_0x7F:
1866           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1867             {
1868               DECODE_COMPOSITION_RULE (c1);
1869               continue;
1870             }
1871           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1872             {
1873               /* This is SPACE or DEL.  */
1874               charset = CHARSET_ASCII;
1875               break;
1876             }
1877           /* This is a graphic character, we fall down ...  */
1878
1879         case ISO_graphic_plane_0:
1880           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1881             {
1882               DECODE_COMPOSITION_RULE (c1);
1883               continue;
1884             }
1885           charset = charset0;
1886           break;
1887
1888         case ISO_0xA0_or_0xFF:
1889           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1890               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1891             goto label_invalid_code;
1892           /* This is a graphic character, we fall down ... */
1893
1894         case ISO_graphic_plane_1:
1895           if (charset1 < 0)
1896             goto label_invalid_code;
1897           charset = charset1;
1898           break;
1899
1900         case ISO_control_0:
1901           if (COMPOSING_P (coding))
1902             DECODE_COMPOSITION_END ('1');
1903
1904           /* All ISO2022 control characters in this class have the
1905              same representation in Emacs internal format.  */
1906           if (c1 == '\n'
1907               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1908               && (coding->eol_type == CODING_EOL_CR
1909                   || coding->eol_type == CODING_EOL_CRLF))
1910             {
1911               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1912               goto label_end_of_loop;
1913             }
1914           charset = CHARSET_ASCII;
1915           break;
1916
1917         case ISO_control_1:
1918           if (COMPOSING_P (coding))
1919             DECODE_COMPOSITION_END ('1');
1920           goto label_invalid_code;
1921
1922         case ISO_carriage_return:
1923           if (COMPOSING_P (coding))
1924             DECODE_COMPOSITION_END ('1');
1925
1926           if (coding->eol_type == CODING_EOL_CR)
1927             c1 = '\n';
1928           else if (coding->eol_type == CODING_EOL_CRLF)
1929             {
1930               ONE_MORE_BYTE (c1);
1931               if (c1 != ISO_CODE_LF)
1932                 {
1933                   src--;
1934                   c1 = '\r';
1935                 }
1936             }
1937           charset = CHARSET_ASCII;
1938           break;
1939
1940         case ISO_shift_out:
1941           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1942               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1943             goto label_invalid_code;
1944           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1945           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1946           continue;
1947
1948         case ISO_shift_in:
1949           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1950             goto label_invalid_code;
1951           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1952           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1953           continue;
1954
1955         case ISO_single_shift_2_7:
1956         case ISO_single_shift_2:
1957           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1958             goto label_invalid_code;
1959           /* SS2 is handled as an escape sequence of ESC 'N' */
1960           c1 = 'N';
1961           goto label_escape_sequence;
1962
1963         case ISO_single_shift_3:
1964           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1965             goto label_invalid_code;
1966           /* SS2 is handled as an escape sequence of ESC 'O' */
1967           c1 = 'O';
1968           goto label_escape_sequence;
1969
1970         case ISO_control_sequence_introducer:
1971           /* CSI is handled as an escape sequence of ESC '[' ...  */
1972           c1 = '[';
1973           goto label_escape_sequence;
1974
1975         case ISO_escape:
1976           ONE_MORE_BYTE (c1);
1977         label_escape_sequence:
1978           /* Escape sequences handled by Emacs are invocation,
1979              designation, direction specification, and character
1980              composition specification.  */
1981           switch (c1)
1982             {
1983             case '&':           /* revision of following character set */
1984               ONE_MORE_BYTE (c1);
1985               if (!(c1 >= '@' && c1 <= '~'))
1986                 goto label_invalid_code;
1987               ONE_MORE_BYTE (c1);
1988               if (c1 != ISO_CODE_ESC)
1989                 goto label_invalid_code;
1990               ONE_MORE_BYTE (c1);
1991               goto label_escape_sequence;
1992
1993             case '$':           /* designation of 2-byte character set */
1994               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1995                 goto label_invalid_code;
1996               ONE_MORE_BYTE (c1);
1997               if (c1 >= '@' && c1 <= 'B')
1998                 {       /* designation of JISX0208.1978, GB2312.1980,
1999                            or JISX0208.1980 */
2000                   DECODE_DESIGNATION (0, 2, 94, c1);
2001                 }
2002               else if (c1 >= 0x28 && c1 <= 0x2B)
2003                 {       /* designation of DIMENSION2_CHARS94 character set */
2004                   ONE_MORE_BYTE (c2);
2005                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2006                 }
2007               else if (c1 >= 0x2C && c1 <= 0x2F)
2008                 {       /* designation of DIMENSION2_CHARS96 character set */
2009                   ONE_MORE_BYTE (c2);
2010                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2011                 }
2012               else
2013                 goto label_invalid_code;
2014               /* We must update these variables now.  */
2015               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2016               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2017               continue;
2018
2019             case 'n':           /* invocation of locking-shift-2 */
2020               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2021                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2022                 goto label_invalid_code;
2023               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2024               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2025               continue;
2026
2027             case 'o':           /* invocation of locking-shift-3 */
2028               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2029                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2030                 goto label_invalid_code;
2031               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2032               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2033               continue;
2034
2035             case 'N':           /* invocation of single-shift-2 */
2036               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2037                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2038                 goto label_invalid_code;
2039               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2040               ONE_MORE_BYTE (c1);
2041               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2042                 goto label_invalid_code;
2043               break;
2044
2045             case 'O':           /* invocation of single-shift-3 */
2046               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2047                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2048                 goto label_invalid_code;
2049               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2050               ONE_MORE_BYTE (c1);
2051               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2052                 goto label_invalid_code;
2053               break;
2054
2055             case '0': case '2': case '3': case '4': /* start composition */
2056               DECODE_COMPOSITION_START (c1);
2057               continue;
2058
2059             case '1':           /* end composition */
2060               DECODE_COMPOSITION_END (c1);
2061               continue;
2062
2063             case '[':           /* specification of direction */
2064               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2065                 goto label_invalid_code;
2066               /* For the moment, nested direction is not supported.
2067                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2068                  left-to-right, and nonzero means right-to-left.  */
2069               ONE_MORE_BYTE (c1);
2070               switch (c1)
2071                 {
2072                 case ']':       /* end of the current direction */
2073                   coding->mode &= ~CODING_MODE_DIRECTION;
2074
2075                 case '0':       /* end of the current direction */
2076                 case '1':       /* start of left-to-right direction */
2077                   ONE_MORE_BYTE (c1);
2078                   if (c1 == ']')
2079                     coding->mode &= ~CODING_MODE_DIRECTION;
2080                   else
2081                     goto label_invalid_code;
2082                   break;
2083
2084                 case '2':       /* start of right-to-left direction */
2085                   ONE_MORE_BYTE (c1);
2086                   if (c1 == ']')
2087                     coding->mode |= CODING_MODE_DIRECTION;
2088                   else
2089                     goto label_invalid_code;
2090                   break;
2091
2092                 default:
2093                   goto label_invalid_code;
2094                 }
2095               continue;
2096
2097             case '%':
2098               if (COMPOSING_P (coding))
2099                 DECODE_COMPOSITION_END ('1');
2100               ONE_MORE_BYTE (c1);
2101               if (c1 == '/')
2102                 {
2103                   /* CTEXT extended segment:
2104                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2105                      We keep these bytes as is for the moment.
2106                      They may be decoded by post-read-conversion.  */
2107                   int dim, M, L;
2108                   int size, required;
2109                   int produced_chars;
2110
2111                   ONE_MORE_BYTE (dim);
2112                   ONE_MORE_BYTE (M);
2113                   ONE_MORE_BYTE (L);
2114                   size = ((M - 128) * 128) + (L - 128);
2115                   required = 8 + size * 2;
2116                   if (dst + required > (dst_bytes ? dst_end : src))
2117                     goto label_end_of_loop;
2118                   *dst++ = ISO_CODE_ESC;
2119                   *dst++ = '%';
2120                   *dst++ = '/';
2121                   *dst++ = dim;
2122                   produced_chars = 4;
2123                   dst += CHAR_STRING (M, dst), produced_chars++;
2124                   dst += CHAR_STRING (L, dst), produced_chars++;
2125                   while (size-- > 0)
2126                     {
2127                       ONE_MORE_BYTE (c1);
2128                       dst += CHAR_STRING (c1, dst), produced_chars++;
2129                     }
2130                   coding->produced_char += produced_chars;
2131                 }
2132               else if (c1 == 'G')
2133                 {
2134                   unsigned char *d = dst;
2135                   int produced_chars;
2136
2137                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2138                      ESC % G --UTF-8-BYTES-- ESC % @
2139                      We keep these bytes as is for the moment.
2140                      They may be decoded by post-read-conversion.  */
2141                   if (d + 6 > (dst_bytes ? dst_end : src))
2142                     goto label_end_of_loop;
2143                   *d++ = ISO_CODE_ESC;
2144                   *d++ = '%';
2145                   *d++ = 'G';
2146                   produced_chars = 3;
2147                   while (d + 1 < (dst_bytes ? dst_end : src))
2148                     {
2149                       ONE_MORE_BYTE (c1);
2150                       if (c1 == ISO_CODE_ESC
2151                           && src + 1 < src_end
2152                           && src[0] == '%'
2153                           && src[1] == '@')
2154                         {
2155                           src += 2;
2156                           break;
2157                         }
2158                       d += CHAR_STRING (c1, d), produced_chars++;
2159                     }
2160                   if (d + 3 > (dst_bytes ? dst_end : src))
2161                     goto label_end_of_loop;
2162                   *d++ = ISO_CODE_ESC;
2163                   *d++ = '%';
2164                   *d++ = '@';
2165                   dst = d;
2166                   coding->produced_char += produced_chars + 3;
2167                 }
2168               else
2169                 goto label_invalid_code;
2170               continue;
2171
2172             default:
2173               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2174                 goto label_invalid_code;
2175               if (c1 >= 0x28 && c1 <= 0x2B)
2176                 {       /* designation of DIMENSION1_CHARS94 character set */
2177                   ONE_MORE_BYTE (c2);
2178                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2179                 }
2180               else if (c1 >= 0x2C && c1 <= 0x2F)
2181                 {       /* designation of DIMENSION1_CHARS96 character set */
2182                   ONE_MORE_BYTE (c2);
2183                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2184                 }
2185               else
2186                 goto label_invalid_code;
2187               /* We must update these variables now.  */
2188               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2189               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2190               continue;
2191             }
2192         }
2193
2194       /* Now we know CHARSET and 1st position code C1 of a character.
2195          Produce a multibyte sequence for that character while getting
2196          2nd position code C2 if necessary.  */
2197       if (CHARSET_DIMENSION (charset) == 2)
2198         {
2199           ONE_MORE_BYTE (c2);
2200           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2201             /* C2 is not in a valid range.  */
2202             goto label_invalid_code;
2203         }
2204       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2205       EMIT_CHAR (c);
2206       continue;
2207
2208     label_invalid_code:
2209       coding->errors++;
2210       if (COMPOSING_P (coding))
2211         DECODE_COMPOSITION_END ('1');
2212       src = src_base;
2213       c = *src++;
2214       if (! NILP (translation_table))
2215         c = translate_char (translation_table, c, 0, 0, 0);
2216       EMIT_CHAR (c);
2217     }
2218
2219  label_end_of_loop:
2220   coding->consumed = coding->consumed_char = src_base - source;
2221   coding->produced = dst - destination;
2222   return;
2223 }
2224
2225
2226 /* ISO2022 encoding stuff.  */
2227
2228 /*
2229    It is not enough to say just "ISO2022" on encoding, we have to
2230    specify more details.  In Emacs, each ISO2022 coding system
2231    variant has the following specifications:
2232         1. Initial designation to G0 through G3.
2233         2. Allows short-form designation?
2234         3. ASCII should be designated to G0 before control characters?
2235         4. ASCII should be designated to G0 at end of line?
2236         5. 7-bit environment or 8-bit environment?
2237         6. Use locking-shift?
2238         7. Use Single-shift?
2239    And the following two are only for Japanese:
2240         8. Use ASCII in place of JIS0201-1976-Roman?
2241         9. Use JISX0208-1983 in place of JISX0208-1978?
2242    These specifications are encoded in `coding->flags' as flag bits
2243    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2244    details.
2245 */
2246
2247 /* Produce codes (escape sequence) for designating CHARSET to graphic
2248    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2249    '@', 'A', or 'B' and the coding system CODING allows, produce
2250    designation sequence of short-form.  */
2251
2252 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2253   do {                                                                  \
2254     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2255     char *intermediate_char_94 = "()*+";                                \
2256     char *intermediate_char_96 = ",-./";                                \
2257     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2258                                                                         \
2259     if (revision < 255)                                                 \
2260       {                                                                 \
2261         *dst++ = ISO_CODE_ESC;                                          \
2262         *dst++ = '&';                                                   \
2263         *dst++ = '@' + revision;                                        \
2264       }                                                                 \
2265     *dst++ = ISO_CODE_ESC;                                              \
2266     if (CHARSET_DIMENSION (charset) == 1)                               \
2267       {                                                                 \
2268         if (CHARSET_CHARS (charset) == 94)                              \
2269           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2270         else                                                            \
2271           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2272       }                                                                 \
2273     else                                                                \
2274       {                                                                 \
2275         *dst++ = '$';                                                   \
2276         if (CHARSET_CHARS (charset) == 94)                              \
2277           {                                                             \
2278             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2279                 || reg != 0                                             \
2280                 || final_char < '@' || final_char > 'B')                \
2281               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2282           }                                                             \
2283         else                                                            \
2284           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2285       }                                                                 \
2286     *dst++ = final_char;                                                \
2287     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2288   } while (0)
2289
2290 /* The following two macros produce codes (control character or escape
2291    sequence) for ISO2022 single-shift functions (single-shift-2 and
2292    single-shift-3).  */
2293
2294 #define ENCODE_SINGLE_SHIFT_2                           \
2295   do {                                                  \
2296     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2297       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2298     else                                                \
2299       *dst++ = ISO_CODE_SS2;                            \
2300     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2301   } while (0)
2302
2303 #define ENCODE_SINGLE_SHIFT_3                           \
2304   do {                                                  \
2305     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2306       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2307     else                                                \
2308       *dst++ = ISO_CODE_SS3;                            \
2309     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2310   } while (0)
2311
2312 /* The following four macros produce codes (control character or
2313    escape sequence) for ISO2022 locking-shift functions (shift-in,
2314    shift-out, locking-shift-2, and locking-shift-3).  */
2315
2316 #define ENCODE_SHIFT_IN                         \
2317   do {                                          \
2318     *dst++ = ISO_CODE_SI;                       \
2319     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2320   } while (0)
2321
2322 #define ENCODE_SHIFT_OUT                        \
2323   do {                                          \
2324     *dst++ = ISO_CODE_SO;                       \
2325     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2326   } while (0)
2327
2328 #define ENCODE_LOCKING_SHIFT_2                  \
2329   do {                                          \
2330     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2331     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2332   } while (0)
2333
2334 #define ENCODE_LOCKING_SHIFT_3                  \
2335   do {                                          \
2336     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2337     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2338   } while (0)
2339
2340 /* Produce codes for a DIMENSION1 character whose character set is
2341    CHARSET and whose position-code is C1.  Designation and invocation
2342    sequences are also produced in advance if necessary.  */
2343
2344 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2345   do {                                                                  \
2346     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2347       {                                                                 \
2348         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2349           *dst++ = c1 & 0x7F;                                           \
2350         else                                                            \
2351           *dst++ = c1 | 0x80;                                           \
2352         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2353         break;                                                          \
2354       }                                                                 \
2355     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2356       {                                                                 \
2357         *dst++ = c1 & 0x7F;                                             \
2358         break;                                                          \
2359       }                                                                 \
2360     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2361       {                                                                 \
2362         *dst++ = c1 | 0x80;                                             \
2363         break;                                                          \
2364       }                                                                 \
2365     else                                                                \
2366       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2367          must invoke it, or, at first, designate it to some graphic     \
2368          register.  Then repeat the loop to actually produce the        \
2369          character.  */                                                 \
2370       dst = encode_invocation_designation (charset, coding, dst);       \
2371   } while (1)
2372
2373 /* Produce codes for a DIMENSION2 character whose character set is
2374    CHARSET and whose position-codes are C1 and C2.  Designation and
2375    invocation codes are also produced in advance if necessary.  */
2376
2377 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2378   do {                                                                  \
2379     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2380       {                                                                 \
2381         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2382           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2383         else                                                            \
2384           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2385         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2386         break;                                                          \
2387       }                                                                 \
2388     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2389       {                                                                 \
2390         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2391         break;                                                          \
2392       }                                                                 \
2393     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2394       {                                                                 \
2395         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2396         break;                                                          \
2397       }                                                                 \
2398     else                                                                \
2399       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2400          must invoke it, or, at first, designate it to some graphic     \
2401          register.  Then repeat the loop to actually produce the        \
2402          character.  */                                                 \
2403       dst = encode_invocation_designation (charset, coding, dst);       \
2404   } while (1)
2405
2406 #define ENCODE_ISO_CHARACTER(c)                                 \
2407   do {                                                          \
2408     int charset, c1, c2;                                        \
2409                                                                 \
2410     SPLIT_CHAR (c, charset, c1, c2);                            \
2411     if (CHARSET_DEFINED_P (charset))                            \
2412       {                                                         \
2413         if (CHARSET_DIMENSION (charset) == 1)                   \
2414           {                                                     \
2415             if (charset == CHARSET_ASCII                        \
2416                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2417               charset = charset_latin_jisx0201;                 \
2418             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2419           }                                                     \
2420         else                                                    \
2421           {                                                     \
2422             if (charset == charset_jisx0208                     \
2423                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2424               charset = charset_jisx0208_1978;                  \
2425             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2426           }                                                     \
2427       }                                                         \
2428     else                                                        \
2429       {                                                         \
2430         *dst++ = c1;                                            \
2431         if (c2 >= 0)                                            \
2432           *dst++ = c2;                                          \
2433       }                                                         \
2434   } while (0)
2435
2436
2437 /* Instead of encoding character C, produce one or two `?'s.  */
2438
2439 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2440   do {                                                          \
2441     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2442     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2443       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2444   } while (0)
2445
2446
2447 /* Produce designation and invocation codes at a place pointed by DST
2448    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2449    Return new DST.  */
2450
2451 unsigned char *
2452 encode_invocation_designation (charset, coding, dst)
2453      int charset;
2454      struct coding_system *coding;
2455      unsigned char *dst;
2456 {
2457   int reg;                      /* graphic register number */
2458
2459   /* At first, check designations.  */
2460   for (reg = 0; reg < 4; reg++)
2461     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2462       break;
2463
2464   if (reg >= 4)
2465     {
2466       /* CHARSET is not yet designated to any graphic registers.  */
2467       /* At first check the requested designation.  */
2468       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2469       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2470         /* Since CHARSET requests no special designation, designate it
2471            to graphic register 0.  */
2472         reg = 0;
2473
2474       ENCODE_DESIGNATION (charset, reg, coding);
2475     }
2476
2477   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2478       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2479     {
2480       /* Since the graphic register REG is not invoked to any graphic
2481          planes, invoke it to graphic plane 0.  */
2482       switch (reg)
2483         {
2484         case 0:                 /* graphic register 0 */
2485           ENCODE_SHIFT_IN;
2486           break;
2487
2488         case 1:                 /* graphic register 1 */
2489           ENCODE_SHIFT_OUT;
2490           break;
2491
2492         case 2:                 /* graphic register 2 */
2493           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2494             ENCODE_SINGLE_SHIFT_2;
2495           else
2496             ENCODE_LOCKING_SHIFT_2;
2497           break;
2498
2499         case 3:                 /* graphic register 3 */
2500           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2501             ENCODE_SINGLE_SHIFT_3;
2502           else
2503             ENCODE_LOCKING_SHIFT_3;
2504           break;
2505         }
2506     }
2507
2508   return dst;
2509 }
2510
2511 /* Produce 2-byte codes for encoded composition rule RULE.  */
2512
2513 #define ENCODE_COMPOSITION_RULE(rule)           \
2514   do {                                          \
2515     int gref, nref;                             \
2516     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2517     *dst++ = 32 + 81 + gref;                    \
2518     *dst++ = 32 + nref;                         \
2519   } while (0)
2520
2521 /* Produce codes for indicating the start of a composition sequence
2522    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2523    which specify information about the composition.  See the comment
2524    in coding.h for the format of DATA.  */
2525
2526 #define ENCODE_COMPOSITION_START(coding, data)                          \
2527   do {                                                                  \
2528     coding->composing = data[3];                                        \
2529     *dst++ = ISO_CODE_ESC;                                              \
2530     if (coding->composing == COMPOSITION_RELATIVE)                      \
2531       *dst++ = '0';                                                     \
2532     else                                                                \
2533       {                                                                 \
2534         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2535                   ? '3' : '4');                                         \
2536         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2537         coding->composition_rule_follows = 0;                           \
2538       }                                                                 \
2539   } while (0)
2540
2541 /* Produce codes for indicating the end of the current composition.  */
2542
2543 #define ENCODE_COMPOSITION_END(coding, data)                    \
2544   do {                                                          \
2545     *dst++ = ISO_CODE_ESC;                                      \
2546     *dst++ = '1';                                               \
2547     coding->cmp_data_start += data[0];                          \
2548     coding->composing = COMPOSITION_NO;                         \
2549     if (coding->cmp_data_start == coding->cmp_data->used        \
2550         && coding->cmp_data->next)                              \
2551       {                                                         \
2552         coding->cmp_data = coding->cmp_data->next;              \
2553         coding->cmp_data_start = 0;                             \
2554       }                                                         \
2555   } while (0)
2556
2557 /* Produce composition start sequence ESC 0.  Here, this sequence
2558    doesn't mean the start of a new composition but means that we have
2559    just produced components (alternate chars and composition rules) of
2560    the composition and the actual text follows in SRC.  */
2561
2562 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2563   do {                                          \
2564     *dst++ = ISO_CODE_ESC;                      \
2565     *dst++ = '0';                               \
2566     coding->composing = COMPOSITION_RELATIVE;   \
2567   } while (0)
2568
2569 /* The following three macros produce codes for indicating direction
2570    of text.  */
2571 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2572   do {                                                  \
2573     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2574       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2575     else                                                \
2576       *dst++ = ISO_CODE_CSI;                            \
2577   } while (0)
2578
2579 #define ENCODE_DIRECTION_R2L    \
2580   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2581
2582 #define ENCODE_DIRECTION_L2R    \
2583   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2584
2585 /* Produce codes for designation and invocation to reset the graphic
2586    planes and registers to initial state.  */
2587 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2588   do {                                                                      \
2589     int reg;                                                                \
2590     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2591       ENCODE_SHIFT_IN;                                                      \
2592     for (reg = 0; reg < 4; reg++)                                           \
2593       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2594           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2595               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2596         ENCODE_DESIGNATION                                                  \
2597           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2598   } while (0)
2599
2600 /* Produce designation sequences of charsets in the line started from
2601    SRC to a place pointed by DST, and return updated DST.
2602
2603    If the current block ends before any end-of-line, we may fail to
2604    find all the necessary designations.  */
2605
2606 static unsigned char *
2607 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2608      struct coding_system *coding;
2609      Lisp_Object translation_table;
2610      const unsigned char *src, *src_end;
2611      unsigned char *dst;
2612 {
2613   int charset, c, found = 0, reg;
2614   /* Table of charsets to be designated to each graphic register.  */
2615   int r[4];
2616
2617   for (reg = 0; reg < 4; reg++)
2618     r[reg] = -1;
2619
2620   while (found < 4)
2621     {
2622       ONE_MORE_CHAR (c);
2623       if (c == '\n')
2624         break;
2625
2626       charset = CHAR_CHARSET (c);
2627       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2628       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2629         {
2630           found++;
2631           r[reg] = charset;
2632         }
2633     }
2634
2635  label_end_of_loop:
2636   if (found)
2637     {
2638       for (reg = 0; reg < 4; reg++)
2639         if (r[reg] >= 0
2640             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2641           ENCODE_DESIGNATION (r[reg], reg, coding);
2642     }
2643
2644   return dst;
2645 }
2646
2647 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2648
2649 static void
2650 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2651      struct coding_system *coding;
2652      const unsigned char *source;
2653      unsigned char *destination;
2654      int src_bytes, dst_bytes;
2655 {
2656   const unsigned char *src = source;
2657   const unsigned char *src_end = source + src_bytes;
2658   unsigned char *dst = destination;
2659   unsigned char *dst_end = destination + dst_bytes;
2660   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2661      from DST_END to assure overflow checking is necessary only at the
2662      head of loop.  */
2663   unsigned char *adjusted_dst_end = dst_end - 19;
2664   /* SRC_BASE remembers the start position in source in each loop.
2665      The loop will be exited when there's not enough source text to
2666      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2667      there's not enough destination area to produce encoded codes
2668      (within macro EMIT_BYTES).  */
2669   const unsigned char *src_base;
2670   int c;
2671   Lisp_Object translation_table;
2672   Lisp_Object safe_chars;
2673
2674   if (coding->flags & CODING_FLAG_ISO_SAFE)
2675     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2676
2677   safe_chars = coding_safe_chars (coding->symbol);
2678
2679   if (NILP (Venable_character_translation))
2680     translation_table = Qnil;
2681   else
2682     {
2683       translation_table = coding->translation_table_for_encode;
2684       if (NILP (translation_table))
2685         translation_table = Vstandard_translation_table_for_encode;
2686     }
2687
2688   coding->consumed_char = 0;
2689   coding->errors = 0;
2690   while (1)
2691     {
2692       src_base = src;
2693
2694       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2695         {
2696           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2697           break;
2698         }
2699
2700       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2701           && CODING_SPEC_ISO_BOL (coding))
2702         {
2703           /* We have to produce designation sequences if any now.  */
2704           dst = encode_designation_at_bol (coding, translation_table,
2705                                            src, src_end, dst);
2706           CODING_SPEC_ISO_BOL (coding) = 0;
2707         }
2708
2709       /* Check composition start and end.  */
2710       if (coding->composing != COMPOSITION_DISABLED
2711           && coding->cmp_data_start < coding->cmp_data->used)
2712         {
2713           struct composition_data *cmp_data = coding->cmp_data;
2714           int *data = cmp_data->data + coding->cmp_data_start;
2715           int this_pos = cmp_data->char_offset + coding->consumed_char;
2716
2717           if (coding->composing == COMPOSITION_RELATIVE)
2718             {
2719               if (this_pos == data[2])
2720                 {
2721                   ENCODE_COMPOSITION_END (coding, data);
2722                   cmp_data = coding->cmp_data;
2723                   data = cmp_data->data + coding->cmp_data_start;
2724                 }
2725             }
2726           else if (COMPOSING_P (coding))
2727             {
2728               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2729               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2730                 /* We have consumed components of the composition.
2731                    What follows in SRC is the composition's base
2732                    text.  */
2733                 ENCODE_COMPOSITION_FAKE_START (coding);
2734               else
2735                 {
2736                   int c = cmp_data->data[coding->cmp_data_index++];
2737                   if (coding->composition_rule_follows)
2738                     {
2739                       ENCODE_COMPOSITION_RULE (c);
2740                       coding->composition_rule_follows = 0;
2741                     }
2742                   else
2743                     {
2744                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2745                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2746                         ENCODE_UNSAFE_CHARACTER (c);
2747                       else
2748                         ENCODE_ISO_CHARACTER (c);
2749                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2750                         coding->composition_rule_follows = 1;
2751                     }
2752                   continue;
2753                 }
2754             }
2755           if (!COMPOSING_P (coding))
2756             {
2757               if (this_pos == data[1])
2758                 {
2759                   ENCODE_COMPOSITION_START (coding, data);
2760                   continue;
2761                 }
2762             }
2763         }
2764
2765       ONE_MORE_CHAR (c);
2766
2767       /* Now encode the character C.  */
2768       if (c < 0x20 || c == 0x7F)
2769         {
2770           if (c == '\r')
2771             {
2772               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2773                 {
2774                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2775                     ENCODE_RESET_PLANE_AND_REGISTER;
2776                   *dst++ = c;
2777                   continue;
2778                 }
2779               /* fall down to treat '\r' as '\n' ...  */
2780               c = '\n';
2781             }
2782           if (c == '\n')
2783             {
2784               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2785                 ENCODE_RESET_PLANE_AND_REGISTER;
2786               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2787                 bcopy (coding->spec.iso2022.initial_designation,
2788                        coding->spec.iso2022.current_designation,
2789                        sizeof coding->spec.iso2022.initial_designation);
2790               if (coding->eol_type == CODING_EOL_LF
2791                   || coding->eol_type == CODING_EOL_UNDECIDED)
2792                 *dst++ = ISO_CODE_LF;
2793               else if (coding->eol_type == CODING_EOL_CRLF)
2794                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2795               else
2796                 *dst++ = ISO_CODE_CR;
2797               CODING_SPEC_ISO_BOL (coding) = 1;
2798             }
2799           else
2800             {
2801               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2802                 ENCODE_RESET_PLANE_AND_REGISTER;
2803               *dst++ = c;
2804             }
2805         }
2806       else if (ASCII_BYTE_P (c))
2807         ENCODE_ISO_CHARACTER (c);
2808       else if (SINGLE_BYTE_CHAR_P (c))
2809         {
2810           *dst++ = c;
2811           coding->errors++;
2812         }
2813       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2814                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2815         ENCODE_UNSAFE_CHARACTER (c);
2816       else
2817         ENCODE_ISO_CHARACTER (c);
2818
2819       coding->consumed_char++;
2820     }
2821
2822  label_end_of_loop:
2823   coding->consumed = src_base - source;
2824   coding->produced = coding->produced_char = dst - destination;
2825 }
2826
2827 \f
2828 /*** 4. SJIS and BIG5 handlers ***/
2829
2830 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2831    quite widely.  So, for the moment, Emacs supports them in the bare
2832    C code.  But, in the future, they may be supported only by CCL.  */
2833
2834 /* SJIS is a coding system encoding three character sets: ASCII, right
2835    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2836    as is.  A character of charset katakana-jisx0201 is encoded by
2837    "position-code + 0x80".  A character of charset japanese-jisx0208
2838    is encoded in 2-byte but two position-codes are divided and shifted
2839    so that it fits in the range below.
2840
2841    --- CODE RANGE of SJIS ---
2842    (character set)      (range)
2843    ASCII                0x00 .. 0x7F
2844    KATAKANA-JISX0201    0xA1 .. 0xDF
2845    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2846             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2847    -------------------------------
2848
2849 */
2850
2851 /* BIG5 is a coding system encoding two character sets: ASCII and
2852    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2853    character set and is encoded in two bytes.
2854
2855    --- CODE RANGE of BIG5 ---
2856    (character set)      (range)
2857    ASCII                0x00 .. 0x7F
2858    Big5 (1st byte)      0xA1 .. 0xFE
2859         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2860    --------------------------
2861
2862    Since the number of characters in Big5 is larger than maximum
2863    characters in Emacs' charset (96x96), it can't be handled as one
2864    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2865    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2866    contains frequently used characters and the latter contains less
2867    frequently used characters.  */
2868
2869 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2870    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2871    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2872    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2873
2874 /* Number of Big5 characters which have the same code in 1st byte.  */
2875 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2876
2877 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2878   do {                                                                  \
2879     unsigned int temp                                                   \
2880       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2881     if (b1 < 0xC9)                                                      \
2882       charset = charset_big5_1;                                         \
2883     else                                                                \
2884       {                                                                 \
2885         charset = charset_big5_2;                                       \
2886         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2887       }                                                                 \
2888     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2889     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2890   } while (0)
2891
2892 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2893   do {                                                                  \
2894     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2895     if (charset == charset_big5_2)                                      \
2896       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2897     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2898     b2 = temp % BIG5_SAME_ROW;                                          \
2899     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2900   } while (0)
2901
2902 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2903    Check if a text is encoded in SJIS.  If it is, return
2904    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2905
2906 static int
2907 detect_coding_sjis (src, src_end, multibytep)
2908      unsigned char *src, *src_end;
2909      int multibytep;
2910 {
2911   int c;
2912   /* Dummy for ONE_MORE_BYTE.  */
2913   struct coding_system dummy_coding;
2914   struct coding_system *coding = &dummy_coding;
2915
2916   while (1)
2917     {
2918       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2919       if (c < 0x80)
2920         continue;
2921       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2922         return 0;
2923       if (c <= 0x9F || c >= 0xE0)
2924         {
2925           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2926           if (c < 0x40 || c == 0x7F || c > 0xFC)
2927             return 0;
2928         }
2929     }
2930 }
2931
2932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2933    Check if a text is encoded in BIG5.  If it is, return
2934    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2935
2936 static int
2937 detect_coding_big5 (src, src_end, multibytep)
2938      unsigned char *src, *src_end;
2939      int multibytep;
2940 {
2941   int c;
2942   /* Dummy for ONE_MORE_BYTE.  */
2943   struct coding_system dummy_coding;
2944   struct coding_system *coding = &dummy_coding;
2945
2946   while (1)
2947     {
2948       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2949       if (c < 0x80)
2950         continue;
2951       if (c < 0xA1 || c > 0xFE)
2952         return 0;
2953       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2954       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2955         return 0;
2956     }
2957 }
2958
2959 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2960    Check if a text is encoded in UTF-8.  If it is, return
2961    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2962
2963 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2964 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2965 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2966 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2967 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2968 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2969 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2970
2971 static int
2972 detect_coding_utf_8 (src, src_end, multibytep)
2973      unsigned char *src, *src_end;
2974      int multibytep;
2975 {
2976   unsigned char c;
2977   int seq_maybe_bytes;
2978   /* Dummy for ONE_MORE_BYTE.  */
2979   struct coding_system dummy_coding;
2980   struct coding_system *coding = &dummy_coding;
2981
2982   while (1)
2983     {
2984       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2985       if (UTF_8_1_OCTET_P (c))
2986         continue;
2987       else if (UTF_8_2_OCTET_LEADING_P (c))
2988         seq_maybe_bytes = 1;
2989       else if (UTF_8_3_OCTET_LEADING_P (c))
2990         seq_maybe_bytes = 2;
2991       else if (UTF_8_4_OCTET_LEADING_P (c))
2992         seq_maybe_bytes = 3;
2993       else if (UTF_8_5_OCTET_LEADING_P (c))
2994         seq_maybe_bytes = 4;
2995       else if (UTF_8_6_OCTET_LEADING_P (c))
2996         seq_maybe_bytes = 5;
2997       else
2998         return 0;
2999
3000       do
3001         {
3002           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3003           if (!UTF_8_EXTRA_OCTET_P (c))
3004             return 0;
3005           seq_maybe_bytes--;
3006         }
3007       while (seq_maybe_bytes > 0);
3008     }
3009 }
3010
3011 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3012    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3013    Little Endian (otherwise).  If it is, return
3014    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3015    else return 0.  */
3016
3017 #define UTF_16_INVALID_P(val)   \
3018   (((val) == 0xFFFE)            \
3019    || ((val) == 0xFFFF))
3020
3021 #define UTF_16_HIGH_SURROGATE_P(val) \
3022   (((val) & 0xD800) == 0xD800)
3023
3024 #define UTF_16_LOW_SURROGATE_P(val) \
3025   (((val) & 0xDC00) == 0xDC00)
3026
3027 static int
3028 detect_coding_utf_16 (src, src_end, multibytep)
3029      unsigned char *src, *src_end;
3030      int multibytep;
3031 {
3032   unsigned char c1, c2;
3033   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3034   struct coding_system dummy_coding;
3035   struct coding_system *coding = &dummy_coding;
3036
3037   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3038   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3039
3040   if ((c1 == 0xFF) && (c2 == 0xFE))
3041     return CODING_CATEGORY_MASK_UTF_16_LE;
3042   else if ((c1 == 0xFE) && (c2 == 0xFF))
3043     return CODING_CATEGORY_MASK_UTF_16_BE;
3044   return 0;
3045 }
3046
3047 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3048    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3049
3050 static void
3051 decode_coding_sjis_big5 (coding, source, destination,
3052                          src_bytes, dst_bytes, sjis_p)
3053      struct coding_system *coding;
3054      const unsigned char *source;
3055      unsigned char  *destination;
3056      int src_bytes, dst_bytes;
3057      int sjis_p;
3058 {
3059   const unsigned char *src = source;
3060   const unsigned char *src_end = source + src_bytes;
3061   unsigned char *dst = destination;
3062   unsigned char *dst_end = destination + dst_bytes;
3063   /* SRC_BASE remembers the start position in source in each loop.
3064      The loop will be exited when there's not enough source code
3065      (within macro ONE_MORE_BYTE), or when there's not enough
3066      destination area to produce a character (within macro
3067      EMIT_CHAR).  */
3068   const unsigned char *src_base;
3069   Lisp_Object translation_table;
3070
3071   if (NILP (Venable_character_translation))
3072     translation_table = Qnil;
3073   else
3074     {
3075       translation_table = coding->translation_table_for_decode;
3076       if (NILP (translation_table))
3077         translation_table = Vstandard_translation_table_for_decode;
3078     }
3079
3080   coding->produced_char = 0;
3081   while (1)
3082     {
3083       int c, charset, c1, c2 = 0;
3084
3085       src_base = src;
3086       ONE_MORE_BYTE (c1);
3087
3088       if (c1 < 0x80)
3089         {
3090           charset = CHARSET_ASCII;
3091           if (c1 < 0x20)
3092             {
3093               if (c1 == '\r')
3094                 {
3095                   if (coding->eol_type == CODING_EOL_CRLF)
3096                     {
3097                       ONE_MORE_BYTE (c2);
3098                       if (c2 == '\n')
3099                         c1 = c2;
3100                       else
3101                         /* To process C2 again, SRC is subtracted by 1.  */
3102                         src--;
3103                     }
3104                   else if (coding->eol_type == CODING_EOL_CR)
3105                     c1 = '\n';
3106                 }
3107               else if (c1 == '\n'
3108                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3109                        && (coding->eol_type == CODING_EOL_CR
3110                            || coding->eol_type == CODING_EOL_CRLF))
3111                 {
3112                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3113                   goto label_end_of_loop;
3114                 }
3115             }
3116         }
3117       else
3118         {
3119           if (sjis_p)
3120             {
3121               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3122                 goto label_invalid_code;
3123               if (c1 <= 0x9F || c1 >= 0xE0)
3124                 {
3125                   /* SJIS -> JISX0208 */
3126                   ONE_MORE_BYTE (c2);
3127                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3128                     goto label_invalid_code;
3129                   DECODE_SJIS (c1, c2, c1, c2);
3130                   charset = charset_jisx0208;
3131                 }
3132               else
3133                 /* SJIS -> JISX0201-Kana */
3134                 charset = charset_katakana_jisx0201;
3135             }
3136           else
3137             {
3138               /* BIG5 -> Big5 */
3139               if (c1 < 0xA0 || c1 > 0xFE)
3140                 goto label_invalid_code;
3141               ONE_MORE_BYTE (c2);
3142               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3143                 goto label_invalid_code;
3144               DECODE_BIG5 (c1, c2, charset, c1, c2);
3145             }
3146         }
3147
3148       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3149       EMIT_CHAR (c);
3150       continue;
3151
3152     label_invalid_code:
3153       coding->errors++;
3154       src = src_base;
3155       c = *src++;
3156       EMIT_CHAR (c);
3157     }
3158
3159  label_end_of_loop:
3160   coding->consumed = coding->consumed_char = src_base - source;
3161   coding->produced = dst - destination;
3162   return;
3163 }
3164
3165 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3166    This function can encode charsets `ascii', `katakana-jisx0201',
3167    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3168    are sure that all these charsets are registered as official charset
3169    (i.e. do not have extended leading-codes).  Characters of other
3170    charsets are produced without any encoding.  If SJIS_P is 1, encode
3171    SJIS text, else encode BIG5 text.  */
3172
3173 static void
3174 encode_coding_sjis_big5 (coding, source, destination,
3175                          src_bytes, dst_bytes, sjis_p)
3176      struct coding_system *coding;
3177      unsigned char *source, *destination;
3178      int src_bytes, dst_bytes;
3179      int sjis_p;
3180 {
3181   unsigned char *src = source;
3182   unsigned char *src_end = source + src_bytes;
3183   unsigned char *dst = destination;
3184   unsigned char *dst_end = destination + dst_bytes;
3185   /* SRC_BASE remembers the start position in source in each loop.
3186      The loop will be exited when there's not enough source text to
3187      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3188      there's not enough destination area to produce encoded codes
3189      (within macro EMIT_BYTES).  */
3190   unsigned char *src_base;
3191   Lisp_Object translation_table;
3192
3193   if (NILP (Venable_character_translation))
3194     translation_table = Qnil;
3195   else
3196     {
3197       translation_table = coding->translation_table_for_encode;
3198       if (NILP (translation_table))
3199         translation_table = Vstandard_translation_table_for_encode;
3200     }
3201
3202   while (1)
3203     {
3204       int c, charset, c1, c2;
3205
3206       src_base = src;
3207       ONE_MORE_CHAR (c);
3208
3209       /* Now encode the character C.  */
3210       if (SINGLE_BYTE_CHAR_P (c))
3211         {
3212           switch (c)
3213             {
3214             case '\r':
3215               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3216                 {
3217                   EMIT_ONE_BYTE (c);
3218                   break;
3219                 }
3220               c = '\n';
3221             case '\n':
3222               if (coding->eol_type == CODING_EOL_CRLF)
3223                 {
3224                   EMIT_TWO_BYTES ('\r', c);
3225                   break;
3226                 }
3227               else if (coding->eol_type == CODING_EOL_CR)
3228                 c = '\r';
3229             default:
3230               EMIT_ONE_BYTE (c);
3231             }
3232         }
3233       else
3234         {
3235           SPLIT_CHAR (c, charset, c1, c2);
3236           if (sjis_p)
3237             {
3238               if (charset == charset_jisx0208
3239                   || charset == charset_jisx0208_1978)
3240                 {
3241                   ENCODE_SJIS (c1, c2, c1, c2);
3242                   EMIT_TWO_BYTES (c1, c2);
3243                 }
3244               else if (charset == charset_katakana_jisx0201)
3245                 EMIT_ONE_BYTE (c1 | 0x80);
3246               else if (charset == charset_latin_jisx0201)
3247                 EMIT_ONE_BYTE (c1);
3248               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3249                 {
3250                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3251                   if (CHARSET_WIDTH (charset) > 1)
3252                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3253                 }
3254               else
3255                 /* There's no way other than producing the internal
3256                    codes as is.  */
3257                 EMIT_BYTES (src_base, src);
3258             }
3259           else
3260             {
3261               if (charset == charset_big5_1 || charset == charset_big5_2)
3262                 {
3263                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3264                   EMIT_TWO_BYTES (c1, c2);
3265                 }
3266               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3267                 {
3268                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3269                   if (CHARSET_WIDTH (charset) > 1)
3270                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3271                 }
3272               else
3273                 /* There's no way other than producing the internal
3274                    codes as is.  */
3275                 EMIT_BYTES (src_base, src);
3276             }
3277         }
3278       coding->consumed_char++;
3279     }
3280
3281  label_end_of_loop:
3282   coding->consumed = src_base - source;
3283   coding->produced = coding->produced_char = dst - destination;
3284 }
3285
3286 \f
3287 /*** 5. CCL handlers ***/
3288
3289 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3290    Check if a text is encoded in a coding system of which
3291    encoder/decoder are written in CCL program.  If it is, return
3292    CODING_CATEGORY_MASK_CCL, else return 0.  */
3293
3294 static int
3295 detect_coding_ccl (src, src_end, multibytep)
3296      unsigned char *src, *src_end;
3297      int multibytep;
3298 {
3299   unsigned char *valid;
3300   int c;
3301   /* Dummy for ONE_MORE_BYTE.  */
3302   struct coding_system dummy_coding;
3303   struct coding_system *coding = &dummy_coding;
3304
3305   /* No coding system is assigned to coding-category-ccl.  */
3306   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3307     return 0;
3308
3309   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3310   while (1)
3311     {
3312       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3313       if (! valid[c])
3314         return 0;
3315     }
3316 }
3317
3318 \f
3319 /*** 6. End-of-line handlers ***/
3320
3321 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3322
3323 static void
3324 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3325      struct coding_system *coding;
3326      const unsigned char *source;
3327      unsigned char *destination;
3328      int src_bytes, dst_bytes;
3329 {
3330   const unsigned char *src = source;
3331   unsigned char *dst = destination;
3332   const unsigned char *src_end = src + src_bytes;
3333   unsigned char *dst_end = dst + dst_bytes;
3334   Lisp_Object translation_table;
3335   /* SRC_BASE remembers the start position in source in each loop.
3336      The loop will be exited when there's not enough source code
3337      (within macro ONE_MORE_BYTE), or when there's not enough
3338      destination area to produce a character (within macro
3339      EMIT_CHAR).  */
3340   const unsigned char *src_base;
3341   int c;
3342
3343   translation_table = Qnil;
3344   switch (coding->eol_type)
3345     {
3346     case CODING_EOL_CRLF:
3347       while (1)
3348         {
3349           src_base = src;
3350           ONE_MORE_BYTE (c);
3351           if (c == '\r')
3352             {
3353               ONE_MORE_BYTE (c);
3354               if (c != '\n')
3355                 {
3356                   src--;
3357                   c = '\r';
3358                 }
3359             }
3360           else if (c == '\n'
3361                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3362             {
3363               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3364               goto label_end_of_loop;
3365             }
3366           EMIT_CHAR (c);
3367         }
3368       break;
3369
3370     case CODING_EOL_CR:
3371       while (1)
3372         {
3373           src_base = src;
3374           ONE_MORE_BYTE (c);
3375           if (c == '\n')
3376             {
3377               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3378                 {
3379                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3380                   goto label_end_of_loop;
3381                 }
3382             }
3383           else if (c == '\r')
3384             c = '\n';
3385           EMIT_CHAR (c);
3386         }
3387       break;
3388
3389     default:                    /* no need for EOL handling */
3390       while (1)
3391         {
3392           src_base = src;
3393           ONE_MORE_BYTE (c);
3394           EMIT_CHAR (c);
3395         }
3396     }
3397
3398  label_end_of_loop:
3399   coding->consumed = coding->consumed_char = src_base - source;
3400   coding->produced = dst - destination;
3401   return;
3402 }
3403
3404 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3405    format of end-of-line according to `coding->eol_type'.  It also
3406    convert multibyte form 8-bit characters to unibyte if
3407    CODING->src_multibyte is nonzero.  If `coding->mode &
3408    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3409    also means end-of-line.  */
3410
3411 static void
3412 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3413      struct coding_system *coding;
3414      const unsigned char *source;
3415      unsigned char *destination;
3416      int src_bytes, dst_bytes;
3417 {
3418   const unsigned char *src = source;
3419   unsigned char *dst = destination;
3420   const unsigned char *src_end = src + src_bytes;
3421   unsigned char *dst_end = dst + dst_bytes;
3422   Lisp_Object translation_table;
3423   /* SRC_BASE remembers the start position in source in each loop.
3424      The loop will be exited when there's not enough source text to
3425      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3426      there's not enough destination area to produce encoded codes
3427      (within macro EMIT_BYTES).  */
3428   const unsigned char *src_base;
3429   unsigned char *tmp;
3430   int c;
3431   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3432
3433   translation_table = Qnil;
3434   if (coding->src_multibyte
3435       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3436     {
3437       src_end--;
3438       src_bytes--;
3439       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3440     }
3441
3442   if (coding->eol_type == CODING_EOL_CRLF)
3443     {
3444       while (src < src_end)
3445         {
3446           src_base = src;
3447           c = *src++;
3448           if (c >= 0x20)
3449             EMIT_ONE_BYTE (c);
3450           else if (c == '\n' || (c == '\r' && selective_display))
3451             EMIT_TWO_BYTES ('\r', '\n');
3452           else
3453             EMIT_ONE_BYTE (c);
3454         }
3455       src_base = src;
3456     label_end_of_loop:
3457       ;
3458     }
3459   else
3460     {
3461       if (!dst_bytes || src_bytes <= dst_bytes)
3462         {
3463           safe_bcopy (src, dst, src_bytes);
3464           src_base = src_end;
3465           dst += src_bytes;
3466         }
3467       else
3468         {
3469           if (coding->src_multibyte
3470               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3471             dst_bytes--;
3472           safe_bcopy (src, dst, dst_bytes);
3473           src_base = src + dst_bytes;
3474           dst = destination + dst_bytes;
3475           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3476         }
3477       if (coding->eol_type == CODING_EOL_CR)
3478         {
3479           for (tmp = destination; tmp < dst; tmp++)
3480             if (*tmp == '\n') *tmp = '\r';
3481         }
3482       else if (selective_display)
3483         {
3484           for (tmp = destination; tmp < dst; tmp++)
3485             if (*tmp == '\r') *tmp = '\n';
3486         }
3487     }
3488   if (coding->src_multibyte)
3489     dst = destination + str_as_unibyte (destination, dst - destination);
3490
3491   coding->consumed = src_base - source;
3492   coding->produced = dst - destination;
3493   coding->produced_char = coding->produced;
3494 }
3495
3496 \f
3497 /*** 7. C library functions ***/
3498
3499 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3500    has a property `coding-system'.  The value of this property is a
3501    vector of length 5 (called the coding-vector).  Among elements of
3502    this vector, the first (element[0]) and the fifth (element[4])
3503    carry important information for decoding/encoding.  Before
3504    decoding/encoding, this information should be set in fields of a
3505    structure of type `coding_system'.
3506
3507    The value of the property `coding-system' can be a symbol of another
3508    subsidiary coding-system.  In that case, Emacs gets coding-vector
3509    from that symbol.
3510
3511    `element[0]' contains information to be set in `coding->type'.  The
3512    value and its meaning is as follows:
3513
3514    0 -- coding_type_emacs_mule
3515    1 -- coding_type_sjis
3516    2 -- coding_type_iso2022
3517    3 -- coding_type_big5
3518    4 -- coding_type_ccl encoder/decoder written in CCL
3519    nil -- coding_type_no_conversion
3520    t -- coding_type_undecided (automatic conversion on decoding,
3521                                no-conversion on encoding)
3522
3523    `element[4]' contains information to be set in `coding->flags' and
3524    `coding->spec'.  The meaning varies by `coding->type'.
3525
3526    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3527    of length 32 (of which the first 13 sub-elements are used now).
3528    Meanings of these sub-elements are:
3529
3530    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3531         If the value is an integer of valid charset, the charset is
3532         assumed to be designated to graphic register N initially.
3533
3534         If the value is minus, it is a minus value of charset which
3535         reserves graphic register N, which means that the charset is
3536         not designated initially but should be designated to graphic
3537         register N just before encoding a character in that charset.
3538
3539         If the value is nil, graphic register N is never used on
3540         encoding.
3541
3542    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3543         Each value takes t or nil.  See the section ISO2022 of
3544         `coding.h' for more information.
3545
3546    If `coding->type' is `coding_type_big5', element[4] is t to denote
3547    BIG5-ETen or nil to denote BIG5-HKU.
3548
3549    If `coding->type' takes the other value, element[4] is ignored.
3550
3551    Emacs Lisp's coding systems also carry information about format of
3552    end-of-line in a value of property `eol-type'.  If the value is
3553    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3554    means CODING_EOL_CR.  If it is not integer, it should be a vector
3555    of subsidiary coding systems of which property `eol-type' has one
3556    of the above values.
3557
3558 */
3559
3560 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3561    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3562    is setup so that no conversion is necessary and return -1, else
3563    return 0.  */
3564
3565 int
3566 setup_coding_system (coding_system, coding)
3567      Lisp_Object coding_system;
3568      struct coding_system *coding;
3569 {
3570   Lisp_Object coding_spec, coding_type, eol_type, plist;
3571   Lisp_Object val;
3572
3573   /* At first, zero clear all members.  */
3574   bzero (coding, sizeof (struct coding_system));
3575
3576   /* Initialize some fields required for all kinds of coding systems.  */
3577   coding->symbol = coding_system;
3578   coding->heading_ascii = -1;
3579   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3580   coding->composing = COMPOSITION_DISABLED;
3581   coding->cmp_data = NULL;
3582
3583   if (NILP (coding_system))
3584     goto label_invalid_coding_system;
3585
3586   coding_spec = Fget (coding_system, Qcoding_system);
3587
3588   if (!VECTORP (coding_spec)
3589       || XVECTOR (coding_spec)->size != 5
3590       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3591     goto label_invalid_coding_system;
3592
3593   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3594   if (VECTORP (eol_type))
3595     {
3596       coding->eol_type = CODING_EOL_UNDECIDED;
3597       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3598       if (system_eol_type != CODING_EOL_LF)
3599         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3600     }
3601   else if (XFASTINT (eol_type) == 1)
3602     {
3603       coding->eol_type = CODING_EOL_CRLF;
3604       coding->common_flags
3605         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3606     }
3607   else if (XFASTINT (eol_type) == 2)
3608     {
3609       coding->eol_type = CODING_EOL_CR;
3610       coding->common_flags
3611         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3612     }
3613   else
3614     {
3615       coding->common_flags = 0;
3616       coding->eol_type = CODING_EOL_LF;
3617     }
3618
3619   coding_type = XVECTOR (coding_spec)->contents[0];
3620   /* Try short cut.  */
3621   if (SYMBOLP (coding_type))
3622     {
3623       if (EQ (coding_type, Qt))
3624         {
3625           coding->type = coding_type_undecided;
3626           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3627         }
3628       else
3629         coding->type = coding_type_no_conversion;
3630       /* Initialize this member.  Any thing other than
3631          CODING_CATEGORY_IDX_UTF_16_BE and
3632          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3633          special treatment in detect_eol.  */
3634       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3635
3636       return 0;
3637     }
3638
3639   /* Get values of coding system properties:
3640      `post-read-conversion', `pre-write-conversion',
3641      `translation-table-for-decode', `translation-table-for-encode'.  */
3642   plist = XVECTOR (coding_spec)->contents[3];
3643   /* Pre & post conversion functions should be disabled if
3644      inhibit_eol_conversion is nonzero.  This is the case that a code
3645      conversion function is called while those functions are running.  */
3646   if (! inhibit_pre_post_conversion)
3647     {
3648       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3649       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3650     }
3651   val = Fplist_get (plist, Qtranslation_table_for_decode);
3652   if (SYMBOLP (val))
3653     val = Fget (val, Qtranslation_table_for_decode);
3654   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3655   val = Fplist_get (plist, Qtranslation_table_for_encode);
3656   if (SYMBOLP (val))
3657     val = Fget (val, Qtranslation_table_for_encode);
3658   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3659   val = Fplist_get (plist, Qcoding_category);
3660   if (!NILP (val))
3661     {
3662       val = Fget (val, Qcoding_category_index);
3663       if (INTEGERP (val))
3664         coding->category_idx = XINT (val);
3665       else
3666         goto label_invalid_coding_system;
3667     }
3668   else
3669     goto label_invalid_coding_system;
3670
3671   /* If the coding system has non-nil `composition' property, enable
3672      composition handling.  */
3673   val = Fplist_get (plist, Qcomposition);
3674   if (!NILP (val))
3675     coding->composing = COMPOSITION_NO;
3676
3677   /* If the coding system is ascii-incompatible, record it in
3678      common_flags.   */
3679   val = Fplist_get (plist, Qascii_incompatible);
3680   if (! NILP (val))
3681     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3682
3683   switch (XFASTINT (coding_type))
3684     {
3685     case 0:
3686       coding->type = coding_type_emacs_mule;
3687       coding->common_flags
3688         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3689       if (!NILP (coding->post_read_conversion))
3690         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3691       if (!NILP (coding->pre_write_conversion))
3692         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3693       break;
3694
3695     case 1:
3696       coding->type = coding_type_sjis;
3697       coding->common_flags
3698         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3699       break;
3700
3701     case 2:
3702       coding->type = coding_type_iso2022;
3703       coding->common_flags
3704         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3705       {
3706         Lisp_Object val, temp;
3707         Lisp_Object *flags;
3708         int i, charset, reg_bits = 0;
3709
3710         val = XVECTOR (coding_spec)->contents[4];
3711
3712         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3713           goto label_invalid_coding_system;
3714
3715         flags = XVECTOR (val)->contents;
3716         coding->flags
3717           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3718              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3719              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3720              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3721              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3722              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3723              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3724              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3725              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3726              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3727              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3728              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3729              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3730              );
3731
3732         /* Invoke graphic register 0 to plane 0.  */
3733         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3734         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3735         CODING_SPEC_ISO_INVOCATION (coding, 1)
3736           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3737         /* Not single shifting at first.  */
3738         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3739         /* Beginning of buffer should also be regarded as bol. */
3740         CODING_SPEC_ISO_BOL (coding) = 1;
3741
3742         for (charset = 0; charset <= MAX_CHARSET; charset++)
3743           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3744         val = Vcharset_revision_alist;
3745         while (CONSP (val))
3746           {
3747             charset = get_charset_id (Fcar_safe (XCAR (val)));
3748             if (charset >= 0
3749                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3750                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3751               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3752             val = XCDR (val);
3753           }
3754
3755         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3756            FLAGS[REG] can be one of below:
3757                 integer CHARSET: CHARSET occupies register I,
3758                 t: designate nothing to REG initially, but can be used
3759                   by any charsets,
3760                 list of integer, nil, or t: designate the first
3761                   element (if integer) to REG initially, the remaining
3762                   elements (if integer) is designated to REG on request,
3763                   if an element is t, REG can be used by any charsets,
3764                 nil: REG is never used.  */
3765         for (charset = 0; charset <= MAX_CHARSET; charset++)
3766           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3768         for (i = 0; i < 4; i++)
3769           {
3770             if ((INTEGERP (flags[i])
3771                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3772                 || (charset = get_charset_id (flags[i])) >= 0)
3773               {
3774                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3775                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3776               }
3777             else if (EQ (flags[i], Qt))
3778               {
3779                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3780                 reg_bits |= 1 << i;
3781                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3782               }
3783             else if (CONSP (flags[i]))
3784               {
3785                 Lisp_Object tail;
3786                 tail = flags[i];
3787
3788                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3789                 if ((INTEGERP (XCAR (tail))
3790                      && (charset = XINT (XCAR (tail)),
3791                          CHARSET_VALID_P (charset)))
3792                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3793                   {
3794                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3795                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3796                   }
3797                 else
3798                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3799                 tail = XCDR (tail);
3800                 while (CONSP (tail))
3801                   {
3802                     if ((INTEGERP (XCAR (tail))
3803                          && (charset = XINT (XCAR (tail)),
3804                              CHARSET_VALID_P (charset)))
3805                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3806                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3807                         = i;
3808                     else if (EQ (XCAR (tail), Qt))
3809                       reg_bits |= 1 << i;
3810                     tail = XCDR (tail);
3811                   }
3812               }
3813             else
3814               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3815
3816             CODING_SPEC_ISO_DESIGNATION (coding, i)
3817               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3818           }
3819
3820         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3821           {
3822             /* REG 1 can be used only by locking shift in 7-bit env.  */
3823             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3824               reg_bits &= ~2;
3825             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3826               /* Without any shifting, only REG 0 and 1 can be used.  */
3827               reg_bits &= 3;
3828           }
3829
3830         if (reg_bits)
3831           for (charset = 0; charset <= MAX_CHARSET; charset++)
3832             {
3833               if (CHARSET_DEFINED_P (charset)
3834                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3835                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3836                 {
3837                   /* There exist some default graphic registers to be
3838                      used by CHARSET.  */
3839
3840                   /* We had better avoid designating a charset of
3841                      CHARS96 to REG 0 as far as possible.  */
3842                   if (CHARSET_CHARS (charset) == 96)
3843                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3844                       = (reg_bits & 2
3845                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3846                   else
3847                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3848                       = (reg_bits & 1
3849                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3850                 }
3851             }
3852       }
3853       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3854       coding->spec.iso2022.last_invalid_designation_register = -1;
3855       break;
3856
3857     case 3:
3858       coding->type = coding_type_big5;
3859       coding->common_flags
3860         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3861       coding->flags
3862         = (NILP (XVECTOR (coding_spec)->contents[4])
3863            ? CODING_FLAG_BIG5_HKU
3864            : CODING_FLAG_BIG5_ETEN);
3865       break;
3866
3867     case 4:
3868       coding->type = coding_type_ccl;
3869       coding->common_flags
3870         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3871       {
3872         val = XVECTOR (coding_spec)->contents[4];
3873         if (! CONSP (val)
3874             || setup_ccl_program (&(coding->spec.ccl.decoder),
3875                                   XCAR (val)) < 0
3876             || setup_ccl_program (&(coding->spec.ccl.encoder),
3877                                   XCDR (val)) < 0)
3878           goto label_invalid_coding_system;
3879
3880         bzero (coding->spec.ccl.valid_codes, 256);
3881         val = Fplist_get (plist, Qvalid_codes);
3882         if (CONSP (val))
3883           {
3884             Lisp_Object this;
3885
3886             for (; CONSP (val); val = XCDR (val))
3887               {
3888                 this = XCAR (val);
3889                 if (INTEGERP (this)
3890                     && XINT (this) >= 0 && XINT (this) < 256)
3891                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3892                 else if (CONSP (this)
3893                          && INTEGERP (XCAR (this))
3894                          && INTEGERP (XCDR (this)))
3895                   {
3896                     int start = XINT (XCAR (this));
3897                     int end = XINT (XCDR (this));
3898
3899                     if (start >= 0 && start <= end && end < 256)
3900                       while (start <= end)
3901                         coding->spec.ccl.valid_codes[start++] = 1;
3902                   }
3903               }
3904           }
3905       }
3906       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3907       coding->spec.ccl.cr_carryover = 0;
3908       coding->spec.ccl.eight_bit_carryover[0] = 0;
3909       break;
3910
3911     case 5:
3912       coding->type = coding_type_raw_text;
3913       break;
3914
3915     default:
3916       goto label_invalid_coding_system;
3917     }
3918   return 0;
3919
3920  label_invalid_coding_system:
3921   coding->type = coding_type_no_conversion;
3922   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3923   coding->common_flags = 0;
3924   coding->eol_type = CODING_EOL_UNDECIDED;
3925   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3926   return NILP (coding_system) ? 0 : -1;
3927 }
3928
3929 /* Free memory blocks allocated for storing composition information.  */
3930
3931 void
3932 coding_free_composition_data (coding)
3933      struct coding_system *coding;
3934 {
3935   struct composition_data *cmp_data = coding->cmp_data, *next;
3936
3937   if (!cmp_data)
3938     return;
3939   /* Memory blocks are chained.  At first, rewind to the first, then,
3940      free blocks one by one.  */
3941   while (cmp_data->prev)
3942     cmp_data = cmp_data->prev;
3943   while (cmp_data)
3944     {
3945       next = cmp_data->next;
3946       xfree (cmp_data);
3947       cmp_data = next;
3948     }
3949   coding->cmp_data = NULL;
3950 }
3951
3952 /* Set `char_offset' member of all memory blocks pointed by
3953    coding->cmp_data to POS.  */
3954
3955 void
3956 coding_adjust_composition_offset (coding, pos)
3957      struct coding_system *coding;
3958      int pos;
3959 {
3960   struct composition_data *cmp_data;
3961
3962   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3963     cmp_data->char_offset = pos;
3964 }
3965
3966 /* Setup raw-text or one of its subsidiaries in the structure
3967    coding_system CODING according to the already setup value eol_type
3968    in CODING.  CODING should be setup for some coding system in
3969    advance.  */
3970
3971 void
3972 setup_raw_text_coding_system (coding)
3973      struct coding_system *coding;
3974 {
3975   if (coding->type != coding_type_raw_text)
3976     {
3977       coding->symbol = Qraw_text;
3978       coding->type = coding_type_raw_text;
3979       if (coding->eol_type != CODING_EOL_UNDECIDED)
3980         {
3981           Lisp_Object subsidiaries;
3982           subsidiaries = Fget (Qraw_text, Qeol_type);
3983
3984           if (VECTORP (subsidiaries)
3985               && XVECTOR (subsidiaries)->size == 3)
3986             coding->symbol
3987               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3988         }
3989       setup_coding_system (coding->symbol, coding);
3990     }
3991   return;
3992 }
3993
3994 /* Emacs has a mechanism to automatically detect a coding system if it
3995    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3996    it's impossible to distinguish some coding systems accurately
3997    because they use the same range of codes.  So, at first, coding
3998    systems are categorized into 7, those are:
3999
4000    o coding-category-emacs-mule
4001
4002         The category for a coding system which has the same code range
4003         as Emacs' internal format.  Assigned the coding-system (Lisp
4004         symbol) `emacs-mule' by default.
4005
4006    o coding-category-sjis
4007
4008         The category for a coding system which has the same code range
4009         as SJIS.  Assigned the coding-system (Lisp
4010         symbol) `japanese-shift-jis' by default.
4011
4012    o coding-category-iso-7
4013
4014         The category for a coding system which has the same code range
4015         as ISO2022 of 7-bit environment.  This doesn't use any locking
4016         shift and single shift functions.  This can encode/decode all
4017         charsets.  Assigned the coding-system (Lisp symbol)
4018         `iso-2022-7bit' by default.
4019
4020    o coding-category-iso-7-tight
4021
4022         Same as coding-category-iso-7 except that this can
4023         encode/decode only the specified charsets.
4024
4025    o coding-category-iso-8-1
4026
4027         The category for a coding system which has the same code range
4028         as ISO2022 of 8-bit environment and graphic plane 1 used only
4029         for DIMENSION1 charset.  This doesn't use any locking shift
4030         and single shift functions.  Assigned the coding-system (Lisp
4031         symbol) `iso-latin-1' by default.
4032
4033    o coding-category-iso-8-2
4034
4035         The category for a coding system which has the same code range
4036         as ISO2022 of 8-bit environment and graphic plane 1 used only
4037         for DIMENSION2 charset.  This doesn't use any locking shift
4038         and single shift functions.  Assigned the coding-system (Lisp
4039         symbol) `japanese-iso-8bit' by default.
4040
4041    o coding-category-iso-7-else
4042
4043         The category for a coding system which has the same code range
4044         as ISO2022 of 7-bit environment but uses locking shift or
4045         single shift functions.  Assigned the coding-system (Lisp
4046         symbol) `iso-2022-7bit-lock' by default.
4047
4048    o coding-category-iso-8-else
4049
4050         The category for a coding system which has the same code range
4051         as ISO2022 of 8-bit environment but uses locking shift or
4052         single shift functions.  Assigned the coding-system (Lisp
4053         symbol) `iso-2022-8bit-ss2' by default.
4054
4055    o coding-category-big5
4056
4057         The category for a coding system which has the same code range
4058         as BIG5.  Assigned the coding-system (Lisp symbol)
4059         `cn-big5' by default.
4060
4061    o coding-category-utf-8
4062
4063         The category for a coding system which has the same code range
4064         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4065         symbol) `utf-8' by default.
4066
4067    o coding-category-utf-16-be
4068
4069         The category for a coding system in which a text has an
4070         Unicode signature (cf. Unicode Standard) in the order of BIG
4071         endian at the head.  Assigned the coding-system (Lisp symbol)
4072         `utf-16-be' by default.
4073
4074    o coding-category-utf-16-le
4075
4076         The category for a coding system in which a text has an
4077         Unicode signature (cf. Unicode Standard) in the order of
4078         LITTLE endian at the head.  Assigned the coding-system (Lisp
4079         symbol) `utf-16-le' by default.
4080
4081    o coding-category-ccl
4082
4083         The category for a coding system of which encoder/decoder is
4084         written in CCL programs.  The default value is nil, i.e., no
4085         coding system is assigned.
4086
4087    o coding-category-binary
4088
4089         The category for a coding system not categorized in any of the
4090         above.  Assigned the coding-system (Lisp symbol)
4091         `no-conversion' by default.
4092
4093    Each of them is a Lisp symbol and the value is an actual
4094    `coding-system' (this is also a Lisp symbol) assigned by a user.
4095    What Emacs does actually is to detect a category of coding system.
4096    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4097    decide a single possible category, it selects a category of the
4098    highest priority.  Priorities of categories are also specified by a
4099    user in a Lisp variable `coding-category-list'.
4100
4101 */
4102
4103 static
4104 int ascii_skip_code[256];
4105
4106 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4107    If it detects possible coding systems, return an integer in which
4108    appropriate flag bits are set.  Flag bits are defined by macros
4109    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4110    it should point the table `coding_priorities'.  In that case, only
4111    the flag bit for a coding system of the highest priority is set in
4112    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4113    range 0x80..0x9F are in multibyte form.
4114
4115    How many ASCII characters are at the head is returned as *SKIP.  */
4116
4117 static int
4118 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4119      unsigned char *source;
4120      int src_bytes, *priorities, *skip;
4121      int multibytep;
4122 {
4123   register unsigned char c;
4124   unsigned char *src = source, *src_end = source + src_bytes;
4125   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4126   int i;
4127
4128   /* At first, skip all ASCII characters and control characters except
4129      for three ISO2022 specific control characters.  */
4130   ascii_skip_code[ISO_CODE_SO] = 0;
4131   ascii_skip_code[ISO_CODE_SI] = 0;
4132   ascii_skip_code[ISO_CODE_ESC] = 0;
4133
4134  label_loop_detect_coding:
4135   while (src < src_end && ascii_skip_code[*src]) src++;
4136   *skip = src - source;
4137
4138   if (src >= src_end)
4139     /* We found nothing other than ASCII.  There's nothing to do.  */
4140     return 0;
4141
4142   c = *src;
4143   /* The text seems to be encoded in some multilingual coding system.
4144      Now, try to find in which coding system the text is encoded.  */
4145   if (c < 0x80)
4146     {
4147       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4148       /* C is an ISO2022 specific control code of C0.  */
4149       mask = detect_coding_iso2022 (src, src_end, multibytep);
4150       if (mask == 0)
4151         {
4152           /* No valid ISO2022 code follows C.  Try again.  */
4153           src++;
4154           if (c == ISO_CODE_ESC)
4155             ascii_skip_code[ISO_CODE_ESC] = 1;
4156           else
4157             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4158           goto label_loop_detect_coding;
4159         }
4160       if (priorities)
4161         {
4162           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4163             {
4164               if (mask & priorities[i])
4165                 return priorities[i];
4166             }
4167           return CODING_CATEGORY_MASK_RAW_TEXT;
4168         }
4169     }
4170   else
4171     {
4172       int try;
4173
4174       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4175         c = src[1] - 0x20;
4176
4177       if (c < 0xA0)
4178         {
4179           /* C is the first byte of SJIS character code,
4180              or a leading-code of Emacs' internal format (emacs-mule),
4181              or the first byte of UTF-16.  */
4182           try = (CODING_CATEGORY_MASK_SJIS
4183                   | CODING_CATEGORY_MASK_EMACS_MULE
4184                   | CODING_CATEGORY_MASK_UTF_16_BE
4185                   | CODING_CATEGORY_MASK_UTF_16_LE);
4186
4187           /* Or, if C is a special latin extra code,
4188              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4189              or is an ISO2022 control-sequence-introducer (CSI),
4190              we should also consider the possibility of ISO2022 codings.  */
4191           if ((VECTORP (Vlatin_extra_code_table)
4192                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4193               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4194               || (c == ISO_CODE_CSI
4195                   && (src < src_end
4196                       && (*src == ']'
4197                           || ((*src == '0' || *src == '1' || *src == '2')
4198                               && src + 1 < src_end
4199                               && src[1] == ']')))))
4200             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4201                      | CODING_CATEGORY_MASK_ISO_8BIT);
4202         }
4203       else
4204         /* C is a character of ISO2022 in graphic plane right,
4205            or a SJIS's 1-byte character code (i.e. JISX0201),
4206            or the first byte of BIG5's 2-byte code,
4207            or the first byte of UTF-8/16.  */
4208         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4209                 | CODING_CATEGORY_MASK_ISO_8BIT
4210                 | CODING_CATEGORY_MASK_SJIS
4211                 | CODING_CATEGORY_MASK_BIG5
4212                 | CODING_CATEGORY_MASK_UTF_8
4213                 | CODING_CATEGORY_MASK_UTF_16_BE
4214                 | CODING_CATEGORY_MASK_UTF_16_LE);
4215
4216       /* Or, we may have to consider the possibility of CCL.  */
4217       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4218           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4219               ->spec.ccl.valid_codes)[c])
4220         try |= CODING_CATEGORY_MASK_CCL;
4221
4222       mask = 0;
4223       utf16_examined_p = iso2022_examined_p = 0;
4224       if (priorities)
4225         {
4226           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4227             {
4228               if (!iso2022_examined_p
4229                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4230                 {
4231                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4232                   iso2022_examined_p = 1;
4233                 }
4234               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4235                 mask |= detect_coding_sjis (src, src_end, multibytep);
4236               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4237                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4238               else if (!utf16_examined_p
4239                        && (priorities[i] & try &
4240                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4241                 {
4242                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4243                   utf16_examined_p = 1;
4244                 }
4245               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4246                 mask |= detect_coding_big5 (src, src_end, multibytep);
4247               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4248                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4249               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4250                 mask |= detect_coding_ccl (src, src_end, multibytep);
4251               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4252                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4253               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4254                 mask |= CODING_CATEGORY_MASK_BINARY;
4255               if (mask & priorities[i])
4256                 return priorities[i];
4257             }
4258           return CODING_CATEGORY_MASK_RAW_TEXT;
4259         }
4260       if (try & CODING_CATEGORY_MASK_ISO)
4261         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4262       if (try & CODING_CATEGORY_MASK_SJIS)
4263         mask |= detect_coding_sjis (src, src_end, multibytep);
4264       if (try & CODING_CATEGORY_MASK_BIG5)
4265         mask |= detect_coding_big5 (src, src_end, multibytep);
4266       if (try & CODING_CATEGORY_MASK_UTF_8)
4267         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4268       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4269         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4270       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4271         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4272       if (try & CODING_CATEGORY_MASK_CCL)
4273         mask |= detect_coding_ccl (src, src_end, multibytep);
4274     }
4275   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4276 }
4277
4278 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4279    The information of the detected coding system is set in CODING.  */
4280
4281 void
4282 detect_coding (coding, src, src_bytes)
4283      struct coding_system *coding;
4284      const unsigned char *src;
4285      int src_bytes;
4286 {
4287   unsigned int idx;
4288   int skip, mask;
4289   Lisp_Object val;
4290
4291   val = Vcoding_category_list;
4292   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4293                              coding->src_multibyte);
4294   coding->heading_ascii = skip;
4295
4296   if (!mask) return;
4297
4298   /* We found a single coding system of the highest priority in MASK.  */
4299   idx = 0;
4300   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4301   if (! mask)
4302     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4303
4304   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4305
4306   if (coding->eol_type != CODING_EOL_UNDECIDED)
4307     {
4308       Lisp_Object tmp;
4309
4310       tmp = Fget (val, Qeol_type);
4311       if (VECTORP (tmp))
4312         val = XVECTOR (tmp)->contents[coding->eol_type];
4313     }
4314
4315   /* Setup this new coding system while preserving some slots.  */
4316   {
4317     int src_multibyte = coding->src_multibyte;
4318     int dst_multibyte = coding->dst_multibyte;
4319
4320     setup_coding_system (val, coding);
4321     coding->src_multibyte = src_multibyte;
4322     coding->dst_multibyte = dst_multibyte;
4323     coding->heading_ascii = skip;
4324   }
4325 }
4326
4327 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4328    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4329    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4330
4331    How many non-eol characters are at the head is returned as *SKIP.  */
4332
4333 #define MAX_EOL_CHECK_COUNT 3
4334
4335 static int
4336 detect_eol_type (source, src_bytes, skip)
4337      unsigned char *source;
4338      int src_bytes, *skip;
4339 {
4340   unsigned char *src = source, *src_end = src + src_bytes;
4341   unsigned char c;
4342   int total = 0;                /* How many end-of-lines are found so far.  */
4343   int eol_type = CODING_EOL_UNDECIDED;
4344   int this_eol_type;
4345
4346   *skip = 0;
4347
4348   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4349     {
4350       c = *src++;
4351       if (c == '\n' || c == '\r')
4352         {
4353           if (*skip == 0)
4354             *skip = src - 1 - source;
4355           total++;
4356           if (c == '\n')
4357             this_eol_type = CODING_EOL_LF;
4358           else if (src >= src_end || *src != '\n')
4359             this_eol_type = CODING_EOL_CR;
4360           else
4361             this_eol_type = CODING_EOL_CRLF, src++;
4362
4363           if (eol_type == CODING_EOL_UNDECIDED)
4364             /* This is the first end-of-line.  */
4365             eol_type = this_eol_type;
4366           else if (eol_type != this_eol_type)
4367             {
4368               /* The found type is different from what found before.  */
4369               eol_type = CODING_EOL_INCONSISTENT;
4370               break;
4371             }
4372         }
4373     }
4374
4375   if (*skip == 0)
4376     *skip = src_end - source;
4377   return eol_type;
4378 }
4379
4380 /* Like detect_eol_type, but detect EOL type in 2-octet
4381    big-endian/little-endian format for coding systems utf-16-be and
4382    utf-16-le.  */
4383
4384 static int
4385 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4386      unsigned char *source;
4387      int src_bytes, *skip, big_endian_p;
4388 {
4389   unsigned char *src = source, *src_end = src + src_bytes;
4390   unsigned int c1, c2;
4391   int total = 0;                /* How many end-of-lines are found so far.  */
4392   int eol_type = CODING_EOL_UNDECIDED;
4393   int this_eol_type;
4394   int msb, lsb;
4395
4396   if (big_endian_p)
4397     msb = 0, lsb = 1;
4398   else
4399     msb = 1, lsb = 0;
4400
4401   *skip = 0;
4402
4403   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4404     {
4405       c1 = (src[msb] << 8) | (src[lsb]);
4406       src += 2;
4407
4408       if (c1 == '\n' || c1 == '\r')
4409         {
4410           if (*skip == 0)
4411             *skip = src - 2 - source;
4412           total++;
4413           if (c1 == '\n')
4414             {
4415               this_eol_type = CODING_EOL_LF;
4416             }
4417           else
4418             {
4419               if ((src + 1) >= src_end)
4420                 {
4421                   this_eol_type = CODING_EOL_CR;
4422                 }
4423               else
4424                 {
4425                   c2 = (src[msb] << 8) | (src[lsb]);
4426                   if (c2 == '\n')
4427                     this_eol_type = CODING_EOL_CRLF, src += 2;
4428                   else
4429                     this_eol_type = CODING_EOL_CR;
4430                 }
4431             }
4432
4433           if (eol_type == CODING_EOL_UNDECIDED)
4434             /* This is the first end-of-line.  */
4435             eol_type = this_eol_type;
4436           else if (eol_type != this_eol_type)
4437             {
4438               /* The found type is different from what found before.  */
4439               eol_type = CODING_EOL_INCONSISTENT;
4440               break;
4441             }
4442         }
4443     }
4444
4445   if (*skip == 0)
4446     *skip = src_end - source;
4447   return eol_type;
4448 }
4449
4450 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4451    is encoded.  If it detects an appropriate format of end-of-line, it
4452    sets the information in *CODING.  */
4453
4454 void
4455 detect_eol (coding, src, src_bytes)
4456      struct coding_system *coding;
4457      const unsigned char *src;
4458      int src_bytes;
4459 {
4460   Lisp_Object val;
4461   int skip;
4462   int eol_type;
4463
4464   switch (coding->category_idx)
4465     {
4466     case CODING_CATEGORY_IDX_UTF_16_BE:
4467       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4468       break;
4469     case CODING_CATEGORY_IDX_UTF_16_LE:
4470       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4471       break;
4472     default:
4473       eol_type = detect_eol_type (src, src_bytes, &skip);
4474       break;
4475     }
4476
4477   if (coding->heading_ascii > skip)
4478     coding->heading_ascii = skip;
4479   else
4480     skip = coding->heading_ascii;
4481
4482   if (eol_type == CODING_EOL_UNDECIDED)
4483     return;
4484   if (eol_type == CODING_EOL_INCONSISTENT)
4485     {
4486 #if 0
4487       /* This code is suppressed until we find a better way to
4488          distinguish raw text file and binary file.  */
4489
4490       /* If we have already detected that the coding is raw-text, the
4491          coding should actually be no-conversion.  */
4492       if (coding->type == coding_type_raw_text)
4493         {
4494           setup_coding_system (Qno_conversion, coding);
4495           return;
4496         }
4497       /* Else, let's decode only text code anyway.  */
4498 #endif /* 0 */
4499       eol_type = CODING_EOL_LF;
4500     }
4501
4502   val = Fget (coding->symbol, Qeol_type);
4503   if (VECTORP (val) && XVECTOR (val)->size == 3)
4504     {
4505       int src_multibyte = coding->src_multibyte;
4506       int dst_multibyte = coding->dst_multibyte;
4507       struct composition_data *cmp_data = coding->cmp_data;
4508
4509       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4510       coding->src_multibyte = src_multibyte;
4511       coding->dst_multibyte = dst_multibyte;
4512       coding->heading_ascii = skip;
4513       coding->cmp_data = cmp_data;
4514     }
4515 }
4516
4517 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4518
4519 #define DECODING_BUFFER_MAG(coding)                     \
4520   (coding->type == coding_type_iso2022                  \
4521    ? 3                                                  \
4522    : (coding->type == coding_type_ccl                   \
4523       ? coding->spec.ccl.decoder.buf_magnification      \
4524       : 2))
4525
4526 /* Return maximum size (bytes) of a buffer enough for decoding
4527    SRC_BYTES of text encoded in CODING.  */
4528
4529 int
4530 decoding_buffer_size (coding, src_bytes)
4531      struct coding_system *coding;
4532      int src_bytes;
4533 {
4534   return (src_bytes * DECODING_BUFFER_MAG (coding)
4535           + CONVERSION_BUFFER_EXTRA_ROOM);
4536 }
4537
4538 /* Return maximum size (bytes) of a buffer enough for encoding
4539    SRC_BYTES of text to CODING.  */
4540
4541 int
4542 encoding_buffer_size (coding, src_bytes)
4543      struct coding_system *coding;
4544      int src_bytes;
4545 {
4546   int magnification;
4547
4548   if (coding->type == coding_type_ccl)
4549     {
4550       magnification = coding->spec.ccl.encoder.buf_magnification;
4551       if (coding->eol_type == CODING_EOL_CRLF)
4552         magnification *= 2;
4553     }
4554   else if (CODING_REQUIRE_ENCODING (coding))
4555     magnification = 3;
4556   else
4557     magnification = 1;
4558
4559   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4560 }
4561
4562 /* Working buffer for code conversion.  */
4563 struct conversion_buffer
4564 {
4565   int size;                     /* size of data.  */
4566   int on_stack;                 /* 1 if allocated by alloca.  */
4567   unsigned char *data;
4568 };
4569
4570 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4571 #define allocate_conversion_buffer(buf, len)            \
4572   do {                                                  \
4573     if (len < MAX_ALLOCA)                               \
4574       {                                                 \
4575         buf.data = (unsigned char *) alloca (len);      \
4576         buf.on_stack = 1;                               \
4577       }                                                 \
4578     else                                                \
4579       {                                                 \
4580         buf.data = (unsigned char *) xmalloc (len);     \
4581         buf.on_stack = 0;                               \
4582       }                                                 \
4583     buf.size = len;                                     \
4584   } while (0)
4585
4586 /* Double the allocated memory for *BUF.  */
4587 static void
4588 extend_conversion_buffer (buf)
4589      struct conversion_buffer *buf;
4590 {
4591   if (buf->on_stack)
4592     {
4593       unsigned char *save = buf->data;
4594       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4595       bcopy (save, buf->data, buf->size);
4596       buf->on_stack = 0;
4597     }
4598   else
4599     {
4600       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4601     }
4602   buf->size *= 2;
4603 }
4604
4605 /* Free the allocated memory for BUF if it is not on stack.  */
4606 static void
4607 free_conversion_buffer (buf)
4608      struct conversion_buffer *buf;
4609 {
4610   if (!buf->on_stack)
4611     xfree (buf->data);
4612 }
4613
4614 int
4615 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4616      struct coding_system *coding;
4617      unsigned char *source, *destination;
4618      int src_bytes, dst_bytes, encodep;
4619 {
4620   struct ccl_program *ccl
4621     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4622   unsigned char *dst = destination;
4623
4624   ccl->suppress_error = coding->suppress_error;
4625   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4626   if (encodep)
4627     {
4628       /* On encoding, EOL format is converted within ccl_driver.  For
4629          that, setup proper information in the structure CCL.  */
4630       ccl->eol_type = coding->eol_type;
4631       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4632         ccl->eol_type = CODING_EOL_LF;
4633       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4634       ccl->eight_bit_control = coding->dst_multibyte;
4635     }
4636   else
4637     ccl->eight_bit_control = 1;
4638   ccl->multibyte = coding->src_multibyte;
4639   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4640     {
4641       /* Move carryover bytes to DESTINATION.  */
4642       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4643       while (*p)
4644         *dst++ = *p++;
4645       coding->spec.ccl.eight_bit_carryover[0] = 0;
4646       if (dst_bytes)
4647         dst_bytes -= dst - destination;
4648     }
4649
4650   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4651                                   &(coding->consumed))
4652                       + dst - destination);
4653
4654   if (encodep)
4655     {
4656       coding->produced_char = coding->produced;
4657       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4658     }
4659   else if (!ccl->eight_bit_control)
4660     {
4661       /* The produced bytes forms a valid multibyte sequence. */
4662       coding->produced_char
4663         = multibyte_chars_in_text (destination, coding->produced);
4664       coding->spec.ccl.eight_bit_carryover[0] = 0;
4665     }
4666   else
4667     {
4668       /* On decoding, the destination should always multibyte.  But,
4669          CCL program might have been generated an invalid multibyte
4670          sequence.  Here we make such a sequence valid as
4671          multibyte.  */
4672       int bytes
4673         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4674
4675       if ((coding->consumed < src_bytes
4676            || !ccl->last_block)
4677           && coding->produced >= 1
4678           && destination[coding->produced - 1] >= 0x80)
4679         {
4680           /* We should not convert the tailing 8-bit codes to
4681              multibyte form even if they doesn't form a valid
4682              multibyte sequence.  They may form a valid sequence in
4683              the next call.  */
4684           int carryover = 0;
4685
4686           if (destination[coding->produced - 1] < 0xA0)
4687             carryover = 1;
4688           else if (coding->produced >= 2)
4689             {
4690               if (destination[coding->produced - 2] >= 0x80)
4691                 {
4692                   if (destination[coding->produced - 2] < 0xA0)
4693                     carryover = 2;
4694                   else if (coding->produced >= 3
4695                            && destination[coding->produced - 3] >= 0x80
4696                            && destination[coding->produced - 3] < 0xA0)
4697                     carryover = 3;
4698                 }
4699             }
4700           if (carryover > 0)
4701             {
4702               BCOPY_SHORT (destination + coding->produced - carryover,
4703                            coding->spec.ccl.eight_bit_carryover,
4704                            carryover);
4705               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4706               coding->produced -= carryover;
4707             }
4708         }
4709       coding->produced = str_as_multibyte (destination, bytes,
4710                                            coding->produced,
4711                                            &(coding->produced_char));
4712     }
4713
4714   switch (ccl->status)
4715     {
4716     case CCL_STAT_SUSPEND_BY_SRC:
4717       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4718       break;
4719     case CCL_STAT_SUSPEND_BY_DST:
4720       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4721       break;
4722     case CCL_STAT_QUIT:
4723     case CCL_STAT_INVALID_CMD:
4724       coding->result = CODING_FINISH_INTERRUPT;
4725       break;
4726     default:
4727       coding->result = CODING_FINISH_NORMAL;
4728       break;
4729     }
4730   return coding->result;
4731 }
4732
4733 /* Decode EOL format of the text at PTR of BYTES length destructively
4734    according to CODING->eol_type.  This is called after the CCL
4735    program produced a decoded text at PTR.  If we do CRLF->LF
4736    conversion, update CODING->produced and CODING->produced_char.  */
4737
4738 static void
4739 decode_eol_post_ccl (coding, ptr, bytes)
4740      struct coding_system *coding;
4741      unsigned char *ptr;
4742      int bytes;
4743 {
4744   Lisp_Object val, saved_coding_symbol;
4745   unsigned char *pend = ptr + bytes;
4746   int dummy;
4747
4748   /* Remember the current coding system symbol.  We set it back when
4749      an inconsistent EOL is found so that `last-coding-system-used' is
4750      set to the coding system that doesn't specify EOL conversion.  */
4751   saved_coding_symbol = coding->symbol;
4752
4753   coding->spec.ccl.cr_carryover = 0;
4754   if (coding->eol_type == CODING_EOL_UNDECIDED)
4755     {
4756       /* Here, to avoid the call of setup_coding_system, we directly
4757          call detect_eol_type.  */
4758       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4759       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4760         coding->eol_type = CODING_EOL_LF;
4761       if (coding->eol_type != CODING_EOL_UNDECIDED)
4762         {
4763           val = Fget (coding->symbol, Qeol_type);
4764           if (VECTORP (val) && XVECTOR (val)->size == 3)
4765             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4766         }
4767       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4768     }
4769
4770   if (coding->eol_type == CODING_EOL_LF
4771       || coding->eol_type == CODING_EOL_UNDECIDED)
4772     {
4773       /* We have nothing to do.  */
4774       ptr = pend;
4775     }
4776   else if (coding->eol_type == CODING_EOL_CRLF)
4777     {
4778       unsigned char *pstart = ptr, *p = ptr;
4779
4780       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4781           && *(pend - 1) == '\r')
4782         {
4783           /* If the last character is CR, we can't handle it here
4784              because LF will be in the not-yet-decoded source text.
4785              Record that the CR is not yet processed.  */
4786           coding->spec.ccl.cr_carryover = 1;
4787           coding->produced--;
4788           coding->produced_char--;
4789           pend--;
4790         }
4791       while (ptr < pend)
4792         {
4793           if (*ptr == '\r')
4794             {
4795               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4796                 {
4797                   *p++ = '\n';
4798                   ptr += 2;
4799                 }
4800               else
4801                 {
4802                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4803                     goto undo_eol_conversion;
4804                   *p++ = *ptr++;
4805                 }
4806             }
4807           else if (*ptr == '\n'
4808                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4809             goto undo_eol_conversion;
4810           else
4811             *p++ = *ptr++;
4812           continue;
4813
4814         undo_eol_conversion:
4815           /* We have faced with inconsistent EOL format at PTR.
4816              Convert all LFs before PTR back to CRLFs.  */
4817           for (p--, ptr--; p >= pstart; p--)
4818             {
4819               if (*p == '\n')
4820                 *ptr-- = '\n', *ptr-- = '\r';
4821               else
4822                 *ptr-- = *p;
4823             }
4824           /*  If carryover is recorded, cancel it because we don't
4825               convert CRLF anymore.  */
4826           if (coding->spec.ccl.cr_carryover)
4827             {
4828               coding->spec.ccl.cr_carryover = 0;
4829               coding->produced++;
4830               coding->produced_char++;
4831               pend++;
4832             }
4833           p = ptr = pend;
4834           coding->eol_type = CODING_EOL_LF;
4835           coding->symbol = saved_coding_symbol;
4836         }
4837       if (p < pend)
4838         {
4839           /* As each two-byte sequence CRLF was converted to LF, (PEND
4840              - P) is the number of deleted characters.  */
4841           coding->produced -= pend - p;
4842           coding->produced_char -= pend - p;
4843         }
4844     }
4845   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4846     {
4847       unsigned char *p = ptr;
4848
4849       for (; ptr < pend; ptr++)
4850         {
4851           if (*ptr == '\r')
4852             *ptr = '\n';
4853           else if (*ptr == '\n'
4854                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4855             {
4856               for (; p < ptr; p++)
4857                 {
4858                   if (*p == '\n')
4859                     *p = '\r';
4860                 }
4861               ptr = pend;
4862               coding->eol_type = CODING_EOL_LF;
4863               coding->symbol = saved_coding_symbol;
4864             }
4865         }
4866     }
4867 }
4868
4869 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4870    decoding, it may detect coding system and format of end-of-line if
4871    those are not yet decided.  The source should be unibyte, the
4872    result is multibyte if CODING->dst_multibyte is nonzero, else
4873    unibyte.  */
4874
4875 int
4876 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4877      struct coding_system *coding;
4878      const unsigned char *source;
4879      unsigned char *destination;
4880      int src_bytes, dst_bytes;
4881 {
4882   int extra = 0;
4883
4884   if (coding->type == coding_type_undecided)
4885     detect_coding (coding, source, src_bytes);
4886
4887   if (coding->eol_type == CODING_EOL_UNDECIDED
4888       && coding->type != coding_type_ccl)
4889     {
4890       detect_eol (coding, source, src_bytes);
4891       /* We had better recover the original eol format if we
4892          encounter an inconsistent eol format while decoding.  */
4893       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4894     }
4895
4896   coding->produced = coding->produced_char = 0;
4897   coding->consumed = coding->consumed_char = 0;
4898   coding->errors = 0;
4899   coding->result = CODING_FINISH_NORMAL;
4900
4901   switch (coding->type)
4902     {
4903     case coding_type_sjis:
4904       decode_coding_sjis_big5 (coding, source, destination,
4905                                src_bytes, dst_bytes, 1);
4906       break;
4907
4908     case coding_type_iso2022:
4909       decode_coding_iso2022 (coding, source, destination,
4910                              src_bytes, dst_bytes);
4911       break;
4912
4913     case coding_type_big5:
4914       decode_coding_sjis_big5 (coding, source, destination,
4915                                src_bytes, dst_bytes, 0);
4916       break;
4917
4918     case coding_type_emacs_mule:
4919       decode_coding_emacs_mule (coding, source, destination,
4920                                 src_bytes, dst_bytes);
4921       break;
4922
4923     case coding_type_ccl:
4924       if (coding->spec.ccl.cr_carryover)
4925         {
4926           /* Put the CR which was not processed by the previous call
4927              of decode_eol_post_ccl in DESTINATION.  It will be
4928              decoded together with the following LF by the call to
4929              decode_eol_post_ccl below.  */
4930           *destination = '\r';
4931           coding->produced++;
4932           coding->produced_char++;
4933           dst_bytes--;
4934           extra = coding->spec.ccl.cr_carryover;
4935         }
4936       ccl_coding_driver (coding, source, destination + extra,
4937                          src_bytes, dst_bytes, 0);
4938       if (coding->eol_type != CODING_EOL_LF)
4939         {
4940           coding->produced += extra;
4941           coding->produced_char += extra;
4942           decode_eol_post_ccl (coding, destination, coding->produced);
4943         }
4944       break;
4945
4946     default:
4947       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4948     }
4949
4950   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4951       && coding->mode & CODING_MODE_LAST_BLOCK
4952       && coding->consumed == src_bytes)
4953     coding->result = CODING_FINISH_NORMAL;
4954
4955   if (coding->mode & CODING_MODE_LAST_BLOCK
4956       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4957     {
4958       const unsigned char *src = source + coding->consumed;
4959       unsigned char *dst = destination + coding->produced;
4960
4961       src_bytes -= coding->consumed;
4962       coding->errors++;
4963       if (COMPOSING_P (coding))
4964         DECODE_COMPOSITION_END ('1');
4965       while (src_bytes--)
4966         {
4967           int c = *src++;
4968           dst += CHAR_STRING (c, dst);
4969           coding->produced_char++;
4970         }
4971       coding->consumed = coding->consumed_char = src - source;
4972       coding->produced = dst - destination;
4973       coding->result = CODING_FINISH_NORMAL;
4974     }
4975
4976   if (!coding->dst_multibyte)
4977     {
4978       coding->produced = str_as_unibyte (destination, coding->produced);
4979       coding->produced_char = coding->produced;
4980     }
4981
4982   return coding->result;
4983 }
4984
4985 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4986    multibyteness of the source is CODING->src_multibyte, the
4987    multibyteness of the result is always unibyte.  */
4988
4989 int
4990 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4991      struct coding_system *coding;
4992      const unsigned char *source;
4993      unsigned char *destination;
4994      int src_bytes, dst_bytes;
4995 {
4996   coding->produced = coding->produced_char = 0;
4997   coding->consumed = coding->consumed_char = 0;
4998   coding->errors = 0;
4999   coding->result = CODING_FINISH_NORMAL;
5000   if (coding->eol_type == CODING_EOL_UNDECIDED)
5001     coding->eol_type = CODING_EOL_LF;
5002
5003   switch (coding->type)
5004     {
5005     case coding_type_sjis:
5006       encode_coding_sjis_big5 (coding, source, destination,
5007                                src_bytes, dst_bytes, 1);
5008       break;
5009
5010     case coding_type_iso2022:
5011       encode_coding_iso2022 (coding, source, destination,
5012                              src_bytes, dst_bytes);
5013       break;
5014
5015     case coding_type_big5:
5016       encode_coding_sjis_big5 (coding, source, destination,
5017                                src_bytes, dst_bytes, 0);
5018       break;
5019
5020     case coding_type_emacs_mule:
5021       encode_coding_emacs_mule (coding, source, destination,
5022                                 src_bytes, dst_bytes);
5023       break;
5024
5025     case coding_type_ccl:
5026       ccl_coding_driver (coding, source, destination,
5027                          src_bytes, dst_bytes, 1);
5028       break;
5029
5030     default:
5031       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5032     }
5033
5034   if (coding->mode & CODING_MODE_LAST_BLOCK
5035       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5036     {
5037       const unsigned char *src = source + coding->consumed;
5038       unsigned char *dst = destination + coding->produced;
5039
5040       if (coding->type == coding_type_iso2022)
5041         ENCODE_RESET_PLANE_AND_REGISTER;
5042       if (COMPOSING_P (coding))
5043         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5044       if (coding->consumed < src_bytes)
5045         {
5046           int len = src_bytes - coding->consumed;
5047
5048           BCOPY_SHORT (src, dst, len);
5049           if (coding->src_multibyte)
5050             len = str_as_unibyte (dst, len);
5051           dst += len;
5052           coding->consumed = src_bytes;
5053         }
5054       coding->produced = coding->produced_char = dst - destination;
5055       coding->result = CODING_FINISH_NORMAL;
5056     }
5057
5058   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5059       && coding->consumed == src_bytes)
5060     coding->result = CODING_FINISH_NORMAL;
5061
5062   return coding->result;
5063 }
5064
5065 /* Scan text in the region between *BEG and *END (byte positions),
5066    skip characters which we don't have to decode by coding system
5067    CODING at the head and tail, then set *BEG and *END to the region
5068    of the text we actually have to convert.  The caller should move
5069    the gap out of the region in advance if the region is from a
5070    buffer.
5071
5072    If STR is not NULL, *BEG and *END are indices into STR.  */
5073
5074 static void
5075 shrink_decoding_region (beg, end, coding, str)
5076      int *beg, *end;
5077      struct coding_system *coding;
5078      unsigned char *str;
5079 {
5080   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5081   int eol_conversion;
5082   Lisp_Object translation_table;
5083
5084   if (coding->type == coding_type_ccl
5085       || coding->type == coding_type_undecided
5086       || coding->eol_type != CODING_EOL_LF
5087       || !NILP (coding->post_read_conversion)
5088       || coding->composing != COMPOSITION_DISABLED)
5089     {
5090       /* We can't skip any data.  */
5091       return;
5092     }
5093   if (coding->type == coding_type_no_conversion
5094       || coding->type == coding_type_raw_text
5095       || coding->type == coding_type_emacs_mule)
5096     {
5097       /* We need no conversion, but don't have to skip any data here.
5098          Decoding routine handles them effectively anyway.  */
5099       return;
5100     }
5101
5102   translation_table = coding->translation_table_for_decode;
5103   if (NILP (translation_table) && !NILP (Venable_character_translation))
5104     translation_table = Vstandard_translation_table_for_decode;
5105   if (CHAR_TABLE_P (translation_table))
5106     {
5107       int i;
5108       for (i = 0; i < 128; i++)
5109         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5110           break;
5111       if (i < 128)
5112         /* Some ASCII character should be translated.  We give up
5113            shrinking.  */
5114         return;
5115     }
5116
5117   if (coding->heading_ascii >= 0)
5118     /* Detection routine has already found how much we can skip at the
5119        head.  */
5120     *beg += coding->heading_ascii;
5121
5122   if (str)
5123     {
5124       begp_orig = begp = str + *beg;
5125       endp_orig = endp = str + *end;
5126     }
5127   else
5128     {
5129       begp_orig = begp = BYTE_POS_ADDR (*beg);
5130       endp_orig = endp = begp + *end - *beg;
5131     }
5132
5133   eol_conversion = (coding->eol_type == CODING_EOL_CR
5134                     || coding->eol_type == CODING_EOL_CRLF);
5135
5136   switch (coding->type)
5137     {
5138     case coding_type_sjis:
5139     case coding_type_big5:
5140       /* We can skip all ASCII characters at the head.  */
5141       if (coding->heading_ascii < 0)
5142         {
5143           if (eol_conversion)
5144             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5145           else
5146             while (begp < endp && *begp < 0x80) begp++;
5147         }
5148       /* We can skip all ASCII characters at the tail except for the
5149          second byte of SJIS or BIG5 code.  */
5150       if (eol_conversion)
5151         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5152       else
5153         while (begp < endp && endp[-1] < 0x80) endp--;
5154       /* Do not consider LF as ascii if preceded by CR, since that
5155          confuses eol decoding. */
5156       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5157         endp++;
5158       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5159         endp++;
5160       break;
5161
5162     case coding_type_iso2022:
5163       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5164         /* We can't skip any data.  */
5165         break;
5166       if (coding->heading_ascii < 0)
5167         {
5168           /* We can skip all ASCII characters at the head except for a
5169              few control codes.  */
5170           while (begp < endp && (c = *begp) < 0x80
5171                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5172                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5173                  && (!eol_conversion || c != ISO_CODE_LF))
5174             begp++;
5175         }
5176       switch (coding->category_idx)
5177         {
5178         case CODING_CATEGORY_IDX_ISO_8_1:
5179         case CODING_CATEGORY_IDX_ISO_8_2:
5180           /* We can skip all ASCII characters at the tail.  */
5181           if (eol_conversion)
5182             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5183           else
5184             while (begp < endp && endp[-1] < 0x80) endp--;
5185           /* Do not consider LF as ascii if preceded by CR, since that
5186              confuses eol decoding. */
5187           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5188             endp++;
5189           break;
5190
5191         case CODING_CATEGORY_IDX_ISO_7:
5192         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5193           {
5194             /* We can skip all characters at the tail except for 8-bit
5195                codes and ESC and the following 2-byte at the tail.  */
5196             unsigned char *eight_bit = NULL;
5197
5198             if (eol_conversion)
5199               while (begp < endp
5200                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5201                 {
5202                   if (!eight_bit && c & 0x80) eight_bit = endp;
5203                   endp--;
5204                 }
5205             else
5206               while (begp < endp
5207                      && (c = endp[-1]) != ISO_CODE_ESC)
5208                 {
5209                   if (!eight_bit && c & 0x80) eight_bit = endp;
5210                   endp--;
5211                 }
5212             /* Do not consider LF as ascii if preceded by CR, since that
5213                confuses eol decoding. */
5214             if (begp < endp && endp < endp_orig
5215                 && endp[-1] == '\r' && endp[0] == '\n')
5216               endp++;
5217             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5218               {
5219                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5220                   /* This is an ASCII designation sequence.  We can
5221                      surely skip the tail.  But, if we have
5222                      encountered an 8-bit code, skip only the codes
5223                      after that.  */
5224                   endp = eight_bit ? eight_bit : endp + 2;
5225                 else
5226                   /* Hmmm, we can't skip the tail.  */
5227                   endp = endp_orig;
5228               }
5229             else if (eight_bit)
5230               endp = eight_bit;
5231           }
5232         }
5233       break;
5234
5235     default:
5236       abort ();
5237     }
5238   *beg += begp - begp_orig;
5239   *end += endp - endp_orig;
5240   return;
5241 }
5242
5243 /* Like shrink_decoding_region but for encoding.  */
5244
5245 static void
5246 shrink_encoding_region (beg, end, coding, str)
5247      int *beg, *end;
5248      struct coding_system *coding;
5249      unsigned char *str;
5250 {
5251   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5252   int eol_conversion;
5253   Lisp_Object translation_table;
5254
5255   if (coding->type == coding_type_ccl
5256       || coding->eol_type == CODING_EOL_CRLF
5257       || coding->eol_type == CODING_EOL_CR
5258       || (coding->cmp_data && coding->cmp_data->used > 0))
5259     {
5260       /* We can't skip any data.  */
5261       return;
5262     }
5263   if (coding->type == coding_type_no_conversion
5264       || coding->type == coding_type_raw_text
5265       || coding->type == coding_type_emacs_mule
5266       || coding->type == coding_type_undecided)
5267     {
5268       /* We need no conversion, but don't have to skip any data here.
5269          Encoding routine handles them effectively anyway.  */
5270       return;
5271     }
5272
5273   translation_table = coding->translation_table_for_encode;
5274   if (NILP (translation_table) && !NILP (Venable_character_translation))
5275     translation_table = Vstandard_translation_table_for_encode;
5276   if (CHAR_TABLE_P (translation_table))
5277     {
5278       int i;
5279       for (i = 0; i < 128; i++)
5280         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5281           break;
5282       if (i < 128)
5283         /* Some ASCII character should be translated.  We give up
5284            shrinking.  */
5285         return;
5286     }
5287
5288   if (str)
5289     {
5290       begp_orig = begp = str + *beg;
5291       endp_orig = endp = str + *end;
5292     }
5293   else
5294     {
5295       begp_orig = begp = BYTE_POS_ADDR (*beg);
5296       endp_orig = endp = begp + *end - *beg;
5297     }
5298
5299   eol_conversion = (coding->eol_type == CODING_EOL_CR
5300                     || coding->eol_type == CODING_EOL_CRLF);
5301
5302   /* Here, we don't have to check coding->pre_write_conversion because
5303      the caller is expected to have handled it already.  */
5304   switch (coding->type)
5305     {
5306     case coding_type_iso2022:
5307       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5308         /* We can't skip any data.  */
5309         break;
5310       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5311         {
5312           unsigned char *bol = begp;
5313           while (begp < endp && *begp < 0x80)
5314             {
5315               begp++;
5316               if (begp[-1] == '\n')
5317                 bol = begp;
5318             }
5319           begp = bol;
5320           goto label_skip_tail;
5321         }
5322       /* fall down ... */
5323
5324     case coding_type_sjis:
5325     case coding_type_big5:
5326       /* We can skip all ASCII characters at the head and tail.  */
5327       if (eol_conversion)
5328         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5329       else
5330         while (begp < endp && *begp < 0x80) begp++;
5331     label_skip_tail:
5332       if (eol_conversion)
5333         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5334       else
5335         while (begp < endp && *(endp - 1) < 0x80) endp--;
5336       break;
5337
5338     default:
5339       abort ();
5340     }
5341
5342   *beg += begp - begp_orig;
5343   *end += endp - endp_orig;
5344   return;
5345 }
5346
5347 /* As shrinking conversion region requires some overhead, we don't try
5348    shrinking if the length of conversion region is less than this
5349    value.  */
5350 static int shrink_conversion_region_threshhold = 1024;
5351
5352 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5353   do {                                                                  \
5354     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5355       {                                                                 \
5356         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5357         else shrink_decoding_region (beg, end, coding, str);            \
5358       }                                                                 \
5359   } while (0)
5360
5361 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5362    Vlast_coding_system_used and the remaining elements are buffers to
5363    kill.  */
5364 static Lisp_Object
5365 code_convert_region_unwind (arg)
5366      Lisp_Object arg;
5367 {
5368   struct gcpro gcpro1;
5369   GCPRO1 (arg);
5370
5371   inhibit_pre_post_conversion = 0;
5372   Vlast_coding_system_used = XCAR (arg);
5373   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5374     Fkill_buffer (XCAR (arg));
5375
5376   UNGCPRO;
5377   return Qnil;
5378 }
5379
5380 /* Store information about all compositions in the range FROM and TO
5381    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5382    buffer or a string, defaults to the current buffer.  */
5383
5384 void
5385 coding_save_composition (coding, from, to, obj)
5386      struct coding_system *coding;
5387      int from, to;
5388      Lisp_Object obj;
5389 {
5390   Lisp_Object prop;
5391   int start, end;
5392
5393   if (coding->composing == COMPOSITION_DISABLED)
5394     return;
5395   if (!coding->cmp_data)
5396     coding_allocate_composition_data (coding, from);
5397   if (!find_composition (from, to, &start, &end, &prop, obj)
5398       || end > to)
5399     return;
5400   if (start < from
5401       && (!find_composition (end, to, &start, &end, &prop, obj)
5402           || end > to))
5403     return;
5404   coding->composing = COMPOSITION_NO;
5405   do
5406     {
5407       if (COMPOSITION_VALID_P (start, end, prop))
5408         {
5409           enum composition_method method = COMPOSITION_METHOD (prop);
5410           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5411               >= COMPOSITION_DATA_SIZE)
5412             coding_allocate_composition_data (coding, from);
5413           /* For relative composition, we remember start and end
5414              positions, for the other compositions, we also remember
5415              components.  */
5416           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5417           if (method != COMPOSITION_RELATIVE)
5418             {
5419               /* We must store a*/
5420               Lisp_Object val, ch;
5421
5422               val = COMPOSITION_COMPONENTS (prop);
5423               if (CONSP (val))
5424                 while (CONSP (val))
5425                   {
5426                     ch = XCAR (val), val = XCDR (val);
5427                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5428                   }
5429               else if (VECTORP (val) || STRINGP (val))
5430                 {
5431                   int len = (VECTORP (val)
5432                              ? XVECTOR (val)->size : SCHARS (val));
5433                   int i;
5434                   for (i = 0; i < len; i++)
5435                     {
5436                       ch = (STRINGP (val)
5437                             ? Faref (val, make_number (i))
5438                             : XVECTOR (val)->contents[i]);
5439                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5440                     }
5441                 }
5442               else              /* INTEGERP (val) */
5443                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5444             }
5445           CODING_ADD_COMPOSITION_END (coding, end - from);
5446         }
5447       start = end;
5448     }
5449   while (start < to
5450          && find_composition (start, to, &start, &end, &prop, obj)
5451          && end <= to);
5452
5453   /* Make coding->cmp_data point to the first memory block.  */
5454   while (coding->cmp_data->prev)
5455     coding->cmp_data = coding->cmp_data->prev;
5456   coding->cmp_data_start = 0;
5457 }
5458
5459 /* Reflect the saved information about compositions to OBJ.
5460    CODING->cmp_data points to a memory block for the information.  OBJ
5461    is a buffer or a string, defaults to the current buffer.  */
5462
5463 void
5464 coding_restore_composition (coding, obj)
5465      struct coding_system *coding;
5466      Lisp_Object obj;
5467 {
5468   struct composition_data *cmp_data = coding->cmp_data;
5469
5470   if (!cmp_data)
5471     return;
5472
5473   while (cmp_data->prev)
5474     cmp_data = cmp_data->prev;
5475
5476   while (cmp_data)
5477     {
5478       int i;
5479
5480       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5481            i += cmp_data->data[i])
5482         {
5483           int *data = cmp_data->data + i;
5484           enum composition_method method = (enum composition_method) data[3];
5485           Lisp_Object components;
5486
5487           if (data[0] < 0 || i + data[0] > cmp_data->used)
5488             /* Invalid composition data.  */
5489             break;
5490
5491           if (method == COMPOSITION_RELATIVE)
5492             components = Qnil;
5493           else
5494             {
5495               int len = data[0] - 4, j;
5496               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5497
5498               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5499                   && len % 2 == 0)
5500                 len --;
5501               if (len < 1)
5502                 /* Invalid composition data.  */
5503                 break;
5504               for (j = 0; j < len; j++)
5505                 args[j] = make_number (data[4 + j]);
5506               components = (method == COMPOSITION_WITH_ALTCHARS
5507                             ? Fstring (len, args)
5508                             : Fvector (len, args));
5509             }
5510           compose_text (data[1], data[2], components, Qnil, obj);
5511         }
5512       cmp_data = cmp_data->next;
5513     }
5514 }
5515
5516 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5517    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5518    coding system CODING, and return the status code of code conversion
5519    (currently, this value has no meaning).
5520
5521    How many characters (and bytes) are converted to how many
5522    characters (and bytes) are recorded in members of the structure
5523    CODING.
5524
5525    If REPLACE is nonzero, we do various things as if the original text
5526    is deleted and a new text is inserted.  See the comments in
5527    replace_range (insdel.c) to know what we are doing.
5528
5529    If REPLACE is zero, it is assumed that the source text is unibyte.
5530    Otherwise, it is assumed that the source text is multibyte.  */
5531
5532 int
5533 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5534      int from, from_byte, to, to_byte, encodep, replace;
5535      struct coding_system *coding;
5536 {
5537   int len = to - from, len_byte = to_byte - from_byte;
5538   int nchars_del = 0, nbytes_del = 0;
5539   int require, inserted, inserted_byte;
5540   int head_skip, tail_skip, total_skip = 0;
5541   Lisp_Object saved_coding_symbol;
5542   int first = 1;
5543   unsigned char *src, *dst;
5544   Lisp_Object deletion;
5545   int orig_point = PT, orig_len = len;
5546   int prev_Z;
5547   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5548
5549   deletion = Qnil;
5550   saved_coding_symbol = coding->symbol;
5551
5552   if (from < PT && PT < to)
5553     {
5554       TEMP_SET_PT_BOTH (from, from_byte);
5555       orig_point = from;
5556     }
5557
5558   if (replace)
5559     {
5560       int saved_from = from;
5561       int saved_inhibit_modification_hooks;
5562
5563       prepare_to_modify_buffer (from, to, &from);
5564       if (saved_from != from)
5565         {
5566           to = from + len;
5567           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5568           len_byte = to_byte - from_byte;
5569         }
5570
5571       /* The code conversion routine can not preserve text properties
5572          for now.  So, we must remove all text properties in the
5573          region.  Here, we must suppress all modification hooks.  */
5574       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5575       inhibit_modification_hooks = 1;
5576       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5577       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5578     }
5579
5580   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5581     {
5582       /* We must detect encoding of text and eol format.  */
5583
5584       if (from < GPT && to > GPT)
5585         move_gap_both (from, from_byte);
5586       if (coding->type == coding_type_undecided)
5587         {
5588           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5589           if (coding->type == coding_type_undecided)
5590             {
5591               /* It seems that the text contains only ASCII, but we
5592                  should not leave it undecided because the deeper
5593                  decoding routine (decode_coding) tries to detect the
5594                  encodings again in vain.  */
5595               coding->type = coding_type_emacs_mule;
5596               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5597               /* As emacs-mule decoder will handle composition, we
5598                  need this setting to allocate coding->cmp_data
5599                  later.  */
5600               coding->composing = COMPOSITION_NO;
5601             }
5602         }
5603       if (coding->eol_type == CODING_EOL_UNDECIDED
5604           && coding->type != coding_type_ccl)
5605         {
5606           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5607           if (coding->eol_type == CODING_EOL_UNDECIDED)
5608             coding->eol_type = CODING_EOL_LF;
5609           /* We had better recover the original eol format if we
5610              encounter an inconsistent eol format while decoding.  */
5611           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5612         }
5613     }
5614
5615   /* Now we convert the text.  */
5616
5617   /* For encoding, we must process pre-write-conversion in advance.  */
5618   if (! inhibit_pre_post_conversion
5619       && encodep
5620       && SYMBOLP (coding->pre_write_conversion)
5621       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5622     {
5623       /* The function in pre-write-conversion may put a new text in a
5624          new buffer.  */
5625       struct buffer *prev = current_buffer;
5626       Lisp_Object new;
5627
5628       record_unwind_protect (code_convert_region_unwind,
5629                              Fcons (Vlast_coding_system_used, Qnil));
5630       /* We should not call any more pre-write/post-read-conversion
5631          functions while this pre-write-conversion is running.  */
5632       inhibit_pre_post_conversion = 1;
5633       call2 (coding->pre_write_conversion,
5634              make_number (from), make_number (to));
5635       inhibit_pre_post_conversion = 0;
5636       /* Discard the unwind protect.  */
5637       specpdl_ptr--;
5638
5639       if (current_buffer != prev)
5640         {
5641           len = ZV - BEGV;
5642           new = Fcurrent_buffer ();
5643           set_buffer_internal_1 (prev);
5644           del_range_2 (from, from_byte, to, to_byte, 0);
5645           TEMP_SET_PT_BOTH (from, from_byte);
5646           insert_from_buffer (XBUFFER (new), 1, len, 0);
5647           Fkill_buffer (new);
5648           if (orig_point >= to)
5649             orig_point += len - orig_len;
5650           else if (orig_point > from)
5651             orig_point = from;
5652           orig_len = len;
5653           to = from + len;
5654           from_byte = CHAR_TO_BYTE (from);
5655           to_byte = CHAR_TO_BYTE (to);
5656           len_byte = to_byte - from_byte;
5657           TEMP_SET_PT_BOTH (from, from_byte);
5658         }
5659     }
5660
5661   if (replace)
5662     {
5663       if (! EQ (current_buffer->undo_list, Qt))
5664         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5665       else
5666         {
5667           nchars_del = to - from;
5668           nbytes_del = to_byte - from_byte;
5669         }
5670     }
5671
5672   if (coding->composing != COMPOSITION_DISABLED)
5673     {
5674       if (encodep)
5675         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5676       else
5677         coding_allocate_composition_data (coding, from);
5678     }
5679
5680   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5681      if we must run CCL program or there are compositions to
5682      encode.  */
5683   if (coding->type != coding_type_ccl
5684       && (! coding->cmp_data || coding->cmp_data->used == 0))
5685     {
5686       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5687
5688       if (from < GPT && GPT < to)
5689         move_gap_both (from, from_byte);
5690       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5691       if (from_byte == to_byte
5692           && (encodep || NILP (coding->post_read_conversion))
5693           && ! CODING_REQUIRE_FLUSHING (coding))
5694         {
5695           coding->produced = len_byte;
5696           coding->produced_char = len;
5697           if (!replace)
5698             /* We must record and adjust for this new text now.  */
5699             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5700           coding_free_composition_data (coding);
5701           return 0;
5702         }
5703
5704       head_skip = from_byte - from_byte_orig;
5705       tail_skip = to_byte_orig - to_byte;
5706       total_skip = head_skip + tail_skip;
5707       from += head_skip;
5708       to -= tail_skip;
5709       len -= total_skip; len_byte -= total_skip;
5710     }
5711
5712   /* For conversion, we must put the gap before the text in addition to
5713      making the gap larger for efficient decoding.  The required gap
5714      size starts from 2000 which is the magic number used in make_gap.
5715      But, after one batch of conversion, it will be incremented if we
5716      find that it is not enough .  */
5717   require = 2000;
5718
5719   if (GAP_SIZE  < require)
5720     make_gap (require - GAP_SIZE);
5721   move_gap_both (from, from_byte);
5722
5723   inserted = inserted_byte = 0;
5724
5725   GAP_SIZE += len_byte;
5726   ZV -= len;
5727   Z -= len;
5728   ZV_BYTE -= len_byte;
5729   Z_BYTE -= len_byte;
5730
5731   if (GPT - BEG < BEG_UNCHANGED)
5732     BEG_UNCHANGED = GPT - BEG;
5733   if (Z - GPT < END_UNCHANGED)
5734     END_UNCHANGED = Z - GPT;
5735
5736   if (!encodep && coding->src_multibyte)
5737     {
5738       /* Decoding routines expects that the source text is unibyte.
5739          We must convert 8-bit characters of multibyte form to
5740          unibyte.  */
5741       int len_byte_orig = len_byte;
5742       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5743       if (len_byte < len_byte_orig)
5744         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5745                     len_byte);
5746       coding->src_multibyte = 0;
5747     }
5748
5749   for (;;)
5750     {
5751       int result;
5752
5753       /* The buffer memory is now:
5754          +--------+converted-text+---------+-------original-text-------+---+
5755          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5756                   |<---------------------- GAP ----------------------->|  */
5757       src = GAP_END_ADDR - len_byte;
5758       dst = GPT_ADDR + inserted_byte;
5759
5760       if (encodep)
5761         result = encode_coding (coding, src, dst, len_byte, 0);
5762       else
5763         {
5764           if (coding->composing != COMPOSITION_DISABLED)
5765             coding->cmp_data->char_offset = from + inserted;
5766           result = decode_coding (coding, src, dst, len_byte, 0);
5767         }
5768
5769       /* The buffer memory is now:
5770          +--------+-------converted-text----+--+------original-text----+---+
5771          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5772                   |<---------------------- GAP ----------------------->|  */
5773
5774       inserted += coding->produced_char;
5775       inserted_byte += coding->produced;
5776       len_byte -= coding->consumed;
5777
5778       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5779         {
5780           coding_allocate_composition_data (coding, from + inserted);
5781           continue;
5782         }
5783
5784       src += coding->consumed;
5785       dst += coding->produced;
5786
5787       if (result == CODING_FINISH_NORMAL)
5788         {
5789           src += len_byte;
5790           break;
5791         }
5792       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5793         {
5794           unsigned char *pend = dst, *p = pend - inserted_byte;
5795           Lisp_Object eol_type;
5796
5797           /* Encode LFs back to the original eol format (CR or CRLF).  */
5798           if (coding->eol_type == CODING_EOL_CR)
5799             {
5800               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5801             }
5802           else
5803             {
5804               int count = 0;
5805
5806               while (p < pend) if (*p++ == '\n') count++;
5807               if (src - dst < count)
5808                 {
5809                   /* We don't have sufficient room for encoding LFs
5810                      back to CRLF.  We must record converted and
5811                      not-yet-converted text back to the buffer
5812                      content, enlarge the gap, then record them out of
5813                      the buffer contents again.  */
5814                   int add = len_byte + inserted_byte;
5815
5816                   GAP_SIZE -= add;
5817                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5818                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5819                   make_gap (count - GAP_SIZE);
5820                   GAP_SIZE += add;
5821                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5822                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5823                   /* Don't forget to update SRC, DST, and PEND.  */
5824                   src = GAP_END_ADDR - len_byte;
5825                   dst = GPT_ADDR + inserted_byte;
5826                   pend = dst;
5827                 }
5828               inserted += count;
5829               inserted_byte += count;
5830               coding->produced += count;
5831               p = dst = pend + count;
5832               while (count)
5833                 {
5834                   *--p = *--pend;
5835                   if (*p == '\n') count--, *--p = '\r';
5836                 }
5837             }
5838
5839           /* Suppress eol-format conversion in the further conversion.  */
5840           coding->eol_type = CODING_EOL_LF;
5841
5842           /* Set the coding system symbol to that for Unix-like EOL.  */
5843           eol_type = Fget (saved_coding_symbol, Qeol_type);
5844           if (VECTORP (eol_type)
5845               && XVECTOR (eol_type)->size == 3
5846               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5847             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5848           else
5849             coding->symbol = saved_coding_symbol;
5850
5851           continue;
5852         }
5853       if (len_byte <= 0)
5854         {
5855           if (coding->type != coding_type_ccl
5856               || coding->mode & CODING_MODE_LAST_BLOCK)
5857             break;
5858           coding->mode |= CODING_MODE_LAST_BLOCK;
5859           continue;
5860         }
5861       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5862         {
5863           /* The source text ends in invalid codes.  Let's just
5864              make them valid buffer contents, and finish conversion.  */
5865           if (multibyte_p)
5866             {
5867               unsigned char *start = dst;
5868
5869               inserted += len_byte;
5870               while (len_byte--)
5871                 {
5872                   int c = *src++;
5873                   dst += CHAR_STRING (c, dst);
5874                 }
5875
5876               inserted_byte += dst - start;
5877             }
5878           else
5879             {
5880               inserted += len_byte;
5881               inserted_byte += len_byte;
5882               while (len_byte--)
5883                 *dst++ = *src++;
5884             }
5885           break;
5886         }
5887       if (result == CODING_FINISH_INTERRUPT)
5888         {
5889           /* The conversion procedure was interrupted by a user.  */
5890           break;
5891         }
5892       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5893       if (coding->consumed < 1)
5894         {
5895           /* It's quite strange to require more memory without
5896              consuming any bytes.  Perhaps CCL program bug.  */
5897           break;
5898         }
5899       if (first)
5900         {
5901           /* We have just done the first batch of conversion which was
5902              stopped because of insufficient gap.  Let's reconsider the
5903              required gap size (i.e. SRT - DST) now.
5904
5905              We have converted ORIG bytes (== coding->consumed) into
5906              NEW bytes (coding->produced).  To convert the remaining
5907              LEN bytes, we may need REQUIRE bytes of gap, where:
5908                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5909                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5910              Here, we are sure that NEW >= ORIG.  */
5911
5912           if (coding->produced <= coding->consumed)
5913             {
5914               /* This happens because of CCL-based coding system with
5915                  eol-type CRLF.  */
5916               require = 0;
5917             }
5918           else
5919             {
5920               float ratio = coding->produced - coding->consumed;
5921               ratio /= coding->consumed;
5922               require = len_byte * ratio;
5923             }
5924           first = 0;
5925         }
5926       if ((src - dst) < (require + 2000))
5927         {
5928           /* See the comment above the previous call of make_gap.  */
5929           int add = len_byte + inserted_byte;
5930
5931           GAP_SIZE -= add;
5932           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5933           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5934           make_gap (require + 2000);
5935           GAP_SIZE += add;
5936           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5937           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5938         }
5939     }
5940   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5941
5942   if (encodep && coding->dst_multibyte)
5943     {
5944       /* The output is unibyte.  We must convert 8-bit characters to
5945          multibyte form.  */
5946       if (inserted_byte * 2 > GAP_SIZE)
5947         {
5948           GAP_SIZE -= inserted_byte;
5949           ZV += inserted_byte; Z += inserted_byte;
5950           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5951           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5952           make_gap (inserted_byte - GAP_SIZE);
5953           GAP_SIZE += inserted_byte;
5954           ZV -= inserted_byte; Z -= inserted_byte;
5955           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5956           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5957         }
5958       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5959     }
5960
5961   /* If we shrank the conversion area, adjust it now.  */
5962   if (total_skip > 0)
5963     {
5964       if (tail_skip > 0)
5965         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5966       inserted += total_skip; inserted_byte += total_skip;
5967       GAP_SIZE += total_skip;
5968       GPT -= head_skip; GPT_BYTE -= head_skip;
5969       ZV -= total_skip; ZV_BYTE -= total_skip;
5970       Z -= total_skip; Z_BYTE -= total_skip;
5971       from -= head_skip; from_byte -= head_skip;
5972       to += tail_skip; to_byte += tail_skip;
5973     }
5974
5975   prev_Z = Z;
5976   if (! EQ (current_buffer->undo_list, Qt))
5977     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5978   else
5979     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5980                                  inserted, inserted_byte);
5981   inserted = Z - prev_Z;
5982
5983   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5984     coding_restore_composition (coding, Fcurrent_buffer ());
5985   coding_free_composition_data (coding);
5986
5987   if (! inhibit_pre_post_conversion
5988       && ! encodep && ! NILP (coding->post_read_conversion))
5989     {
5990       Lisp_Object val;
5991       Lisp_Object saved_coding_system;
5992
5993       if (from != PT)
5994         TEMP_SET_PT_BOTH (from, from_byte);
5995       prev_Z = Z;
5996       record_unwind_protect (code_convert_region_unwind,
5997                              Fcons (Vlast_coding_system_used, Qnil));
5998       saved_coding_system = Vlast_coding_system_used;
5999       Vlast_coding_system_used = coding->symbol;
6000       /* We should not call any more pre-write/post-read-conversion
6001          functions while this post-read-conversion is running.  */
6002       inhibit_pre_post_conversion = 1;
6003       val = call1 (coding->post_read_conversion, make_number (inserted));
6004       inhibit_pre_post_conversion = 0;
6005       coding->symbol = Vlast_coding_system_used;
6006       Vlast_coding_system_used = saved_coding_system;
6007       /* Discard the unwind protect.  */
6008       specpdl_ptr--;
6009       CHECK_NUMBER (val);
6010       inserted += Z - prev_Z;
6011     }
6012
6013   if (orig_point >= from)
6014     {
6015       if (orig_point >= from + orig_len)
6016         orig_point += inserted - orig_len;
6017       else
6018         orig_point = from;
6019       TEMP_SET_PT (orig_point);
6020     }
6021
6022   if (replace)
6023     {
6024       signal_after_change (from, to - from, inserted);
6025       update_compositions (from, from + inserted, CHECK_BORDER);
6026     }
6027
6028   {
6029     coding->consumed = to_byte - from_byte;
6030     coding->consumed_char = to - from;
6031     coding->produced = inserted_byte;
6032     coding->produced_char = inserted;
6033   }
6034
6035   return 0;
6036 }
6037
6038 /* Name (or base name) of work buffer for code conversion.  */
6039 static Lisp_Object Vcode_conversion_workbuf_name;
6040
6041 /* Set the current buffer to the working buffer prepared for
6042    code-conversion.  MULTIBYTE specifies the multibyteness of the
6043    buffer.  Return the buffer we set if it must be killed after use.
6044    Otherwise return Qnil.  */
6045
6046 static Lisp_Object
6047 set_conversion_work_buffer (multibyte)
6048      int multibyte;
6049 {
6050   Lisp_Object buffer, buffer_to_kill;
6051   struct buffer *buf;
6052
6053   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6054   buf = XBUFFER (buffer);
6055   if (buf == current_buffer)
6056     {
6057       /* As we are already in the work buffer, we must generate a new
6058          buffer for the work.  */
6059       Lisp_Object name;
6060
6061       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6062       buffer = buffer_to_kill = Fget_buffer_create (name);
6063       buf = XBUFFER (buffer);
6064     }
6065   else
6066     buffer_to_kill = Qnil;
6067
6068   delete_all_overlays (buf);
6069   buf->directory = current_buffer->directory;
6070   buf->read_only = Qnil;
6071   buf->filename = Qnil;
6072   buf->undo_list = Qt;
6073   eassert (buf->overlays_before == NULL);
6074   eassert (buf->overlays_after == NULL);
6075   set_buffer_internal (buf);
6076   if (BEG != BEGV || Z != ZV)
6077     Fwiden ();
6078   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6079   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6080   return buffer_to_kill;
6081 }
6082
6083 Lisp_Object
6084 run_pre_post_conversion_on_str (str, coding, encodep)
6085      Lisp_Object str;
6086      struct coding_system *coding;
6087      int encodep;
6088 {
6089   int count = SPECPDL_INDEX ();
6090   struct gcpro gcpro1, gcpro2;
6091   int multibyte = STRING_MULTIBYTE (str);
6092   Lisp_Object old_deactivate_mark;
6093   Lisp_Object buffer_to_kill;
6094   Lisp_Object unwind_arg;
6095
6096   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6097   /* It is not crucial to specbind this.  */
6098   old_deactivate_mark = Vdeactivate_mark;
6099   GCPRO2 (str, old_deactivate_mark);
6100
6101   /* We must insert the contents of STR as is without
6102      unibyte<->multibyte conversion.  For that, we adjust the
6103      multibyteness of the working buffer to that of STR.  */
6104   buffer_to_kill = set_conversion_work_buffer (multibyte);
6105   if (NILP (buffer_to_kill))
6106     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6107   else
6108     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6109   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6110
6111   insert_from_string (str, 0, 0,
6112                       SCHARS (str), SBYTES (str), 0);
6113   UNGCPRO;
6114   inhibit_pre_post_conversion = 1;
6115   if (encodep)
6116     {
6117       struct buffer *prev = current_buffer;
6118
6119       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6120       if (prev != current_buffer)
6121         /* We must kill the current buffer too.  */
6122         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6123     }
6124   else
6125     {
6126       Vlast_coding_system_used = coding->symbol;
6127       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6128       call1 (coding->post_read_conversion, make_number (Z - BEG));
6129       coding->symbol = Vlast_coding_system_used;
6130     }
6131   inhibit_pre_post_conversion = 0;
6132   Vdeactivate_mark = old_deactivate_mark;
6133   str = make_buffer_string (BEG, Z, 1);
6134   return unbind_to (count, str);
6135 }
6136
6137
6138 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6139    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6140    is intended that this function is called from encode_terminal_code,
6141    the pre-write-conversion function is run by safe_call and thus
6142    "Error during redisplay: ..." is logged when an error occurs.
6143
6144    Store the resulting text in *STR and set CODING->produced_char and
6145    CODING->produced to the number of characters and bytes
6146    respectively.  If the size of *STR is too small, enlarge it by
6147    xrealloc and update *STR and *SIZE.  */
6148
6149 void
6150 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6151      unsigned char **str;
6152      int *size, nchars, nbytes;
6153      struct coding_system *coding;
6154 {
6155   struct gcpro gcpro1, gcpro2;
6156   struct buffer *cur = current_buffer;
6157   struct buffer *prev;
6158   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6159   Lisp_Object args[3];
6160   Lisp_Object buffer_to_kill;
6161
6162   /* It is not crucial to specbind this.  */
6163   old_deactivate_mark = Vdeactivate_mark;
6164   old_last_coding_system_used = Vlast_coding_system_used;
6165   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6166
6167   /* We must insert the contents of STR as is without
6168      unibyte<->multibyte conversion.  For that, we adjust the
6169      multibyteness of the working buffer to that of STR.  */
6170   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6171   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6172   UNGCPRO;
6173   inhibit_pre_post_conversion = 1;
6174   prev = current_buffer;
6175   args[0] = coding->pre_write_conversion;
6176   args[1] = make_number (BEG);
6177   args[2] = make_number (Z);
6178   safe_call (3, args);
6179   inhibit_pre_post_conversion = 0;
6180   Vdeactivate_mark = old_deactivate_mark;
6181   Vlast_coding_system_used = old_last_coding_system_used;
6182   coding->produced_char = Z - BEG;
6183   coding->produced = Z_BYTE - BEG_BYTE;
6184   if (coding->produced > *size)
6185     {
6186       *size = coding->produced;
6187       *str = xrealloc (*str, *size);
6188     }
6189   if (BEG < GPT && GPT < Z)
6190     move_gap (BEG);
6191   bcopy (BEG_ADDR, *str, coding->produced);
6192   coding->src_multibyte
6193     = ! NILP (current_buffer->enable_multibyte_characters);
6194   if (prev != current_buffer)
6195     Fkill_buffer (Fcurrent_buffer ());
6196   set_buffer_internal (cur);
6197   if (! NILP (buffer_to_kill))
6198     Fkill_buffer (buffer_to_kill);
6199 }
6200
6201
6202 Lisp_Object
6203 decode_coding_string (str, coding, nocopy)
6204      Lisp_Object str;
6205      struct coding_system *coding;
6206      int nocopy;
6207 {
6208   int len;
6209   struct conversion_buffer buf;
6210   int from, to_byte;
6211   Lisp_Object saved_coding_symbol;
6212   int result;
6213   int require_decoding;
6214   int shrinked_bytes = 0;
6215   Lisp_Object newstr;
6216   int consumed, consumed_char, produced, produced_char;
6217
6218   from = 0;
6219   to_byte = SBYTES (str);
6220
6221   saved_coding_symbol = coding->symbol;
6222   coding->src_multibyte = STRING_MULTIBYTE (str);
6223   coding->dst_multibyte = 1;
6224   if (CODING_REQUIRE_DETECTION (coding))
6225     {
6226       /* See the comments in code_convert_region.  */
6227       if (coding->type == coding_type_undecided)
6228         {
6229           detect_coding (coding, SDATA (str), to_byte);
6230           if (coding->type == coding_type_undecided)
6231             {
6232               coding->type = coding_type_emacs_mule;
6233               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6234               /* As emacs-mule decoder will handle composition, we
6235                  need this setting to allocate coding->cmp_data
6236                  later.  */
6237               coding->composing = COMPOSITION_NO;
6238             }
6239         }
6240       if (coding->eol_type == CODING_EOL_UNDECIDED
6241           && coding->type != coding_type_ccl)
6242         {
6243           saved_coding_symbol = coding->symbol;
6244           detect_eol (coding, SDATA (str), to_byte);
6245           if (coding->eol_type == CODING_EOL_UNDECIDED)
6246             coding->eol_type = CODING_EOL_LF;
6247           /* We had better recover the original eol format if we
6248              encounter an inconsistent eol format while decoding.  */
6249           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6250         }
6251     }
6252
6253   if (coding->type == coding_type_no_conversion
6254       || coding->type == coding_type_raw_text)
6255     coding->dst_multibyte = 0;
6256
6257   require_decoding = CODING_REQUIRE_DECODING (coding);
6258
6259   if (STRING_MULTIBYTE (str))
6260     {
6261       /* Decoding routines expect the source text to be unibyte.  */
6262       str = Fstring_as_unibyte (str);
6263       to_byte = SBYTES (str);
6264       nocopy = 1;
6265       coding->src_multibyte = 0;
6266     }
6267
6268   /* Try to skip the heading and tailing ASCIIs.  */
6269   if (require_decoding && coding->type != coding_type_ccl)
6270     {
6271       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6272                                 0);
6273       if (from == to_byte)
6274         require_decoding = 0;
6275       shrinked_bytes = from + (SBYTES (str) - to_byte);
6276     }
6277
6278   if (!require_decoding
6279       && !(SYMBOLP (coding->post_read_conversion)
6280            && !NILP (Ffboundp (coding->post_read_conversion))))
6281     {
6282       coding->consumed = SBYTES (str);
6283       coding->consumed_char = SCHARS (str);
6284       if (coding->dst_multibyte)
6285         {
6286           str = Fstring_as_multibyte (str);
6287           nocopy = 1;
6288         }
6289       coding->produced = SBYTES (str);
6290       coding->produced_char = SCHARS (str);
6291       return (nocopy ? str : Fcopy_sequence (str));
6292     }
6293
6294   if (coding->composing != COMPOSITION_DISABLED)
6295     coding_allocate_composition_data (coding, from);
6296   len = decoding_buffer_size (coding, to_byte - from);
6297   allocate_conversion_buffer (buf, len);
6298
6299   consumed = consumed_char = produced = produced_char = 0;
6300   while (1)
6301     {
6302       result = decode_coding (coding, SDATA (str) + from + consumed,
6303                               buf.data + produced, to_byte - from - consumed,
6304                               buf.size - produced);
6305       consumed += coding->consumed;
6306       consumed_char += coding->consumed_char;
6307       produced += coding->produced;
6308       produced_char += coding->produced_char;
6309       if (result == CODING_FINISH_NORMAL
6310           || result == CODING_FINISH_INTERRUPT
6311           || (result == CODING_FINISH_INSUFFICIENT_SRC
6312               && coding->consumed == 0))
6313         break;
6314       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6315         coding_allocate_composition_data (coding, from + produced_char);
6316       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6317         extend_conversion_buffer (&buf);
6318       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6319         {
6320           Lisp_Object eol_type;
6321
6322           /* Recover the original EOL format.  */
6323           if (coding->eol_type == CODING_EOL_CR)
6324             {
6325               unsigned char *p;
6326               for (p = buf.data; p < buf.data + produced; p++)
6327                 if (*p == '\n') *p = '\r';
6328             }
6329           else if (coding->eol_type == CODING_EOL_CRLF)
6330             {
6331               int num_eol = 0;
6332               unsigned char *p0, *p1;
6333               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6334                 if (*p0 == '\n') num_eol++;
6335               if (produced + num_eol >= buf.size)
6336                 extend_conversion_buffer (&buf);
6337               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6338                 {
6339                   *--p1 = *--p0;
6340                   if (*p0 == '\n') *--p1 = '\r';
6341                 }
6342               produced += num_eol;
6343               produced_char += num_eol;
6344             }
6345           /* Suppress eol-format conversion in the further conversion.  */
6346           coding->eol_type = CODING_EOL_LF;
6347
6348           /* Set the coding system symbol to that for Unix-like EOL.  */
6349           eol_type = Fget (saved_coding_symbol, Qeol_type);
6350           if (VECTORP (eol_type)
6351               && XVECTOR (eol_type)->size == 3
6352               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6353             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6354           else
6355             coding->symbol = saved_coding_symbol;
6356
6357
6358         }
6359     }
6360
6361   coding->consumed = consumed;
6362   coding->consumed_char = consumed_char;
6363   coding->produced = produced;
6364   coding->produced_char = produced_char;
6365
6366   if (coding->dst_multibyte)
6367     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6368                                            produced + shrinked_bytes);
6369   else
6370     newstr = make_uninit_string (produced + shrinked_bytes);
6371   if (from > 0)
6372     STRING_COPYIN (newstr, 0, SDATA (str), from);
6373   STRING_COPYIN (newstr, from, buf.data, produced);
6374   if (shrinked_bytes > from)
6375     STRING_COPYIN (newstr, from + produced,
6376                    SDATA (str) + to_byte,
6377                    shrinked_bytes - from);
6378   free_conversion_buffer (&buf);
6379
6380   coding->consumed += shrinked_bytes;
6381   coding->consumed_char += shrinked_bytes;
6382   coding->produced += shrinked_bytes;
6383   coding->produced_char += shrinked_bytes;
6384
6385   if (coding->cmp_data && coding->cmp_data->used)
6386     coding_restore_composition (coding, newstr);
6387   coding_free_composition_data (coding);
6388
6389   if (SYMBOLP (coding->post_read_conversion)
6390       && !NILP (Ffboundp (coding->post_read_conversion)))
6391     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6392
6393   return newstr;
6394 }
6395
6396 Lisp_Object
6397 encode_coding_string (str, coding, nocopy)
6398      Lisp_Object str;
6399      struct coding_system *coding;
6400      int nocopy;
6401 {
6402   int len;
6403   struct conversion_buffer buf;
6404   int from, to, to_byte;
6405   int result;
6406   int shrinked_bytes = 0;
6407   Lisp_Object newstr;
6408   int consumed, consumed_char, produced, produced_char;
6409
6410   if (SYMBOLP (coding->pre_write_conversion)
6411       && !NILP (Ffboundp (coding->pre_write_conversion)))
6412     {
6413       str = run_pre_post_conversion_on_str (str, coding, 1);
6414       /* As STR is just newly generated, we don't have to copy it
6415          anymore.  */
6416       nocopy = 1;
6417     }
6418
6419   from = 0;
6420   to = SCHARS (str);
6421   to_byte = SBYTES (str);
6422
6423   /* Encoding routines determine the multibyteness of the source text
6424      by coding->src_multibyte.  */
6425   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6426   coding->dst_multibyte = 0;
6427   if (! CODING_REQUIRE_ENCODING (coding))
6428     goto no_need_of_encoding;
6429
6430   if (coding->composing != COMPOSITION_DISABLED)
6431     coding_save_composition (coding, from, to, str);
6432
6433   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6434      if we must run CCL program or there are compositions to
6435      encode.  */
6436   if (coding->type != coding_type_ccl
6437       && (! coding->cmp_data || coding->cmp_data->used == 0))
6438     {
6439       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6440                                 1);
6441       if (from == to_byte)
6442         {
6443           coding_free_composition_data (coding);
6444           goto no_need_of_encoding;
6445         }
6446       shrinked_bytes = from + (SBYTES (str) - to_byte);
6447     }
6448
6449   len = encoding_buffer_size (coding, to_byte - from);
6450   allocate_conversion_buffer (buf, len);
6451
6452   consumed = consumed_char = produced = produced_char = 0;
6453   while (1)
6454     {
6455       result = encode_coding (coding, SDATA (str) + from + consumed,
6456                               buf.data + produced, to_byte - from - consumed,
6457                               buf.size - produced);
6458       consumed += coding->consumed;
6459       consumed_char += coding->consumed_char;
6460       produced += coding->produced;
6461       produced_char += coding->produced_char;
6462       if (result == CODING_FINISH_NORMAL
6463           || result == CODING_FINISH_INTERRUPT
6464           || (result == CODING_FINISH_INSUFFICIENT_SRC
6465               && coding->consumed == 0))
6466         break;
6467       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6468       extend_conversion_buffer (&buf);
6469     }
6470
6471   coding->consumed = consumed;
6472   coding->consumed_char = consumed_char;
6473   coding->produced = produced;
6474   coding->produced_char = produced_char;
6475
6476   newstr = make_uninit_string (produced + shrinked_bytes);
6477   if (from > 0)
6478     STRING_COPYIN (newstr, 0, SDATA (str), from);
6479   STRING_COPYIN (newstr, from, buf.data, produced);
6480   if (shrinked_bytes > from)
6481     STRING_COPYIN (newstr, from + produced,
6482                    SDATA (str) + to_byte,
6483                    shrinked_bytes - from);
6484
6485   free_conversion_buffer (&buf);
6486   coding_free_composition_data (coding);
6487
6488   return newstr;
6489
6490  no_need_of_encoding:
6491   coding->consumed = SBYTES (str);
6492   coding->consumed_char = SCHARS (str);
6493   if (STRING_MULTIBYTE (str))
6494     {
6495       if (nocopy)
6496         /* We are sure that STR doesn't contain a multibyte
6497            character.  */
6498         STRING_SET_UNIBYTE (str);
6499       else
6500         {
6501           str = Fstring_as_unibyte (str);
6502           nocopy = 1;
6503         }
6504     }
6505   coding->produced = SBYTES (str);
6506   coding->produced_char = SCHARS (str);
6507   return (nocopy ? str : Fcopy_sequence (str));
6508 }
6509
6510 \f
6511 #ifdef emacs
6512 /*** 8. Emacs Lisp library functions ***/
6513
6514 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6515        doc: /* Return t if OBJECT is nil or a coding-system.
6516 See the documentation of `make-coding-system' for information
6517 about coding-system objects.  */)
6518      (obj)
6519      Lisp_Object obj;
6520 {
6521   if (NILP (obj))
6522     return Qt;
6523   if (!SYMBOLP (obj))
6524     return Qnil;
6525   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6526     return Qt;
6527   /* Get coding-spec vector for OBJ.  */
6528   obj = Fget (obj, Qcoding_system);
6529   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6530           ? Qt : Qnil);
6531 }
6532
6533 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6534        Sread_non_nil_coding_system, 1, 1, 0,
6535        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6536      (prompt)
6537      Lisp_Object prompt;
6538 {
6539   Lisp_Object val;
6540   do
6541     {
6542       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6543                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6544     }
6545   while (SCHARS (val) == 0);
6546   return (Fintern (val, Qnil));
6547 }
6548
6549 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6550        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6551 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6552      (prompt, default_coding_system)
6553      Lisp_Object prompt, default_coding_system;
6554 {
6555   Lisp_Object val;
6556   if (SYMBOLP (default_coding_system))
6557     default_coding_system = SYMBOL_NAME (default_coding_system);
6558   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6559                           Qt, Qnil, Qcoding_system_history,
6560                           default_coding_system, Qnil);
6561   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6562 }
6563
6564 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6565        1, 1, 0,
6566        doc: /* Check validity of CODING-SYSTEM.
6567 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6568 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6569 The value of this property should be a vector of length 5.  */)
6570      (coding_system)
6571      Lisp_Object coding_system;
6572 {
6573   Lisp_Object define_form;
6574
6575   define_form = Fget (coding_system, Qcoding_system_define_form);
6576   if (! NILP (define_form))
6577     {
6578       Fput (coding_system, Qcoding_system_define_form, Qnil);
6579       safe_eval (define_form);
6580     }
6581   if (!NILP (Fcoding_system_p (coding_system)))
6582     return coding_system;
6583   xsignal1 (Qcoding_system_error, coding_system);
6584 }
6585 \f
6586 Lisp_Object
6587 detect_coding_system (src, src_bytes, highest, multibytep)
6588      const unsigned char *src;
6589      int src_bytes, highest;
6590      int multibytep;
6591 {
6592   int coding_mask, eol_type;
6593   Lisp_Object val, tmp;
6594   int dummy;
6595
6596   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6597   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6598   if (eol_type == CODING_EOL_INCONSISTENT)
6599     eol_type = CODING_EOL_UNDECIDED;
6600
6601   if (!coding_mask)
6602     {
6603       val = Qundecided;
6604       if (eol_type != CODING_EOL_UNDECIDED)
6605         {
6606           Lisp_Object val2;
6607           val2 = Fget (Qundecided, Qeol_type);
6608           if (VECTORP (val2))
6609             val = XVECTOR (val2)->contents[eol_type];
6610         }
6611       return (highest ? val : Fcons (val, Qnil));
6612     }
6613
6614   /* At first, gather possible coding systems in VAL.  */
6615   val = Qnil;
6616   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6617     {
6618       Lisp_Object category_val, category_index;
6619
6620       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6621       category_val = Fsymbol_value (XCAR (tmp));
6622       if (!NILP (category_val)
6623           && NATNUMP (category_index)
6624           && (coding_mask & (1 << XFASTINT (category_index))))
6625         {
6626           val = Fcons (category_val, val);
6627           if (highest)
6628             break;
6629         }
6630     }
6631   if (!highest)
6632     val = Fnreverse (val);
6633
6634   /* Then, replace the elements with subsidiary coding systems.  */
6635   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6636     {
6637       if (eol_type != CODING_EOL_UNDECIDED
6638           && eol_type != CODING_EOL_INCONSISTENT)
6639         {
6640           Lisp_Object eol;
6641           eol = Fget (XCAR (tmp), Qeol_type);
6642           if (VECTORP (eol))
6643             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6644         }
6645     }
6646   return (highest ? XCAR (val) : val);
6647 }
6648
6649 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6650        2, 3, 0,
6651        doc: /* Detect how the byte sequence in the region is encoded.
6652 Return a list of possible coding systems used on decoding a byte
6653 sequence containing the bytes in the region between START and END when
6654 the coding system `undecided' is specified.  The list is ordered by
6655 priority decided in the current language environment.
6656
6657 If only ASCII characters are found (except for such ISO-2022 control
6658 characters ISO-2022 as ESC), it returns a list of single element
6659 `undecided' or its subsidiary coding system according to a detected
6660 end-of-line format.
6661
6662 If optional argument HIGHEST is non-nil, return the coding system of
6663 highest priority.  */)
6664      (start, end, highest)
6665      Lisp_Object start, end, highest;
6666 {
6667   int from, to;
6668   int from_byte, to_byte;
6669   int include_anchor_byte = 0;
6670
6671   CHECK_NUMBER_COERCE_MARKER (start);
6672   CHECK_NUMBER_COERCE_MARKER (end);
6673
6674   validate_region (&start, &end);
6675   from = XINT (start), to = XINT (end);
6676   from_byte = CHAR_TO_BYTE (from);
6677   to_byte = CHAR_TO_BYTE (to);
6678
6679   if (from < GPT && to >= GPT)
6680     move_gap_both (to, to_byte);
6681   /* If we an anchor byte `\0' follows the region, we include it in
6682      the detecting source.  Then code detectors can handle the tailing
6683      byte sequence more accurately.
6684
6685      Fix me: This is not a perfect solution.  It is better that we
6686      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6687   */
6688   if (to == Z || (to == GPT && GAP_SIZE > 0))
6689     include_anchor_byte = 1;
6690   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6691                                to_byte - from_byte + include_anchor_byte,
6692                                !NILP (highest),
6693                                !NILP (current_buffer
6694                                       ->enable_multibyte_characters));
6695 }
6696
6697 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6698        1, 2, 0,
6699        doc: /* Detect how the byte sequence in STRING is encoded.
6700 Return a list of possible coding systems used on decoding a byte
6701 sequence containing the bytes in STRING when the coding system
6702 `undecided' is specified.  The list is ordered by priority decided in
6703 the current language environment.
6704
6705 If only ASCII characters are found (except for such ISO-2022 control
6706 characters ISO-2022 as ESC), it returns a list of single element
6707 `undecided' or its subsidiary coding system according to a detected
6708 end-of-line format.
6709
6710 If optional argument HIGHEST is non-nil, return the coding system of
6711 highest priority.  */)
6712      (string, highest)
6713      Lisp_Object string, highest;
6714 {
6715   CHECK_STRING (string);
6716
6717   return detect_coding_system (SDATA (string),
6718                                /* "+ 1" is to include the anchor byte
6719                                   `\0'.  With this, code detectors can
6720                                   handle the tailing bytes more
6721                                   accurately.  */
6722                                SBYTES (string) + 1,
6723                                !NILP (highest),
6724                                STRING_MULTIBYTE (string));
6725 }
6726
6727 /*  Subroutine for Ffind_coding_systems_region_internal.
6728
6729     Return a list of coding systems that safely encode the multibyte
6730     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6731     possible coding systems.  If it is nil, it means that we have not
6732     yet found any coding systems.
6733
6734     WORK_TABLE a char-table of which element is set to t once the
6735     element is looked up.
6736
6737     If a non-ASCII single byte char is found, set
6738     *single_byte_char_found to 1.  */
6739
6740 static Lisp_Object
6741 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6742      unsigned char *p, *pend;
6743      Lisp_Object safe_codings, work_table;
6744      int *single_byte_char_found;
6745 {
6746   int c, len;
6747   Lisp_Object val, ch;
6748   Lisp_Object prev, tail;
6749
6750   if (NILP (safe_codings))
6751     goto done_safe_codings;
6752   while (p < pend)
6753     {
6754       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6755       p += len;
6756       if (ASCII_BYTE_P (c))
6757         /* We can ignore ASCII characters here.  */
6758         continue;
6759       if (SINGLE_BYTE_CHAR_P (c))
6760         *single_byte_char_found = 1;
6761       /* Check the safe coding systems for C.  */
6762       ch = make_number (c);
6763       val = Faref (work_table, ch);
6764       if (EQ (val, Qt))
6765         /* This element was already checked.  Ignore it.  */
6766         continue;
6767       /* Remember that we checked this element.  */
6768       Faset (work_table, ch, Qt);
6769
6770       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6771         {
6772           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6773           int encodable;
6774
6775           elt = XCAR (tail);
6776           if (CONSP (XCDR (elt)))
6777             {
6778               /* This entry has this format now:
6779                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6780                           ACCEPT-LATIN-EXTRA ) */
6781               val = XCDR (elt);
6782               encodable = ! NILP (Faref (XCAR (val), ch));
6783               if (! encodable)
6784                 {
6785                   val = XCDR (val);
6786                   translation_table = XCAR (val);
6787                   hash_table = XCAR (XCDR (val));
6788                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6789                 }
6790             }
6791           else
6792             {
6793               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6794               encodable = ! NILP (Faref (XCDR (elt), ch));
6795               if (! encodable)
6796                 {
6797                   /* Transform the format to:
6798                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6799                        ACCEPT-LATIN-EXTRA )  */
6800                   val = Fget (XCAR (elt), Qcoding_system);
6801                   translation_table
6802                     = Fplist_get (AREF (val, 3),
6803                                   Qtranslation_table_for_encode);
6804                   if (SYMBOLP (translation_table))
6805                     translation_table = Fget (translation_table,
6806                                               Qtranslation_table);
6807                   hash_table
6808                     = (CHAR_TABLE_P (translation_table)
6809                        ? XCHAR_TABLE (translation_table)->extras[1]
6810                        : Qnil);
6811                   accept_latin_extra
6812                     = ((EQ (AREF (val, 0), make_number (2))
6813                         && VECTORP (AREF (val, 4)))
6814                        ? AREF (AREF (val, 4), 16)
6815                        : Qnil);
6816                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6817                                         translation_table, hash_table,
6818                                         accept_latin_extra));
6819                 }
6820             }
6821
6822           if (! encodable
6823               && ((CHAR_TABLE_P (translation_table)
6824                    && ! NILP (Faref (translation_table, ch)))
6825                   || (HASH_TABLE_P (hash_table)
6826                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6827                   || (SINGLE_BYTE_CHAR_P (c)
6828                       && ! NILP (accept_latin_extra)
6829                       && VECTORP (Vlatin_extra_code_table)
6830                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6831             encodable = 1;
6832           if (encodable)
6833             prev = tail;
6834           else
6835             {
6836               /* Exclude this coding system from SAFE_CODINGS.  */
6837               if (EQ (tail, safe_codings))
6838                 {
6839                   safe_codings = XCDR (safe_codings);
6840                   if (NILP (safe_codings))
6841                     goto done_safe_codings;
6842                 }
6843               else
6844                 XSETCDR (prev, XCDR (tail));
6845             }
6846         }
6847     }
6848
6849  done_safe_codings:
6850   /* If the above loop was terminated before P reaches PEND, it means
6851      SAFE_CODINGS was set to nil.  If we have not yet found an
6852      non-ASCII single-byte char, check it now.  */
6853   if (! *single_byte_char_found)
6854     while (p < pend)
6855       {
6856         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6857         p += len;
6858         if (! ASCII_BYTE_P (c)
6859             && SINGLE_BYTE_CHAR_P (c))
6860           {
6861             *single_byte_char_found = 1;
6862             break;
6863           }
6864       }
6865   return safe_codings;
6866 }
6867
6868 DEFUN ("find-coding-systems-region-internal",
6869        Ffind_coding_systems_region_internal,
6870        Sfind_coding_systems_region_internal, 2, 2, 0,
6871        doc: /* Internal use only.  */)
6872      (start, end)
6873      Lisp_Object start, end;
6874 {
6875   Lisp_Object work_table, safe_codings;
6876   int non_ascii_p = 0;
6877   int single_byte_char_found = 0;
6878   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6879
6880   if (STRINGP (start))
6881     {
6882       if (!STRING_MULTIBYTE (start))
6883         return Qt;
6884       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6885       p2 = p2end = p1end;
6886       if (SCHARS (start) != SBYTES (start))
6887         non_ascii_p = 1;
6888     }
6889   else
6890     {
6891       int from, to, stop;
6892
6893       CHECK_NUMBER_COERCE_MARKER (start);
6894       CHECK_NUMBER_COERCE_MARKER (end);
6895       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6896         args_out_of_range (start, end);
6897       if (NILP (current_buffer->enable_multibyte_characters))
6898         return Qt;
6899       from = CHAR_TO_BYTE (XINT (start));
6900       to = CHAR_TO_BYTE (XINT (end));
6901       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6902       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6903       if (stop == to)
6904         p2 = p2end = p1end;
6905       else
6906         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6907       if (XINT (end) - XINT (start) != to - from)
6908         non_ascii_p = 1;
6909     }
6910
6911   if (!non_ascii_p)
6912     {
6913       /* We are sure that the text contains no multibyte character.
6914          Check if it contains eight-bit-graphic.  */
6915       p = p1;
6916       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6917       if (p == p1end)
6918         {
6919           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6920           if (p == p2end)
6921             return Qt;
6922         }
6923     }
6924
6925   /* The text contains non-ASCII characters.  */
6926
6927   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6928   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6929
6930   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6931                                     &single_byte_char_found);
6932   if (p2 < p2end)
6933     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6934                                       &single_byte_char_found);
6935   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6936     safe_codings = Qt;
6937   else
6938     {
6939       /* Turn safe_codings to a list of coding systems... */
6940       Lisp_Object val;
6941
6942       if (single_byte_char_found)
6943         /* ... and append these for eight-bit chars.  */
6944         val = Fcons (Qraw_text,
6945                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6946       else
6947         /* ... and append generic coding systems.  */
6948         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6949
6950       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6951         val = Fcons (XCAR (XCAR (safe_codings)), val);
6952       safe_codings = val;
6953     }
6954
6955   return safe_codings;
6956 }
6957
6958
6959 /* Search from position POS for such characters that are unencodable
6960    accoding to SAFE_CHARS, and return a list of their positions.  P
6961    points where in the memory the character at POS exists.  Limit the
6962    search at PEND or when Nth unencodable characters are found.
6963
6964    If SAFE_CHARS is a char table, an element for an unencodable
6965    character is nil.
6966
6967    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6968
6969    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6970    eight-bit-graphic characters are unencodable.  */
6971
6972 static Lisp_Object
6973 unencodable_char_position (safe_chars, pos, p, pend, n)
6974      Lisp_Object safe_chars;
6975      int pos;
6976      unsigned char *p, *pend;
6977      int n;
6978 {
6979   Lisp_Object pos_list;
6980
6981   pos_list = Qnil;
6982   while (p < pend)
6983     {
6984       int len;
6985       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6986
6987       if (c >= 128
6988           && (CHAR_TABLE_P (safe_chars)
6989               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6990               : (NILP (safe_chars) || c < 256)))
6991         {
6992           pos_list = Fcons (make_number (pos), pos_list);
6993           if (--n <= 0)
6994             break;
6995         }
6996       pos++;
6997       p += len;
6998     }
6999   return Fnreverse (pos_list);
7000 }
7001
7002
7003 DEFUN ("unencodable-char-position", Funencodable_char_position,
7004        Sunencodable_char_position, 3, 5, 0,
7005        doc: /*
7006 Return position of first un-encodable character in a region.
7007 START and END specfiy the region and CODING-SYSTEM specifies the
7008 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7009
7010 If optional 4th argument COUNT is non-nil, it specifies at most how
7011 many un-encodable characters to search.  In this case, the value is a
7012 list of positions.
7013
7014 If optional 5th argument STRING is non-nil, it is a string to search
7015 for un-encodable characters.  In that case, START and END are indexes
7016 to the string.  */)
7017      (start, end, coding_system, count, string)
7018      Lisp_Object start, end, coding_system, count, string;
7019 {
7020   int n;
7021   Lisp_Object safe_chars;
7022   struct coding_system coding;
7023   Lisp_Object positions;
7024   int from, to;
7025   unsigned char *p, *pend;
7026
7027   if (NILP (string))
7028     {
7029       validate_region (&start, &end);
7030       from = XINT (start);
7031       to = XINT (end);
7032       if (NILP (current_buffer->enable_multibyte_characters))
7033         return Qnil;
7034       p = CHAR_POS_ADDR (from);
7035       if (to == GPT)
7036         pend = GPT_ADDR;
7037       else
7038         pend = CHAR_POS_ADDR (to);
7039     }
7040   else
7041     {
7042       CHECK_STRING (string);
7043       CHECK_NATNUM (start);
7044       CHECK_NATNUM (end);
7045       from = XINT (start);
7046       to = XINT (end);
7047       if (from > to
7048           || to > SCHARS (string))
7049         args_out_of_range_3 (string, start, end);
7050       if (! STRING_MULTIBYTE (string))
7051         return Qnil;
7052       p = SDATA (string) + string_char_to_byte (string, from);
7053       pend = SDATA (string) + string_char_to_byte (string, to);
7054     }
7055
7056   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7057
7058   if (NILP (count))
7059     n = 1;
7060   else
7061     {
7062       CHECK_NATNUM (count);
7063       n = XINT (count);
7064     }
7065
7066   if (coding.type == coding_type_no_conversion
7067       || coding.type == coding_type_raw_text)
7068     return Qnil;
7069
7070   if (coding.type == coding_type_undecided)
7071     safe_chars = Qnil;
7072   else
7073     safe_chars = coding_safe_chars (coding_system);
7074
7075   if (STRINGP (string)
7076       || from >= GPT || to <= GPT)
7077     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7078   else
7079     {
7080       Lisp_Object args[2];
7081
7082       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7083       n -= XINT (Flength (args[0]));
7084       if (n <= 0)
7085         positions = args[0];
7086       else
7087         {
7088           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7089                                                pend, n);
7090           positions = Fappend (2, args);
7091         }
7092     }
7093
7094   return  (NILP (count) ? Fcar (positions) : positions);
7095 }
7096
7097
7098 Lisp_Object
7099 code_convert_region1 (start, end, coding_system, encodep)
7100      Lisp_Object start, end, coding_system;
7101      int encodep;
7102 {
7103   struct coding_system coding;
7104   int from, to;
7105
7106   CHECK_NUMBER_COERCE_MARKER (start);
7107   CHECK_NUMBER_COERCE_MARKER (end);
7108   CHECK_SYMBOL (coding_system);
7109
7110   validate_region (&start, &end);
7111   from = XFASTINT (start);
7112   to = XFASTINT (end);
7113
7114   if (NILP (coding_system))
7115     return make_number (to - from);
7116
7117   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7118     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7119
7120   coding.mode |= CODING_MODE_LAST_BLOCK;
7121   coding.src_multibyte = coding.dst_multibyte
7122     = !NILP (current_buffer->enable_multibyte_characters);
7123   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7124                        &coding, encodep, 1);
7125   Vlast_coding_system_used = coding.symbol;
7126   return make_number (coding.produced_char);
7127 }
7128
7129 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7130        3, 3, "r\nzCoding system: ",
7131        doc: /* Decode the current region from the specified coding system.
7132 When called from a program, takes three arguments:
7133 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7134 This function sets `last-coding-system-used' to the precise coding system
7135 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7136 not fully specified.)
7137 It returns the length of the decoded text.  */)
7138      (start, end, coding_system)
7139      Lisp_Object start, end, coding_system;
7140 {
7141   return code_convert_region1 (start, end, coding_system, 0);
7142 }
7143
7144 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7145        3, 3, "r\nzCoding system: ",
7146        doc: /* Encode the current region into the specified coding system.
7147 When called from a program, takes three arguments:
7148 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7149 This function sets `last-coding-system-used' to the precise coding system
7150 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7151 not fully specified.)
7152 It returns the length of the encoded text.  */)
7153      (start, end, coding_system)
7154      Lisp_Object start, end, coding_system;
7155 {
7156   return code_convert_region1 (start, end, coding_system, 1);
7157 }
7158
7159 Lisp_Object
7160 code_convert_string1 (string, coding_system, nocopy, encodep)
7161      Lisp_Object string, coding_system, nocopy;
7162      int encodep;
7163 {
7164   struct coding_system coding;
7165
7166   CHECK_STRING (string);
7167   CHECK_SYMBOL (coding_system);
7168
7169   if (NILP (coding_system))
7170     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7171
7172   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7173     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7174
7175   coding.mode |= CODING_MODE_LAST_BLOCK;
7176   string = (encodep
7177             ? encode_coding_string (string, &coding, !NILP (nocopy))
7178             : decode_coding_string (string, &coding, !NILP (nocopy)));
7179   Vlast_coding_system_used = coding.symbol;
7180
7181   return string;
7182 }
7183
7184 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7185        2, 3, 0,
7186        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7187 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7188 if the decoding operation is trivial.
7189 This function sets `last-coding-system-used' to the precise coding system
7190 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7191 not fully specified.)  */)
7192      (string, coding_system, nocopy)
7193      Lisp_Object string, coding_system, nocopy;
7194 {
7195   return code_convert_string1 (string, coding_system, nocopy, 0);
7196 }
7197
7198 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7199        2, 3, 0,
7200        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7201 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7202 if the encoding operation is trivial.
7203 This function sets `last-coding-system-used' to the precise coding system
7204 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7205 not fully specified.)  */)
7206      (string, coding_system, nocopy)
7207      Lisp_Object string, coding_system, nocopy;
7208 {
7209   return code_convert_string1 (string, coding_system, nocopy, 1);
7210 }
7211
7212 /* Encode or decode STRING according to CODING_SYSTEM.
7213    Do not set Vlast_coding_system_used.
7214
7215    This function is called only from macros DECODE_FILE and
7216    ENCODE_FILE, thus we ignore character composition.  */
7217
7218 Lisp_Object
7219 code_convert_string_norecord (string, coding_system, encodep)
7220      Lisp_Object string, coding_system;
7221      int encodep;
7222 {
7223   struct coding_system coding;
7224
7225   CHECK_STRING (string);
7226   CHECK_SYMBOL (coding_system);
7227
7228   if (NILP (coding_system))
7229     return string;
7230
7231   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7232     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7233
7234   coding.composing = COMPOSITION_DISABLED;
7235   coding.mode |= CODING_MODE_LAST_BLOCK;
7236   return (encodep
7237           ? encode_coding_string (string, &coding, 1)
7238           : decode_coding_string (string, &coding, 1));
7239 }
7240 \f
7241 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7242        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7243 Return the corresponding character.  */)
7244      (code)
7245      Lisp_Object code;
7246 {
7247   unsigned char c1, c2, s1, s2;
7248   Lisp_Object val;
7249
7250   CHECK_NUMBER (code);
7251   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7252   if (s1 == 0)
7253     {
7254       if (s2 < 0x80)
7255         XSETFASTINT (val, s2);
7256       else if (s2 >= 0xA0 || s2 <= 0xDF)
7257         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7258       else
7259         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7260     }
7261   else
7262     {
7263       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7264           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7265         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7266       DECODE_SJIS (s1, s2, c1, c2);
7267       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7268     }
7269   return val;
7270 }
7271
7272 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7273        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7274 Return the corresponding code in SJIS.  */)
7275      (ch)
7276      Lisp_Object ch;
7277 {
7278   int charset, c1, c2, s1, s2;
7279   Lisp_Object val;
7280
7281   CHECK_NUMBER (ch);
7282   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7283   if (charset == CHARSET_ASCII)
7284     {
7285       val = ch;
7286     }
7287   else if (charset == charset_jisx0208
7288            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7289     {
7290       ENCODE_SJIS (c1, c2, s1, s2);
7291       XSETFASTINT (val, (s1 << 8) | s2);
7292     }
7293   else if (charset == charset_katakana_jisx0201
7294            && c1 > 0x20 && c2 < 0xE0)
7295     {
7296       XSETFASTINT (val, c1 | 0x80);
7297     }
7298   else
7299     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7300   return val;
7301 }
7302
7303 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7304        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7305 Return the corresponding character.  */)
7306      (code)
7307      Lisp_Object code;
7308 {
7309   int charset;
7310   unsigned char b1, b2, c1, c2;
7311   Lisp_Object val;
7312
7313   CHECK_NUMBER (code);
7314   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7315   if (b1 == 0)
7316     {
7317       if (b2 >= 0x80)
7318         error ("Invalid BIG5 code: %x", XFASTINT (code));
7319       val = code;
7320     }
7321   else
7322     {
7323       if ((b1 < 0xA1 || b1 > 0xFE)
7324           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7325         error ("Invalid BIG5 code: %x", XFASTINT (code));
7326       DECODE_BIG5 (b1, b2, charset, c1, c2);
7327       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7328     }
7329   return val;
7330 }
7331
7332 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7333        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7334 Return the corresponding character code in Big5.  */)
7335      (ch)
7336      Lisp_Object ch;
7337 {
7338   int charset, c1, c2, b1, b2;
7339   Lisp_Object val;
7340
7341   CHECK_NUMBER (ch);
7342   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7343   if (charset == CHARSET_ASCII)
7344     {
7345       val = ch;
7346     }
7347   else if ((charset == charset_big5_1
7348             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7349            || (charset == charset_big5_2
7350                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7351     {
7352       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7353       XSETFASTINT (val, (b1 << 8) | b2);
7354     }
7355   else
7356     error ("Can't encode to Big5: %d", XFASTINT (ch));
7357   return val;
7358 }
7359 \f
7360 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7361        Sset_terminal_coding_system_internal, 1, 2, 0,
7362        doc: /* Internal use only.  */)
7363      (coding_system, terminal)
7364      Lisp_Object coding_system;
7365      Lisp_Object terminal;
7366 {
7367   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
7368   CHECK_SYMBOL (coding_system);
7369   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7370   /* We had better not send unsafe characters to terminal.  */
7371   terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7372   /* Character composition should be disabled.  */
7373   terminal_coding->composing = COMPOSITION_DISABLED;
7374   /* Error notification should be suppressed.  */
7375   terminal_coding->suppress_error = 1;
7376   terminal_coding->src_multibyte = 1;
7377   terminal_coding->dst_multibyte = 0;
7378   return Qnil;
7379 }
7380
7381 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7382        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7383        doc: /* Internal use only.  */)
7384      (coding_system)
7385      Lisp_Object coding_system;
7386 {
7387   CHECK_SYMBOL (coding_system);
7388   setup_coding_system (Fcheck_coding_system (coding_system),
7389                        &safe_terminal_coding);
7390   /* Character composition should be disabled.  */
7391   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7392   /* Error notification should be suppressed.  */
7393   safe_terminal_coding.suppress_error = 1;
7394   safe_terminal_coding.src_multibyte = 1;
7395   safe_terminal_coding.dst_multibyte = 0;
7396   return Qnil;
7397 }
7398
7399 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7400        Sterminal_coding_system, 0, 1, 0,
7401        doc: /* Return coding system specified for terminal output on the given terminal.
7402 TERMINAL may be a terminal id, a frame, or nil for the selected
7403 frame's terminal device.  */)
7404      (terminal)
7405      Lisp_Object terminal;
7406 {
7407   return TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
7408 }
7409
7410 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7411        Sset_keyboard_coding_system_internal, 1, 2, 0,
7412        doc: /* Internal use only.  */)
7413      (coding_system, terminal)
7414      Lisp_Object coding_system;
7415      Lisp_Object terminal;
7416 {
7417   struct terminal *t = get_terminal (terminal, 1);
7418   CHECK_SYMBOL (coding_system);
7419
7420   setup_coding_system (Fcheck_coding_system (coding_system),
7421                        TERMINAL_KEYBOARD_CODING (t));
7422   /* Character composition should be disabled.  */
7423   TERMINAL_KEYBOARD_CODING (t)->composing = COMPOSITION_DISABLED;
7424   return Qnil;
7425 }
7426
7427 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7428        Skeyboard_coding_system, 0, 1, 0,
7429        doc: /* Return coding system for decoding keyboard input on TERMINAL.
7430 TERMINAL may be a terminal id, a frame, or nil for the selected
7431 frame's terminal device.  */)
7432      (terminal)
7433      Lisp_Object terminal;
7434 {
7435   return TERMINAL_KEYBOARD_CODING (get_terminal (terminal, 1))->symbol;
7436 }
7437
7438 \f
7439 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7440        Sfind_operation_coding_system,  1, MANY, 0,
7441        doc: /* Choose a coding system for an operation based on the target name.
7442 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7443 DECODING-SYSTEM is the coding system to use for decoding
7444 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7445 for encoding (in case OPERATION does encoding).
7446
7447 The first argument OPERATION specifies an I/O primitive:
7448   For file I/O, `insert-file-contents' or `write-region'.
7449   For process I/O, `call-process', `call-process-region', or `start-process'.
7450   For network I/O, `open-network-stream'.
7451
7452 The remaining arguments should be the same arguments that were passed
7453 to the primitive.  Depending on which primitive, one of those arguments
7454 is selected as the TARGET.  For example, if OPERATION does file I/O,
7455 whichever argument specifies the file name is TARGET.
7456
7457 TARGET has a meaning which depends on OPERATION:
7458   For file I/O, TARGET is a file name (except for the special case below).
7459   For process I/O, TARGET is a process name.
7460   For network I/O, TARGET is a service name or a port number
7461
7462 This function looks up what specified for TARGET in,
7463 `file-coding-system-alist', `process-coding-system-alist',
7464 or `network-coding-system-alist' depending on OPERATION.
7465 They may specify a coding system, a cons of coding systems,
7466 or a function symbol to call.
7467 In the last case, we call the function with one argument,
7468 which is a list of all the arguments given to this function.
7469
7470 If OPERATION is `insert-file-contents', the argument corresponding to
7471 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7472 file name to look up, and BUFFER is a buffer that contains the file's
7473 contents (not yet decoded).  If `file-coding-system-alist' specifies a
7474 function to call for FILENAME, that function should examine the
7475 contents of BUFFER instead of reading the file.
7476
7477 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7478      (nargs, args)
7479      int nargs;
7480      Lisp_Object *args;
7481 {
7482   Lisp_Object operation, target_idx, target, val;
7483   register Lisp_Object chain;
7484
7485   if (nargs < 2)
7486     error ("Too few arguments");
7487   operation = args[0];
7488   if (!SYMBOLP (operation)
7489       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7490     error ("Invalid first argument");
7491   if (nargs < 1 + XINT (target_idx))
7492     error ("Too few arguments for operation: %s",
7493            SDATA (SYMBOL_NAME (operation)));
7494   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7495      argument to write-region) is string, it must be treated as a
7496      target file name.  */
7497   if (EQ (operation, Qwrite_region)
7498       && nargs > 5
7499       && STRINGP (args[5]))
7500     target_idx = make_number (4);
7501   target = args[XINT (target_idx) + 1];
7502   if (!(STRINGP (target)
7503         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7504             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7505         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7506     error ("Invalid argument %d", XINT (target_idx) + 1);
7507   if (CONSP (target))
7508     target = XCAR (target);
7509
7510   chain = ((EQ (operation, Qinsert_file_contents)
7511             || EQ (operation, Qwrite_region))
7512            ? Vfile_coding_system_alist
7513            : (EQ (operation, Qopen_network_stream)
7514               ? Vnetwork_coding_system_alist
7515               : Vprocess_coding_system_alist));
7516   if (NILP (chain))
7517     return Qnil;
7518
7519   for (; CONSP (chain); chain = XCDR (chain))
7520     {
7521       Lisp_Object elt;
7522       elt = XCAR (chain);
7523
7524       if (CONSP (elt)
7525           && ((STRINGP (target)
7526                && STRINGP (XCAR (elt))
7527                && fast_string_match (XCAR (elt), target) >= 0)
7528               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7529         {
7530           val = XCDR (elt);
7531           /* Here, if VAL is both a valid coding system and a valid
7532              function symbol, we return VAL as a coding system.  */
7533           if (CONSP (val))
7534             return val;
7535           if (! SYMBOLP (val))
7536             return Qnil;
7537           if (! NILP (Fcoding_system_p (val)))
7538             return Fcons (val, val);
7539           if (! NILP (Ffboundp (val)))
7540             {
7541               /* We use call1 rather than safe_call1
7542                  so as to get bug reports about functions called here
7543                  which don't handle the current interface.  */
7544               val = call1 (val, Flist (nargs, args));
7545               if (CONSP (val))
7546                 return val;
7547               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7548                 return Fcons (val, val);
7549             }
7550           return Qnil;
7551         }
7552     }
7553   return Qnil;
7554 }
7555
7556 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7557        Supdate_coding_systems_internal, 0, 0, 0,
7558        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7559 When values of any coding categories are changed, you must
7560 call this function.  */)
7561      ()
7562 {
7563   int i;
7564
7565   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7566     {
7567       Lisp_Object val;
7568
7569       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7570       if (!NILP (val))
7571         {
7572           if (! coding_system_table[i])
7573             coding_system_table[i] = ((struct coding_system *)
7574                                       xmalloc (sizeof (struct coding_system)));
7575           setup_coding_system (val, coding_system_table[i]);
7576         }
7577       else if (coding_system_table[i])
7578         {
7579           xfree (coding_system_table[i]);
7580           coding_system_table[i] = NULL;
7581         }
7582     }
7583
7584   return Qnil;
7585 }
7586
7587 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7588        Sset_coding_priority_internal, 0, 0, 0,
7589        doc: /* Update internal database for the current value of `coding-category-list'.
7590 This function is internal use only.  */)
7591      ()
7592 {
7593   int i = 0, idx;
7594   Lisp_Object val;
7595
7596   val = Vcoding_category_list;
7597
7598   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7599     {
7600       if (! SYMBOLP (XCAR (val)))
7601         break;
7602       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7603       if (idx >= CODING_CATEGORY_IDX_MAX)
7604         break;
7605       coding_priorities[i++] = (1 << idx);
7606       val = XCDR (val);
7607     }
7608   /* If coding-category-list is valid and contains all coding
7609      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7610      the following code saves Emacs from crashing.  */
7611   while (i < CODING_CATEGORY_IDX_MAX)
7612     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7613
7614   return Qnil;
7615 }
7616
7617 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7618        Sdefine_coding_system_internal, 1, 1, 0,
7619        doc: /* Register CODING-SYSTEM as a base coding system.
7620 This function is internal use only.  */)
7621      (coding_system)
7622      Lisp_Object coding_system;
7623 {
7624   Lisp_Object safe_chars, slot;
7625
7626   if (NILP (Fcheck_coding_system (coding_system)))
7627     xsignal1 (Qcoding_system_error, coding_system);
7628
7629   safe_chars = coding_safe_chars (coding_system);
7630   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7631     error ("No valid safe-chars property for %s",
7632            SDATA (SYMBOL_NAME (coding_system)));
7633
7634   if (EQ (safe_chars, Qt))
7635     {
7636       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7637         XSETCAR (Vcoding_system_safe_chars,
7638                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7639     }
7640   else
7641     {
7642       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7643       if (NILP (slot))
7644         XSETCDR (Vcoding_system_safe_chars,
7645                  nconc2 (XCDR (Vcoding_system_safe_chars),
7646                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7647       else
7648         XSETCDR (slot, safe_chars);
7649     }
7650   return Qnil;
7651 }
7652
7653 #endif /* emacs */
7654
7655 \f
7656 /*** 9. Post-amble ***/
7657
7658 void
7659 init_coding_once ()
7660 {
7661   int i;
7662
7663   /* Emacs' internal format specific initialize routine.  */
7664   for (i = 0; i <= 0x20; i++)
7665     emacs_code_class[i] = EMACS_control_code;
7666   emacs_code_class[0x0A] = EMACS_linefeed_code;
7667   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7668   for (i = 0x21 ; i < 0x7F; i++)
7669     emacs_code_class[i] = EMACS_ascii_code;
7670   emacs_code_class[0x7F] = EMACS_control_code;
7671   for (i = 0x80; i < 0xFF; i++)
7672     emacs_code_class[i] = EMACS_invalid_code;
7673   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7674   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7675   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7676   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7677
7678   /* ISO2022 specific initialize routine.  */
7679   for (i = 0; i < 0x20; i++)
7680     iso_code_class[i] = ISO_control_0;
7681   for (i = 0x21; i < 0x7F; i++)
7682     iso_code_class[i] = ISO_graphic_plane_0;
7683   for (i = 0x80; i < 0xA0; i++)
7684     iso_code_class[i] = ISO_control_1;
7685   for (i = 0xA1; i < 0xFF; i++)
7686     iso_code_class[i] = ISO_graphic_plane_1;
7687   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7688   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7689   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7690   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7691   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7692   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7693   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7694   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7695   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7696   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7697
7698   setup_coding_system (Qnil, &safe_terminal_coding);
7699   setup_coding_system (Qnil, &default_buffer_file_coding);
7700
7701   bzero (coding_system_table, sizeof coding_system_table);
7702
7703   bzero (ascii_skip_code, sizeof ascii_skip_code);
7704   for (i = 0; i < 128; i++)
7705     ascii_skip_code[i] = 1;
7706
7707 #if defined (MSDOS) || defined (WINDOWSNT)
7708   system_eol_type = CODING_EOL_CRLF;
7709 #else
7710   system_eol_type = CODING_EOL_LF;
7711 #endif
7712
7713   inhibit_pre_post_conversion = 0;
7714 }
7715
7716 #ifdef emacs
7717
7718 void
7719 syms_of_coding ()
7720 {
7721   staticpro (&Vcode_conversion_workbuf_name);
7722   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7723
7724   Qtarget_idx = intern ("target-idx");
7725   staticpro (&Qtarget_idx);
7726
7727   Qcoding_system_history = intern ("coding-system-history");
7728   staticpro (&Qcoding_system_history);
7729   Fset (Qcoding_system_history, Qnil);
7730
7731   /* Target FILENAME is the first argument.  */
7732   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7733   /* Target FILENAME is the third argument.  */
7734   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7735
7736   Qcall_process = intern ("call-process");
7737   staticpro (&Qcall_process);
7738   /* Target PROGRAM is the first argument.  */
7739   Fput (Qcall_process, Qtarget_idx, make_number (0));
7740
7741   Qcall_process_region = intern ("call-process-region");
7742   staticpro (&Qcall_process_region);
7743   /* Target PROGRAM is the third argument.  */
7744   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7745
7746   Qstart_process = intern ("start-process");
7747   staticpro (&Qstart_process);
7748   /* Target PROGRAM is the third argument.  */
7749   Fput (Qstart_process, Qtarget_idx, make_number (2));
7750
7751   Qopen_network_stream = intern ("open-network-stream");
7752   staticpro (&Qopen_network_stream);
7753   /* Target SERVICE is the fourth argument.  */
7754   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7755
7756   Qcoding_system = intern ("coding-system");
7757   staticpro (&Qcoding_system);
7758
7759   Qeol_type = intern ("eol-type");
7760   staticpro (&Qeol_type);
7761
7762   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7763   staticpro (&Qbuffer_file_coding_system);
7764
7765   Qpost_read_conversion = intern ("post-read-conversion");
7766   staticpro (&Qpost_read_conversion);
7767
7768   Qpre_write_conversion = intern ("pre-write-conversion");
7769   staticpro (&Qpre_write_conversion);
7770
7771   Qno_conversion = intern ("no-conversion");
7772   staticpro (&Qno_conversion);
7773
7774   Qundecided = intern ("undecided");
7775   staticpro (&Qundecided);
7776
7777   Qcoding_system_p = intern ("coding-system-p");
7778   staticpro (&Qcoding_system_p);
7779
7780   Qcoding_system_error = intern ("coding-system-error");
7781   staticpro (&Qcoding_system_error);
7782
7783   Fput (Qcoding_system_error, Qerror_conditions,
7784         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7785   Fput (Qcoding_system_error, Qerror_message,
7786         build_string ("Invalid coding system"));
7787
7788   Qcoding_category = intern ("coding-category");
7789   staticpro (&Qcoding_category);
7790   Qcoding_category_index = intern ("coding-category-index");
7791   staticpro (&Qcoding_category_index);
7792
7793   Vcoding_category_table
7794     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7795   staticpro (&Vcoding_category_table);
7796   {
7797     int i;
7798     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7799       {
7800         XVECTOR (Vcoding_category_table)->contents[i]
7801           = intern (coding_category_name[i]);
7802         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7803               Qcoding_category_index, make_number (i));
7804       }
7805   }
7806
7807   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7808   staticpro (&Vcoding_system_safe_chars);
7809
7810   Qtranslation_table = intern ("translation-table");
7811   staticpro (&Qtranslation_table);
7812   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7813
7814   Qtranslation_table_id = intern ("translation-table-id");
7815   staticpro (&Qtranslation_table_id);
7816
7817   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7818   staticpro (&Qtranslation_table_for_decode);
7819
7820   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7821   staticpro (&Qtranslation_table_for_encode);
7822
7823   Qsafe_chars = intern ("safe-chars");
7824   staticpro (&Qsafe_chars);
7825
7826   Qchar_coding_system = intern ("char-coding-system");
7827   staticpro (&Qchar_coding_system);
7828
7829   /* Intern this now in case it isn't already done.
7830      Setting this variable twice is harmless.
7831      But don't staticpro it here--that is done in alloc.c.  */
7832   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7833   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7834   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7835
7836   Qvalid_codes = intern ("valid-codes");
7837   staticpro (&Qvalid_codes);
7838
7839   Qascii_incompatible = intern ("ascii-incompatible");
7840   staticpro (&Qascii_incompatible);
7841
7842   Qemacs_mule = intern ("emacs-mule");
7843   staticpro (&Qemacs_mule);
7844
7845   Qraw_text = intern ("raw-text");
7846   staticpro (&Qraw_text);
7847
7848   Qutf_8 = intern ("utf-8");
7849   staticpro (&Qutf_8);
7850
7851   Qcoding_system_define_form = intern ("coding-system-define-form");
7852   staticpro (&Qcoding_system_define_form);
7853
7854   defsubr (&Scoding_system_p);
7855   defsubr (&Sread_coding_system);
7856   defsubr (&Sread_non_nil_coding_system);
7857   defsubr (&Scheck_coding_system);
7858   defsubr (&Sdetect_coding_region);
7859   defsubr (&Sdetect_coding_string);
7860   defsubr (&Sfind_coding_systems_region_internal);
7861   defsubr (&Sunencodable_char_position);
7862   defsubr (&Sdecode_coding_region);
7863   defsubr (&Sencode_coding_region);
7864   defsubr (&Sdecode_coding_string);
7865   defsubr (&Sencode_coding_string);
7866   defsubr (&Sdecode_sjis_char);
7867   defsubr (&Sencode_sjis_char);
7868   defsubr (&Sdecode_big5_char);
7869   defsubr (&Sencode_big5_char);
7870   defsubr (&Sset_terminal_coding_system_internal);
7871   defsubr (&Sset_safe_terminal_coding_system_internal);
7872   defsubr (&Sterminal_coding_system);
7873   defsubr (&Sset_keyboard_coding_system_internal);
7874   defsubr (&Skeyboard_coding_system);
7875   defsubr (&Sfind_operation_coding_system);
7876   defsubr (&Supdate_coding_systems_internal);
7877   defsubr (&Sset_coding_priority_internal);
7878   defsubr (&Sdefine_coding_system_internal);
7879
7880   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7881                doc: /* List of coding systems.
7882
7883 Do not alter the value of this variable manually.  This variable should be
7884 updated by the functions `make-coding-system' and
7885 `define-coding-system-alias'.  */);
7886   Vcoding_system_list = Qnil;
7887
7888   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7889                doc: /* Alist of coding system names.
7890 Each element is one element list of coding system name.
7891 This variable is given to `completing-read' as TABLE argument.
7892
7893 Do not alter the value of this variable manually.  This variable should be
7894 updated by the functions `make-coding-system' and
7895 `define-coding-system-alias'.  */);
7896   Vcoding_system_alist = Qnil;
7897
7898   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7899                doc: /* List of coding-categories (symbols) ordered by priority.
7900
7901 On detecting a coding system, Emacs tries code detection algorithms
7902 associated with each coding-category one by one in this order.  When
7903 one algorithm agrees with a byte sequence of source text, the coding
7904 system bound to the corresponding coding-category is selected.
7905
7906 Don't modify this variable directly, but use `set-coding-priority'.  */);
7907   {
7908     int i;
7909
7910     Vcoding_category_list = Qnil;
7911     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7912       Vcoding_category_list
7913         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7914                  Vcoding_category_list);
7915   }
7916
7917   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7918                doc: /* Specify the coding system for read operations.
7919 It is useful to bind this variable with `let', but do not set it globally.
7920 If the value is a coding system, it is used for decoding on read operation.
7921 If not, an appropriate element is used from one of the coding system alists:
7922 There are three such tables, `file-coding-system-alist',
7923 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7924   Vcoding_system_for_read = Qnil;
7925
7926   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7927                doc: /* Specify the coding system for write operations.
7928 Programs bind this variable with `let', but you should not set it globally.
7929 If the value is a coding system, it is used for encoding of output,
7930 when writing it to a file and when sending it to a file or subprocess.
7931
7932 If this does not specify a coding system, an appropriate element
7933 is used from one of the coding system alists:
7934 There are three such tables, `file-coding-system-alist',
7935 `process-coding-system-alist', and `network-coding-system-alist'.
7936 For output to files, if the above procedure does not specify a coding system,
7937 the value of `buffer-file-coding-system' is used.  */);
7938   Vcoding_system_for_write = Qnil;
7939
7940   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7941                doc: /* Coding system used in the latest file or process I/O.
7942 Also set by `encode-coding-region', `decode-coding-region',
7943 `encode-coding-string' and `decode-coding-string'.  */);
7944   Vlast_coding_system_used = Qnil;
7945
7946   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7947                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7948 See info node `Coding Systems' and info node `Text and Binary' concerning
7949 such conversion.  */);
7950   inhibit_eol_conversion = 0;
7951
7952   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7953                doc: /* Non-nil means process buffer inherits coding system of process output.
7954 Bind it to t if the process output is to be treated as if it were a file
7955 read from some filesystem.  */);
7956   inherit_process_coding_system = 0;
7957
7958   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7959                doc: /* Alist to decide a coding system to use for a file I/O operation.
7960 The format is ((PATTERN . VAL) ...),
7961 where PATTERN is a regular expression matching a file name,
7962 VAL is a coding system, a cons of coding systems, or a function symbol.
7963 If VAL is a coding system, it is used for both decoding and encoding
7964 the file contents.
7965 If VAL is a cons of coding systems, the car part is used for decoding,
7966 and the cdr part is used for encoding.
7967 If VAL is a function symbol, the function must return a coding system
7968 or a cons of coding systems which are used as above.  The function is
7969 called with an argument that is a list of the arguments with which
7970 `find-operation-coding-system' was called.
7971
7972 See also the function `find-operation-coding-system'
7973 and the variable `auto-coding-alist'.  */);
7974   Vfile_coding_system_alist = Qnil;
7975
7976   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7977     doc: /* Alist to decide a coding system to use for a process I/O operation.
7978 The format is ((PATTERN . VAL) ...),
7979 where PATTERN is a regular expression matching a program name,
7980 VAL is a coding system, a cons of coding systems, or a function symbol.
7981 If VAL is a coding system, it is used for both decoding what received
7982 from the program and encoding what sent to the program.
7983 If VAL is a cons of coding systems, the car part is used for decoding,
7984 and the cdr part is used for encoding.
7985 If VAL is a function symbol, the function must return a coding system
7986 or a cons of coding systems which are used as above.
7987
7988 See also the function `find-operation-coding-system'.  */);
7989   Vprocess_coding_system_alist = Qnil;
7990
7991   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7992     doc: /* Alist to decide a coding system to use for a network I/O operation.
7993 The format is ((PATTERN . VAL) ...),
7994 where PATTERN is a regular expression matching a network service name
7995 or is a port number to connect to,
7996 VAL is a coding system, a cons of coding systems, or a function symbol.
7997 If VAL is a coding system, it is used for both decoding what received
7998 from the network stream and encoding what sent to the network stream.
7999 If VAL is a cons of coding systems, the car part is used for decoding,
8000 and the cdr part is used for encoding.
8001 If VAL is a function symbol, the function must return a coding system
8002 or a cons of coding systems which are used as above.
8003
8004 See also the function `find-operation-coding-system'.  */);
8005   Vnetwork_coding_system_alist = Qnil;
8006
8007   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8008                doc: /* Coding system to use with system messages.
8009 Also used for decoding keyboard input on X Window system.  */);
8010   Vlocale_coding_system = Qnil;
8011
8012   /* The eol mnemonics are reset in startup.el system-dependently.  */
8013   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8014                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8015   eol_mnemonic_unix = build_string (":");
8016
8017   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8018                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8019   eol_mnemonic_dos = build_string ("\\");
8020
8021   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8022                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8023   eol_mnemonic_mac = build_string ("/");
8024
8025   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8026                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8027   eol_mnemonic_undecided = build_string (":");
8028
8029   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8030                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8031   Venable_character_translation = Qt;
8032
8033   DEFVAR_LISP ("standard-translation-table-for-decode",
8034                &Vstandard_translation_table_for_decode,
8035                doc: /* Table for translating characters while decoding.  */);
8036   Vstandard_translation_table_for_decode = Qnil;
8037
8038   DEFVAR_LISP ("standard-translation-table-for-encode",
8039                &Vstandard_translation_table_for_encode,
8040                doc: /* Table for translating characters while encoding.  */);
8041   Vstandard_translation_table_for_encode = Qnil;
8042
8043   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8044                doc: /* Alist of charsets vs revision numbers.
8045 While encoding, if a charset (car part of an element) is found,
8046 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8047   Vcharset_revision_alist = Qnil;
8048
8049   DEFVAR_LISP ("default-process-coding-system",
8050                &Vdefault_process_coding_system,
8051                doc: /* Cons of coding systems used for process I/O by default.
8052 The car part is used for decoding a process output,
8053 the cdr part is used for encoding a text to be sent to a process.  */);
8054   Vdefault_process_coding_system = Qnil;
8055
8056   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8057                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8058 This is a vector of length 256.
8059 If Nth element is non-nil, the existence of code N in a file
8060 \(or output of subprocess) doesn't prevent it to be detected as
8061 a coding system of ISO 2022 variant which has a flag
8062 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8063 or reading output of a subprocess.
8064 Only 128th through 159th elements has a meaning.  */);
8065   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8066
8067   DEFVAR_LISP ("select-safe-coding-system-function",
8068                &Vselect_safe_coding_system_function,
8069                doc: /* Function to call to select safe coding system for encoding a text.
8070
8071 If set, this function is called to force a user to select a proper
8072 coding system which can encode the text in the case that a default
8073 coding system used in each operation can't encode the text.
8074
8075 The default value is `select-safe-coding-system' (which see).  */);
8076   Vselect_safe_coding_system_function = Qnil;
8077
8078   DEFVAR_BOOL ("coding-system-require-warning",
8079                &coding_system_require_warning,
8080                doc: /* Internal use only.
8081 If non-nil, on writing a file, `select-safe-coding-system-function' is
8082 called even if `coding-system-for-write' is non-nil.  The command
8083 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8084   coding_system_require_warning = 0;
8085
8086
8087   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8088                &inhibit_iso_escape_detection,
8089                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8090
8091 By default, on reading a file, Emacs tries to detect how the text is
8092 encoded.  This code detection is sensitive to escape sequences.  If
8093 the sequence is valid as ISO2022, the code is determined as one of
8094 the ISO2022 encodings, and the file is decoded by the corresponding
8095 coding system (e.g. `iso-2022-7bit').
8096
8097 However, there may be a case that you want to read escape sequences in
8098 a file as is.  In such a case, you can set this variable to non-nil.
8099 Then, as the code detection ignores any escape sequences, no file is
8100 detected as encoded in some ISO2022 encoding.  The result is that all
8101 escape sequences become visible in a buffer.
8102
8103 The default value is nil, and it is strongly recommended not to change
8104 it.  That is because many Emacs Lisp source files that contain
8105 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8106 in Emacs's distribution, and they won't be decoded correctly on
8107 reading if you suppress escape sequence detection.
8108
8109 The other way to read escape sequences in a file without decoding is
8110 to explicitly specify some coding system that doesn't use ISO2022's
8111 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8112   inhibit_iso_escape_detection = 0;
8113
8114   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8115                doc: /* Char table for translating self-inserting characters.
8116 This is applied to the result of input methods, not their input.  See also
8117 `keyboard-translate-table'.  */);
8118     Vtranslation_table_for_input = Qnil;
8119 }
8120
8121 char *
8122 emacs_strerror (error_number)
8123      int error_number;
8124 {
8125   char *str;
8126
8127   synchronize_system_messages_locale ();
8128   str = strerror (error_number);
8129
8130   if (! NILP (Vlocale_coding_system))
8131     {
8132       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8133                                                       Vlocale_coding_system,
8134                                                       0);
8135       str = (char *) SDATA (dec);
8136     }
8137
8138   return str;
8139 }
8140
8141 #endif /* emacs */
8142
8143 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8144    (do not change this comment) */