code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348
 349 #else  /* not emacs */
 350
 351 #include "mulelib.h"
 352
 353 #endif /* not emacs */
 354
 355 Lisp_Object Qcoding_system, Qeol_type;
 356 Lisp_Object Qbuffer_file_coding_system;
 357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 358 Lisp_Object Qno_conversion, Qundecided;
 359 Lisp_Object Qcoding_system_history;
 360 Lisp_Object Qsafe_chars;
 361 Lisp_Object Qvalid_codes;
 362
 363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 365 Lisp_Object Qstart_process, Qopen_network_stream;
 366 Lisp_Object Qtarget_idx;
 367
 368 Lisp_Object Vselect_safe_coding_system_function;
 369
 370 int coding_system_require_warning;
 371
 372 /* Mnemonic string for each format of end-of-line.  */
 373 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 374 /* Mnemonic string to indicate format of end-of-line is not yet
 375    decided.  */
 376 Lisp_Object eol_mnemonic_undecided;
 377
 378 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 379    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 380 int system_eol_type;
 381
 382 #ifdef emacs
 383
 384 /* Information about which coding system is safe for which chars.
 385    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 386
 387    GENERIC-LIST is a list of generic coding systems which can encode
 388    any characters.
 389
 390    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 391    corresponding char table that contains safe chars.  */
 392 Lisp_Object Vcoding_system_safe_chars;
 393
 394 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 395
 396 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 397
 398 /* Coding system emacs-mule and raw-text are for converting only
 399    end-of-line format.  */
 400 Lisp_Object Qemacs_mule, Qraw_text;
 401
 402 /* Coding-systems are handed between Emacs Lisp programs and C internal
 403    routines by the following three variables.  */
 404 /* Coding-system for reading files and receiving data from process.  */
 405 Lisp_Object Vcoding_system_for_read;
 406 /* Coding-system for writing files and sending data to process.  */
 407 Lisp_Object Vcoding_system_for_write;
 408 /* Coding-system actually used in the latest I/O.  */
 409 Lisp_Object Vlast_coding_system_used;
 410
 411 /* A vector of length 256 which contains information about special
 412    Latin codes (especially for dealing with Microsoft codes).  */
 413 Lisp_Object Vlatin_extra_code_table;
 414
 415 /* Flag to inhibit code conversion of end-of-line format.  */
 416 int inhibit_eol_conversion;
 417
 418 /* Flag to inhibit ISO2022 escape sequence detection.  */
 419 int inhibit_iso_escape_detection;
 420
 421 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 422 int inherit_process_coding_system;
 423
 424 /* Coding system to be used to encode text for terminal display.  */
 425 struct coding_system terminal_coding;
 426
 427 /* Coding system to be used to encode text for terminal display when
 428    terminal coding system is nil.  */
 429 struct coding_system safe_terminal_coding;
 430
 431 /* Coding system of what is sent from terminal keyboard.  */
 432 struct coding_system keyboard_coding;
 433
 434 /* Default coding system to be used to write a file.  */
 435 struct coding_system default_buffer_file_coding;
 436
 437 Lisp_Object Vfile_coding_system_alist;
 438 Lisp_Object Vprocess_coding_system_alist;
 439 Lisp_Object Vnetwork_coding_system_alist;
 440
 441 Lisp_Object Vlocale_coding_system;
 442
 443 #endif /* emacs */
 444
 445 Lisp_Object Qcoding_category, Qcoding_category_index;
 446
 447 /* List of symbols `coding-category-xxx' ordered by priority.  */
 448 Lisp_Object Vcoding_category_list;
 449
 450 /* Table of coding categories (Lisp symbols).  */
 451 Lisp_Object Vcoding_category_table;
 452
 453 /* Table of names of symbol for each coding-category.  */
 454 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 455   "coding-category-emacs-mule",
 456   "coding-category-sjis",
 457   "coding-category-iso-7",
 458   "coding-category-iso-7-tight",
 459   "coding-category-iso-8-1",
 460   "coding-category-iso-8-2",
 461   "coding-category-iso-7-else",
 462   "coding-category-iso-8-else",
 463   "coding-category-ccl",
 464   "coding-category-big5",
 465   "coding-category-utf-8",
 466   "coding-category-utf-16-be",
 467   "coding-category-utf-16-le",
 468   "coding-category-raw-text",
 469   "coding-category-binary"
 470 };
 471
 472 /* Table of pointers to coding systems corresponding to each coding
 473    categories.  */
 474 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 475
 476 /* Table of coding category masks.  Nth element is a mask for a coding
 477    category of which priority is Nth.  */
 478 static
 479 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 480
 481 /* Flag to tell if we look up translation table on character code
 482    conversion.  */
 483 Lisp_Object Venable_character_translation;
 484 /* Standard translation table to look up on decoding (reading).  */
 485 Lisp_Object Vstandard_translation_table_for_decode;
 486 /* Standard translation table to look up on encoding (writing).  */
 487 Lisp_Object Vstandard_translation_table_for_encode;
 488
 489 Lisp_Object Qtranslation_table;
 490 Lisp_Object Qtranslation_table_id;
 491 Lisp_Object Qtranslation_table_for_decode;
 492 Lisp_Object Qtranslation_table_for_encode;
 493
 494 /* Alist of charsets vs revision number.  */
 495 Lisp_Object Vcharset_revision_alist;
 496
 497 /* Default coding systems used for process I/O.  */
 498 Lisp_Object Vdefault_process_coding_system;
 499
 500 /* Char table for translating Quail and self-inserting input.  */
 501 Lisp_Object Vtranslation_table_for_input;
 502
 503 /* Global flag to tell that we can't call post-read-conversion and
 504    pre-write-conversion functions.  Usually the value is zero, but it
 505    is set to 1 temporarily while such functions are running.  This is
 506    to avoid infinite recursive call.  */
 507 static int inhibit_pre_post_conversion;
 508
 509 /* Char-table containing safe coding systems of each character.  */
 510 Lisp_Object Vchar_coding_system_table;
 511 Lisp_Object Qchar_coding_system;
 512
 513 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 514    its validity.  */
 515
 516 Lisp_Object
 517 coding_safe_chars (coding_system)
 518      Lisp_Object coding_system;
 519 {
 520   Lisp_Object coding_spec, plist, safe_chars;
 521
 522   coding_spec = Fget (coding_system, Qcoding_system);
 523   plist = XVECTOR (coding_spec)->contents[3];
 524   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 525   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 526 }
 527
 528 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 529   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 530
 531 \f
 532 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 533
 534 /* Emacs' internal format for representation of multiple character
 535    sets is a kind of multi-byte encoding, i.e. characters are
 536    represented by variable-length sequences of one-byte codes.
 537
 538    ASCII characters and control characters (e.g. `tab', `newline') are
 539    represented by one-byte sequences which are their ASCII codes, in
 540    the range 0x00 through 0x7F.
 541
 542    8-bit characters of the range 0x80..0x9F are represented by
 543    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 544    code + 0x20).
 545
 546    8-bit characters of the range 0xA0..0xFF are represented by
 547    one-byte sequences which are their 8-bit code.
 548
 549    The other characters are represented by a sequence of `base
 550    leading-code', optional `extended leading-code', and one or two
 551    `position-code's.  The length of the sequence is determined by the
 552    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 553    whereas extended leading-code and position-code take the range 0xA0
 554    through 0xFF.  See `charset.h' for more details about leading-code
 555    and position-code.
 556
 557    --- CODE RANGE of Emacs' internal format ---
 558    character set        range
 559    -------------        -----
 560    ascii                0x00..0x7F
 561    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 562    eight-bit-graphic    0xA0..0xBF
 563    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 564    ---------------------------------------------
 565
 566    As this is the internal character representation, the format is
 567    usually not used externally (i.e. in a file or in a data sent to a
 568    process).  But, it is possible to have a text externally in this
 569    format (i.e. by encoding by the coding system `emacs-mule').
 570
 571    In that case, a sequence of one-byte codes has a slightly different
 572    form.
 573
 574    Firstly, all characters in eight-bit-control are represented by
 575    one-byte sequences which are their 8-bit code.
 576
 577    Next, character composition data are represented by the byte
 578    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 579    where,
 580         METHOD is 0xF0 plus one of composition method (enum
 581         composition_method),
 582
 583         BYTES is 0xA0 plus the byte length of these composition data,
 584
 585         CHARS is 0xA0 plus the number of characters composed by these
 586         data,
 587
 588         COMPONENTs are characters of multibyte form or composition
 589         rules encoded by two-byte of ASCII codes.
 590
 591    In addition, for backward compatibility, the following formats are
 592    also recognized as composition data on decoding.
 593
 594    0x80 MSEQ ...
 595    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 596
 597    Here,
 598         MSEQ is a multibyte form but in these special format:
 599           ASCII: 0xA0 ASCII_CODE+0x80,
 600           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 601         RULE is a one byte code of the range 0xA0..0xF0 that
 602         represents a composition rule.
 603   */
 604
 605 enum emacs_code_class_type emacs_code_class[256];
 606
 607 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 608    Check if a text is encoded in Emacs' internal format.  If it is,
 609    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 610
 611 static int
 612 detect_coding_emacs_mule (src, src_end, multibytep)
 613       unsigned char *src, *src_end;
 614       int multibytep;
 615 {
 616   unsigned char c;
 617   int composing = 0;
 618   /* Dummy for ONE_MORE_BYTE.  */
 619   struct coding_system dummy_coding;
 620   struct coding_system *coding = &dummy_coding;
 621
 622   while (1)
 623     {
 624       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 625
 626       if (composing)
 627         {
 628           if (c < 0xA0)
 629             composing = 0;
 630           else if (c == 0xA0)
 631             {
 632               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 633               c &= 0x7F;
 634             }
 635           else
 636             c -= 0x20;
 637         }
 638
 639       if (c < 0x20)
 640         {
 641           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 642             return 0;
 643         }
 644       else if (c >= 0x80 && c < 0xA0)
 645         {
 646           if (c == 0x80)
 647             /* Old leading code for a composite character.  */
 648             composing = 1;
 649           else
 650             {
 651               unsigned char *src_base = src - 1;
 652               int bytes;
 653
 654               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 655                                                bytes))
 656                 return 0;
 657               src = src_base + bytes;
 658             }
 659         }
 660     }
 661  label_end_of_loop:
 662   return CODING_CATEGORY_MASK_EMACS_MULE;
 663 }
 664
 665
 666 /* Record the starting position START and METHOD of one composition.  */
 667
 668 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 669   do {                                                          \
 670     struct composition_data *cmp_data = coding->cmp_data;       \
 671     int *data = cmp_data->data + cmp_data->used;                \
 672     coding->cmp_data_start = cmp_data->used;                    \
 673     data[0] = -1;                                               \
 674     data[1] = cmp_data->char_offset + start;                    \
 675     data[3] = (int) method;                                     \
 676     cmp_data->used += 4;                                        \
 677   } while (0)
 678
 679 /* Record the ending position END of the current composition.  */
 680
 681 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 682   do {                                                          \
 683     struct composition_data *cmp_data = coding->cmp_data;       \
 684     int *data = cmp_data->data + coding->cmp_data_start;        \
 685     data[0] = cmp_data->used - coding->cmp_data_start;          \
 686     data[2] = cmp_data->char_offset + end;                      \
 687   } while (0)
 688
 689 /* Record one COMPONENT (alternate character or composition rule).  */
 690
 691 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 692   do {                                                                  \
 693     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 694     if (coding->cmp_data->used - coding->cmp_data_start                 \
 695         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 696       {                                                                 \
 697         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 698         coding->composing = COMPOSITION_NO;                             \
 699       }                                                                 \
 700   } while (0)
 701
 702
 703 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 704    is not less than SRC_END, return -1 without incrementing Src.  */
 705
 706 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 707
 708
 709 /* Decode a character represented as a component of composition
 710    sequence of Emacs 20 style at SRC.  Set C to that character, store
 711    its multibyte form sequence at P, and set P to the end of that
 712    sequence.  If no valid character is found, set C to -1.  */
 713
 714 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 715   do {                                                          \
 716     int bytes;                                                  \
 717                                                                 \
 718     c = SAFE_ONE_MORE_BYTE ();                                  \
 719     if (c < 0)                                                  \
 720       break;                                                    \
 721     if (CHAR_HEAD_P (c))                                        \
 722       c = -1;                                                   \
 723     else if (c == 0xA0)                                         \
 724       {                                                         \
 725         c = SAFE_ONE_MORE_BYTE ();                              \
 726         if (c < 0xA0)                                           \
 727           c = -1;                                               \
 728         else                                                    \
 729           {                                                     \
 730             c -= 0xA0;                                          \
 731             *p++ = c;                                           \
 732           }                                                     \
 733       }                                                         \
 734     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 735       {                                                         \
 736         unsigned char *p0 = p;                                  \
 737                                                                 \
 738         c -= 0x20;                                              \
 739         *p++ = c;                                               \
 740         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 741         while (--bytes)                                         \
 742           {                                                     \
 743             c = SAFE_ONE_MORE_BYTE ();                          \
 744             if (c < 0)                                          \
 745               break;                                            \
 746             *p++ = c;                                           \
 747           }                                                     \
 748         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes))     \
 749           c = STRING_CHAR (p0, bytes);                          \
 750         else                                                    \
 751           c = -1;                                               \
 752       }                                                         \
 753     else                                                        \
 754       c = -1;                                                   \
 755   } while (0)
 756
 757
 758 /* Decode a composition rule represented as a component of composition
 759    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 760    valid rule is found, set C to -1.  */
 761
 762 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 763   do {                                                  \
 764     c = SAFE_ONE_MORE_BYTE ();                          \
 765     c -= 0xA0;                                          \
 766     if (c < 0 || c >= 81)                               \
 767       c = -1;                                           \
 768     else                                                \
 769       {                                                 \
 770         gref = c / 9, nref = c % 9;                     \
 771         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 772       }                                                 \
 773   } while (0)
 774
 775
 776 /* Decode composition sequence encoded by `emacs-mule' at the source
 777    pointed by SRC.  SRC_END is the end of source.  Store information
 778    of the composition in CODING->cmp_data.
 779
 780    For backward compatibility, decode also a composition sequence of
 781    Emacs 20 style.  In that case, the composition sequence contains
 782    characters that should be extracted into a buffer or string.  Store
 783    those characters at *DESTINATION in multibyte form.
 784
 785    If we encounter an invalid byte sequence, return 0.
 786    If we encounter an insufficient source or destination, or
 787    insufficient space in CODING->cmp_data, return 1.
 788    Otherwise, return consumed bytes in the source.
 789
 790 */
 791 static INLINE int
 792 decode_composition_emacs_mule (coding, src, src_end,
 793                                destination, dst_end, dst_bytes)
 794      struct coding_system *coding;
 795      unsigned char *src, *src_end, **destination, *dst_end;
 796      int dst_bytes;
 797 {
 798   unsigned char *dst = *destination;
 799   int method, data_len, nchars;
 800   unsigned char *src_base = src++;
 801   /* Store components of composition.  */
 802   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 803   int ncomponent;
 804   /* Store multibyte form of characters to be composed.  This is for
 805      Emacs 20 style composition sequence.  */
 806   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 807   unsigned char *bufp = buf;
 808   int c, i, gref, nref;
 809
 810   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 811       >= COMPOSITION_DATA_SIZE)
 812     {
 813       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 814       return -1;
 815     }
 816
 817   ONE_MORE_BYTE (c);
 818   if (c - 0xF0 >= COMPOSITION_RELATIVE
 819            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 820     {
 821       int with_rule;
 822
 823       method = c - 0xF0;
 824       with_rule = (method == COMPOSITION_WITH_RULE
 825                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 826       ONE_MORE_BYTE (c);
 827       data_len = c - 0xA0;
 828       if (data_len < 4
 829           || src_base + data_len > src_end)
 830         return 0;
 831       ONE_MORE_BYTE (c);
 832       nchars = c - 0xA0;
 833       if (c < 1)
 834         return 0;
 835       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 836         {
 837           /* If it is longer than this, it can't be valid.  */
 838           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 839             return 0;
 840
 841           if (ncomponent % 2 && with_rule)
 842             {
 843               ONE_MORE_BYTE (gref);
 844               gref -= 32;
 845               ONE_MORE_BYTE (nref);
 846               nref -= 32;
 847               c = COMPOSITION_ENCODE_RULE (gref, nref);
 848             }
 849           else
 850             {
 851               int bytes;
 852               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 853                 c = STRING_CHAR (src, bytes);
 854               else
 855                 c = *src, bytes = 1;
 856               src += bytes;
 857             }
 858           component[ncomponent] = c;
 859         }
 860     }
 861   else
 862     {
 863       /* This may be an old Emacs 20 style format.  See the comment at
 864          the section 2 of this file.  */
 865       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 866       if (src == src_end
 867           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 868         goto label_end_of_loop;
 869
 870       src_end = src;
 871       src = src_base + 1;
 872       if (c < 0xC0)
 873         {
 874           method = COMPOSITION_RELATIVE;
 875           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 876             {
 877               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 878               if (c < 0)
 879                 break;
 880               component[ncomponent++] = c;
 881             }
 882           if (ncomponent < 2)
 883             return 0;
 884           nchars = ncomponent;
 885         }
 886       else if (c == 0xFF)
 887         {
 888           method = COMPOSITION_WITH_RULE;
 889           src++;
 890           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 891           if (c < 0)
 892             return 0;
 893           component[0] = c;
 894           for (ncomponent = 1;
 895                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 896             {
 897               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 898               if (c < 0)
 899                 break;
 900               component[ncomponent++] = c;
 901               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 902               if (c < 0)
 903                 break;
 904               component[ncomponent++] = c;
 905             }
 906           if (ncomponent < 3)
 907             return 0;
 908           nchars = (ncomponent + 1) / 2;
 909         }
 910       else
 911         return 0;
 912     }
 913
 914   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 915     {
 916       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 917       for (i = 0; i < ncomponent; i++)
 918         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 919       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 920       if (buf < bufp)
 921         {
 922           unsigned char *p = buf;
 923           EMIT_BYTES (p, bufp);
 924           *destination += bufp - buf;
 925           coding->produced_char += nchars;
 926         }
 927       return (src - src_base);
 928     }
 929  label_end_of_loop:
 930   return -1;
 931 }
 932
 933 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 934
 935 static void
 936 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 937      struct coding_system *coding;
 938      unsigned char *source, *destination;
 939      int src_bytes, dst_bytes;
 940 {
 941   unsigned char *src = source;
 942   unsigned char *src_end = source + src_bytes;
 943   unsigned char *dst = destination;
 944   unsigned char *dst_end = destination + dst_bytes;
 945   /* SRC_BASE remembers the start position in source in each loop.
 946      The loop will be exited when there's not enough source code, or
 947      when there's not enough destination area to produce a
 948      character.  */
 949   unsigned char *src_base;
 950
 951   coding->produced_char = 0;
 952   while ((src_base = src) < src_end)
 953     {
 954       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 955       int bytes;
 956
 957       if (*src == '\r')
 958         {
 959           int c = *src++;
 960
 961           if (coding->eol_type == CODING_EOL_CR)
 962             c = '\n';
 963           else if (coding->eol_type == CODING_EOL_CRLF)
 964             {
 965               ONE_MORE_BYTE (c);
 966               if (c != '\n')
 967                 {
 968                   src--;
 969                   c = '\r';
 970                 }
 971             }
 972           *dst++ = c;
 973           coding->produced_char++;
 974           continue;
 975         }
 976       else if (*src == '\n')
 977         {
 978           if ((coding->eol_type == CODING_EOL_CR
 979                || coding->eol_type == CODING_EOL_CRLF)
 980               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 981             {
 982               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 983               goto label_end_of_loop;
 984             }
 985           *dst++ = *src++;
 986           coding->produced_char++;
 987           continue;
 988         }
 989       else if (*src == 0x80 && coding->cmp_data)
 990         {
 991           /* Start of composition data.  */
 992           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
 993                                                          &dst, dst_end,
 994                                                          dst_bytes);
 995           if (consumed < 0)
 996             goto label_end_of_loop;
 997           else if (consumed > 0)
 998             {
 999               src += consumed;
1000               continue;
1001             }
1002           bytes = CHAR_STRING (*src, tmp);
1003           p = tmp;
1004           src++;
1005         }
1006       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
1007         {
1008           p = src;
1009           src += bytes;
1010         }
1011       else
1012         {
1013           bytes = CHAR_STRING (*src, tmp);
1014           p = tmp;
1015           src++;
1016         }
1017       if (dst + bytes >= (dst_bytes ? dst_end : src))
1018         {
1019           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1020           break;
1021         }
1022       while (bytes--) *dst++ = *p++;
1023       coding->produced_char++;
1024     }
1025  label_end_of_loop:
1026   coding->consumed = coding->consumed_char = src_base - source;
1027   coding->produced = dst - destination;
1028 }
1029
1030
1031 /* Encode composition data stored at DATA into a special byte sequence
1032    starting by 0x80.  Update CODING->cmp_data_start and maybe
1033    CODING->cmp_data for the next call.  */
1034
1035 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1036   do {                                                                  \
1037     unsigned char buf[1024], *p0 = buf, *p;                             \
1038     int len = data[0];                                                  \
1039     int i;                                                              \
1040                                                                         \
1041     buf[0] = 0x80;                                                      \
1042     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1043     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1044     p = buf + 4;                                                        \
1045     if (data[3] == COMPOSITION_WITH_RULE                                \
1046         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1047       {                                                                 \
1048         p += CHAR_STRING (data[4], p);                                  \
1049         for (i = 5; i < len; i += 2)                                    \
1050           {                                                             \
1051             int gref, nref;                                             \
1052              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1053             *p++ = 0x20 + gref;                                         \
1054             *p++ = 0x20 + nref;                                         \
1055             p += CHAR_STRING (data[i + 1], p);                          \
1056           }                                                             \
1057       }                                                                 \
1058     else                                                                \
1059       {                                                                 \
1060         for (i = 4; i < len; i++)                                       \
1061           p += CHAR_STRING (data[i], p);                                \
1062       }                                                                 \
1063     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1064                                                                         \
1065     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1066       {                                                                 \
1067         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1068         goto label_end_of_loop;                                         \
1069       }                                                                 \
1070     while (p0 < p)                                                      \
1071       *dst++ = *p0++;                                                   \
1072     coding->cmp_data_start += data[0];                                  \
1073     if (coding->cmp_data_start == coding->cmp_data->used                \
1074         && coding->cmp_data->next)                                      \
1075       {                                                                 \
1076         coding->cmp_data = coding->cmp_data->next;                      \
1077         coding->cmp_data_start = 0;                                     \
1078       }                                                                 \
1079   } while (0)
1080
1081
1082 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1083                             unsigned char *, int, int));
1084
1085 static void
1086 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1087      struct coding_system *coding;
1088      unsigned char *source, *destination;
1089      int src_bytes, dst_bytes;
1090 {
1091   unsigned char *src = source;
1092   unsigned char *src_end = source + src_bytes;
1093   unsigned char *dst = destination;
1094   unsigned char *dst_end = destination + dst_bytes;
1095   unsigned char *src_base;
1096   int c;
1097   int char_offset;
1098   int *data;
1099
1100   Lisp_Object translation_table;
1101
1102   translation_table = Qnil;
1103
1104   /* Optimization for the case that there's no composition.  */
1105   if (!coding->cmp_data || coding->cmp_data->used == 0)
1106     {
1107       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1108       return;
1109     }
1110
1111   char_offset = coding->cmp_data->char_offset;
1112   data = coding->cmp_data->data + coding->cmp_data_start;
1113   while (1)
1114     {
1115       src_base = src;
1116
1117       /* If SRC starts a composition, encode the information about the
1118          composition in advance.  */
1119       if (coding->cmp_data_start < coding->cmp_data->used
1120           && char_offset + coding->consumed_char == data[1])
1121         {
1122           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1123           char_offset = coding->cmp_data->char_offset;
1124           data = coding->cmp_data->data + coding->cmp_data_start;
1125         }
1126
1127       ONE_MORE_CHAR (c);
1128       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1129                         || coding->eol_type == CODING_EOL_CR))
1130         {
1131           if (coding->eol_type == CODING_EOL_CRLF)
1132             EMIT_TWO_BYTES ('\r', c);
1133           else
1134             EMIT_ONE_BYTE ('\r');
1135         }
1136       else if (SINGLE_BYTE_CHAR_P (c))
1137         EMIT_ONE_BYTE (c);
1138       else
1139         EMIT_BYTES (src_base, src);
1140       coding->consumed_char++;
1141     }
1142  label_end_of_loop:
1143   coding->consumed = src_base - source;
1144   coding->produced = coding->produced_char = dst - destination;
1145   return;
1146 }
1147
1148 \f
1149 /*** 3. ISO2022 handlers ***/
1150
1151 /* The following note describes the coding system ISO2022 briefly.
1152    Since the intention of this note is to help understand the
1153    functions in this file, some parts are NOT ACCURATE or are OVERLY
1154    SIMPLIFIED.  For thorough understanding, please refer to the
1155    original document of ISO2022.  This is equivalent to the standard
1156    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1157
1158    ISO2022 provides many mechanisms to encode several character sets
1159    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1160    is encoded using bytes less than 128.  This may make the encoded
1161    text a little bit longer, but the text passes more easily through
1162    several types of gateway, some of which strip off the MSB (Most
1163    Significant Bit).
1164
1165    There are two kinds of character sets: control character sets and
1166    graphic character sets.  The former contain control characters such
1167    as `newline' and `escape' to provide control functions (control
1168    functions are also provided by escape sequences).  The latter
1169    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1170    two control character sets and many graphic character sets.
1171
1172    Graphic character sets are classified into one of the following
1173    four classes, according to the number of bytes (DIMENSION) and
1174    number of characters in one dimension (CHARS) of the set:
1175    - DIMENSION1_CHARS94
1176    - DIMENSION1_CHARS96
1177    - DIMENSION2_CHARS94
1178    - DIMENSION2_CHARS96
1179
1180    In addition, each character set is assigned an identification tag,
1181    unique for each set, called the "final character" (denoted as <F>
1182    hereafter).  The <F> of each character set is decided by ECMA(*)
1183    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1184    (0x30..0x3F are for private use only).
1185
1186    Note (*): ECMA = European Computer Manufacturers Association
1187
1188    Here are examples of graphic character sets [NAME(<F>)]:
1189         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1190         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1191         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1192         o DIMENSION2_CHARS96 -- none for the moment
1193
1194    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1195         C0 [0x00..0x1F] -- control character plane 0
1196         GL [0x20..0x7F] -- graphic character plane 0
1197         C1 [0x80..0x9F] -- control character plane 1
1198         GR [0xA0..0xFF] -- graphic character plane 1
1199
1200    A control character set is directly designated and invoked to C0 or
1201    C1 by an escape sequence.  The most common case is that:
1202    - ISO646's  control character set is designated/invoked to C0, and
1203    - ISO6429's control character set is designated/invoked to C1,
1204    and usually these designations/invocations are omitted in encoded
1205    text.  In a 7-bit environment, only C0 can be used, and a control
1206    character for C1 is encoded by an appropriate escape sequence to
1207    fit into the environment.  All control characters for C1 are
1208    defined to have corresponding escape sequences.
1209
1210    A graphic character set is at first designated to one of four
1211    graphic registers (G0 through G3), then these graphic registers are
1212    invoked to GL or GR.  These designations and invocations can be
1213    done independently.  The most common case is that G0 is invoked to
1214    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1215    these invocations and designations are omitted in encoded text.
1216    In a 7-bit environment, only GL can be used.
1217
1218    When a graphic character set of CHARS94 is invoked to GL, codes
1219    0x20 and 0x7F of the GL area work as control characters SPACE and
1220    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1221    be used.
1222
1223    There are two ways of invocation: locking-shift and single-shift.
1224    With locking-shift, the invocation lasts until the next different
1225    invocation, whereas with single-shift, the invocation affects the
1226    following character only and doesn't affect the locking-shift
1227    state.  Invocations are done by the following control characters or
1228    escape sequences:
1229
1230    ----------------------------------------------------------------------
1231    abbrev  function                  cntrl escape seq   description
1232    ----------------------------------------------------------------------
1233    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1234    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1235    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1236    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1237    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1238    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1239    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1240    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1241    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1242    ----------------------------------------------------------------------
1243    (*) These are not used by any known coding system.
1244
1245    Control characters for these functions are defined by macros
1246    ISO_CODE_XXX in `coding.h'.
1247
1248    Designations are done by the following escape sequences:
1249    ----------------------------------------------------------------------
1250    escape sequence      description
1251    ----------------------------------------------------------------------
1252    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1253    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1254    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1255    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1256    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1257    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1258    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1259    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1260    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1261    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1262    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1263    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1264    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1265    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1266    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1267    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1268    ----------------------------------------------------------------------
1269
1270    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1271    of dimension 1, chars 94, and final character <F>, etc...
1272
1273    Note (*): Although these designations are not allowed in ISO2022,
1274    Emacs accepts them on decoding, and produces them on encoding
1275    CHARS96 character sets in a coding system which is characterized as
1276    7-bit environment, non-locking-shift, and non-single-shift.
1277
1278    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1279    '(' can be omitted.  We refer to this as "short-form" hereafter.
1280
1281    Now you may notice that there are a lot of ways of encoding the
1282    same multilingual text in ISO2022.  Actually, there exist many
1283    coding systems such as Compound Text (used in X11's inter client
1284    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1285    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1286    localized platforms), and all of these are variants of ISO2022.
1287
1288    In addition to the above, Emacs handles two more kinds of escape
1289    sequences: ISO6429's direction specification and Emacs' private
1290    sequence for specifying character composition.
1291
1292    ISO6429's direction specification takes the following form:
1293         o CSI ']'      -- end of the current direction
1294         o CSI '0' ']'  -- end of the current direction
1295         o CSI '1' ']'  -- start of left-to-right text
1296         o CSI '2' ']'  -- start of right-to-left text
1297    The control character CSI (0x9B: control sequence introducer) is
1298    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1299
1300    Character composition specification takes the following form:
1301         o ESC '0' -- start relative composition
1302         o ESC '1' -- end composition
1303         o ESC '2' -- start rule-base composition (*)
1304         o ESC '3' -- start relative composition with alternate chars  (**)
1305         o ESC '4' -- start rule-base composition with alternate chars  (**)
1306   Since these are not standard escape sequences of any ISO standard,
1307   the use of them with these meanings is restricted to Emacs only.
1308
1309   (*) This form is used only in Emacs 20.5 and older versions,
1310   but the newer versions can safely decode it.
1311   (**) This form is used only in Emacs 21.1 and newer versions,
1312   and the older versions can't decode it.
1313
1314   Here's a list of example usages of these composition escape
1315   sequences (categorized by `enum composition_method').
1316
1317   COMPOSITION_RELATIVE:
1318         ESC 0 CHAR [ CHAR ] ESC 1
1319   COMPOSITION_WITH_RULE:
1320         ESC 2 CHAR [ RULE CHAR ] ESC 1
1321   COMPOSITION_WITH_ALTCHARS:
1322         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1323   COMPOSITION_WITH_RULE_ALTCHARS:
1324         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1325
1326 enum iso_code_class_type iso_code_class[256];
1327
1328 #define CHARSET_OK(idx, charset, c)                                     \
1329   (coding_system_table[idx]                                             \
1330    && (charset == CHARSET_ASCII                                         \
1331        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1332            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1333    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1334                                               charset)                  \
1335        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1336
1337 #define SHIFT_OUT_OK(idx) \
1338   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1339
1340 #define COMPOSITION_OK(idx)     \
1341   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1342
1343 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1344    Check if a text is encoded in ISO2022.  If it is, return an
1345    integer in which appropriate flag bits any of:
1346         CODING_CATEGORY_MASK_ISO_7
1347         CODING_CATEGORY_MASK_ISO_7_TIGHT
1348         CODING_CATEGORY_MASK_ISO_8_1
1349         CODING_CATEGORY_MASK_ISO_8_2
1350         CODING_CATEGORY_MASK_ISO_7_ELSE
1351         CODING_CATEGORY_MASK_ISO_8_ELSE
1352    are set.  If a code which should never appear in ISO2022 is found,
1353    returns 0.  */
1354
1355 static int
1356 detect_coding_iso2022 (src, src_end, multibytep)
1357      unsigned char *src, *src_end;
1358      int multibytep;
1359 {
1360   int mask = CODING_CATEGORY_MASK_ISO;
1361   int mask_found = 0;
1362   int reg[4], shift_out = 0, single_shifting = 0;
1363   int c, c1, charset;
1364   /* Dummy for ONE_MORE_BYTE.  */
1365   struct coding_system dummy_coding;
1366   struct coding_system *coding = &dummy_coding;
1367   Lisp_Object safe_chars;
1368
1369   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1370   while (mask && src < src_end)
1371     {
1372       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1373     retry:
1374       switch (c)
1375         {
1376         case ISO_CODE_ESC:
1377           if (inhibit_iso_escape_detection)
1378             break;
1379           single_shifting = 0;
1380           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1381           if (c >= '(' && c <= '/')
1382             {
1383               /* Designation sequence for a charset of dimension 1.  */
1384               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1385               if (c1 < ' ' || c1 >= 0x80
1386                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1387                 /* Invalid designation sequence.  Just ignore.  */
1388                 break;
1389               reg[(c - '(') % 4] = charset;
1390             }
1391           else if (c == '$')
1392             {
1393               /* Designation sequence for a charset of dimension 2.  */
1394               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1395               if (c >= '@' && c <= 'B')
1396                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1397                 reg[0] = charset = iso_charset_table[1][0][c];
1398               else if (c >= '(' && c <= '/')
1399                 {
1400                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1401                   if (c1 < ' ' || c1 >= 0x80
1402                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1403                     /* Invalid designation sequence.  Just ignore.  */
1404                     break;
1405                   reg[(c - '(') % 4] = charset;
1406                 }
1407               else
1408                 /* Invalid designation sequence.  Just ignore.  */
1409                 break;
1410             }
1411           else if (c == 'N' || c == 'O')
1412             {
1413               /* ESC <Fe> for SS2 or SS3.  */
1414               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1415               break;
1416             }
1417           else if (c >= '0' && c <= '4')
1418             {
1419               /* ESC <Fp> for start/end composition.  */
1420               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1421                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1422               else
1423                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1424               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1425                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1426               else
1427                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1428               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1429                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1430               else
1431                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1432               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1433                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1434               else
1435                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1436               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1437                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1438               else
1439                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1440               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1441                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1442               else
1443                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1444               break;
1445             }
1446           else
1447             /* Invalid escape sequence.  Just ignore.  */
1448             break;
1449
1450           /* We found a valid designation sequence for CHARSET.  */
1451           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1452           c = MAKE_CHAR (charset, 0, 0);
1453           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1454             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1455           else
1456             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1457           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1458             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1459           else
1460             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1461           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1462             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1463           else
1464             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1465           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1466             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1467           else
1468             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1469           break;
1470
1471         case ISO_CODE_SO:
1472           if (inhibit_iso_escape_detection)
1473             break;
1474           single_shifting = 0;
1475           if (shift_out == 0
1476               && (reg[1] >= 0
1477                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1478                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1479             {
1480               /* Locking shift out.  */
1481               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1482               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1483             }
1484           break;
1485
1486         case ISO_CODE_SI:
1487           if (inhibit_iso_escape_detection)
1488             break;
1489           single_shifting = 0;
1490           if (shift_out == 1)
1491             {
1492               /* Locking shift in.  */
1493               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1494               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1495             }
1496           break;
1497
1498         case ISO_CODE_CSI:
1499           single_shifting = 0;
1500         case ISO_CODE_SS2:
1501         case ISO_CODE_SS3:
1502           {
1503             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1504
1505             if (inhibit_iso_escape_detection)
1506               break;
1507             if (c != ISO_CODE_CSI)
1508               {
1509                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1510                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1511                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1512                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1513                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1514                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1515                 single_shifting = 1;
1516               }
1517             if (VECTORP (Vlatin_extra_code_table)
1518                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1519               {
1520                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1521                     & CODING_FLAG_ISO_LATIN_EXTRA)
1522                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1523                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1524                     & CODING_FLAG_ISO_LATIN_EXTRA)
1525                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1526               }
1527             mask &= newmask;
1528             mask_found |= newmask;
1529           }
1530           break;
1531
1532         default:
1533           if (c < 0x80)
1534             {
1535               single_shifting = 0;
1536               break;
1537             }
1538           else if (c < 0xA0)
1539             {
1540               single_shifting = 0;
1541               if (VECTORP (Vlatin_extra_code_table)
1542                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1543                 {
1544                   int newmask = 0;
1545
1546                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1547                       & CODING_FLAG_ISO_LATIN_EXTRA)
1548                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1549                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1550                       & CODING_FLAG_ISO_LATIN_EXTRA)
1551                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1552                   mask &= newmask;
1553                   mask_found |= newmask;
1554                 }
1555               else
1556                 return 0;
1557             }
1558           else
1559             {
1560               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1561                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1562               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1563               /* Check the length of succeeding codes of the range
1564                  0xA0..0FF.  If the byte length is odd, we exclude
1565                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1566                  when we are not single shifting.  */
1567               if (!single_shifting
1568                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1569                 {
1570                   int i = 1;
1571
1572                   c = -1;
1573                   while (src < src_end)
1574                     {
1575                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1576                       if (c < 0xA0)
1577                         break;
1578                       i++;
1579                     }
1580
1581                   if (i & 1 && src < src_end)
1582                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1583                   else
1584                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1585                   if (c >= 0)
1586                     /* This means that we have read one extra byte.  */
1587                     goto retry;
1588                 }
1589             }
1590           break;
1591         }
1592     }
1593  label_end_of_loop:
1594   return (mask & mask_found);
1595 }
1596
1597 /* Decode a character of which charset is CHARSET, the 1st position
1598    code is C1, the 2nd position code is C2, and return the decoded
1599    character code.  If the variable `translation_table' is non-nil,
1600    returned the translated code.  */
1601
1602 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1603   (NILP (translation_table)                     \
1604    ? MAKE_CHAR (charset, c1, c2)                \
1605    : translate_char (translation_table, -1, charset, c1, c2))
1606
1607 /* Set designation state into CODING.  */
1608 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1609   do {                                                                     \
1610     int charset, c;                                                        \
1611                                                                            \
1612     if (final_char < '0' || final_char >= 128)                             \
1613       goto label_invalid_code;                                             \
1614     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1615                                  make_number (chars),                      \
1616                                  make_number (final_char));                \
1617     c = MAKE_CHAR (charset, 0, 0);                                         \
1618     if (charset >= 0                                                       \
1619         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1620             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1621       {                                                                    \
1622         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1623             && reg == 0                                                    \
1624             && charset == CHARSET_ASCII)                                   \
1625           {                                                                \
1626             /* We should insert this designation sequence as is so         \
1627                that it is surely written back to a file.  */               \
1628             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1629             goto label_invalid_code;                                       \
1630           }                                                                \
1631         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1632         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1633             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1634           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1635         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1636       }                                                                    \
1637     else                                                                   \
1638       {                                                                    \
1639         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1640         goto label_invalid_code;                                           \
1641       }                                                                    \
1642   } while (0)
1643
1644 /* Allocate a memory block for storing information about compositions.
1645    The block is chained to the already allocated blocks.  */
1646
1647 void
1648 coding_allocate_composition_data (coding, char_offset)
1649      struct coding_system *coding;
1650      int char_offset;
1651 {
1652   struct composition_data *cmp_data
1653     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1654
1655   cmp_data->char_offset = char_offset;
1656   cmp_data->used = 0;
1657   cmp_data->prev = coding->cmp_data;
1658   cmp_data->next = NULL;
1659   if (coding->cmp_data)
1660     coding->cmp_data->next = cmp_data;
1661   coding->cmp_data = cmp_data;
1662   coding->cmp_data_start = 0;
1663 }
1664
1665 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1666    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1667    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1668    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1669    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1670   */
1671
1672 #define DECODE_COMPOSITION_START(c1)                                       \
1673   do {                                                                     \
1674     if (coding->composing == COMPOSITION_DISABLED)                         \
1675       {                                                                    \
1676         *dst++ = ISO_CODE_ESC;                                             \
1677         *dst++ = c1 & 0x7f;                                                \
1678         coding->produced_char += 2;                                        \
1679       }                                                                    \
1680     else if (!COMPOSING_P (coding))                                        \
1681       {                                                                    \
1682         /* This is surely the start of a composition.  We must be sure     \
1683            that coding->cmp_data has enough space to store the             \
1684            information about the composition.  If not, terminate the       \
1685            current decoding loop, allocate one more memory block for       \
1686            coding->cmp_data in the caller, then start the decoding         \
1687            loop again.  We can't allocate memory here directly because     \
1688            it may cause buffer/string relocation.  */                      \
1689         if (!coding->cmp_data                                              \
1690             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1691                 >= COMPOSITION_DATA_SIZE))                                 \
1692           {                                                                \
1693             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1694             goto label_end_of_loop;                                        \
1695           }                                                                \
1696         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1697                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1698                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1699                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1700         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1701                                       coding->composing);                  \
1702         coding->composition_rule_follows = 0;                              \
1703       }                                                                    \
1704     else                                                                   \
1705       {                                                                    \
1706         /* We are already handling a composition.  If the method is        \
1707            the following two, the codes following the current escape       \
1708            sequence are actual characters stored in a buffer.  */          \
1709         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1710             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1711           {                                                                \
1712             coding->composing = COMPOSITION_RELATIVE;                      \
1713             coding->composition_rule_follows = 0;                          \
1714           }                                                                \
1715       }                                                                    \
1716   } while (0)
1717
1718 /* Handle composition end sequence ESC 1.  */
1719
1720 #define DECODE_COMPOSITION_END(c1)                                      \
1721   do {                                                                  \
1722     if (! COMPOSING_P (coding))                                         \
1723       {                                                                 \
1724         *dst++ = ISO_CODE_ESC;                                          \
1725         *dst++ = c1;                                                    \
1726         coding->produced_char += 2;                                     \
1727       }                                                                 \
1728     else                                                                \
1729       {                                                                 \
1730         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1731         coding->composing = COMPOSITION_NO;                             \
1732       }                                                                 \
1733   } while (0)
1734
1735 /* Decode a composition rule from the byte C1 (and maybe one more byte
1736    from SRC) and store one encoded composition rule in
1737    coding->cmp_data.  */
1738
1739 #define DECODE_COMPOSITION_RULE(c1)                                     \
1740   do {                                                                  \
1741     int rule = 0;                                                       \
1742     (c1) -= 32;                                                         \
1743     if (c1 < 81)                /* old format (before ver.21) */        \
1744       {                                                                 \
1745         int gref = (c1) / 9;                                            \
1746         int nref = (c1) % 9;                                            \
1747         if (gref == 4) gref = 10;                                       \
1748         if (nref == 4) nref = 10;                                       \
1749         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1750       }                                                                 \
1751     else if (c1 < 93)           /* new format (after ver.21) */         \
1752       {                                                                 \
1753         ONE_MORE_BYTE (c2);                                             \
1754         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1755       }                                                                 \
1756     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1757     coding->composition_rule_follows = 0;                               \
1758   } while (0)
1759
1760
1761 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1762
1763 static void
1764 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1765      struct coding_system *coding;
1766      unsigned char *source, *destination;
1767      int src_bytes, dst_bytes;
1768 {
1769   unsigned char *src = source;
1770   unsigned char *src_end = source + src_bytes;
1771   unsigned char *dst = destination;
1772   unsigned char *dst_end = destination + dst_bytes;
1773   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1774   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1775   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1776   /* SRC_BASE remembers the start position in source in each loop.
1777      The loop will be exited when there's not enough source code
1778      (within macro ONE_MORE_BYTE), or when there's not enough
1779      destination area to produce a character (within macro
1780      EMIT_CHAR).  */
1781   unsigned char *src_base;
1782   int c, charset;
1783   Lisp_Object translation_table;
1784   Lisp_Object safe_chars;
1785
1786   safe_chars = coding_safe_chars (coding->symbol);
1787
1788   if (NILP (Venable_character_translation))
1789     translation_table = Qnil;
1790   else
1791     {
1792       translation_table = coding->translation_table_for_decode;
1793       if (NILP (translation_table))
1794         translation_table = Vstandard_translation_table_for_decode;
1795     }
1796
1797   coding->result = CODING_FINISH_NORMAL;
1798
1799   while (1)
1800     {
1801       int c1, c2;
1802
1803       src_base = src;
1804       ONE_MORE_BYTE (c1);
1805
1806       /* We produce no character or one character.  */
1807       switch (iso_code_class [c1])
1808         {
1809         case ISO_0x20_or_0x7F:
1810           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1811             {
1812               DECODE_COMPOSITION_RULE (c1);
1813               continue;
1814             }
1815           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1816             {
1817               /* This is SPACE or DEL.  */
1818               charset = CHARSET_ASCII;
1819               break;
1820             }
1821           /* This is a graphic character, we fall down ...  */
1822
1823         case ISO_graphic_plane_0:
1824           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1825             {
1826               DECODE_COMPOSITION_RULE (c1);
1827               continue;
1828             }
1829           charset = charset0;
1830           break;
1831
1832         case ISO_0xA0_or_0xFF:
1833           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1834               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1835             goto label_invalid_code;
1836           /* This is a graphic character, we fall down ... */
1837
1838         case ISO_graphic_plane_1:
1839           if (charset1 < 0)
1840             goto label_invalid_code;
1841           charset = charset1;
1842           break;
1843
1844         case ISO_control_0:
1845           if (COMPOSING_P (coding))
1846             DECODE_COMPOSITION_END ('1');
1847
1848           /* All ISO2022 control characters in this class have the
1849              same representation in Emacs internal format.  */
1850           if (c1 == '\n'
1851               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1852               && (coding->eol_type == CODING_EOL_CR
1853                   || coding->eol_type == CODING_EOL_CRLF))
1854             {
1855               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1856               goto label_end_of_loop;
1857             }
1858           charset = CHARSET_ASCII;
1859           break;
1860
1861         case ISO_control_1:
1862           if (COMPOSING_P (coding))
1863             DECODE_COMPOSITION_END ('1');
1864           goto label_invalid_code;
1865
1866         case ISO_carriage_return:
1867           if (COMPOSING_P (coding))
1868             DECODE_COMPOSITION_END ('1');
1869
1870           if (coding->eol_type == CODING_EOL_CR)
1871             c1 = '\n';
1872           else if (coding->eol_type == CODING_EOL_CRLF)
1873             {
1874               ONE_MORE_BYTE (c1);
1875               if (c1 != ISO_CODE_LF)
1876                 {
1877                   src--;
1878                   c1 = '\r';
1879                 }
1880             }
1881           charset = CHARSET_ASCII;
1882           break;
1883
1884         case ISO_shift_out:
1885           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1886               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1887             goto label_invalid_code;
1888           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1889           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1890           continue;
1891
1892         case ISO_shift_in:
1893           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1894             goto label_invalid_code;
1895           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1896           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1897           continue;
1898
1899         case ISO_single_shift_2_7:
1900         case ISO_single_shift_2:
1901           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1902             goto label_invalid_code;
1903           /* SS2 is handled as an escape sequence of ESC 'N' */
1904           c1 = 'N';
1905           goto label_escape_sequence;
1906
1907         case ISO_single_shift_3:
1908           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1909             goto label_invalid_code;
1910           /* SS2 is handled as an escape sequence of ESC 'O' */
1911           c1 = 'O';
1912           goto label_escape_sequence;
1913
1914         case ISO_control_sequence_introducer:
1915           /* CSI is handled as an escape sequence of ESC '[' ...  */
1916           c1 = '[';
1917           goto label_escape_sequence;
1918
1919         case ISO_escape:
1920           ONE_MORE_BYTE (c1);
1921         label_escape_sequence:
1922           /* Escape sequences handled by Emacs are invocation,
1923              designation, direction specification, and character
1924              composition specification.  */
1925           switch (c1)
1926             {
1927             case '&':           /* revision of following character set */
1928               ONE_MORE_BYTE (c1);
1929               if (!(c1 >= '@' && c1 <= '~'))
1930                 goto label_invalid_code;
1931               ONE_MORE_BYTE (c1);
1932               if (c1 != ISO_CODE_ESC)
1933                 goto label_invalid_code;
1934               ONE_MORE_BYTE (c1);
1935               goto label_escape_sequence;
1936
1937             case '$':           /* designation of 2-byte character set */
1938               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1939                 goto label_invalid_code;
1940               ONE_MORE_BYTE (c1);
1941               if (c1 >= '@' && c1 <= 'B')
1942                 {       /* designation of JISX0208.1978, GB2312.1980,
1943                            or JISX0208.1980 */
1944                   DECODE_DESIGNATION (0, 2, 94, c1);
1945                 }
1946               else if (c1 >= 0x28 && c1 <= 0x2B)
1947                 {       /* designation of DIMENSION2_CHARS94 character set */
1948                   ONE_MORE_BYTE (c2);
1949                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1950                 }
1951               else if (c1 >= 0x2C && c1 <= 0x2F)
1952                 {       /* designation of DIMENSION2_CHARS96 character set */
1953                   ONE_MORE_BYTE (c2);
1954                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1955                 }
1956               else
1957                 goto label_invalid_code;
1958               /* We must update these variables now.  */
1959               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1960               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1961               continue;
1962
1963             case 'n':           /* invocation of locking-shift-2 */
1964               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1965                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1966                 goto label_invalid_code;
1967               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1968               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1969               continue;
1970
1971             case 'o':           /* invocation of locking-shift-3 */
1972               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1973                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1974                 goto label_invalid_code;
1975               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1976               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1977               continue;
1978
1979             case 'N':           /* invocation of single-shift-2 */
1980               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1981                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1982                 goto label_invalid_code;
1983               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1984               ONE_MORE_BYTE (c1);
1985               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1986                 goto label_invalid_code;
1987               break;
1988
1989             case 'O':           /* invocation of single-shift-3 */
1990               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1991                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1992                 goto label_invalid_code;
1993               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1994               ONE_MORE_BYTE (c1);
1995               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1996                 goto label_invalid_code;
1997               break;
1998
1999             case '0': case '2': case '3': case '4': /* start composition */
2000               DECODE_COMPOSITION_START (c1);
2001               continue;
2002
2003             case '1':           /* end composition */
2004               DECODE_COMPOSITION_END (c1);
2005               continue;
2006
2007             case '[':           /* specification of direction */
2008               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2009                 goto label_invalid_code;
2010               /* For the moment, nested direction is not supported.
2011                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2012                  left-to-right, and nonzero means right-to-left.  */
2013               ONE_MORE_BYTE (c1);
2014               switch (c1)
2015                 {
2016                 case ']':       /* end of the current direction */
2017                   coding->mode &= ~CODING_MODE_DIRECTION;
2018
2019                 case '0':       /* end of the current direction */
2020                 case '1':       /* start of left-to-right direction */
2021                   ONE_MORE_BYTE (c1);
2022                   if (c1 == ']')
2023                     coding->mode &= ~CODING_MODE_DIRECTION;
2024                   else
2025                     goto label_invalid_code;
2026                   break;
2027
2028                 case '2':       /* start of right-to-left direction */
2029                   ONE_MORE_BYTE (c1);
2030                   if (c1 == ']')
2031                     coding->mode |= CODING_MODE_DIRECTION;
2032                   else
2033                     goto label_invalid_code;
2034                   break;
2035
2036                 default:
2037                   goto label_invalid_code;
2038                 }
2039               continue;
2040
2041             default:
2042               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2043                 goto label_invalid_code;
2044               if (c1 >= 0x28 && c1 <= 0x2B)
2045                 {       /* designation of DIMENSION1_CHARS94 character set */
2046                   ONE_MORE_BYTE (c2);
2047                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2048                 }
2049               else if (c1 >= 0x2C && c1 <= 0x2F)
2050                 {       /* designation of DIMENSION1_CHARS96 character set */
2051                   ONE_MORE_BYTE (c2);
2052                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2053                 }
2054               else
2055                 goto label_invalid_code;
2056               /* We must update these variables now.  */
2057               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2058               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2059               continue;
2060             }
2061         }
2062
2063       /* Now we know CHARSET and 1st position code C1 of a character.
2064          Produce a multibyte sequence for that character while getting
2065          2nd position code C2 if necessary.  */
2066       if (CHARSET_DIMENSION (charset) == 2)
2067         {
2068           ONE_MORE_BYTE (c2);
2069           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2070             /* C2 is not in a valid range.  */
2071             goto label_invalid_code;
2072         }
2073       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2074       EMIT_CHAR (c);
2075       continue;
2076
2077     label_invalid_code:
2078       coding->errors++;
2079       if (COMPOSING_P (coding))
2080         DECODE_COMPOSITION_END ('1');
2081       src = src_base;
2082       c = *src++;
2083       EMIT_CHAR (c);
2084     }
2085
2086  label_end_of_loop:
2087   coding->consumed = coding->consumed_char = src_base - source;
2088   coding->produced = dst - destination;
2089   return;
2090 }
2091
2092
2093 /* ISO2022 encoding stuff.  */
2094
2095 /*
2096    It is not enough to say just "ISO2022" on encoding, we have to
2097    specify more details.  In Emacs, each ISO2022 coding system
2098    variant has the following specifications:
2099         1. Initial designation to G0 through G3.
2100         2. Allows short-form designation?
2101         3. ASCII should be designated to G0 before control characters?
2102         4. ASCII should be designated to G0 at end of line?
2103         5. 7-bit environment or 8-bit environment?
2104         6. Use locking-shift?
2105         7. Use Single-shift?
2106    And the following two are only for Japanese:
2107         8. Use ASCII in place of JIS0201-1976-Roman?
2108         9. Use JISX0208-1983 in place of JISX0208-1978?
2109    These specifications are encoded in `coding->flags' as flag bits
2110    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2111    details.
2112 */
2113
2114 /* Produce codes (escape sequence) for designating CHARSET to graphic
2115    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2116    '@', 'A', or 'B' and the coding system CODING allows, produce
2117    designation sequence of short-form.  */
2118
2119 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2120   do {                                                                  \
2121     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2122     char *intermediate_char_94 = "()*+";                                \
2123     char *intermediate_char_96 = ",-./";                                \
2124     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2125                                                                         \
2126     if (revision < 255)                                                 \
2127       {                                                                 \
2128         *dst++ = ISO_CODE_ESC;                                          \
2129         *dst++ = '&';                                                   \
2130         *dst++ = '@' + revision;                                        \
2131       }                                                                 \
2132     *dst++ = ISO_CODE_ESC;                                              \
2133     if (CHARSET_DIMENSION (charset) == 1)                               \
2134       {                                                                 \
2135         if (CHARSET_CHARS (charset) == 94)                              \
2136           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2137         else                                                            \
2138           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2139       }                                                                 \
2140     else                                                                \
2141       {                                                                 \
2142         *dst++ = '$';                                                   \
2143         if (CHARSET_CHARS (charset) == 94)                              \
2144           {                                                             \
2145             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2146                 || reg != 0                                             \
2147                 || final_char < '@' || final_char > 'B')                \
2148               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2149           }                                                             \
2150         else                                                            \
2151           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2152       }                                                                 \
2153     *dst++ = final_char;                                                \
2154     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2155   } while (0)
2156
2157 /* The following two macros produce codes (control character or escape
2158    sequence) for ISO2022 single-shift functions (single-shift-2 and
2159    single-shift-3).  */
2160
2161 #define ENCODE_SINGLE_SHIFT_2                           \
2162   do {                                                  \
2163     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2164       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2165     else                                                \
2166       *dst++ = ISO_CODE_SS2;                            \
2167     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2168   } while (0)
2169
2170 #define ENCODE_SINGLE_SHIFT_3                           \
2171   do {                                                  \
2172     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2173       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2174     else                                                \
2175       *dst++ = ISO_CODE_SS3;                            \
2176     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2177   } while (0)
2178
2179 /* The following four macros produce codes (control character or
2180    escape sequence) for ISO2022 locking-shift functions (shift-in,
2181    shift-out, locking-shift-2, and locking-shift-3).  */
2182
2183 #define ENCODE_SHIFT_IN                         \
2184   do {                                          \
2185     *dst++ = ISO_CODE_SI;                       \
2186     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2187   } while (0)
2188
2189 #define ENCODE_SHIFT_OUT                        \
2190   do {                                          \
2191     *dst++ = ISO_CODE_SO;                       \
2192     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2193   } while (0)
2194
2195 #define ENCODE_LOCKING_SHIFT_2                  \
2196   do {                                          \
2197     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2198     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2199   } while (0)
2200
2201 #define ENCODE_LOCKING_SHIFT_3                  \
2202   do {                                          \
2203     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2204     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2205   } while (0)
2206
2207 /* Produce codes for a DIMENSION1 character whose character set is
2208    CHARSET and whose position-code is C1.  Designation and invocation
2209    sequences are also produced in advance if necessary.  */
2210
2211 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2212   do {                                                                  \
2213     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2214       {                                                                 \
2215         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2216           *dst++ = c1 & 0x7F;                                           \
2217         else                                                            \
2218           *dst++ = c1 | 0x80;                                           \
2219         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2220         break;                                                          \
2221       }                                                                 \
2222     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2223       {                                                                 \
2224         *dst++ = c1 & 0x7F;                                             \
2225         break;                                                          \
2226       }                                                                 \
2227     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2228       {                                                                 \
2229         *dst++ = c1 | 0x80;                                             \
2230         break;                                                          \
2231       }                                                                 \
2232     else                                                                \
2233       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2234          must invoke it, or, at first, designate it to some graphic     \
2235          register.  Then repeat the loop to actually produce the        \
2236          character.  */                                                 \
2237       dst = encode_invocation_designation (charset, coding, dst);       \
2238   } while (1)
2239
2240 /* Produce codes for a DIMENSION2 character whose character set is
2241    CHARSET and whose position-codes are C1 and C2.  Designation and
2242    invocation codes are also produced in advance if necessary.  */
2243
2244 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2245   do {                                                                  \
2246     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2247       {                                                                 \
2248         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2249           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2250         else                                                            \
2251           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2252         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2253         break;                                                          \
2254       }                                                                 \
2255     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2256       {                                                                 \
2257         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2258         break;                                                          \
2259       }                                                                 \
2260     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2261       {                                                                 \
2262         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2263         break;                                                          \
2264       }                                                                 \
2265     else                                                                \
2266       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2267          must invoke it, or, at first, designate it to some graphic     \
2268          register.  Then repeat the loop to actually produce the        \
2269          character.  */                                                 \
2270       dst = encode_invocation_designation (charset, coding, dst);       \
2271   } while (1)
2272
2273 #define ENCODE_ISO_CHARACTER(c)                                 \
2274   do {                                                          \
2275     int charset, c1, c2;                                        \
2276                                                                 \
2277     SPLIT_CHAR (c, charset, c1, c2);                            \
2278     if (CHARSET_DEFINED_P (charset))                            \
2279       {                                                         \
2280         if (CHARSET_DIMENSION (charset) == 1)                   \
2281           {                                                     \
2282             if (charset == CHARSET_ASCII                        \
2283                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2284               charset = charset_latin_jisx0201;                 \
2285             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2286           }                                                     \
2287         else                                                    \
2288           {                                                     \
2289             if (charset == charset_jisx0208                     \
2290                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2291               charset = charset_jisx0208_1978;                  \
2292             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2293           }                                                     \
2294       }                                                         \
2295     else                                                        \
2296       {                                                         \
2297         *dst++ = c1;                                            \
2298         if (c2 >= 0)                                            \
2299           *dst++ = c2;                                          \
2300       }                                                         \
2301   } while (0)
2302
2303
2304 /* Instead of encoding character C, produce one or two `?'s.  */
2305
2306 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
2307   do {                                                                  \
2308     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
2309     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
2310       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
2311   } while (0)
2312
2313
2314 /* Produce designation and invocation codes at a place pointed by DST
2315    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2316    Return new DST.  */
2317
2318 unsigned char *
2319 encode_invocation_designation (charset, coding, dst)
2320      int charset;
2321      struct coding_system *coding;
2322      unsigned char *dst;
2323 {
2324   int reg;                      /* graphic register number */
2325
2326   /* At first, check designations.  */
2327   for (reg = 0; reg < 4; reg++)
2328     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2329       break;
2330
2331   if (reg >= 4)
2332     {
2333       /* CHARSET is not yet designated to any graphic registers.  */
2334       /* At first check the requested designation.  */
2335       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2336       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2337         /* Since CHARSET requests no special designation, designate it
2338            to graphic register 0.  */
2339         reg = 0;
2340
2341       ENCODE_DESIGNATION (charset, reg, coding);
2342     }
2343
2344   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2345       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2346     {
2347       /* Since the graphic register REG is not invoked to any graphic
2348          planes, invoke it to graphic plane 0.  */
2349       switch (reg)
2350         {
2351         case 0:                 /* graphic register 0 */
2352           ENCODE_SHIFT_IN;
2353           break;
2354
2355         case 1:                 /* graphic register 1 */
2356           ENCODE_SHIFT_OUT;
2357           break;
2358
2359         case 2:                 /* graphic register 2 */
2360           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2361             ENCODE_SINGLE_SHIFT_2;
2362           else
2363             ENCODE_LOCKING_SHIFT_2;
2364           break;
2365
2366         case 3:                 /* graphic register 3 */
2367           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2368             ENCODE_SINGLE_SHIFT_3;
2369           else
2370             ENCODE_LOCKING_SHIFT_3;
2371           break;
2372         }
2373     }
2374
2375   return dst;
2376 }
2377
2378 /* Produce 2-byte codes for encoded composition rule RULE.  */
2379
2380 #define ENCODE_COMPOSITION_RULE(rule)           \
2381   do {                                          \
2382     int gref, nref;                             \
2383     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2384     *dst++ = 32 + 81 + gref;                    \
2385     *dst++ = 32 + nref;                         \
2386   } while (0)
2387
2388 /* Produce codes for indicating the start of a composition sequence
2389    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2390    which specify information about the composition.  See the comment
2391    in coding.h for the format of DATA.  */
2392
2393 #define ENCODE_COMPOSITION_START(coding, data)                          \
2394   do {                                                                  \
2395     coding->composing = data[3];                                        \
2396     *dst++ = ISO_CODE_ESC;                                              \
2397     if (coding->composing == COMPOSITION_RELATIVE)                      \
2398       *dst++ = '0';                                                     \
2399     else                                                                \
2400       {                                                                 \
2401         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2402                   ? '3' : '4');                                         \
2403         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2404         coding->composition_rule_follows = 0;                           \
2405       }                                                                 \
2406   } while (0)
2407
2408 /* Produce codes for indicating the end of the current composition.  */
2409
2410 #define ENCODE_COMPOSITION_END(coding, data)                    \
2411   do {                                                          \
2412     *dst++ = ISO_CODE_ESC;                                      \
2413     *dst++ = '1';                                               \
2414     coding->cmp_data_start += data[0];                          \
2415     coding->composing = COMPOSITION_NO;                         \
2416     if (coding->cmp_data_start == coding->cmp_data->used        \
2417         && coding->cmp_data->next)                              \
2418       {                                                         \
2419         coding->cmp_data = coding->cmp_data->next;              \
2420         coding->cmp_data_start = 0;                             \
2421       }                                                         \
2422   } while (0)
2423
2424 /* Produce composition start sequence ESC 0.  Here, this sequence
2425    doesn't mean the start of a new composition but means that we have
2426    just produced components (alternate chars and composition rules) of
2427    the composition and the actual text follows in SRC.  */
2428
2429 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2430   do {                                          \
2431     *dst++ = ISO_CODE_ESC;                      \
2432     *dst++ = '0';                               \
2433     coding->composing = COMPOSITION_RELATIVE;   \
2434   } while (0)
2435
2436 /* The following three macros produce codes for indicating direction
2437    of text.  */
2438 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2439   do {                                                  \
2440     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2441       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2442     else                                                \
2443       *dst++ = ISO_CODE_CSI;                            \
2444   } while (0)
2445
2446 #define ENCODE_DIRECTION_R2L    \
2447   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2448
2449 #define ENCODE_DIRECTION_L2R    \
2450   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2451
2452 /* Produce codes for designation and invocation to reset the graphic
2453    planes and registers to initial state.  */
2454 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2455   do {                                                                      \
2456     int reg;                                                                \
2457     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2458       ENCODE_SHIFT_IN;                                                      \
2459     for (reg = 0; reg < 4; reg++)                                           \
2460       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2461           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2462               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2463         ENCODE_DESIGNATION                                                  \
2464           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2465   } while (0)
2466
2467 /* Produce designation sequences of charsets in the line started from
2468    SRC to a place pointed by DST, and return updated DST.
2469
2470    If the current block ends before any end-of-line, we may fail to
2471    find all the necessary designations.  */
2472
2473 static unsigned char *
2474 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2475      struct coding_system *coding;
2476      Lisp_Object translation_table;
2477      unsigned char *src, *src_end, *dst;
2478 {
2479   int charset, c, found = 0, reg;
2480   /* Table of charsets to be designated to each graphic register.  */
2481   int r[4];
2482
2483   for (reg = 0; reg < 4; reg++)
2484     r[reg] = -1;
2485
2486   while (found < 4)
2487     {
2488       ONE_MORE_CHAR (c);
2489       if (c == '\n')
2490         break;
2491
2492       charset = CHAR_CHARSET (c);
2493       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2494       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2495         {
2496           found++;
2497           r[reg] = charset;
2498         }
2499     }
2500
2501  label_end_of_loop:
2502   if (found)
2503     {
2504       for (reg = 0; reg < 4; reg++)
2505         if (r[reg] >= 0
2506             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2507           ENCODE_DESIGNATION (r[reg], reg, coding);
2508     }
2509
2510   return dst;
2511 }
2512
2513 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2514
2515 static void
2516 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2517      struct coding_system *coding;
2518      unsigned char *source, *destination;
2519      int src_bytes, dst_bytes;
2520 {
2521   unsigned char *src = source;
2522   unsigned char *src_end = source + src_bytes;
2523   unsigned char *dst = destination;
2524   unsigned char *dst_end = destination + dst_bytes;
2525   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2526      from DST_END to assure overflow checking is necessary only at the
2527      head of loop.  */
2528   unsigned char *adjusted_dst_end = dst_end - 19;
2529   /* SRC_BASE remembers the start position in source in each loop.
2530      The loop will be exited when there's not enough source text to
2531      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2532      there's not enough destination area to produce encoded codes
2533      (within macro EMIT_BYTES).  */
2534   unsigned char *src_base;
2535   int c;
2536   Lisp_Object translation_table;
2537   Lisp_Object safe_chars;
2538
2539   safe_chars = coding_safe_chars (coding->symbol);
2540
2541   if (NILP (Venable_character_translation))
2542     translation_table = Qnil;
2543   else
2544     {
2545       translation_table = coding->translation_table_for_encode;
2546       if (NILP (translation_table))
2547         translation_table = Vstandard_translation_table_for_encode;
2548     }
2549
2550   coding->consumed_char = 0;
2551   coding->errors = 0;
2552   while (1)
2553     {
2554       src_base = src;
2555
2556       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2557         {
2558           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2559           break;
2560         }
2561
2562       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2563           && CODING_SPEC_ISO_BOL (coding))
2564         {
2565           /* We have to produce designation sequences if any now.  */
2566           dst = encode_designation_at_bol (coding, translation_table,
2567                                            src, src_end, dst);
2568           CODING_SPEC_ISO_BOL (coding) = 0;
2569         }
2570
2571       /* Check composition start and end.  */
2572       if (coding->composing != COMPOSITION_DISABLED
2573           && coding->cmp_data_start < coding->cmp_data->used)
2574         {
2575           struct composition_data *cmp_data = coding->cmp_data;
2576           int *data = cmp_data->data + coding->cmp_data_start;
2577           int this_pos = cmp_data->char_offset + coding->consumed_char;
2578
2579           if (coding->composing == COMPOSITION_RELATIVE)
2580             {
2581               if (this_pos == data[2])
2582                 {
2583                   ENCODE_COMPOSITION_END (coding, data);
2584                   cmp_data = coding->cmp_data;
2585                   data = cmp_data->data + coding->cmp_data_start;
2586                 }
2587             }
2588           else if (COMPOSING_P (coding))
2589             {
2590               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2591               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2592                 /* We have consumed components of the composition.
2593                    What follows in SRC is the composition's base
2594                    text.  */
2595                 ENCODE_COMPOSITION_FAKE_START (coding);
2596               else
2597                 {
2598                   int c = cmp_data->data[coding->cmp_data_index++];
2599                   if (coding->composition_rule_follows)
2600                     {
2601                       ENCODE_COMPOSITION_RULE (c);
2602                       coding->composition_rule_follows = 0;
2603                     }
2604                   else
2605                     {
2606                       if (coding->flags & CODING_FLAG_ISO_SAFE
2607                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2608                         ENCODE_UNSAFE_CHARACTER (c);
2609                       else
2610                         ENCODE_ISO_CHARACTER (c);
2611                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2612                         coding->composition_rule_follows = 1;
2613                     }
2614                   continue;
2615                 }
2616             }
2617           if (!COMPOSING_P (coding))
2618             {
2619               if (this_pos == data[1])
2620                 {
2621                   ENCODE_COMPOSITION_START (coding, data);
2622                   continue;
2623                 }
2624             }
2625         }
2626
2627       ONE_MORE_CHAR (c);
2628
2629       /* Now encode the character C.  */
2630       if (c < 0x20 || c == 0x7F)
2631         {
2632           if (c == '\r')
2633             {
2634               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2635                 {
2636                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2637                     ENCODE_RESET_PLANE_AND_REGISTER;
2638                   *dst++ = c;
2639                   continue;
2640                 }
2641               /* fall down to treat '\r' as '\n' ...  */
2642               c = '\n';
2643             }
2644           if (c == '\n')
2645             {
2646               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2647                 ENCODE_RESET_PLANE_AND_REGISTER;
2648               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2649                 bcopy (coding->spec.iso2022.initial_designation,
2650                        coding->spec.iso2022.current_designation,
2651                        sizeof coding->spec.iso2022.initial_designation);
2652               if (coding->eol_type == CODING_EOL_LF
2653                   || coding->eol_type == CODING_EOL_UNDECIDED)
2654                 *dst++ = ISO_CODE_LF;
2655               else if (coding->eol_type == CODING_EOL_CRLF)
2656                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2657               else
2658                 *dst++ = ISO_CODE_CR;
2659               CODING_SPEC_ISO_BOL (coding) = 1;
2660             }
2661           else
2662             {
2663               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2664                 ENCODE_RESET_PLANE_AND_REGISTER;
2665               *dst++ = c;
2666             }
2667         }
2668       else if (ASCII_BYTE_P (c))
2669         ENCODE_ISO_CHARACTER (c);
2670       else if (SINGLE_BYTE_CHAR_P (c))
2671         {
2672           *dst++ = c;
2673           coding->errors++;
2674         }
2675       else if (coding->flags & CODING_FLAG_ISO_SAFE
2676                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2677         ENCODE_UNSAFE_CHARACTER (c);
2678       else
2679         ENCODE_ISO_CHARACTER (c);
2680
2681       coding->consumed_char++;
2682     }
2683
2684  label_end_of_loop:
2685   coding->consumed = src_base - source;
2686   coding->produced = coding->produced_char = dst - destination;
2687 }
2688
2689 \f
2690 /*** 4. SJIS and BIG5 handlers ***/
2691
2692 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2693    quite widely.  So, for the moment, Emacs supports them in the bare
2694    C code.  But, in the future, they may be supported only by CCL.  */
2695
2696 /* SJIS is a coding system encoding three character sets: ASCII, right
2697    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2698    as is.  A character of charset katakana-jisx0201 is encoded by
2699    "position-code + 0x80".  A character of charset japanese-jisx0208
2700    is encoded in 2-byte but two position-codes are divided and shifted
2701    so that it fits in the range below.
2702
2703    --- CODE RANGE of SJIS ---
2704    (character set)      (range)
2705    ASCII                0x00 .. 0x7F
2706    KATAKANA-JISX0201    0xA1 .. 0xDF
2707    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2708             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2709    -------------------------------
2710
2711 */
2712
2713 /* BIG5 is a coding system encoding two character sets: ASCII and
2714    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2715    character set and is encoded in two bytes.
2716
2717    --- CODE RANGE of BIG5 ---
2718    (character set)      (range)
2719    ASCII                0x00 .. 0x7F
2720    Big5 (1st byte)      0xA1 .. 0xFE
2721         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2722    --------------------------
2723
2724    Since the number of characters in Big5 is larger than maximum
2725    characters in Emacs' charset (96x96), it can't be handled as one
2726    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2727    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2728    contains frequently used characters and the latter contains less
2729    frequently used characters.  */
2730
2731 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2732    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2733    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2734    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2735
2736 /* Number of Big5 characters which have the same code in 1st byte.  */
2737 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2738
2739 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2740   do {                                                                  \
2741     unsigned int temp                                                   \
2742       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2743     if (b1 < 0xC9)                                                      \
2744       charset = charset_big5_1;                                         \
2745     else                                                                \
2746       {                                                                 \
2747         charset = charset_big5_2;                                       \
2748         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2749       }                                                                 \
2750     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2751     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2752   } while (0)
2753
2754 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2755   do {                                                                  \
2756     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2757     if (charset == charset_big5_2)                                      \
2758       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2759     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2760     b2 = temp % BIG5_SAME_ROW;                                          \
2761     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2762   } while (0)
2763
2764 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2765    Check if a text is encoded in SJIS.  If it is, return
2766    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2767
2768 static int
2769 detect_coding_sjis (src, src_end, multibytep)
2770      unsigned char *src, *src_end;
2771      int multibytep;
2772 {
2773   int c;
2774   /* Dummy for ONE_MORE_BYTE.  */
2775   struct coding_system dummy_coding;
2776   struct coding_system *coding = &dummy_coding;
2777
2778   while (1)
2779     {
2780       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2781       if (c < 0x80)
2782         continue;
2783       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2784         return 0;
2785       if (c <= 0x9F || c >= 0xE0)
2786         {
2787           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2788           if (c < 0x40 || c == 0x7F || c > 0xFC)
2789             return 0;
2790         }
2791     }
2792  label_end_of_loop:
2793   return CODING_CATEGORY_MASK_SJIS;
2794 }
2795
2796 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2797    Check if a text is encoded in BIG5.  If it is, return
2798    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2799
2800 static int
2801 detect_coding_big5 (src, src_end, multibytep)
2802      unsigned char *src, *src_end;
2803      int multibytep;
2804 {
2805   int c;
2806   /* Dummy for ONE_MORE_BYTE.  */
2807   struct coding_system dummy_coding;
2808   struct coding_system *coding = &dummy_coding;
2809
2810   while (1)
2811     {
2812       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2813       if (c < 0x80)
2814         continue;
2815       if (c < 0xA1 || c > 0xFE)
2816         return 0;
2817       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2818       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2819         return 0;
2820     }
2821  label_end_of_loop:
2822   return CODING_CATEGORY_MASK_BIG5;
2823 }
2824
2825 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2826    Check if a text is encoded in UTF-8.  If it is, return
2827    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2828
2829 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2830 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2831 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2832 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2833 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2834 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2835 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2836
2837 static int
2838 detect_coding_utf_8 (src, src_end, multibytep)
2839      unsigned char *src, *src_end;
2840      int multibytep;
2841 {
2842   unsigned char c;
2843   int seq_maybe_bytes;
2844   /* Dummy for ONE_MORE_BYTE.  */
2845   struct coding_system dummy_coding;
2846   struct coding_system *coding = &dummy_coding;
2847
2848   while (1)
2849     {
2850       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2851       if (UTF_8_1_OCTET_P (c))
2852         continue;
2853       else if (UTF_8_2_OCTET_LEADING_P (c))
2854         seq_maybe_bytes = 1;
2855       else if (UTF_8_3_OCTET_LEADING_P (c))
2856         seq_maybe_bytes = 2;
2857       else if (UTF_8_4_OCTET_LEADING_P (c))
2858         seq_maybe_bytes = 3;
2859       else if (UTF_8_5_OCTET_LEADING_P (c))
2860         seq_maybe_bytes = 4;
2861       else if (UTF_8_6_OCTET_LEADING_P (c))
2862         seq_maybe_bytes = 5;
2863       else
2864         return 0;
2865
2866       do
2867         {
2868           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2869           if (!UTF_8_EXTRA_OCTET_P (c))
2870             return 0;
2871           seq_maybe_bytes--;
2872         }
2873       while (seq_maybe_bytes > 0);
2874     }
2875
2876  label_end_of_loop:
2877   return CODING_CATEGORY_MASK_UTF_8;
2878 }
2879
2880 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2881    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2882    Little Endian (otherwise).  If it is, return
2883    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2884    else return 0.  */
2885
2886 #define UTF_16_INVALID_P(val)   \
2887   (((val) == 0xFFFE)            \
2888    || ((val) == 0xFFFF))
2889
2890 #define UTF_16_HIGH_SURROGATE_P(val) \
2891   (((val) & 0xD800) == 0xD800)
2892
2893 #define UTF_16_LOW_SURROGATE_P(val) \
2894   (((val) & 0xDC00) == 0xDC00)
2895
2896 static int
2897 detect_coding_utf_16 (src, src_end, multibytep)
2898      unsigned char *src, *src_end;
2899      int multibytep;
2900 {
2901   unsigned char c1, c2;
2902   /* Dummy for TWO_MORE_BYTES.  */
2903   struct coding_system dummy_coding;
2904   struct coding_system *coding = &dummy_coding;
2905
2906   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2907   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2908
2909   if ((c1 == 0xFF) && (c2 == 0xFE))
2910     return CODING_CATEGORY_MASK_UTF_16_LE;
2911   else if ((c1 == 0xFE) && (c2 == 0xFF))
2912     return CODING_CATEGORY_MASK_UTF_16_BE;
2913
2914  label_end_of_loop:
2915   return 0;
2916 }
2917
2918 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2919    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2920
2921 static void
2922 decode_coding_sjis_big5 (coding, source, destination,
2923                          src_bytes, dst_bytes, sjis_p)
2924      struct coding_system *coding;
2925      unsigned char *source, *destination;
2926      int src_bytes, dst_bytes;
2927      int sjis_p;
2928 {
2929   unsigned char *src = source;
2930   unsigned char *src_end = source + src_bytes;
2931   unsigned char *dst = destination;
2932   unsigned char *dst_end = destination + dst_bytes;
2933   /* SRC_BASE remembers the start position in source in each loop.
2934      The loop will be exited when there's not enough source code
2935      (within macro ONE_MORE_BYTE), or when there's not enough
2936      destination area to produce a character (within macro
2937      EMIT_CHAR).  */
2938   unsigned char *src_base;
2939   Lisp_Object translation_table;
2940
2941   if (NILP (Venable_character_translation))
2942     translation_table = Qnil;
2943   else
2944     {
2945       translation_table = coding->translation_table_for_decode;
2946       if (NILP (translation_table))
2947         translation_table = Vstandard_translation_table_for_decode;
2948     }
2949
2950   coding->produced_char = 0;
2951   while (1)
2952     {
2953       int c, charset, c1, c2;
2954
2955       src_base = src;
2956       ONE_MORE_BYTE (c1);
2957
2958       if (c1 < 0x80)
2959         {
2960           charset = CHARSET_ASCII;
2961           if (c1 < 0x20)
2962             {
2963               if (c1 == '\r')
2964                 {
2965                   if (coding->eol_type == CODING_EOL_CRLF)
2966                     {
2967                       ONE_MORE_BYTE (c2);
2968                       if (c2 == '\n')
2969                         c1 = c2;
2970                       else
2971                         /* To process C2 again, SRC is subtracted by 1.  */
2972                         src--;
2973                     }
2974                   else if (coding->eol_type == CODING_EOL_CR)
2975                     c1 = '\n';
2976                 }
2977               else if (c1 == '\n'
2978                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2979                        && (coding->eol_type == CODING_EOL_CR
2980                            || coding->eol_type == CODING_EOL_CRLF))
2981                 {
2982                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2983                   goto label_end_of_loop;
2984                 }
2985             }
2986         }
2987       else
2988         {
2989           if (sjis_p)
2990             {
2991               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
2992                 goto label_invalid_code;
2993               if (c1 <= 0x9F || c1 >= 0xE0)
2994                 {
2995                   /* SJIS -> JISX0208 */
2996                   ONE_MORE_BYTE (c2);
2997                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2998                     goto label_invalid_code;
2999                   DECODE_SJIS (c1, c2, c1, c2);
3000                   charset = charset_jisx0208;
3001                 }
3002               else
3003                 /* SJIS -> JISX0201-Kana */
3004                 charset = charset_katakana_jisx0201;
3005             }
3006           else
3007             {
3008               /* BIG5 -> Big5 */
3009               if (c1 < 0xA0 || c1 > 0xFE)
3010                 goto label_invalid_code;
3011               ONE_MORE_BYTE (c2);
3012               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3013                 goto label_invalid_code;
3014               DECODE_BIG5 (c1, c2, charset, c1, c2);
3015             }
3016         }
3017
3018       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3019       EMIT_CHAR (c);
3020       continue;
3021
3022     label_invalid_code:
3023       coding->errors++;
3024       src = src_base;
3025       c = *src++;
3026       EMIT_CHAR (c);
3027     }
3028
3029  label_end_of_loop:
3030   coding->consumed = coding->consumed_char = src_base - source;
3031   coding->produced = dst - destination;
3032   return;
3033 }
3034
3035 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3036    This function can encode charsets `ascii', `katakana-jisx0201',
3037    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3038    are sure that all these charsets are registered as official charset
3039    (i.e. do not have extended leading-codes).  Characters of other
3040    charsets are produced without any encoding.  If SJIS_P is 1, encode
3041    SJIS text, else encode BIG5 text.  */
3042
3043 static void
3044 encode_coding_sjis_big5 (coding, source, destination,
3045                          src_bytes, dst_bytes, sjis_p)
3046      struct coding_system *coding;
3047      unsigned char *source, *destination;
3048      int src_bytes, dst_bytes;
3049      int sjis_p;
3050 {
3051   unsigned char *src = source;
3052   unsigned char *src_end = source + src_bytes;
3053   unsigned char *dst = destination;
3054   unsigned char *dst_end = destination + dst_bytes;
3055   /* SRC_BASE remembers the start position in source in each loop.
3056      The loop will be exited when there's not enough source text to
3057      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3058      there's not enough destination area to produce encoded codes
3059      (within macro EMIT_BYTES).  */
3060   unsigned char *src_base;
3061   Lisp_Object translation_table;
3062
3063   if (NILP (Venable_character_translation))
3064     translation_table = Qnil;
3065   else
3066     {
3067       translation_table = coding->translation_table_for_encode;
3068       if (NILP (translation_table))
3069         translation_table = Vstandard_translation_table_for_encode;
3070     }
3071
3072   while (1)
3073     {
3074       int c, charset, c1, c2;
3075
3076       src_base = src;
3077       ONE_MORE_CHAR (c);
3078
3079       /* Now encode the character C.  */
3080       if (SINGLE_BYTE_CHAR_P (c))
3081         {
3082           switch (c)
3083             {
3084             case '\r':
3085               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3086                 {
3087                   EMIT_ONE_BYTE (c);
3088                   break;
3089                 }
3090               c = '\n';
3091             case '\n':
3092               if (coding->eol_type == CODING_EOL_CRLF)
3093                 {
3094                   EMIT_TWO_BYTES ('\r', c);
3095                   break;
3096                 }
3097               else if (coding->eol_type == CODING_EOL_CR)
3098                 c = '\r';
3099             default:
3100               EMIT_ONE_BYTE (c);
3101             }
3102         }
3103       else
3104         {
3105           SPLIT_CHAR (c, charset, c1, c2);
3106           if (sjis_p)
3107             {
3108               if (charset == charset_jisx0208
3109                   || charset == charset_jisx0208_1978)
3110                 {
3111                   ENCODE_SJIS (c1, c2, c1, c2);
3112                   EMIT_TWO_BYTES (c1, c2);
3113                 }
3114               else if (charset == charset_katakana_jisx0201)
3115                 EMIT_ONE_BYTE (c1 | 0x80);
3116               else if (charset == charset_latin_jisx0201)
3117                 EMIT_ONE_BYTE (c1);
3118               else
3119                 /* There's no way other than producing the internal
3120                    codes as is.  */
3121                 EMIT_BYTES (src_base, src);
3122             }
3123           else
3124             {
3125               if (charset == charset_big5_1 || charset == charset_big5_2)
3126                 {
3127                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3128                   EMIT_TWO_BYTES (c1, c2);
3129                 }
3130               else
3131                 /* There's no way other than producing the internal
3132                    codes as is.  */
3133                 EMIT_BYTES (src_base, src);
3134             }
3135         }
3136       coding->consumed_char++;
3137     }
3138
3139  label_end_of_loop:
3140   coding->consumed = src_base - source;
3141   coding->produced = coding->produced_char = dst - destination;
3142 }
3143
3144 \f
3145 /*** 5. CCL handlers ***/
3146
3147 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3148    Check if a text is encoded in a coding system of which
3149    encoder/decoder are written in CCL program.  If it is, return
3150    CODING_CATEGORY_MASK_CCL, else return 0.  */
3151
3152 static int
3153 detect_coding_ccl (src, src_end, multibytep)
3154      unsigned char *src, *src_end;
3155      int multibytep;
3156 {
3157   unsigned char *valid;
3158   int c;
3159   /* Dummy for ONE_MORE_BYTE.  */
3160   struct coding_system dummy_coding;
3161   struct coding_system *coding = &dummy_coding;
3162
3163   /* No coding system is assigned to coding-category-ccl.  */
3164   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3165     return 0;
3166
3167   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3168   while (1)
3169     {
3170       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3171       if (! valid[c])
3172         return 0;
3173     }
3174  label_end_of_loop:
3175   return CODING_CATEGORY_MASK_CCL;
3176 }
3177
3178 \f
3179 /*** 6. End-of-line handlers ***/
3180
3181 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3182
3183 static void
3184 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3185      struct coding_system *coding;
3186      unsigned char *source, *destination;
3187      int src_bytes, dst_bytes;
3188 {
3189   unsigned char *src = source;
3190   unsigned char *dst = destination;
3191   unsigned char *src_end = src + src_bytes;
3192   unsigned char *dst_end = dst + dst_bytes;
3193   Lisp_Object translation_table;
3194   /* SRC_BASE remembers the start position in source in each loop.
3195      The loop will be exited when there's not enough source code
3196      (within macro ONE_MORE_BYTE), or when there's not enough
3197      destination area to produce a character (within macro
3198      EMIT_CHAR).  */
3199   unsigned char *src_base;
3200   int c;
3201
3202   translation_table = Qnil;
3203   switch (coding->eol_type)
3204     {
3205     case CODING_EOL_CRLF:
3206       while (1)
3207         {
3208           src_base = src;
3209           ONE_MORE_BYTE (c);
3210           if (c == '\r')
3211             {
3212               ONE_MORE_BYTE (c);
3213               if (c != '\n')
3214                 {
3215                   src--;
3216                   c = '\r';
3217                 }
3218             }
3219           else if (c == '\n'
3220                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3221             {
3222               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3223               goto label_end_of_loop;
3224             }
3225           EMIT_CHAR (c);
3226         }
3227       break;
3228
3229     case CODING_EOL_CR:
3230       while (1)
3231         {
3232           src_base = src;
3233           ONE_MORE_BYTE (c);
3234           if (c == '\n')
3235             {
3236               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3237                 {
3238                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3239                   goto label_end_of_loop;
3240                 }
3241             }
3242           else if (c == '\r')
3243             c = '\n';
3244           EMIT_CHAR (c);
3245         }
3246       break;
3247
3248     default:                    /* no need for EOL handling */
3249       while (1)
3250         {
3251           src_base = src;
3252           ONE_MORE_BYTE (c);
3253           EMIT_CHAR (c);
3254         }
3255     }
3256
3257  label_end_of_loop:
3258   coding->consumed = coding->consumed_char = src_base - source;
3259   coding->produced = dst - destination;
3260   return;
3261 }
3262
3263 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3264    format of end-of-line according to `coding->eol_type'.  It also
3265    convert multibyte form 8-bit characters to unibyte if
3266    CODING->src_multibyte is nonzero.  If `coding->mode &
3267    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3268    also means end-of-line.  */
3269
3270 static void
3271 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3272      struct coding_system *coding;
3273      const unsigned char *source;
3274      unsigned char *destination;
3275      int src_bytes, dst_bytes;
3276 {
3277   const unsigned char *src = source;
3278   unsigned char *dst = destination;
3279   const unsigned char *src_end = src + src_bytes;
3280   unsigned char *dst_end = dst + dst_bytes;
3281   Lisp_Object translation_table;
3282   /* SRC_BASE remembers the start position in source in each loop.
3283      The loop will be exited when there's not enough source text to
3284      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3285      there's not enough destination area to produce encoded codes
3286      (within macro EMIT_BYTES).  */
3287   const unsigned char *src_base;
3288   unsigned char *tmp;
3289   int c;
3290   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3291
3292   translation_table = Qnil;
3293   if (coding->src_multibyte
3294       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3295     {
3296       src_end--;
3297       src_bytes--;
3298       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3299     }
3300
3301   if (coding->eol_type == CODING_EOL_CRLF)
3302     {
3303       while (src < src_end)
3304         {
3305           src_base = src;
3306           c = *src++;
3307           if (c >= 0x20)
3308             EMIT_ONE_BYTE (c);
3309           else if (c == '\n' || (c == '\r' && selective_display))
3310             EMIT_TWO_BYTES ('\r', '\n');
3311           else
3312             EMIT_ONE_BYTE (c);
3313         }
3314       src_base = src;
3315     label_end_of_loop:
3316       ;
3317     }
3318   else
3319     {
3320       if (!dst_bytes || src_bytes <= dst_bytes)
3321         {
3322           safe_bcopy (src, dst, src_bytes);
3323           src_base = src_end;
3324           dst += src_bytes;
3325         }
3326       else
3327         {
3328           if (coding->src_multibyte
3329               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3330             dst_bytes--;
3331           safe_bcopy (src, dst, dst_bytes);
3332           src_base = src + dst_bytes;
3333           dst = destination + dst_bytes;
3334           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3335         }
3336       if (coding->eol_type == CODING_EOL_CR)
3337         {
3338           for (tmp = destination; tmp < dst; tmp++)
3339             if (*tmp == '\n') *tmp = '\r';
3340         }
3341       else if (selective_display)
3342         {
3343           for (tmp = destination; tmp < dst; tmp++)
3344             if (*tmp == '\r') *tmp = '\n';
3345         }
3346     }
3347   if (coding->src_multibyte)
3348     dst = destination + str_as_unibyte (destination, dst - destination);
3349
3350   coding->consumed = src_base - source;
3351   coding->produced = dst - destination;
3352   coding->produced_char = coding->produced;
3353 }
3354
3355 \f
3356 /*** 7. C library functions ***/
3357
3358 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3359    has a property `coding-system'.  The value of this property is a
3360    vector of length 5 (called the coding-vector).  Among elements of
3361    this vector, the first (element[0]) and the fifth (element[4])
3362    carry important information for decoding/encoding.  Before
3363    decoding/encoding, this information should be set in fields of a
3364    structure of type `coding_system'.
3365
3366    The value of the property `coding-system' can be a symbol of another
3367    subsidiary coding-system.  In that case, Emacs gets coding-vector
3368    from that symbol.
3369
3370    `element[0]' contains information to be set in `coding->type'.  The
3371    value and its meaning is as follows:
3372
3373    0 -- coding_type_emacs_mule
3374    1 -- coding_type_sjis
3375    2 -- coding_type_iso2022
3376    3 -- coding_type_big5
3377    4 -- coding_type_ccl encoder/decoder written in CCL
3378    nil -- coding_type_no_conversion
3379    t -- coding_type_undecided (automatic conversion on decoding,
3380                                no-conversion on encoding)
3381
3382    `element[4]' contains information to be set in `coding->flags' and
3383    `coding->spec'.  The meaning varies by `coding->type'.
3384
3385    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3386    of length 32 (of which the first 13 sub-elements are used now).
3387    Meanings of these sub-elements are:
3388
3389    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3390         If the value is an integer of valid charset, the charset is
3391         assumed to be designated to graphic register N initially.
3392
3393         If the value is minus, it is a minus value of charset which
3394         reserves graphic register N, which means that the charset is
3395         not designated initially but should be designated to graphic
3396         register N just before encoding a character in that charset.
3397
3398         If the value is nil, graphic register N is never used on
3399         encoding.
3400
3401    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3402         Each value takes t or nil.  See the section ISO2022 of
3403         `coding.h' for more information.
3404
3405    If `coding->type' is `coding_type_big5', element[4] is t to denote
3406    BIG5-ETen or nil to denote BIG5-HKU.
3407
3408    If `coding->type' takes the other value, element[4] is ignored.
3409
3410    Emacs Lisp's coding systems also carry information about format of
3411    end-of-line in a value of property `eol-type'.  If the value is
3412    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3413    means CODING_EOL_CR.  If it is not integer, it should be a vector
3414    of subsidiary coding systems of which property `eol-type' has one
3415    of the above values.
3416
3417 */
3418
3419 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3420    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3421    is setup so that no conversion is necessary and return -1, else
3422    return 0.  */
3423
3424 int
3425 setup_coding_system (coding_system, coding)
3426      Lisp_Object coding_system;
3427      struct coding_system *coding;
3428 {
3429   Lisp_Object coding_spec, coding_type, eol_type, plist;
3430   Lisp_Object val;
3431
3432   /* At first, zero clear all members.  */
3433   bzero (coding, sizeof (struct coding_system));
3434
3435   /* Initialize some fields required for all kinds of coding systems.  */
3436   coding->symbol = coding_system;
3437   coding->heading_ascii = -1;
3438   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3439   coding->composing = COMPOSITION_DISABLED;
3440   coding->cmp_data = NULL;
3441
3442   if (NILP (coding_system))
3443     goto label_invalid_coding_system;
3444
3445   coding_spec = Fget (coding_system, Qcoding_system);
3446
3447   if (!VECTORP (coding_spec)
3448       || XVECTOR (coding_spec)->size != 5
3449       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3450     goto label_invalid_coding_system;
3451
3452   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3453   if (VECTORP (eol_type))
3454     {
3455       coding->eol_type = CODING_EOL_UNDECIDED;
3456       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3457     }
3458   else if (XFASTINT (eol_type) == 1)
3459     {
3460       coding->eol_type = CODING_EOL_CRLF;
3461       coding->common_flags
3462         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3463     }
3464   else if (XFASTINT (eol_type) == 2)
3465     {
3466       coding->eol_type = CODING_EOL_CR;
3467       coding->common_flags
3468         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3469     }
3470   else
3471     coding->eol_type = CODING_EOL_LF;
3472
3473   coding_type = XVECTOR (coding_spec)->contents[0];
3474   /* Try short cut.  */
3475   if (SYMBOLP (coding_type))
3476     {
3477       if (EQ (coding_type, Qt))
3478         {
3479           coding->type = coding_type_undecided;
3480           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3481         }
3482       else
3483         coding->type = coding_type_no_conversion;
3484       /* Initialize this member.  Any thing other than
3485          CODING_CATEGORY_IDX_UTF_16_BE and
3486          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3487          special treatment in detect_eol.  */
3488       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3489
3490       return 0;
3491     }
3492
3493   /* Get values of coding system properties:
3494      `post-read-conversion', `pre-write-conversion',
3495      `translation-table-for-decode', `translation-table-for-encode'.  */
3496   plist = XVECTOR (coding_spec)->contents[3];
3497   /* Pre & post conversion functions should be disabled if
3498      inhibit_eol_conversion is nonzero.  This is the case that a code
3499      conversion function is called while those functions are running.  */
3500   if (! inhibit_pre_post_conversion)
3501     {
3502       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3503       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3504     }
3505   val = Fplist_get (plist, Qtranslation_table_for_decode);
3506   if (SYMBOLP (val))
3507     val = Fget (val, Qtranslation_table_for_decode);
3508   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3509   val = Fplist_get (plist, Qtranslation_table_for_encode);
3510   if (SYMBOLP (val))
3511     val = Fget (val, Qtranslation_table_for_encode);
3512   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3513   val = Fplist_get (plist, Qcoding_category);
3514   if (!NILP (val))
3515     {
3516       val = Fget (val, Qcoding_category_index);
3517       if (INTEGERP (val))
3518         coding->category_idx = XINT (val);
3519       else
3520         goto label_invalid_coding_system;
3521     }
3522   else
3523     goto label_invalid_coding_system;
3524
3525   /* If the coding system has non-nil `composition' property, enable
3526      composition handling.  */
3527   val = Fplist_get (plist, Qcomposition);
3528   if (!NILP (val))
3529     coding->composing = COMPOSITION_NO;
3530
3531   switch (XFASTINT (coding_type))
3532     {
3533     case 0:
3534       coding->type = coding_type_emacs_mule;
3535       coding->common_flags
3536         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3537       if (!NILP (coding->post_read_conversion))
3538         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3539       if (!NILP (coding->pre_write_conversion))
3540         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3541       break;
3542
3543     case 1:
3544       coding->type = coding_type_sjis;
3545       coding->common_flags
3546         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3547       break;
3548
3549     case 2:
3550       coding->type = coding_type_iso2022;
3551       coding->common_flags
3552         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3553       {
3554         Lisp_Object val, temp;
3555         Lisp_Object *flags;
3556         int i, charset, reg_bits = 0;
3557
3558         val = XVECTOR (coding_spec)->contents[4];
3559
3560         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3561           goto label_invalid_coding_system;
3562
3563         flags = XVECTOR (val)->contents;
3564         coding->flags
3565           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3566              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3567              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3568              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3569              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3570              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3571              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3572              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3573              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3574              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3575              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3576              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3577              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3578              );
3579
3580         /* Invoke graphic register 0 to plane 0.  */
3581         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3582         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3583         CODING_SPEC_ISO_INVOCATION (coding, 1)
3584           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3585         /* Not single shifting at first.  */
3586         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3587         /* Beginning of buffer should also be regarded as bol. */
3588         CODING_SPEC_ISO_BOL (coding) = 1;
3589
3590         for (charset = 0; charset <= MAX_CHARSET; charset++)
3591           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3592         val = Vcharset_revision_alist;
3593         while (CONSP (val))
3594           {
3595             charset = get_charset_id (Fcar_safe (XCAR (val)));
3596             if (charset >= 0
3597                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3598                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3599               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3600             val = XCDR (val);
3601           }
3602
3603         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3604            FLAGS[REG] can be one of below:
3605                 integer CHARSET: CHARSET occupies register I,
3606                 t: designate nothing to REG initially, but can be used
3607                   by any charsets,
3608                 list of integer, nil, or t: designate the first
3609                   element (if integer) to REG initially, the remaining
3610                   elements (if integer) is designated to REG on request,
3611                   if an element is t, REG can be used by any charsets,
3612                 nil: REG is never used.  */
3613         for (charset = 0; charset <= MAX_CHARSET; charset++)
3614           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3615             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3616         for (i = 0; i < 4; i++)
3617           {
3618             if ((INTEGERP (flags[i])
3619                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3620                 || (charset = get_charset_id (flags[i])) >= 0)
3621               {
3622                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3623                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3624               }
3625             else if (EQ (flags[i], Qt))
3626               {
3627                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3628                 reg_bits |= 1 << i;
3629                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3630               }
3631             else if (CONSP (flags[i]))
3632               {
3633                 Lisp_Object tail;
3634                 tail = flags[i];
3635
3636                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3637                 if ((INTEGERP (XCAR (tail))
3638                      && (charset = XINT (XCAR (tail)),
3639                          CHARSET_VALID_P (charset)))
3640                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3641                   {
3642                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3643                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3644                   }
3645                 else
3646                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3647                 tail = XCDR (tail);
3648                 while (CONSP (tail))
3649                   {
3650                     if ((INTEGERP (XCAR (tail))
3651                          && (charset = XINT (XCAR (tail)),
3652                              CHARSET_VALID_P (charset)))
3653                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3654                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3655                         = i;
3656                     else if (EQ (XCAR (tail), Qt))
3657                       reg_bits |= 1 << i;
3658                     tail = XCDR (tail);
3659                   }
3660               }
3661             else
3662               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3663
3664             CODING_SPEC_ISO_DESIGNATION (coding, i)
3665               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3666           }
3667
3668         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3669           {
3670             /* REG 1 can be used only by locking shift in 7-bit env.  */
3671             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3672               reg_bits &= ~2;
3673             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3674               /* Without any shifting, only REG 0 and 1 can be used.  */
3675               reg_bits &= 3;
3676           }
3677
3678         if (reg_bits)
3679           for (charset = 0; charset <= MAX_CHARSET; charset++)
3680             {
3681               if (CHARSET_DEFINED_P (charset)
3682                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3683                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3684                 {
3685                   /* There exist some default graphic registers to be
3686                      used by CHARSET.  */
3687
3688                   /* We had better avoid designating a charset of
3689                      CHARS96 to REG 0 as far as possible.  */
3690                   if (CHARSET_CHARS (charset) == 96)
3691                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3692                       = (reg_bits & 2
3693                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3694                   else
3695                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3696                       = (reg_bits & 1
3697                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3698                 }
3699             }
3700       }
3701       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3702       coding->spec.iso2022.last_invalid_designation_register = -1;
3703       break;
3704
3705     case 3:
3706       coding->type = coding_type_big5;
3707       coding->common_flags
3708         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3709       coding->flags
3710         = (NILP (XVECTOR (coding_spec)->contents[4])
3711            ? CODING_FLAG_BIG5_HKU
3712            : CODING_FLAG_BIG5_ETEN);
3713       break;
3714
3715     case 4:
3716       coding->type = coding_type_ccl;
3717       coding->common_flags
3718         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3719       {
3720         val = XVECTOR (coding_spec)->contents[4];
3721         if (! CONSP (val)
3722             || setup_ccl_program (&(coding->spec.ccl.decoder),
3723                                   XCAR (val)) < 0
3724             || setup_ccl_program (&(coding->spec.ccl.encoder),
3725                                   XCDR (val)) < 0)
3726           goto label_invalid_coding_system;
3727
3728         bzero (coding->spec.ccl.valid_codes, 256);
3729         val = Fplist_get (plist, Qvalid_codes);
3730         if (CONSP (val))
3731           {
3732             Lisp_Object this;
3733
3734             for (; CONSP (val); val = XCDR (val))
3735               {
3736                 this = XCAR (val);
3737                 if (INTEGERP (this)
3738                     && XINT (this) >= 0 && XINT (this) < 256)
3739                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3740                 else if (CONSP (this)
3741                          && INTEGERP (XCAR (this))
3742                          && INTEGERP (XCDR (this)))
3743                   {
3744                     int start = XINT (XCAR (this));
3745                     int end = XINT (XCDR (this));
3746
3747                     if (start >= 0 && start <= end && end < 256)
3748                       while (start <= end)
3749                         coding->spec.ccl.valid_codes[start++] = 1;
3750                   }
3751               }
3752           }
3753       }
3754       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3755       coding->spec.ccl.cr_carryover = 0;
3756       coding->spec.ccl.eight_bit_carryover[0] = 0;
3757       break;
3758
3759     case 5:
3760       coding->type = coding_type_raw_text;
3761       break;
3762
3763     default:
3764       goto label_invalid_coding_system;
3765     }
3766   return 0;
3767
3768  label_invalid_coding_system:
3769   coding->type = coding_type_no_conversion;
3770   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3771   coding->common_flags = 0;
3772   coding->eol_type = CODING_EOL_LF;
3773   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3774   return -1;
3775 }
3776
3777 /* Free memory blocks allocated for storing composition information.  */
3778
3779 void
3780 coding_free_composition_data (coding)
3781      struct coding_system *coding;
3782 {
3783   struct composition_data *cmp_data = coding->cmp_data, *next;
3784
3785   if (!cmp_data)
3786     return;
3787   /* Memory blocks are chained.  At first, rewind to the first, then,
3788      free blocks one by one.  */
3789   while (cmp_data->prev)
3790     cmp_data = cmp_data->prev;
3791   while (cmp_data)
3792     {
3793       next = cmp_data->next;
3794       xfree (cmp_data);
3795       cmp_data = next;
3796     }
3797   coding->cmp_data = NULL;
3798 }
3799
3800 /* Set `char_offset' member of all memory blocks pointed by
3801    coding->cmp_data to POS.  */
3802
3803 void
3804 coding_adjust_composition_offset (coding, pos)
3805      struct coding_system *coding;
3806      int pos;
3807 {
3808   struct composition_data *cmp_data;
3809
3810   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3811     cmp_data->char_offset = pos;
3812 }
3813
3814 /* Setup raw-text or one of its subsidiaries in the structure
3815    coding_system CODING according to the already setup value eol_type
3816    in CODING.  CODING should be setup for some coding system in
3817    advance.  */
3818
3819 void
3820 setup_raw_text_coding_system (coding)
3821      struct coding_system *coding;
3822 {
3823   if (coding->type != coding_type_raw_text)
3824     {
3825       coding->symbol = Qraw_text;
3826       coding->type = coding_type_raw_text;
3827       if (coding->eol_type != CODING_EOL_UNDECIDED)
3828         {
3829           Lisp_Object subsidiaries;
3830           subsidiaries = Fget (Qraw_text, Qeol_type);
3831
3832           if (VECTORP (subsidiaries)
3833               && XVECTOR (subsidiaries)->size == 3)
3834             coding->symbol
3835               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3836         }
3837       setup_coding_system (coding->symbol, coding);
3838     }
3839   return;
3840 }
3841
3842 /* Emacs has a mechanism to automatically detect a coding system if it
3843    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3844    it's impossible to distinguish some coding systems accurately
3845    because they use the same range of codes.  So, at first, coding
3846    systems are categorized into 7, those are:
3847
3848    o coding-category-emacs-mule
3849
3850         The category for a coding system which has the same code range
3851         as Emacs' internal format.  Assigned the coding-system (Lisp
3852         symbol) `emacs-mule' by default.
3853
3854    o coding-category-sjis
3855
3856         The category for a coding system which has the same code range
3857         as SJIS.  Assigned the coding-system (Lisp
3858         symbol) `japanese-shift-jis' by default.
3859
3860    o coding-category-iso-7
3861
3862         The category for a coding system which has the same code range
3863         as ISO2022 of 7-bit environment.  This doesn't use any locking
3864         shift and single shift functions.  This can encode/decode all
3865         charsets.  Assigned the coding-system (Lisp symbol)
3866         `iso-2022-7bit' by default.
3867
3868    o coding-category-iso-7-tight
3869
3870         Same as coding-category-iso-7 except that this can
3871         encode/decode only the specified charsets.
3872
3873    o coding-category-iso-8-1
3874
3875         The category for a coding system which has the same code range
3876         as ISO2022 of 8-bit environment and graphic plane 1 used only
3877         for DIMENSION1 charset.  This doesn't use any locking shift
3878         and single shift functions.  Assigned the coding-system (Lisp
3879         symbol) `iso-latin-1' by default.
3880
3881    o coding-category-iso-8-2
3882
3883         The category for a coding system which has the same code range
3884         as ISO2022 of 8-bit environment and graphic plane 1 used only
3885         for DIMENSION2 charset.  This doesn't use any locking shift
3886         and single shift functions.  Assigned the coding-system (Lisp
3887         symbol) `japanese-iso-8bit' by default.
3888
3889    o coding-category-iso-7-else
3890
3891         The category for a coding system which has the same code range
3892         as ISO2022 of 7-bit environment but uses locking shift or
3893         single shift functions.  Assigned the coding-system (Lisp
3894         symbol) `iso-2022-7bit-lock' by default.
3895
3896    o coding-category-iso-8-else
3897
3898         The category for a coding system which has the same code range
3899         as ISO2022 of 8-bit environment but uses locking shift or
3900         single shift functions.  Assigned the coding-system (Lisp
3901         symbol) `iso-2022-8bit-ss2' by default.
3902
3903    o coding-category-big5
3904
3905         The category for a coding system which has the same code range
3906         as BIG5.  Assigned the coding-system (Lisp symbol)
3907         `cn-big5' by default.
3908
3909    o coding-category-utf-8
3910
3911         The category for a coding system which has the same code range
3912         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3913         symbol) `utf-8' by default.
3914
3915    o coding-category-utf-16-be
3916
3917         The category for a coding system in which a text has an
3918         Unicode signature (cf. Unicode Standard) in the order of BIG
3919         endian at the head.  Assigned the coding-system (Lisp symbol)
3920         `utf-16-be' by default.
3921
3922    o coding-category-utf-16-le
3923
3924         The category for a coding system in which a text has an
3925         Unicode signature (cf. Unicode Standard) in the order of
3926         LITTLE endian at the head.  Assigned the coding-system (Lisp
3927         symbol) `utf-16-le' by default.
3928
3929    o coding-category-ccl
3930
3931         The category for a coding system of which encoder/decoder is
3932         written in CCL programs.  The default value is nil, i.e., no
3933         coding system is assigned.
3934
3935    o coding-category-binary
3936
3937         The category for a coding system not categorized in any of the
3938         above.  Assigned the coding-system (Lisp symbol)
3939         `no-conversion' by default.
3940
3941    Each of them is a Lisp symbol and the value is an actual
3942    `coding-system' (this is also a Lisp symbol) assigned by a user.
3943    What Emacs does actually is to detect a category of coding system.
3944    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3945    decide a single possible category, it selects a category of the
3946    highest priority.  Priorities of categories are also specified by a
3947    user in a Lisp variable `coding-category-list'.
3948
3949 */
3950
3951 static
3952 int ascii_skip_code[256];
3953
3954 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3955    If it detects possible coding systems, return an integer in which
3956    appropriate flag bits are set.  Flag bits are defined by macros
3957    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3958    it should point the table `coding_priorities'.  In that case, only
3959    the flag bit for a coding system of the highest priority is set in
3960    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3961    range 0x80..0x9F are in multibyte form.
3962
3963    How many ASCII characters are at the head is returned as *SKIP.  */
3964
3965 static int
3966 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3967      unsigned char *source;
3968      int src_bytes, *priorities, *skip;
3969      int multibytep;
3970 {
3971   register unsigned char c;
3972   unsigned char *src = source, *src_end = source + src_bytes;
3973   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3974   int i;
3975
3976   /* At first, skip all ASCII characters and control characters except
3977      for three ISO2022 specific control characters.  */
3978   ascii_skip_code[ISO_CODE_SO] = 0;
3979   ascii_skip_code[ISO_CODE_SI] = 0;
3980   ascii_skip_code[ISO_CODE_ESC] = 0;
3981
3982  label_loop_detect_coding:
3983   while (src < src_end && ascii_skip_code[*src]) src++;
3984   *skip = src - source;
3985
3986   if (src >= src_end)
3987     /* We found nothing other than ASCII.  There's nothing to do.  */
3988     return 0;
3989
3990   c = *src;
3991   /* The text seems to be encoded in some multilingual coding system.
3992      Now, try to find in which coding system the text is encoded.  */
3993   if (c < 0x80)
3994     {
3995       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3996       /* C is an ISO2022 specific control code of C0.  */
3997       mask = detect_coding_iso2022 (src, src_end, multibytep);
3998       if (mask == 0)
3999         {
4000           /* No valid ISO2022 code follows C.  Try again.  */
4001           src++;
4002           if (c == ISO_CODE_ESC)
4003             ascii_skip_code[ISO_CODE_ESC] = 1;
4004           else
4005             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4006           goto label_loop_detect_coding;
4007         }
4008       if (priorities)
4009         {
4010           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4011             {
4012               if (mask & priorities[i])
4013                 return priorities[i];
4014             }
4015           return CODING_CATEGORY_MASK_RAW_TEXT;
4016         }
4017     }
4018   else
4019     {
4020       int try;
4021
4022       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4023         c = src[1] - 0x20;
4024
4025       if (c < 0xA0)
4026         {
4027           /* C is the first byte of SJIS character code,
4028              or a leading-code of Emacs' internal format (emacs-mule),
4029              or the first byte of UTF-16.  */
4030           try = (CODING_CATEGORY_MASK_SJIS
4031                   | CODING_CATEGORY_MASK_EMACS_MULE
4032                   | CODING_CATEGORY_MASK_UTF_16_BE
4033                   | CODING_CATEGORY_MASK_UTF_16_LE);
4034
4035           /* Or, if C is a special latin extra code,
4036              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4037              or is an ISO2022 control-sequence-introducer (CSI),
4038              we should also consider the possibility of ISO2022 codings.  */
4039           if ((VECTORP (Vlatin_extra_code_table)
4040                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4041               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4042               || (c == ISO_CODE_CSI
4043                   && (src < src_end
4044                       && (*src == ']'
4045                           || ((*src == '0' || *src == '1' || *src == '2')
4046                               && src + 1 < src_end
4047                               && src[1] == ']')))))
4048             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4049                      | CODING_CATEGORY_MASK_ISO_8BIT);
4050         }
4051       else
4052         /* C is a character of ISO2022 in graphic plane right,
4053            or a SJIS's 1-byte character code (i.e. JISX0201),
4054            or the first byte of BIG5's 2-byte code,
4055            or the first byte of UTF-8/16.  */
4056         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4057                 | CODING_CATEGORY_MASK_ISO_8BIT
4058                 | CODING_CATEGORY_MASK_SJIS
4059                 | CODING_CATEGORY_MASK_BIG5
4060                 | CODING_CATEGORY_MASK_UTF_8
4061                 | CODING_CATEGORY_MASK_UTF_16_BE
4062                 | CODING_CATEGORY_MASK_UTF_16_LE);
4063
4064       /* Or, we may have to consider the possibility of CCL.  */
4065       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4066           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4067               ->spec.ccl.valid_codes)[c])
4068         try |= CODING_CATEGORY_MASK_CCL;
4069
4070       mask = 0;
4071       utf16_examined_p = iso2022_examined_p = 0;
4072       if (priorities)
4073         {
4074           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4075             {
4076               if (!iso2022_examined_p
4077                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4078                 {
4079                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4080                   iso2022_examined_p = 1;
4081                 }
4082               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4083                 mask |= detect_coding_sjis (src, src_end, multibytep);
4084               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4085                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4086               else if (!utf16_examined_p
4087                        && (priorities[i] & try &
4088                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4089                 {
4090                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4091                   utf16_examined_p = 1;
4092                 }
4093               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4094                 mask |= detect_coding_big5 (src, src_end, multibytep);
4095               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4096                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4097               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4098                 mask |= detect_coding_ccl (src, src_end, multibytep);
4099               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4100                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4101               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4102                 mask |= CODING_CATEGORY_MASK_BINARY;
4103               if (mask & priorities[i])
4104                 return priorities[i];
4105             }
4106           return CODING_CATEGORY_MASK_RAW_TEXT;
4107         }
4108       if (try & CODING_CATEGORY_MASK_ISO)
4109         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4110       if (try & CODING_CATEGORY_MASK_SJIS)
4111         mask |= detect_coding_sjis (src, src_end, multibytep);
4112       if (try & CODING_CATEGORY_MASK_BIG5)
4113         mask |= detect_coding_big5 (src, src_end, multibytep);
4114       if (try & CODING_CATEGORY_MASK_UTF_8)
4115         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4116       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4117         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4118       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4119         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4120       if (try & CODING_CATEGORY_MASK_CCL)
4121         mask |= detect_coding_ccl (src, src_end, multibytep);
4122     }
4123   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4124 }
4125
4126 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4127    The information of the detected coding system is set in CODING.  */
4128
4129 void
4130 detect_coding (coding, src, src_bytes)
4131      struct coding_system *coding;
4132      const unsigned char *src;
4133      int src_bytes;
4134 {
4135   unsigned int idx;
4136   int skip, mask;
4137   Lisp_Object val;
4138
4139   val = Vcoding_category_list;
4140   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4141                              coding->src_multibyte);
4142   coding->heading_ascii = skip;
4143
4144   if (!mask) return;
4145
4146   /* We found a single coding system of the highest priority in MASK.  */
4147   idx = 0;
4148   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4149   if (! mask)
4150     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4151
4152   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4153
4154   if (coding->eol_type != CODING_EOL_UNDECIDED)
4155     {
4156       Lisp_Object tmp;
4157
4158       tmp = Fget (val, Qeol_type);
4159       if (VECTORP (tmp))
4160         val = XVECTOR (tmp)->contents[coding->eol_type];
4161     }
4162
4163   /* Setup this new coding system while preserving some slots.  */
4164   {
4165     int src_multibyte = coding->src_multibyte;
4166     int dst_multibyte = coding->dst_multibyte;
4167
4168     setup_coding_system (val, coding);
4169     coding->src_multibyte = src_multibyte;
4170     coding->dst_multibyte = dst_multibyte;
4171     coding->heading_ascii = skip;
4172   }
4173 }
4174
4175 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4176    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4177    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4178
4179    How many non-eol characters are at the head is returned as *SKIP.  */
4180
4181 #define MAX_EOL_CHECK_COUNT 3
4182
4183 static int
4184 detect_eol_type (source, src_bytes, skip)
4185      unsigned char *source;
4186      int src_bytes, *skip;
4187 {
4188   unsigned char *src = source, *src_end = src + src_bytes;
4189   unsigned char c;
4190   int total = 0;                /* How many end-of-lines are found so far.  */
4191   int eol_type = CODING_EOL_UNDECIDED;
4192   int this_eol_type;
4193
4194   *skip = 0;
4195
4196   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4197     {
4198       c = *src++;
4199       if (c == '\n' || c == '\r')
4200         {
4201           if (*skip == 0)
4202             *skip = src - 1 - source;
4203           total++;
4204           if (c == '\n')
4205             this_eol_type = CODING_EOL_LF;
4206           else if (src >= src_end || *src != '\n')
4207             this_eol_type = CODING_EOL_CR;
4208           else
4209             this_eol_type = CODING_EOL_CRLF, src++;
4210
4211           if (eol_type == CODING_EOL_UNDECIDED)
4212             /* This is the first end-of-line.  */
4213             eol_type = this_eol_type;
4214           else if (eol_type != this_eol_type)
4215             {
4216               /* The found type is different from what found before.  */
4217               eol_type = CODING_EOL_INCONSISTENT;
4218               break;
4219             }
4220         }
4221     }
4222
4223   if (*skip == 0)
4224     *skip = src_end - source;
4225   return eol_type;
4226 }
4227
4228 /* Like detect_eol_type, but detect EOL type in 2-octet
4229    big-endian/little-endian format for coding systems utf-16-be and
4230    utf-16-le.  */
4231
4232 static int
4233 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4234      unsigned char *source;
4235      int src_bytes, *skip, big_endian_p;
4236 {
4237   unsigned char *src = source, *src_end = src + src_bytes;
4238   unsigned int c1, c2;
4239   int total = 0;                /* How many end-of-lines are found so far.  */
4240   int eol_type = CODING_EOL_UNDECIDED;
4241   int this_eol_type;
4242   int msb, lsb;
4243
4244   if (big_endian_p)
4245     msb = 0, lsb = 1;
4246   else
4247     msb = 1, lsb = 0;
4248
4249   *skip = 0;
4250
4251   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4252     {
4253       c1 = (src[msb] << 8) | (src[lsb]);
4254       src += 2;
4255
4256       if (c1 == '\n' || c1 == '\r')
4257         {
4258           if (*skip == 0)
4259             *skip = src - 2 - source;
4260           total++;
4261           if (c1 == '\n')
4262             {
4263               this_eol_type = CODING_EOL_LF;
4264             }
4265           else
4266             {
4267               if ((src + 1) >= src_end)
4268                 {
4269                   this_eol_type = CODING_EOL_CR;
4270                 }
4271               else
4272                 {
4273                   c2 = (src[msb] << 8) | (src[lsb]);
4274                   if (c2 == '\n')
4275                     this_eol_type = CODING_EOL_CRLF, src += 2;
4276                   else
4277                     this_eol_type = CODING_EOL_CR;
4278                 }
4279             }
4280
4281           if (eol_type == CODING_EOL_UNDECIDED)
4282             /* This is the first end-of-line.  */
4283             eol_type = this_eol_type;
4284           else if (eol_type != this_eol_type)
4285             {
4286               /* The found type is different from what found before.  */
4287               eol_type = CODING_EOL_INCONSISTENT;
4288               break;
4289             }
4290         }
4291     }
4292
4293   if (*skip == 0)
4294     *skip = src_end - source;
4295   return eol_type;
4296 }
4297
4298 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4299    is encoded.  If it detects an appropriate format of end-of-line, it
4300    sets the information in *CODING.  */
4301
4302 void
4303 detect_eol (coding, src, src_bytes)
4304      struct coding_system *coding;
4305      const unsigned char *src;
4306      int src_bytes;
4307 {
4308   Lisp_Object val;
4309   int skip;
4310   int eol_type;
4311
4312   switch (coding->category_idx)
4313     {
4314     case CODING_CATEGORY_IDX_UTF_16_BE:
4315       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4316       break;
4317     case CODING_CATEGORY_IDX_UTF_16_LE:
4318       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4319       break;
4320     default:
4321       eol_type = detect_eol_type (src, src_bytes, &skip);
4322       break;
4323     }
4324
4325   if (coding->heading_ascii > skip)
4326     coding->heading_ascii = skip;
4327   else
4328     skip = coding->heading_ascii;
4329
4330   if (eol_type == CODING_EOL_UNDECIDED)
4331     return;
4332   if (eol_type == CODING_EOL_INCONSISTENT)
4333     {
4334 #if 0
4335       /* This code is suppressed until we find a better way to
4336          distinguish raw text file and binary file.  */
4337
4338       /* If we have already detected that the coding is raw-text, the
4339          coding should actually be no-conversion.  */
4340       if (coding->type == coding_type_raw_text)
4341         {
4342           setup_coding_system (Qno_conversion, coding);
4343           return;
4344         }
4345       /* Else, let's decode only text code anyway.  */
4346 #endif /* 0 */
4347       eol_type = CODING_EOL_LF;
4348     }
4349
4350   val = Fget (coding->symbol, Qeol_type);
4351   if (VECTORP (val) && XVECTOR (val)->size == 3)
4352     {
4353       int src_multibyte = coding->src_multibyte;
4354       int dst_multibyte = coding->dst_multibyte;
4355       struct composition_data *cmp_data = coding->cmp_data;
4356
4357       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4358       coding->src_multibyte = src_multibyte;
4359       coding->dst_multibyte = dst_multibyte;
4360       coding->heading_ascii = skip;
4361       coding->cmp_data = cmp_data;
4362     }
4363 }
4364
4365 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4366
4367 #define DECODING_BUFFER_MAG(coding)                     \
4368   (coding->type == coding_type_iso2022                  \
4369    ? 3                                                  \
4370    : (coding->type == coding_type_ccl                   \
4371       ? coding->spec.ccl.decoder.buf_magnification      \
4372       : 2))
4373
4374 /* Return maximum size (bytes) of a buffer enough for decoding
4375    SRC_BYTES of text encoded in CODING.  */
4376
4377 int
4378 decoding_buffer_size (coding, src_bytes)
4379      struct coding_system *coding;
4380      int src_bytes;
4381 {
4382   return (src_bytes * DECODING_BUFFER_MAG (coding)
4383           + CONVERSION_BUFFER_EXTRA_ROOM);
4384 }
4385
4386 /* Return maximum size (bytes) of a buffer enough for encoding
4387    SRC_BYTES of text to CODING.  */
4388
4389 int
4390 encoding_buffer_size (coding, src_bytes)
4391      struct coding_system *coding;
4392      int src_bytes;
4393 {
4394   int magnification;
4395
4396   if (coding->type == coding_type_ccl)
4397     magnification = coding->spec.ccl.encoder.buf_magnification;
4398   else if (CODING_REQUIRE_ENCODING (coding))
4399     magnification = 3;
4400   else
4401     magnification = 1;
4402
4403   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4404 }
4405
4406 /* Working buffer for code conversion.  */
4407 struct conversion_buffer
4408 {
4409   int size;                     /* size of data.  */
4410   int on_stack;                 /* 1 if allocated by alloca.  */
4411   unsigned char *data;
4412 };
4413
4414 /* Don't use alloca for allocating memory space larger than this, lest
4415    we overflow their stack.  */
4416 #define MAX_ALLOCA 16*1024
4417
4418 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4419 #define allocate_conversion_buffer(buf, len)            \
4420   do {                                                  \
4421     if (len < MAX_ALLOCA)                               \
4422       {                                                 \
4423         buf.data = (unsigned char *) alloca (len);      \
4424         buf.on_stack = 1;                               \
4425       }                                                 \
4426     else                                                \
4427       {                                                 \
4428         buf.data = (unsigned char *) xmalloc (len);     \
4429         buf.on_stack = 0;                               \
4430       }                                                 \
4431     buf.size = len;                                     \
4432   } while (0)
4433
4434 /* Double the allocated memory for *BUF.  */
4435 static void
4436 extend_conversion_buffer (buf)
4437      struct conversion_buffer *buf;
4438 {
4439   if (buf->on_stack)
4440     {
4441       unsigned char *save = buf->data;
4442       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4443       bcopy (save, buf->data, buf->size);
4444       buf->on_stack = 0;
4445     }
4446   else
4447     {
4448       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4449     }
4450   buf->size *= 2;
4451 }
4452
4453 /* Free the allocated memory for BUF if it is not on stack.  */
4454 static void
4455 free_conversion_buffer (buf)
4456      struct conversion_buffer *buf;
4457 {
4458   if (!buf->on_stack)
4459     xfree (buf->data);
4460 }
4461
4462 int
4463 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4464      struct coding_system *coding;
4465      unsigned char *source, *destination;
4466      int src_bytes, dst_bytes, encodep;
4467 {
4468   struct ccl_program *ccl
4469     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4470   unsigned char *dst = destination;
4471
4472   ccl->suppress_error = coding->suppress_error;
4473   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4474   if (encodep)
4475     {
4476       /* On encoding, EOL format is converted within ccl_driver.  For
4477          that, setup proper information in the structure CCL.  */
4478       ccl->eol_type = coding->eol_type;
4479       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4480         ccl->eol_type = CODING_EOL_LF;
4481       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4482     }
4483   ccl->multibyte = coding->src_multibyte;
4484   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4485     {
4486       /* Move carryover bytes to DESTINATION.  */
4487       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4488       while (*p)
4489         *dst++ = *p++;
4490       coding->spec.ccl.eight_bit_carryover[0] = 0;
4491       if (dst_bytes)
4492         dst_bytes -= dst - destination;
4493     }
4494
4495   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4496                                   &(coding->consumed))
4497                       + dst - destination);
4498
4499   if (encodep)
4500     {
4501       coding->produced_char = coding->produced;
4502       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4503     }
4504   else if (!ccl->eight_bit_control)
4505     {
4506       /* The produced bytes forms a valid multibyte sequence. */
4507       coding->produced_char
4508         = multibyte_chars_in_text (destination, coding->produced);
4509       coding->spec.ccl.eight_bit_carryover[0] = 0;
4510     }
4511   else
4512     {
4513       /* On decoding, the destination should always multibyte.  But,
4514          CCL program might have been generated an invalid multibyte
4515          sequence.  Here we make such a sequence valid as
4516          multibyte.  */
4517       int bytes
4518         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4519
4520       if ((coding->consumed < src_bytes
4521            || !ccl->last_block)
4522           && coding->produced >= 1
4523           && destination[coding->produced - 1] >= 0x80)
4524         {
4525           /* We should not convert the tailing 8-bit codes to
4526              multibyte form even if they doesn't form a valid
4527              multibyte sequence.  They may form a valid sequence in
4528              the next call.  */
4529           int carryover = 0;
4530
4531           if (destination[coding->produced - 1] < 0xA0)
4532             carryover = 1;
4533           else if (coding->produced >= 2)
4534             {
4535               if (destination[coding->produced - 2] >= 0x80)
4536                 {
4537                   if (destination[coding->produced - 2] < 0xA0)
4538                     carryover = 2;
4539                   else if (coding->produced >= 3
4540                            && destination[coding->produced - 3] >= 0x80
4541                            && destination[coding->produced - 3] < 0xA0)
4542                     carryover = 3;
4543                 }
4544             }
4545           if (carryover > 0)
4546             {
4547               BCOPY_SHORT (destination + coding->produced - carryover,
4548                            coding->spec.ccl.eight_bit_carryover,
4549                            carryover);
4550               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4551               coding->produced -= carryover;
4552             }
4553         }
4554       coding->produced = str_as_multibyte (destination, bytes,
4555                                            coding->produced,
4556                                            &(coding->produced_char));
4557     }
4558
4559   switch (ccl->status)
4560     {
4561     case CCL_STAT_SUSPEND_BY_SRC:
4562       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4563       break;
4564     case CCL_STAT_SUSPEND_BY_DST:
4565       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4566       break;
4567     case CCL_STAT_QUIT:
4568     case CCL_STAT_INVALID_CMD:
4569       coding->result = CODING_FINISH_INTERRUPT;
4570       break;
4571     default:
4572       coding->result = CODING_FINISH_NORMAL;
4573       break;
4574     }
4575   return coding->result;
4576 }
4577
4578 /* Decode EOL format of the text at PTR of BYTES length destructively
4579    according to CODING->eol_type.  This is called after the CCL
4580    program produced a decoded text at PTR.  If we do CRLF->LF
4581    conversion, update CODING->produced and CODING->produced_char.  */
4582
4583 static void
4584 decode_eol_post_ccl (coding, ptr, bytes)
4585      struct coding_system *coding;
4586      unsigned char *ptr;
4587      int bytes;
4588 {
4589   Lisp_Object val, saved_coding_symbol;
4590   unsigned char *pend = ptr + bytes;
4591   int dummy;
4592
4593   /* Remember the current coding system symbol.  We set it back when
4594      an inconsistent EOL is found so that `last-coding-system-used' is
4595      set to the coding system that doesn't specify EOL conversion.  */
4596   saved_coding_symbol = coding->symbol;
4597
4598   coding->spec.ccl.cr_carryover = 0;
4599   if (coding->eol_type == CODING_EOL_UNDECIDED)
4600     {
4601       /* Here, to avoid the call of setup_coding_system, we directly
4602          call detect_eol_type.  */
4603       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4604       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4605         coding->eol_type = CODING_EOL_LF;
4606       if (coding->eol_type != CODING_EOL_UNDECIDED)
4607         {
4608           val = Fget (coding->symbol, Qeol_type);
4609           if (VECTORP (val) && XVECTOR (val)->size == 3)
4610             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4611         }
4612       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4613     }
4614
4615   if (coding->eol_type == CODING_EOL_LF
4616       || coding->eol_type == CODING_EOL_UNDECIDED)
4617     {
4618       /* We have nothing to do.  */
4619       ptr = pend;
4620     }
4621   else if (coding->eol_type == CODING_EOL_CRLF)
4622     {
4623       unsigned char *pstart = ptr, *p = ptr;
4624
4625       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4626           && *(pend - 1) == '\r')
4627         {
4628           /* If the last character is CR, we can't handle it here
4629              because LF will be in the not-yet-decoded source text.
4630              Record that the CR is not yet processed.  */
4631           coding->spec.ccl.cr_carryover = 1;
4632           coding->produced--;
4633           coding->produced_char--;
4634           pend--;
4635         }
4636       while (ptr < pend)
4637         {
4638           if (*ptr == '\r')
4639             {
4640               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4641                 {
4642                   *p++ = '\n';
4643                   ptr += 2;
4644                 }
4645               else
4646                 {
4647                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4648                     goto undo_eol_conversion;
4649                   *p++ = *ptr++;
4650                 }
4651             }
4652           else if (*ptr == '\n'
4653                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4654             goto undo_eol_conversion;
4655           else
4656             *p++ = *ptr++;
4657           continue;
4658
4659         undo_eol_conversion:
4660           /* We have faced with inconsistent EOL format at PTR.
4661              Convert all LFs before PTR back to CRLFs.  */
4662           for (p--, ptr--; p >= pstart; p--)
4663             {
4664               if (*p == '\n')
4665                 *ptr-- = '\n', *ptr-- = '\r';
4666               else
4667                 *ptr-- = *p;
4668             }
4669           /*  If carryover is recorded, cancel it because we don't
4670               convert CRLF anymore.  */
4671           if (coding->spec.ccl.cr_carryover)
4672             {
4673               coding->spec.ccl.cr_carryover = 0;
4674               coding->produced++;
4675               coding->produced_char++;
4676               pend++;
4677             }
4678           p = ptr = pend;
4679           coding->eol_type = CODING_EOL_LF;
4680           coding->symbol = saved_coding_symbol;
4681         }
4682       if (p < pend)
4683         {
4684           /* As each two-byte sequence CRLF was converted to LF, (PEND
4685              - P) is the number of deleted characters.  */
4686           coding->produced -= pend - p;
4687           coding->produced_char -= pend - p;
4688         }
4689     }
4690   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4691     {
4692       unsigned char *p = ptr;
4693
4694       for (; ptr < pend; ptr++)
4695         {
4696           if (*ptr == '\r')
4697             *ptr = '\n';
4698           else if (*ptr == '\n'
4699                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4700             {
4701               for (; p < ptr; p++)
4702                 {
4703                   if (*p == '\n')
4704                     *p = '\r';
4705                 }
4706               ptr = pend;
4707               coding->eol_type = CODING_EOL_LF;
4708               coding->symbol = saved_coding_symbol;
4709             }
4710         }
4711     }
4712 }
4713
4714 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4715    decoding, it may detect coding system and format of end-of-line if
4716    those are not yet decided.  The source should be unibyte, the
4717    result is multibyte if CODING->dst_multibyte is nonzero, else
4718    unibyte.  */
4719
4720 int
4721 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4722      struct coding_system *coding;
4723      const unsigned char *source;
4724      unsigned char *destination;
4725      int src_bytes, dst_bytes;
4726 {
4727   int extra = 0;
4728
4729   if (coding->type == coding_type_undecided)
4730     detect_coding (coding, source, src_bytes);
4731
4732   if (coding->eol_type == CODING_EOL_UNDECIDED
4733       && coding->type != coding_type_ccl)
4734     {
4735       detect_eol (coding, source, src_bytes);
4736       /* We had better recover the original eol format if we
4737          encounter an inconsistent eol format while decoding.  */
4738       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4739     }
4740
4741   coding->produced = coding->produced_char = 0;
4742   coding->consumed = coding->consumed_char = 0;
4743   coding->errors = 0;
4744   coding->result = CODING_FINISH_NORMAL;
4745
4746   switch (coding->type)
4747     {
4748     case coding_type_sjis:
4749       decode_coding_sjis_big5 (coding, source, destination,
4750                                src_bytes, dst_bytes, 1);
4751       break;
4752
4753     case coding_type_iso2022:
4754       decode_coding_iso2022 (coding, source, destination,
4755                              src_bytes, dst_bytes);
4756       break;
4757
4758     case coding_type_big5:
4759       decode_coding_sjis_big5 (coding, source, destination,
4760                                src_bytes, dst_bytes, 0);
4761       break;
4762
4763     case coding_type_emacs_mule:
4764       decode_coding_emacs_mule (coding, source, destination,
4765                                 src_bytes, dst_bytes);
4766       break;
4767
4768     case coding_type_ccl:
4769       if (coding->spec.ccl.cr_carryover)
4770         {
4771           /* Put the CR which was not processed by the previous call
4772              of decode_eol_post_ccl in DESTINATION.  It will be
4773              decoded together with the following LF by the call to
4774              decode_eol_post_ccl below.  */
4775           *destination = '\r';
4776           coding->produced++;
4777           coding->produced_char++;
4778           dst_bytes--;
4779           extra = coding->spec.ccl.cr_carryover;
4780         }
4781       ccl_coding_driver (coding, source, destination + extra,
4782                          src_bytes, dst_bytes, 0);
4783       if (coding->eol_type != CODING_EOL_LF)
4784         {
4785           coding->produced += extra;
4786           coding->produced_char += extra;
4787           decode_eol_post_ccl (coding, destination, coding->produced);
4788         }
4789       break;
4790
4791     default:
4792       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4793     }
4794
4795   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4796       && coding->mode & CODING_MODE_LAST_BLOCK
4797       && coding->consumed == src_bytes)
4798     coding->result = CODING_FINISH_NORMAL;
4799
4800   if (coding->mode & CODING_MODE_LAST_BLOCK
4801       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4802     {
4803       const unsigned char *src = source + coding->consumed;
4804       unsigned char *dst = destination + coding->produced;
4805
4806       src_bytes -= coding->consumed;
4807       coding->errors++;
4808       if (COMPOSING_P (coding))
4809         DECODE_COMPOSITION_END ('1');
4810       while (src_bytes--)
4811         {
4812           int c = *src++;
4813           dst += CHAR_STRING (c, dst);
4814           coding->produced_char++;
4815         }
4816       coding->consumed = coding->consumed_char = src - source;
4817       coding->produced = dst - destination;
4818       coding->result = CODING_FINISH_NORMAL;
4819     }
4820
4821   if (!coding->dst_multibyte)
4822     {
4823       coding->produced = str_as_unibyte (destination, coding->produced);
4824       coding->produced_char = coding->produced;
4825     }
4826
4827   return coding->result;
4828 }
4829
4830 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4831    multibyteness of the source is CODING->src_multibyte, the
4832    multibyteness of the result is always unibyte.  */
4833
4834 int
4835 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4836      struct coding_system *coding;
4837      const unsigned char *source;
4838      unsigned char *destination;
4839      int src_bytes, dst_bytes;
4840 {
4841   coding->produced = coding->produced_char = 0;
4842   coding->consumed = coding->consumed_char = 0;
4843   coding->errors = 0;
4844   coding->result = CODING_FINISH_NORMAL;
4845
4846   switch (coding->type)
4847     {
4848     case coding_type_sjis:
4849       encode_coding_sjis_big5 (coding, source, destination,
4850                                src_bytes, dst_bytes, 1);
4851       break;
4852
4853     case coding_type_iso2022:
4854       encode_coding_iso2022 (coding, source, destination,
4855                              src_bytes, dst_bytes);
4856       break;
4857
4858     case coding_type_big5:
4859       encode_coding_sjis_big5 (coding, source, destination,
4860                                src_bytes, dst_bytes, 0);
4861       break;
4862
4863     case coding_type_emacs_mule:
4864       encode_coding_emacs_mule (coding, source, destination,
4865                                 src_bytes, dst_bytes);
4866       break;
4867
4868     case coding_type_ccl:
4869       ccl_coding_driver (coding, source, destination,
4870                          src_bytes, dst_bytes, 1);
4871       break;
4872
4873     default:
4874       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4875     }
4876
4877   if (coding->mode & CODING_MODE_LAST_BLOCK
4878       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4879     {
4880       const unsigned char *src = source + coding->consumed;
4881       unsigned char *dst = destination + coding->produced;
4882
4883       if (coding->type == coding_type_iso2022)
4884         ENCODE_RESET_PLANE_AND_REGISTER;
4885       if (COMPOSING_P (coding))
4886         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4887       if (coding->consumed < src_bytes)
4888         {
4889           int len = src_bytes - coding->consumed;
4890
4891           BCOPY_SHORT (src, dst, len);
4892           if (coding->src_multibyte)
4893             len = str_as_unibyte (dst, len);
4894           dst += len;
4895           coding->consumed = src_bytes;
4896         }
4897       coding->produced = coding->produced_char = dst - destination;
4898       coding->result = CODING_FINISH_NORMAL;
4899     }
4900
4901   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4902       && coding->consumed == src_bytes)
4903     coding->result = CODING_FINISH_NORMAL;
4904
4905   return coding->result;
4906 }
4907
4908 /* Scan text in the region between *BEG and *END (byte positions),
4909    skip characters which we don't have to decode by coding system
4910    CODING at the head and tail, then set *BEG and *END to the region
4911    of the text we actually have to convert.  The caller should move
4912    the gap out of the region in advance if the region is from a
4913    buffer.
4914
4915    If STR is not NULL, *BEG and *END are indices into STR.  */
4916
4917 static void
4918 shrink_decoding_region (beg, end, coding, str)
4919      int *beg, *end;
4920      struct coding_system *coding;
4921      unsigned char *str;
4922 {
4923   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4924   int eol_conversion;
4925   Lisp_Object translation_table;
4926
4927   if (coding->type == coding_type_ccl
4928       || coding->type == coding_type_undecided
4929       || coding->eol_type != CODING_EOL_LF
4930       || !NILP (coding->post_read_conversion)
4931       || coding->composing != COMPOSITION_DISABLED)
4932     {
4933       /* We can't skip any data.  */
4934       return;
4935     }
4936   if (coding->type == coding_type_no_conversion
4937       || coding->type == coding_type_raw_text
4938       || coding->type == coding_type_emacs_mule)
4939     {
4940       /* We need no conversion, but don't have to skip any data here.
4941          Decoding routine handles them effectively anyway.  */
4942       return;
4943     }
4944
4945   translation_table = coding->translation_table_for_decode;
4946   if (NILP (translation_table) && !NILP (Venable_character_translation))
4947     translation_table = Vstandard_translation_table_for_decode;
4948   if (CHAR_TABLE_P (translation_table))
4949     {
4950       int i;
4951       for (i = 0; i < 128; i++)
4952         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4953           break;
4954       if (i < 128)
4955         /* Some ASCII character should be translated.  We give up
4956            shrinking.  */
4957         return;
4958     }
4959
4960   if (coding->heading_ascii >= 0)
4961     /* Detection routine has already found how much we can skip at the
4962        head.  */
4963     *beg += coding->heading_ascii;
4964
4965   if (str)
4966     {
4967       begp_orig = begp = str + *beg;
4968       endp_orig = endp = str + *end;
4969     }
4970   else
4971     {
4972       begp_orig = begp = BYTE_POS_ADDR (*beg);
4973       endp_orig = endp = begp + *end - *beg;
4974     }
4975
4976   eol_conversion = (coding->eol_type == CODING_EOL_CR
4977                     || coding->eol_type == CODING_EOL_CRLF);
4978
4979   switch (coding->type)
4980     {
4981     case coding_type_sjis:
4982     case coding_type_big5:
4983       /* We can skip all ASCII characters at the head.  */
4984       if (coding->heading_ascii < 0)
4985         {
4986           if (eol_conversion)
4987             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4988           else
4989             while (begp < endp && *begp < 0x80) begp++;
4990         }
4991       /* We can skip all ASCII characters at the tail except for the
4992          second byte of SJIS or BIG5 code.  */
4993       if (eol_conversion)
4994         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4995       else
4996         while (begp < endp && endp[-1] < 0x80) endp--;
4997       /* Do not consider LF as ascii if preceded by CR, since that
4998          confuses eol decoding. */
4999       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5000         endp++;
5001       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5002         endp++;
5003       break;
5004
5005     case coding_type_iso2022:
5006       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5007         /* We can't skip any data.  */
5008         break;
5009       if (coding->heading_ascii < 0)
5010         {
5011           /* We can skip all ASCII characters at the head except for a
5012              few control codes.  */
5013           while (begp < endp && (c = *begp) < 0x80
5014                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5015                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5016                  && (!eol_conversion || c != ISO_CODE_LF))
5017             begp++;
5018         }
5019       switch (coding->category_idx)
5020         {
5021         case CODING_CATEGORY_IDX_ISO_8_1:
5022         case CODING_CATEGORY_IDX_ISO_8_2:
5023           /* We can skip all ASCII characters at the tail.  */
5024           if (eol_conversion)
5025             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5026           else
5027             while (begp < endp && endp[-1] < 0x80) endp--;
5028           /* Do not consider LF as ascii if preceded by CR, since that
5029              confuses eol decoding. */
5030           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5031             endp++;
5032           break;
5033
5034         case CODING_CATEGORY_IDX_ISO_7:
5035         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5036           {
5037             /* We can skip all characters at the tail except for 8-bit
5038                codes and ESC and the following 2-byte at the tail.  */
5039             unsigned char *eight_bit = NULL;
5040
5041             if (eol_conversion)
5042               while (begp < endp
5043                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5044                 {
5045                   if (!eight_bit && c & 0x80) eight_bit = endp;
5046                   endp--;
5047                 }
5048             else
5049               while (begp < endp
5050                      && (c = endp[-1]) != ISO_CODE_ESC)
5051                 {
5052                   if (!eight_bit && c & 0x80) eight_bit = endp;
5053                   endp--;
5054                 }
5055             /* Do not consider LF as ascii if preceded by CR, since that
5056                confuses eol decoding. */
5057             if (begp < endp && endp < endp_orig
5058                 && endp[-1] == '\r' && endp[0] == '\n')
5059               endp++;
5060             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5061               {
5062                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5063                   /* This is an ASCII designation sequence.  We can
5064                      surely skip the tail.  But, if we have
5065                      encountered an 8-bit code, skip only the codes
5066                      after that.  */
5067                   endp = eight_bit ? eight_bit : endp + 2;
5068                 else
5069                   /* Hmmm, we can't skip the tail.  */
5070                   endp = endp_orig;
5071               }
5072             else if (eight_bit)
5073               endp = eight_bit;
5074           }
5075         }
5076       break;
5077
5078     default:
5079       abort ();
5080     }
5081   *beg += begp - begp_orig;
5082   *end += endp - endp_orig;
5083   return;
5084 }
5085
5086 /* Like shrink_decoding_region but for encoding.  */
5087
5088 static void
5089 shrink_encoding_region (beg, end, coding, str)
5090      int *beg, *end;
5091      struct coding_system *coding;
5092      unsigned char *str;
5093 {
5094   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5095   int eol_conversion;
5096   Lisp_Object translation_table;
5097
5098   if (coding->type == coding_type_ccl
5099       || coding->eol_type == CODING_EOL_CRLF
5100       || coding->eol_type == CODING_EOL_CR
5101       || (coding->cmp_data && coding->cmp_data->used > 0))
5102     {
5103       /* We can't skip any data.  */
5104       return;
5105     }
5106   if (coding->type == coding_type_no_conversion
5107       || coding->type == coding_type_raw_text
5108       || coding->type == coding_type_emacs_mule
5109       || coding->type == coding_type_undecided)
5110     {
5111       /* We need no conversion, but don't have to skip any data here.
5112          Encoding routine handles them effectively anyway.  */
5113       return;
5114     }
5115
5116   translation_table = coding->translation_table_for_encode;
5117   if (NILP (translation_table) && !NILP (Venable_character_translation))
5118     translation_table = Vstandard_translation_table_for_encode;
5119   if (CHAR_TABLE_P (translation_table))
5120     {
5121       int i;
5122       for (i = 0; i < 128; i++)
5123         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5124           break;
5125       if (i < 128)
5126         /* Some ASCII character should be translated.  We give up
5127            shrinking.  */
5128         return;
5129     }
5130
5131   if (str)
5132     {
5133       begp_orig = begp = str + *beg;
5134       endp_orig = endp = str + *end;
5135     }
5136   else
5137     {
5138       begp_orig = begp = BYTE_POS_ADDR (*beg);
5139       endp_orig = endp = begp + *end - *beg;
5140     }
5141
5142   eol_conversion = (coding->eol_type == CODING_EOL_CR
5143                     || coding->eol_type == CODING_EOL_CRLF);
5144
5145   /* Here, we don't have to check coding->pre_write_conversion because
5146      the caller is expected to have handled it already.  */
5147   switch (coding->type)
5148     {
5149     case coding_type_iso2022:
5150       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5151         /* We can't skip any data.  */
5152         break;
5153       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5154         {
5155           unsigned char *bol = begp;
5156           while (begp < endp && *begp < 0x80)
5157             {
5158               begp++;
5159               if (begp[-1] == '\n')
5160                 bol = begp;
5161             }
5162           begp = bol;
5163           goto label_skip_tail;
5164         }
5165       /* fall down ... */
5166
5167     case coding_type_sjis:
5168     case coding_type_big5:
5169       /* We can skip all ASCII characters at the head and tail.  */
5170       if (eol_conversion)
5171         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5172       else
5173         while (begp < endp && *begp < 0x80) begp++;
5174     label_skip_tail:
5175       if (eol_conversion)
5176         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5177       else
5178         while (begp < endp && *(endp - 1) < 0x80) endp--;
5179       break;
5180
5181     default:
5182       abort ();
5183     }
5184
5185   *beg += begp - begp_orig;
5186   *end += endp - endp_orig;
5187   return;
5188 }
5189
5190 /* As shrinking conversion region requires some overhead, we don't try
5191    shrinking if the length of conversion region is less than this
5192    value.  */
5193 static int shrink_conversion_region_threshhold = 1024;
5194
5195 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5196   do {                                                                  \
5197     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5198       {                                                                 \
5199         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5200         else shrink_decoding_region (beg, end, coding, str);            \
5201       }                                                                 \
5202   } while (0)
5203
5204 static Lisp_Object
5205 code_convert_region_unwind (dummy)
5206      Lisp_Object dummy;
5207 {
5208   inhibit_pre_post_conversion = 0;
5209   return Qnil;
5210 }
5211
5212 /* Store information about all compositions in the range FROM and TO
5213    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5214    buffer or a string, defaults to the current buffer.  */
5215
5216 void
5217 coding_save_composition (coding, from, to, obj)
5218      struct coding_system *coding;
5219      int from, to;
5220      Lisp_Object obj;
5221 {
5222   Lisp_Object prop;
5223   int start, end;
5224
5225   if (coding->composing == COMPOSITION_DISABLED)
5226     return;
5227   if (!coding->cmp_data)
5228     coding_allocate_composition_data (coding, from);
5229   if (!find_composition (from, to, &start, &end, &prop, obj)
5230       || end > to)
5231     return;
5232   if (start < from
5233       && (!find_composition (end, to, &start, &end, &prop, obj)
5234           || end > to))
5235     return;
5236   coding->composing = COMPOSITION_NO;
5237   do
5238     {
5239       if (COMPOSITION_VALID_P (start, end, prop))
5240         {
5241           enum composition_method method = COMPOSITION_METHOD (prop);
5242           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5243               >= COMPOSITION_DATA_SIZE)
5244             coding_allocate_composition_data (coding, from);
5245           /* For relative composition, we remember start and end
5246              positions, for the other compositions, we also remember
5247              components.  */
5248           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5249           if (method != COMPOSITION_RELATIVE)
5250             {
5251               /* We must store a*/
5252               Lisp_Object val, ch;
5253
5254               val = COMPOSITION_COMPONENTS (prop);
5255               if (CONSP (val))
5256                 while (CONSP (val))
5257                   {
5258                     ch = XCAR (val), val = XCDR (val);
5259                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5260                   }
5261               else if (VECTORP (val) || STRINGP (val))
5262                 {
5263                   int len = (VECTORP (val)
5264                              ? XVECTOR (val)->size : SCHARS (val));
5265                   int i;
5266                   for (i = 0; i < len; i++)
5267                     {
5268                       ch = (STRINGP (val)
5269                             ? Faref (val, make_number (i))
5270                             : XVECTOR (val)->contents[i]);
5271                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5272                     }
5273                 }
5274               else              /* INTEGERP (val) */
5275                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5276             }
5277           CODING_ADD_COMPOSITION_END (coding, end - from);
5278         }
5279       start = end;
5280     }
5281   while (start < to
5282          && find_composition (start, to, &start, &end, &prop, obj)
5283          && end <= to);
5284
5285   /* Make coding->cmp_data point to the first memory block.  */
5286   while (coding->cmp_data->prev)
5287     coding->cmp_data = coding->cmp_data->prev;
5288   coding->cmp_data_start = 0;
5289 }
5290
5291 /* Reflect the saved information about compositions to OBJ.
5292    CODING->cmp_data points to a memory block for the information.  OBJ
5293    is a buffer or a string, defaults to the current buffer.  */
5294
5295 void
5296 coding_restore_composition (coding, obj)
5297      struct coding_system *coding;
5298      Lisp_Object obj;
5299 {
5300   struct composition_data *cmp_data = coding->cmp_data;
5301
5302   if (!cmp_data)
5303     return;
5304
5305   while (cmp_data->prev)
5306     cmp_data = cmp_data->prev;
5307
5308   while (cmp_data)
5309     {
5310       int i;
5311
5312       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5313            i += cmp_data->data[i])
5314         {
5315           int *data = cmp_data->data + i;
5316           enum composition_method method = (enum composition_method) data[3];
5317           Lisp_Object components;
5318
5319           if (method == COMPOSITION_RELATIVE)
5320             components = Qnil;
5321           else
5322             {
5323               int len = data[0] - 4, j;
5324               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5325
5326               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5327                   && len % 2 == 0)
5328                 len --;
5329               for (j = 0; j < len; j++)
5330                 args[j] = make_number (data[4 + j]);
5331               components = (method == COMPOSITION_WITH_ALTCHARS
5332                             ? Fstring (len, args) : Fvector (len, args));
5333             }
5334           compose_text (data[1], data[2], components, Qnil, obj);
5335         }
5336       cmp_data = cmp_data->next;
5337     }
5338 }
5339
5340 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5341    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5342    coding system CODING, and return the status code of code conversion
5343    (currently, this value has no meaning).
5344
5345    How many characters (and bytes) are converted to how many
5346    characters (and bytes) are recorded in members of the structure
5347    CODING.
5348
5349    If REPLACE is nonzero, we do various things as if the original text
5350    is deleted and a new text is inserted.  See the comments in
5351    replace_range (insdel.c) to know what we are doing.
5352
5353    If REPLACE is zero, it is assumed that the source text is unibyte.
5354    Otherwise, it is assumed that the source text is multibyte.  */
5355
5356 int
5357 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5358      int from, from_byte, to, to_byte, encodep, replace;
5359      struct coding_system *coding;
5360 {
5361   int len = to - from, len_byte = to_byte - from_byte;
5362   int nchars_del = 0, nbytes_del = 0;
5363   int require, inserted, inserted_byte;
5364   int head_skip, tail_skip, total_skip = 0;
5365   Lisp_Object saved_coding_symbol;
5366   int first = 1;
5367   unsigned char *src, *dst;
5368   Lisp_Object deletion;
5369   int orig_point = PT, orig_len = len;
5370   int prev_Z;
5371   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5372
5373   deletion = Qnil;
5374   saved_coding_symbol = coding->symbol;
5375
5376   if (from < PT && PT < to)
5377     {
5378       TEMP_SET_PT_BOTH (from, from_byte);
5379       orig_point = from;
5380     }
5381
5382   if (replace)
5383     {
5384       int saved_from = from;
5385       int saved_inhibit_modification_hooks;
5386
5387       prepare_to_modify_buffer (from, to, &from);
5388       if (saved_from != from)
5389         {
5390           to = from + len;
5391           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5392           len_byte = to_byte - from_byte;
5393         }
5394
5395       /* The code conversion routine can not preserve text properties
5396          for now.  So, we must remove all text properties in the
5397          region.  Here, we must suppress all modification hooks.  */
5398       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5399       inhibit_modification_hooks = 1;
5400       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5401       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5402     }
5403
5404   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5405     {
5406       /* We must detect encoding of text and eol format.  */
5407
5408       if (from < GPT && to > GPT)
5409         move_gap_both (from, from_byte);
5410       if (coding->type == coding_type_undecided)
5411         {
5412           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5413           if (coding->type == coding_type_undecided)
5414             {
5415               /* It seems that the text contains only ASCII, but we
5416                  should not leave it undecided because the deeper
5417                  decoding routine (decode_coding) tries to detect the
5418                  encodings again in vain.  */
5419               coding->type = coding_type_emacs_mule;
5420               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5421               /* As emacs-mule decoder will handle composition, we
5422                  need this setting to allocate coding->cmp_data
5423                  later.  */
5424               coding->composing = COMPOSITION_NO;
5425             }
5426         }
5427       if (coding->eol_type == CODING_EOL_UNDECIDED
5428           && coding->type != coding_type_ccl)
5429         {
5430           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5431           if (coding->eol_type == CODING_EOL_UNDECIDED)
5432             coding->eol_type = CODING_EOL_LF;
5433           /* We had better recover the original eol format if we
5434              encounter an inconsistent eol format while decoding.  */
5435           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5436         }
5437     }
5438
5439   /* Now we convert the text.  */
5440
5441   /* For encoding, we must process pre-write-conversion in advance.  */
5442   if (! inhibit_pre_post_conversion
5443       && encodep
5444       && SYMBOLP (coding->pre_write_conversion)
5445       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5446     {
5447       /* The function in pre-write-conversion may put a new text in a
5448          new buffer.  */
5449       struct buffer *prev = current_buffer;
5450       Lisp_Object new;
5451
5452       record_unwind_protect (code_convert_region_unwind, Qnil);
5453       /* We should not call any more pre-write/post-read-conversion
5454          functions while this pre-write-conversion is running.  */
5455       inhibit_pre_post_conversion = 1;
5456       call2 (coding->pre_write_conversion,
5457              make_number (from), make_number (to));
5458       inhibit_pre_post_conversion = 0;
5459       /* Discard the unwind protect.  */
5460       specpdl_ptr--;
5461
5462       if (current_buffer != prev)
5463         {
5464           len = ZV - BEGV;
5465           new = Fcurrent_buffer ();
5466           set_buffer_internal_1 (prev);
5467           del_range_2 (from, from_byte, to, to_byte, 0);
5468           TEMP_SET_PT_BOTH (from, from_byte);
5469           insert_from_buffer (XBUFFER (new), 1, len, 0);
5470           Fkill_buffer (new);
5471           if (orig_point >= to)
5472             orig_point += len - orig_len;
5473           else if (orig_point > from)
5474             orig_point = from;
5475           orig_len = len;
5476           to = from + len;
5477           from_byte = CHAR_TO_BYTE (from);
5478           to_byte = CHAR_TO_BYTE (to);
5479           len_byte = to_byte - from_byte;
5480           TEMP_SET_PT_BOTH (from, from_byte);
5481         }
5482     }
5483
5484   if (replace)
5485     {
5486       if (! EQ (current_buffer->undo_list, Qt))
5487         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5488       else
5489         {
5490           nchars_del = to - from;
5491           nbytes_del = to_byte - from_byte;
5492         }
5493     }
5494
5495   if (coding->composing != COMPOSITION_DISABLED)
5496     {
5497       if (encodep)
5498         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5499       else
5500         coding_allocate_composition_data (coding, from);
5501     }
5502
5503   /* Try to skip the heading and tailing ASCIIs.  */
5504   if (coding->type != coding_type_ccl)
5505     {
5506       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5507
5508       if (from < GPT && GPT < to)
5509         move_gap_both (from, from_byte);
5510       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5511       if (from_byte == to_byte
5512           && (encodep || NILP (coding->post_read_conversion))
5513           && ! CODING_REQUIRE_FLUSHING (coding))
5514         {
5515           coding->produced = len_byte;
5516           coding->produced_char = len;
5517           if (!replace)
5518             /* We must record and adjust for this new text now.  */
5519             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5520           return 0;
5521         }
5522
5523       head_skip = from_byte - from_byte_orig;
5524       tail_skip = to_byte_orig - to_byte;
5525       total_skip = head_skip + tail_skip;
5526       from += head_skip;
5527       to -= tail_skip;
5528       len -= total_skip; len_byte -= total_skip;
5529     }
5530
5531   /* For conversion, we must put the gap before the text in addition to
5532      making the gap larger for efficient decoding.  The required gap
5533      size starts from 2000 which is the magic number used in make_gap.
5534      But, after one batch of conversion, it will be incremented if we
5535      find that it is not enough .  */
5536   require = 2000;
5537
5538   if (GAP_SIZE  < require)
5539     make_gap (require - GAP_SIZE);
5540   move_gap_both (from, from_byte);
5541
5542   inserted = inserted_byte = 0;
5543
5544   GAP_SIZE += len_byte;
5545   ZV -= len;
5546   Z -= len;
5547   ZV_BYTE -= len_byte;
5548   Z_BYTE -= len_byte;
5549
5550   if (GPT - BEG < BEG_UNCHANGED)
5551     BEG_UNCHANGED = GPT - BEG;
5552   if (Z - GPT < END_UNCHANGED)
5553     END_UNCHANGED = Z - GPT;
5554
5555   if (!encodep && coding->src_multibyte)
5556     {
5557       /* Decoding routines expects that the source text is unibyte.
5558          We must convert 8-bit characters of multibyte form to
5559          unibyte.  */
5560       int len_byte_orig = len_byte;
5561       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5562       if (len_byte < len_byte_orig)
5563         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5564                     len_byte);
5565       coding->src_multibyte = 0;
5566     }
5567
5568   for (;;)
5569     {
5570       int result;
5571
5572       /* The buffer memory is now:
5573          +--------+converted-text+---------+-------original-text-------+---+
5574          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5575                   |<---------------------- GAP ----------------------->|  */
5576       src = GAP_END_ADDR - len_byte;
5577       dst = GPT_ADDR + inserted_byte;
5578
5579       if (encodep)
5580         result = encode_coding (coding, src, dst, len_byte, 0);
5581       else
5582         {
5583           if (coding->composing != COMPOSITION_DISABLED)
5584             coding->cmp_data->char_offset = from + inserted;
5585           result = decode_coding (coding, src, dst, len_byte, 0);
5586         }
5587
5588       /* The buffer memory is now:
5589          +--------+-------converted-text----+--+------original-text----+---+
5590          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5591                   |<---------------------- GAP ----------------------->|  */
5592
5593       inserted += coding->produced_char;
5594       inserted_byte += coding->produced;
5595       len_byte -= coding->consumed;
5596
5597       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5598         {
5599           coding_allocate_composition_data (coding, from + inserted);
5600           continue;
5601         }
5602
5603       src += coding->consumed;
5604       dst += coding->produced;
5605
5606       if (result == CODING_FINISH_NORMAL)
5607         {
5608           src += len_byte;
5609           break;
5610         }
5611       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5612         {
5613           unsigned char *pend = dst, *p = pend - inserted_byte;
5614           Lisp_Object eol_type;
5615
5616           /* Encode LFs back to the original eol format (CR or CRLF).  */
5617           if (coding->eol_type == CODING_EOL_CR)
5618             {
5619               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5620             }
5621           else
5622             {
5623               int count = 0;
5624
5625               while (p < pend) if (*p++ == '\n') count++;
5626               if (src - dst < count)
5627                 {
5628                   /* We don't have sufficient room for encoding LFs
5629                      back to CRLF.  We must record converted and
5630                      not-yet-converted text back to the buffer
5631                      content, enlarge the gap, then record them out of
5632                      the buffer contents again.  */
5633                   int add = len_byte + inserted_byte;
5634
5635                   GAP_SIZE -= add;
5636                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5637                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5638                   make_gap (count - GAP_SIZE);
5639                   GAP_SIZE += add;
5640                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5641                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5642                   /* Don't forget to update SRC, DST, and PEND.  */
5643                   src = GAP_END_ADDR - len_byte;
5644                   dst = GPT_ADDR + inserted_byte;
5645                   pend = dst;
5646                 }
5647               inserted += count;
5648               inserted_byte += count;
5649               coding->produced += count;
5650               p = dst = pend + count;
5651               while (count)
5652                 {
5653                   *--p = *--pend;
5654                   if (*p == '\n') count--, *--p = '\r';
5655                 }
5656             }
5657
5658           /* Suppress eol-format conversion in the further conversion.  */
5659           coding->eol_type = CODING_EOL_LF;
5660
5661           /* Set the coding system symbol to that for Unix-like EOL.  */
5662           eol_type = Fget (saved_coding_symbol, Qeol_type);
5663           if (VECTORP (eol_type)
5664               && XVECTOR (eol_type)->size == 3
5665               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5666             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5667           else
5668             coding->symbol = saved_coding_symbol;
5669
5670           continue;
5671         }
5672       if (len_byte <= 0)
5673         {
5674           if (coding->type != coding_type_ccl
5675               || coding->mode & CODING_MODE_LAST_BLOCK)
5676             break;
5677           coding->mode |= CODING_MODE_LAST_BLOCK;
5678           continue;
5679         }
5680       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5681         {
5682           /* The source text ends in invalid codes.  Let's just
5683              make them valid buffer contents, and finish conversion.  */
5684           if (multibyte_p)
5685             {
5686               unsigned char *start = dst;
5687
5688               inserted += len_byte;
5689               while (len_byte--)
5690                 {
5691                   int c = *src++;
5692                   dst += CHAR_STRING (c, dst);
5693                 }
5694
5695               inserted_byte += dst - start;
5696             }
5697           else
5698             {
5699               inserted += len_byte;
5700               inserted_byte += len_byte;
5701               while (len_byte--)
5702                 *dst++ = *src++;
5703             }
5704           break;
5705         }
5706       if (result == CODING_FINISH_INTERRUPT)
5707         {
5708           /* The conversion procedure was interrupted by a user.  */
5709           break;
5710         }
5711       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5712       if (coding->consumed < 1)
5713         {
5714           /* It's quite strange to require more memory without
5715              consuming any bytes.  Perhaps CCL program bug.  */
5716           break;
5717         }
5718       if (first)
5719         {
5720           /* We have just done the first batch of conversion which was
5721              stopped because of insufficient gap.  Let's reconsider the
5722              required gap size (i.e. SRT - DST) now.
5723
5724              We have converted ORIG bytes (== coding->consumed) into
5725              NEW bytes (coding->produced).  To convert the remaining
5726              LEN bytes, we may need REQUIRE bytes of gap, where:
5727                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5728                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5729              Here, we are sure that NEW >= ORIG.  */
5730           float ratio;
5731
5732           if (coding->produced <= coding->consumed)
5733             {
5734               /* This happens because of CCL-based coding system with
5735                  eol-type CRLF.  */
5736               require = 0;
5737             }
5738           else
5739             {
5740               ratio = (coding->produced - coding->consumed) / coding->consumed;
5741               require = len_byte * ratio;
5742             }
5743           first = 0;
5744         }
5745       if ((src - dst) < (require + 2000))
5746         {
5747           /* See the comment above the previous call of make_gap.  */
5748           int add = len_byte + inserted_byte;
5749
5750           GAP_SIZE -= add;
5751           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5752           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5753           make_gap (require + 2000);
5754           GAP_SIZE += add;
5755           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5756           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5757         }
5758     }
5759   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5760
5761   if (encodep && coding->dst_multibyte)
5762     {
5763       /* The output is unibyte.  We must convert 8-bit characters to
5764          multibyte form.  */
5765       if (inserted_byte * 2 > GAP_SIZE)
5766         {
5767           GAP_SIZE -= inserted_byte;
5768           ZV += inserted_byte; Z += inserted_byte;
5769           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5770           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5771           make_gap (inserted_byte - GAP_SIZE);
5772           GAP_SIZE += inserted_byte;
5773           ZV -= inserted_byte; Z -= inserted_byte;
5774           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5775           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5776         }
5777       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5778     }
5779
5780   /* If we shrank the conversion area, adjust it now.  */
5781   if (total_skip > 0)
5782     {
5783       if (tail_skip > 0)
5784         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5785       inserted += total_skip; inserted_byte += total_skip;
5786       GAP_SIZE += total_skip;
5787       GPT -= head_skip; GPT_BYTE -= head_skip;
5788       ZV -= total_skip; ZV_BYTE -= total_skip;
5789       Z -= total_skip; Z_BYTE -= total_skip;
5790       from -= head_skip; from_byte -= head_skip;
5791       to += tail_skip; to_byte += tail_skip;
5792     }
5793
5794   prev_Z = Z;
5795   if (! EQ (current_buffer->undo_list, Qt))
5796     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5797   else
5798     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5799                                  inserted, inserted_byte);
5800   inserted = Z - prev_Z;
5801
5802   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5803     coding_restore_composition (coding, Fcurrent_buffer ());
5804   coding_free_composition_data (coding);
5805
5806   if (! inhibit_pre_post_conversion
5807       && ! encodep && ! NILP (coding->post_read_conversion))
5808     {
5809       Lisp_Object val;
5810
5811       if (from != PT)
5812         TEMP_SET_PT_BOTH (from, from_byte);
5813       prev_Z = Z;
5814       record_unwind_protect (code_convert_region_unwind, Qnil);
5815       /* We should not call any more pre-write/post-read-conversion
5816          functions while this post-read-conversion is running.  */
5817       inhibit_pre_post_conversion = 1;
5818       val = call1 (coding->post_read_conversion, make_number (inserted));
5819       inhibit_pre_post_conversion = 0;
5820       /* Discard the unwind protect.  */
5821       specpdl_ptr--;
5822       CHECK_NUMBER (val);
5823       inserted += Z - prev_Z;
5824     }
5825
5826   if (orig_point >= from)
5827     {
5828       if (orig_point >= from + orig_len)
5829         orig_point += inserted - orig_len;
5830       else
5831         orig_point = from;
5832       TEMP_SET_PT (orig_point);
5833     }
5834
5835   if (replace)
5836     {
5837       signal_after_change (from, to - from, inserted);
5838       update_compositions (from, from + inserted, CHECK_BORDER);
5839     }
5840
5841   {
5842     coding->consumed = to_byte - from_byte;
5843     coding->consumed_char = to - from;
5844     coding->produced = inserted_byte;
5845     coding->produced_char = inserted;
5846   }
5847
5848   return 0;
5849 }
5850
5851 Lisp_Object
5852 run_pre_post_conversion_on_str (str, coding, encodep)
5853      Lisp_Object str;
5854      struct coding_system *coding;
5855      int encodep;
5856 {
5857   int count = SPECPDL_INDEX ();
5858   struct gcpro gcpro1, gcpro2;
5859   int multibyte = STRING_MULTIBYTE (str);
5860   Lisp_Object buffer;
5861   struct buffer *buf;
5862   Lisp_Object old_deactivate_mark;
5863
5864   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5865   record_unwind_protect (code_convert_region_unwind, Qnil);
5866   /* It is not crucial to specbind this.  */
5867   old_deactivate_mark = Vdeactivate_mark;
5868   GCPRO2 (str, old_deactivate_mark);
5869
5870   buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5871   buf = XBUFFER (buffer);
5872
5873   buf->directory = current_buffer->directory;
5874   buf->read_only = Qnil;
5875   buf->filename = Qnil;
5876   buf->undo_list = Qt;
5877   buf->overlays_before = Qnil;
5878   buf->overlays_after = Qnil;
5879
5880   set_buffer_internal (buf);
5881   /* We must insert the contents of STR as is without
5882      unibyte<->multibyte conversion.  For that, we adjust the
5883      multibyteness of the working buffer to that of STR.  */
5884   Ferase_buffer ();
5885   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
5886
5887   insert_from_string (str, 0, 0,
5888                       SCHARS (str), SBYTES (str), 0);
5889   UNGCPRO;
5890   inhibit_pre_post_conversion = 1;
5891   if (encodep)
5892     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5893   else
5894     {
5895       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5896       call1 (coding->post_read_conversion, make_number (Z - BEG));
5897     }
5898   inhibit_pre_post_conversion = 0;
5899   Vdeactivate_mark = old_deactivate_mark;
5900   str = make_buffer_string (BEG, Z, 1);
5901   return unbind_to (count, str);
5902 }
5903
5904 Lisp_Object
5905 decode_coding_string (str, coding, nocopy)
5906      Lisp_Object str;
5907      struct coding_system *coding;
5908      int nocopy;
5909 {
5910   int len;
5911   struct conversion_buffer buf;
5912   int from, to_byte;
5913   Lisp_Object saved_coding_symbol;
5914   int result;
5915   int require_decoding;
5916   int shrinked_bytes = 0;
5917   Lisp_Object newstr;
5918   int consumed, consumed_char, produced, produced_char;
5919
5920   from = 0;
5921   to_byte = SBYTES (str);
5922
5923   saved_coding_symbol = coding->symbol;
5924   coding->src_multibyte = STRING_MULTIBYTE (str);
5925   coding->dst_multibyte = 1;
5926   if (CODING_REQUIRE_DETECTION (coding))
5927     {
5928       /* See the comments in code_convert_region.  */
5929       if (coding->type == coding_type_undecided)
5930         {
5931           detect_coding (coding, SDATA (str), to_byte);
5932           if (coding->type == coding_type_undecided)
5933             {
5934               coding->type = coding_type_emacs_mule;
5935               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5936               /* As emacs-mule decoder will handle composition, we
5937                  need this setting to allocate coding->cmp_data
5938                  later.  */
5939               coding->composing = COMPOSITION_NO;
5940             }
5941         }
5942       if (coding->eol_type == CODING_EOL_UNDECIDED
5943           && coding->type != coding_type_ccl)
5944         {
5945           saved_coding_symbol = coding->symbol;
5946           detect_eol (coding, SDATA (str), to_byte);
5947           if (coding->eol_type == CODING_EOL_UNDECIDED)
5948             coding->eol_type = CODING_EOL_LF;
5949           /* We had better recover the original eol format if we
5950              encounter an inconsistent eol format while decoding.  */
5951           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5952         }
5953     }
5954
5955   if (coding->type == coding_type_no_conversion
5956       || coding->type == coding_type_raw_text)
5957     coding->dst_multibyte = 0;
5958
5959   require_decoding = CODING_REQUIRE_DECODING (coding);
5960
5961   if (STRING_MULTIBYTE (str))
5962     {
5963       /* Decoding routines expect the source text to be unibyte.  */
5964       str = Fstring_as_unibyte (str);
5965       to_byte = SBYTES (str);
5966       nocopy = 1;
5967       coding->src_multibyte = 0;
5968     }
5969
5970   /* Try to skip the heading and tailing ASCIIs.  */
5971   if (require_decoding && coding->type != coding_type_ccl)
5972     {
5973       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
5974                                 0);
5975       if (from == to_byte)
5976         require_decoding = 0;
5977       shrinked_bytes = from + (SBYTES (str) - to_byte);
5978     }
5979
5980   if (!require_decoding)
5981     {
5982       coding->consumed = SBYTES (str);
5983       coding->consumed_char = SCHARS (str);
5984       if (coding->dst_multibyte)
5985         {
5986           str = Fstring_as_multibyte (str);
5987           nocopy = 1;
5988         }
5989       coding->produced = SBYTES (str);
5990       coding->produced_char = SCHARS (str);
5991       return (nocopy ? str : Fcopy_sequence (str));
5992     }
5993
5994   if (coding->composing != COMPOSITION_DISABLED)
5995     coding_allocate_composition_data (coding, from);
5996   len = decoding_buffer_size (coding, to_byte - from);
5997   allocate_conversion_buffer (buf, len);
5998
5999   consumed = consumed_char = produced = produced_char = 0;
6000   while (1)
6001     {
6002       result = decode_coding (coding, SDATA (str) + from + consumed,
6003                               buf.data + produced, to_byte - from - consumed,
6004                               buf.size - produced);
6005       consumed += coding->consumed;
6006       consumed_char += coding->consumed_char;
6007       produced += coding->produced;
6008       produced_char += coding->produced_char;
6009       if (result == CODING_FINISH_NORMAL
6010           || (result == CODING_FINISH_INSUFFICIENT_SRC
6011               && coding->consumed == 0))
6012         break;
6013       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6014         coding_allocate_composition_data (coding, from + produced_char);
6015       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6016         extend_conversion_buffer (&buf);
6017       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6018         {
6019           Lisp_Object eol_type;
6020
6021           /* Recover the original EOL format.  */
6022           if (coding->eol_type == CODING_EOL_CR)
6023             {
6024               unsigned char *p;
6025               for (p = buf.data; p < buf.data + produced; p++)
6026                 if (*p == '\n') *p = '\r';
6027             }
6028           else if (coding->eol_type == CODING_EOL_CRLF)
6029             {
6030               int num_eol = 0;
6031               unsigned char *p0, *p1;
6032               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6033                 if (*p0 == '\n') num_eol++;
6034               if (produced + num_eol >= buf.size)
6035                 extend_conversion_buffer (&buf);
6036               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6037                 {
6038                   *--p1 = *--p0;
6039                   if (*p0 == '\n') *--p1 = '\r';
6040                 }
6041               produced += num_eol;
6042               produced_char += num_eol;
6043             }
6044           /* Suppress eol-format conversion in the further conversion.  */
6045           coding->eol_type = CODING_EOL_LF;
6046
6047           /* Set the coding system symbol to that for Unix-like EOL.  */
6048           eol_type = Fget (saved_coding_symbol, Qeol_type);
6049           if (VECTORP (eol_type)
6050               && XVECTOR (eol_type)->size == 3
6051               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6052             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6053           else
6054             coding->symbol = saved_coding_symbol;
6055
6056
6057         }
6058     }
6059
6060   coding->consumed = consumed;
6061   coding->consumed_char = consumed_char;
6062   coding->produced = produced;
6063   coding->produced_char = produced_char;
6064
6065   if (coding->dst_multibyte)
6066     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6067                                            produced + shrinked_bytes);
6068   else
6069     newstr = make_uninit_string (produced + shrinked_bytes);
6070   if (from > 0)
6071     STRING_COPYIN (newstr, 0, SDATA (str), from);
6072   STRING_COPYIN (newstr, from, buf.data, produced);
6073   if (shrinked_bytes > from)
6074     STRING_COPYIN (newstr, from + produced,
6075                    SDATA (str) + to_byte,
6076                    shrinked_bytes - from);
6077   free_conversion_buffer (&buf);
6078
6079   if (coding->cmp_data && coding->cmp_data->used)
6080     coding_restore_composition (coding, newstr);
6081   coding_free_composition_data (coding);
6082
6083   if (SYMBOLP (coding->post_read_conversion)
6084       && !NILP (Ffboundp (coding->post_read_conversion)))
6085     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6086
6087   return newstr;
6088 }
6089
6090 Lisp_Object
6091 encode_coding_string (str, coding, nocopy)
6092      Lisp_Object str;
6093      struct coding_system *coding;
6094      int nocopy;
6095 {
6096   int len;
6097   struct conversion_buffer buf;
6098   int from, to, to_byte;
6099   int result;
6100   int shrinked_bytes = 0;
6101   Lisp_Object newstr;
6102   int consumed, consumed_char, produced, produced_char;
6103
6104   if (SYMBOLP (coding->pre_write_conversion)
6105       && !NILP (Ffboundp (coding->pre_write_conversion)))
6106     str = run_pre_post_conversion_on_str (str, coding, 1);
6107
6108   from = 0;
6109   to = SCHARS (str);
6110   to_byte = SBYTES (str);
6111
6112   /* Encoding routines determine the multibyteness of the source text
6113      by coding->src_multibyte.  */
6114   coding->src_multibyte = STRING_MULTIBYTE (str);
6115   coding->dst_multibyte = 0;
6116   if (! CODING_REQUIRE_ENCODING (coding))
6117     {
6118       coding->consumed = SBYTES (str);
6119       coding->consumed_char = SCHARS (str);
6120       if (STRING_MULTIBYTE (str))
6121         {
6122           str = Fstring_as_unibyte (str);
6123           nocopy = 1;
6124         }
6125       coding->produced = SBYTES (str);
6126       coding->produced_char = SCHARS (str);
6127       return (nocopy ? str : Fcopy_sequence (str));
6128     }
6129
6130   if (coding->composing != COMPOSITION_DISABLED)
6131     coding_save_composition (coding, from, to, str);
6132
6133   /* Try to skip the heading and tailing ASCIIs.  */
6134   if (coding->type != coding_type_ccl)
6135     {
6136       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6137                                 1);
6138       if (from == to_byte)
6139         return (nocopy ? str : Fcopy_sequence (str));
6140       shrinked_bytes = from + (SBYTES (str) - to_byte);
6141     }
6142
6143   len = encoding_buffer_size (coding, to_byte - from);
6144   allocate_conversion_buffer (buf, len);
6145
6146   consumed = consumed_char = produced = produced_char = 0;
6147   while (1)
6148     {
6149       result = encode_coding (coding, SDATA (str) + from + consumed,
6150                               buf.data + produced, to_byte - from - consumed,
6151                               buf.size - produced);
6152       consumed += coding->consumed;
6153       consumed_char += coding->consumed_char;
6154       produced += coding->produced;
6155       produced_char += coding->produced_char;
6156       if (result == CODING_FINISH_NORMAL
6157           || (result == CODING_FINISH_INSUFFICIENT_SRC
6158               && coding->consumed == 0))
6159         break;
6160       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6161       extend_conversion_buffer (&buf);
6162     }
6163
6164   coding->consumed = consumed;
6165   coding->consumed_char = consumed_char;
6166   coding->produced = produced;
6167   coding->produced_char = produced_char;
6168
6169   newstr = make_uninit_string (produced + shrinked_bytes);
6170   if (from > 0)
6171     STRING_COPYIN (newstr, 0, SDATA (str), from);
6172   STRING_COPYIN (newstr, from, buf.data, produced);
6173   if (shrinked_bytes > from)
6174     STRING_COPYIN (newstr, from + produced,
6175                    SDATA (str) + to_byte,
6176                    shrinked_bytes - from);
6177
6178   free_conversion_buffer (&buf);
6179   coding_free_composition_data (coding);
6180
6181   return newstr;
6182 }
6183
6184 \f
6185 #ifdef emacs
6186 /*** 8. Emacs Lisp library functions ***/
6187
6188 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6189        doc: /* Return t if OBJECT is nil or a coding-system.
6190 See the documentation of `make-coding-system' for information
6191 about coding-system objects.  */)
6192      (obj)
6193      Lisp_Object obj;
6194 {
6195   if (NILP (obj))
6196     return Qt;
6197   if (!SYMBOLP (obj))
6198     return Qnil;
6199   /* Get coding-spec vector for OBJ.  */
6200   obj = Fget (obj, Qcoding_system);
6201   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6202           ? Qt : Qnil);
6203 }
6204
6205 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6206        Sread_non_nil_coding_system, 1, 1, 0,
6207        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6208      (prompt)
6209      Lisp_Object prompt;
6210 {
6211   Lisp_Object val;
6212   do
6213     {
6214       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6215                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6216     }
6217   while (SCHARS (val) == 0);
6218   return (Fintern (val, Qnil));
6219 }
6220
6221 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6222        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6223 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6224      (prompt, default_coding_system)
6225      Lisp_Object prompt, default_coding_system;
6226 {
6227   Lisp_Object val;
6228   if (SYMBOLP (default_coding_system))
6229     default_coding_system = SYMBOL_NAME (default_coding_system);
6230   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6231                           Qt, Qnil, Qcoding_system_history,
6232                           default_coding_system, Qnil);
6233   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6234 }
6235
6236 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6237        1, 1, 0,
6238        doc: /* Check validity of CODING-SYSTEM.
6239 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6240 It is valid if it is a symbol with a non-nil `coding-system' property.
6241 The value of property should be a vector of length 5.  */)
6242      (coding_system)
6243      Lisp_Object coding_system;
6244 {
6245   CHECK_SYMBOL (coding_system);
6246   if (!NILP (Fcoding_system_p (coding_system)))
6247     return coding_system;
6248   while (1)
6249     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6250 }
6251 \f
6252 Lisp_Object
6253 detect_coding_system (src, src_bytes, highest, multibytep)
6254      const unsigned char *src;
6255      int src_bytes, highest;
6256      int multibytep;
6257 {
6258   int coding_mask, eol_type;
6259   Lisp_Object val, tmp;
6260   int dummy;
6261
6262   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6263   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6264   if (eol_type == CODING_EOL_INCONSISTENT)
6265     eol_type = CODING_EOL_UNDECIDED;
6266
6267   if (!coding_mask)
6268     {
6269       val = Qundecided;
6270       if (eol_type != CODING_EOL_UNDECIDED)
6271         {
6272           Lisp_Object val2;
6273           val2 = Fget (Qundecided, Qeol_type);
6274           if (VECTORP (val2))
6275             val = XVECTOR (val2)->contents[eol_type];
6276         }
6277       return (highest ? val : Fcons (val, Qnil));
6278     }
6279
6280   /* At first, gather possible coding systems in VAL.  */
6281   val = Qnil;
6282   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6283     {
6284       Lisp_Object category_val, category_index;
6285
6286       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6287       category_val = Fsymbol_value (XCAR (tmp));
6288       if (!NILP (category_val)
6289           && NATNUMP (category_index)
6290           && (coding_mask & (1 << XFASTINT (category_index))))
6291         {
6292           val = Fcons (category_val, val);
6293           if (highest)
6294             break;
6295         }
6296     }
6297   if (!highest)
6298     val = Fnreverse (val);
6299
6300   /* Then, replace the elements with subsidiary coding systems.  */
6301   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6302     {
6303       if (eol_type != CODING_EOL_UNDECIDED
6304           && eol_type != CODING_EOL_INCONSISTENT)
6305         {
6306           Lisp_Object eol;
6307           eol = Fget (XCAR (tmp), Qeol_type);
6308           if (VECTORP (eol))
6309             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6310         }
6311     }
6312   return (highest ? XCAR (val) : val);
6313 }
6314
6315 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6316        2, 3, 0,
6317        doc: /* Detect how the byte sequence in the region is encoded.
6318 Return a list of possible coding systems used on decoding a byte
6319 sequence containing the bytes in the region between START and END when
6320 the coding system `undecided' is specified.  The list is ordered by
6321 priority decided in the current language environment.
6322
6323 If only ASCII characters are found, it returns a list of single element
6324 `undecided' or its subsidiary coding system according to a detected
6325 end-of-line format.
6326
6327 If optional argument HIGHEST is non-nil, return the coding system of
6328 highest priority.  */)
6329      (start, end, highest)
6330      Lisp_Object start, end, highest;
6331 {
6332   int from, to;
6333   int from_byte, to_byte;
6334   int include_anchor_byte = 0;
6335
6336   CHECK_NUMBER_COERCE_MARKER (start);
6337   CHECK_NUMBER_COERCE_MARKER (end);
6338
6339   validate_region (&start, &end);
6340   from = XINT (start), to = XINT (end);
6341   from_byte = CHAR_TO_BYTE (from);
6342   to_byte = CHAR_TO_BYTE (to);
6343
6344   if (from < GPT && to >= GPT)
6345     move_gap_both (to, to_byte);
6346   /* If we an anchor byte `\0' follows the region, we include it in
6347      the detecting source.  Then code detectors can handle the tailing
6348      byte sequence more accurately.
6349
6350      Fix me: This is not a perfect solution.  It is better that we
6351      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6352   */
6353   if (to == Z || (to == GPT && GAP_SIZE > 0))
6354     include_anchor_byte = 1;
6355   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6356                                to_byte - from_byte + include_anchor_byte,
6357                                !NILP (highest),
6358                                !NILP (current_buffer
6359                                       ->enable_multibyte_characters));
6360 }
6361
6362 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6363        1, 2, 0,
6364        doc: /* Detect how the byte sequence in STRING is encoded.
6365 Return a list of possible coding systems used on decoding a byte
6366 sequence containing the bytes in STRING when the coding system
6367 `undecided' is specified.  The list is ordered by priority decided in
6368 the current language environment.
6369
6370 If only ASCII characters are found, it returns a list of single element
6371 `undecided' or its subsidiary coding system according to a detected
6372 end-of-line format.
6373
6374 If optional argument HIGHEST is non-nil, return the coding system of
6375 highest priority.  */)
6376      (string, highest)
6377      Lisp_Object string, highest;
6378 {
6379   CHECK_STRING (string);
6380
6381   return detect_coding_system (SDATA (string),
6382                                /* "+ 1" is to include the anchor byte
6383                                   `\0'.  With this, code detectors can
6384                                   handle the tailing bytes more
6385                                   accurately.  */
6386                                SBYTES (string) + 1,
6387                                !NILP (highest),
6388                                STRING_MULTIBYTE (string));
6389 }
6390
6391 /* Return an intersection of lists L1 and L2.  */
6392
6393 static Lisp_Object
6394 intersection (l1, l2)
6395      Lisp_Object l1, l2;
6396 {
6397   Lisp_Object val = Fcons (Qnil, Qnil), tail;
6398
6399   for (tail = val; CONSP (l1); l1 = XCDR (l1))
6400     {
6401       if (!NILP (Fmemq (XCAR (l1), l2)))
6402         {
6403           XSETCDR (tail, Fcons (XCAR (l1), Qnil));
6404           tail = XCDR (tail);
6405         }
6406     }
6407   return XCDR (val);
6408 }
6409
6410
6411 /*  Subroutine for Fsafe_coding_systems_region_internal.
6412
6413     Return a list of coding systems that safely encode the multibyte
6414     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
6415     possible coding systems.  If it is nil, it means that we have not
6416     yet found any coding systems.
6417
6418     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6419     element of WORK_TABLE is set to t once the element is looked up.
6420
6421     If a non-ASCII single byte char is found, set
6422     *single_byte_char_found to 1.  */
6423
6424 static Lisp_Object
6425 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6426      unsigned char *p, *pend;
6427      Lisp_Object safe_codings, work_table;
6428      int *single_byte_char_found;
6429 {
6430   int c, len, idx;
6431   Lisp_Object val;
6432
6433   while (p < pend)
6434     {
6435       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6436       p += len;
6437       if (ASCII_BYTE_P (c))
6438         /* We can ignore ASCII characters here.  */
6439         continue;
6440       if (SINGLE_BYTE_CHAR_P (c))
6441         *single_byte_char_found = 1;
6442       if (NILP (safe_codings))
6443         continue;
6444       /* Check the safe coding systems for C.  */
6445       val = char_table_ref_and_index (work_table, c, &idx);
6446       if (EQ (val, Qt))
6447         /* This element was already checked.  Ignore it.  */
6448         continue;
6449       /* Remember that we checked this element.  */
6450       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
6451
6452       /* If there are some safe coding systems for C and we have
6453          already found the other set of coding systems for the
6454          different characters, get the intersection of them.  */
6455       if (!EQ (safe_codings, Qt) && !NILP (val))
6456         val = intersection (safe_codings, val);
6457       safe_codings = val;
6458     }
6459   return safe_codings;
6460 }
6461
6462
6463 /* Return a list of coding systems that safely encode the text between
6464    START and END.  If the text contains only ASCII or is unibyte,
6465    return t.  */
6466
6467 DEFUN ("find-coding-systems-region-internal",
6468        Ffind_coding_systems_region_internal,
6469        Sfind_coding_systems_region_internal, 2, 2, 0,
6470        doc: /* Internal use only.  */)
6471      (start, end)
6472      Lisp_Object start, end;
6473 {
6474   Lisp_Object work_table, safe_codings;
6475   int non_ascii_p = 0;
6476   int single_byte_char_found = 0;
6477   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6478
6479   if (STRINGP (start))
6480     {
6481       if (!STRING_MULTIBYTE (start))
6482         return Qt;
6483       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6484       p2 = p2end = p1end;
6485       if (SCHARS (start) != SBYTES (start))
6486         non_ascii_p = 1;
6487     }
6488   else
6489     {
6490       int from, to, stop;
6491
6492       CHECK_NUMBER_COERCE_MARKER (start);
6493       CHECK_NUMBER_COERCE_MARKER (end);
6494       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6495         args_out_of_range (start, end);
6496       if (NILP (current_buffer->enable_multibyte_characters))
6497         return Qt;
6498       from = CHAR_TO_BYTE (XINT (start));
6499       to = CHAR_TO_BYTE (XINT (end));
6500       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6501       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6502       if (stop == to)
6503         p2 = p2end = p1end;
6504       else
6505         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6506       if (XINT (end) - XINT (start) != to - from)
6507         non_ascii_p = 1;
6508     }
6509
6510   if (!non_ascii_p)
6511     {
6512       /* We are sure that the text contains no multibyte character.
6513          Check if it contains eight-bit-graphic.  */
6514       p = p1;
6515       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6516       if (p == p1end)
6517         {
6518           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6519           if (p == p2end)
6520             return Qt;
6521         }
6522     }
6523
6524   /* The text contains non-ASCII characters.  */
6525   work_table = Fcopy_sequence (Vchar_coding_system_table);
6526   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
6527                                     &single_byte_char_found);
6528   if (p2 < p2end)
6529     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6530                                       &single_byte_char_found);
6531
6532   if (EQ (safe_codings, Qt))
6533     ; /* Nothing to be done.  */
6534   else if (!single_byte_char_found)
6535     {
6536       /* Append generic coding systems.  */
6537       Lisp_Object args[2];
6538       args[0] = safe_codings;
6539       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
6540                                         make_number (0));
6541       safe_codings = Fappend (2, args);
6542     }
6543   else
6544     safe_codings = Fcons (Qraw_text,
6545                           Fcons (Qemacs_mule,
6546                                  Fcons (Qno_conversion, safe_codings)));
6547   return safe_codings;
6548 }
6549
6550
6551 static Lisp_Object
6552 find_safe_codings_2 (p, pend, safe_codings, work_table, single_byte_char_found)
6553      unsigned char *p, *pend;
6554      Lisp_Object safe_codings, work_table;
6555      int *single_byte_char_found;
6556 {
6557   int c, len, i;
6558   Lisp_Object val, ch;
6559   Lisp_Object prev, tail;
6560
6561   while (p < pend)
6562     {
6563       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6564       p += len;
6565       if (ASCII_BYTE_P (c))
6566         /* We can ignore ASCII characters here.  */
6567         continue;
6568       if (SINGLE_BYTE_CHAR_P (c))
6569         *single_byte_char_found = 1;
6570       if (NILP (safe_codings))
6571         /* Already all coding systems are excluded.  */
6572         continue;
6573       /* Check the safe coding systems for C.  */
6574       ch = make_number (c);
6575       val = Faref (work_table, ch);
6576       if (EQ (val, Qt))
6577         /* This element was already checked.  Ignore it.  */
6578         continue;
6579       /* Remember that we checked this element.  */
6580       Faset (work_table, ch, Qt);
6581
6582       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6583         {
6584           val = XCAR (tail);
6585           if (NILP (Faref (XCDR (val), ch)))
6586             {
6587               /* Exclued this coding system from SAFE_CODINGS.  */
6588               if (EQ (tail, safe_codings))
6589                 safe_codings = XCDR (safe_codings);
6590               else
6591                 XSETCDR (prev, XCDR (tail));
6592             }
6593           else
6594             prev = tail;
6595         }
6596     }
6597   return safe_codings;
6598 }
6599
6600 DEFUN ("find-coding-systems-region-internal-2",
6601        Ffind_coding_systems_region_internal_2,
6602        Sfind_coding_systems_region_internal_2, 2, 2, 0,
6603        doc: /* Internal use only.  */)
6604      (start, end)
6605      Lisp_Object start, end;
6606 {
6607   Lisp_Object work_table, safe_codings;
6608   int non_ascii_p = 0;
6609   int single_byte_char_found = 0;
6610   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6611
6612   if (STRINGP (start))
6613     {
6614       if (!STRING_MULTIBYTE (start))
6615         return Qt;
6616       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6617       p2 = p2end = p1end;
6618       if (SCHARS (start) != SBYTES (start))
6619         non_ascii_p = 1;
6620     }
6621   else
6622     {
6623       int from, to, stop;
6624
6625       CHECK_NUMBER_COERCE_MARKER (start);
6626       CHECK_NUMBER_COERCE_MARKER (end);
6627       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6628         args_out_of_range (start, end);
6629       if (NILP (current_buffer->enable_multibyte_characters))
6630         return Qt;
6631       from = CHAR_TO_BYTE (XINT (start));
6632       to = CHAR_TO_BYTE (XINT (end));
6633       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6634       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6635       if (stop == to)
6636         p2 = p2end = p1end;
6637       else
6638         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6639       if (XINT (end) - XINT (start) != to - from)
6640         non_ascii_p = 1;
6641     }
6642
6643   if (!non_ascii_p)
6644     {
6645       /* We are sure that the text contains no multibyte character.
6646          Check if it contains eight-bit-graphic.  */
6647       p = p1;
6648       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6649       if (p == p1end)
6650         {
6651           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6652           if (p == p2end)
6653             return Qt;
6654         }
6655     }
6656
6657   /* The text contains non-ASCII characters.  */
6658
6659   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6660   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6661
6662   safe_codings = find_safe_codings_2 (p1, p1end, safe_codings, work_table,
6663                                       &single_byte_char_found);
6664   if (p2 < p2end)
6665     safe_codings = find_safe_codings_2 (p2, p2end, safe_codings, work_table,
6666                                         &single_byte_char_found);
6667   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6668     safe_codings = Qt;
6669   else
6670     {
6671       /* Turn safe_codings to a list of coding systems... */
6672       Lisp_Object val;
6673
6674       if (single_byte_char_found)
6675         /* ... and append these for eight-bit chars.  */
6676         val = Fcons (Qraw_text,
6677                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6678       else
6679         /* ... and append generic coding systems.  */
6680         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6681
6682       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6683         val = Fcons (XCAR (XCAR (safe_codings)), val);
6684       safe_codings = val;
6685     }
6686
6687   return safe_codings;
6688 }
6689
6690
6691 /* Search from position POS for such characters that are unencodable
6692    accoding to SAFE_CHARS, and return a list of their positions.  P
6693    points where in the memory the character at POS exists.  Limit the
6694    search at PEND or when Nth unencodable characters are found.
6695
6696    If SAFE_CHARS is a char table, an element for an unencodable
6697    character is nil.
6698
6699    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6700
6701    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6702    eight-bit-graphic characters are unencodable.  */
6703
6704 static Lisp_Object
6705 unencodable_char_position (safe_chars, pos, p, pend, n)
6706      Lisp_Object safe_chars;
6707      int pos;
6708      unsigned char *p, *pend;
6709      int n;
6710 {
6711   Lisp_Object pos_list;
6712
6713   pos_list = Qnil;
6714   while (p < pend)
6715     {
6716       int len;
6717       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6718
6719       if (c >= 128
6720           && (CHAR_TABLE_P (safe_chars)
6721               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6722               : (NILP (safe_chars) || c < 256)))
6723         {
6724           pos_list = Fcons (make_number (pos), pos_list);
6725           if (--n <= 0)
6726             break;
6727         }
6728       pos++;
6729       p += len;
6730     }
6731   return Fnreverse (pos_list);
6732 }
6733
6734
6735 DEFUN ("unencodable-char-position", Funencodable_char_position,
6736        Sunencodable_char_position, 3, 5, 0,
6737        doc: /*
6738 Return position of first un-encodable character in a region.
6739 START and END specfiy the region and CODING-SYSTEM specifies the
6740 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6741
6742 If optional 4th argument COUNT is non-nil, it specifies at most how
6743 many un-encodable characters to search.  In this case, the value is a
6744 list of positions.
6745
6746 If optional 5th argument STRING is non-nil, it is a string to search
6747 for un-encodable characters.  In that case, START and END are indexes
6748 to the string.  */)
6749      (start, end, coding_system, count, string)
6750      Lisp_Object start, end, coding_system, count, string;
6751 {
6752   int n;
6753   Lisp_Object safe_chars;
6754   struct coding_system coding;
6755   Lisp_Object positions;
6756   int from, to;
6757   unsigned char *p, *pend;
6758
6759   if (NILP (string))
6760     {
6761       validate_region (&start, &end);
6762       from = XINT (start);
6763       to = XINT (end);
6764       if (NILP (current_buffer->enable_multibyte_characters))
6765         return Qnil;
6766       p = CHAR_POS_ADDR (from);
6767       if (to == GPT)
6768         pend = GPT_ADDR;
6769       else
6770         pend = CHAR_POS_ADDR (to);
6771     }
6772   else
6773     {
6774       CHECK_STRING (string);
6775       CHECK_NATNUM (start);
6776       CHECK_NATNUM (end);
6777       from = XINT (start);
6778       to = XINT (end);
6779       if (from > to
6780           || to > SCHARS (string))
6781         args_out_of_range_3 (string, start, end);
6782       if (! STRING_MULTIBYTE (string))
6783         return Qnil;
6784       p = SDATA (string) + string_char_to_byte (string, from);
6785       pend = SDATA (string) + string_char_to_byte (string, to);
6786     }
6787
6788   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6789
6790   if (NILP (count))
6791     n = 1;
6792   else
6793     {
6794       CHECK_NATNUM (count);
6795       n = XINT (count);
6796     }
6797
6798   if (coding.type == coding_type_no_conversion
6799       || coding.type == coding_type_raw_text)
6800     return Qnil;
6801
6802   if (coding.type == coding_type_undecided)
6803     safe_chars = Qnil;
6804   else
6805     safe_chars = coding_safe_chars (coding_system);
6806
6807   if (STRINGP (string)
6808       || from >= GPT || to <= GPT)
6809     positions = unencodable_char_position (safe_chars, from, p, pend, n);
6810   else
6811     {
6812       Lisp_Object args[2];
6813
6814       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6815       n -= XINT (Flength (args[0]));
6816       if (n <= 0)
6817         positions = args[0];
6818       else
6819         {
6820           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6821                                                pend, n);
6822           positions = Fappend (2, args);
6823         }
6824     }
6825
6826   return  (NILP (count) ? Fcar (positions) : positions);
6827 }
6828
6829
6830 Lisp_Object
6831 code_convert_region1 (start, end, coding_system, encodep)
6832      Lisp_Object start, end, coding_system;
6833      int encodep;
6834 {
6835   struct coding_system coding;
6836   int from, to;
6837
6838   CHECK_NUMBER_COERCE_MARKER (start);
6839   CHECK_NUMBER_COERCE_MARKER (end);
6840   CHECK_SYMBOL (coding_system);
6841
6842   validate_region (&start, &end);
6843   from = XFASTINT (start);
6844   to = XFASTINT (end);
6845
6846   if (NILP (coding_system))
6847     return make_number (to - from);
6848
6849   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6850     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6851
6852   coding.mode |= CODING_MODE_LAST_BLOCK;
6853   coding.src_multibyte = coding.dst_multibyte
6854     = !NILP (current_buffer->enable_multibyte_characters);
6855   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6856                        &coding, encodep, 1);
6857   Vlast_coding_system_used = coding.symbol;
6858   return make_number (coding.produced_char);
6859 }
6860
6861 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6862        3, 3, "r\nzCoding system: ",
6863        doc: /* Decode the current region from the specified coding system.
6864 When called from a program, takes three arguments:
6865 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6866 This function sets `last-coding-system-used' to the precise coding system
6867 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6868 not fully specified.)
6869 It returns the length of the decoded text.  */)
6870      (start, end, coding_system)
6871      Lisp_Object start, end, coding_system;
6872 {
6873   return code_convert_region1 (start, end, coding_system, 0);
6874 }
6875
6876 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6877        3, 3, "r\nzCoding system: ",
6878        doc: /* Encode the current region into the specified coding system.
6879 When called from a program, takes three arguments:
6880 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6881 This function sets `last-coding-system-used' to the precise coding system
6882 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6883 not fully specified.)
6884 It returns the length of the encoded text.  */)
6885      (start, end, coding_system)
6886      Lisp_Object start, end, coding_system;
6887 {
6888   return code_convert_region1 (start, end, coding_system, 1);
6889 }
6890
6891 Lisp_Object
6892 code_convert_string1 (string, coding_system, nocopy, encodep)
6893      Lisp_Object string, coding_system, nocopy;
6894      int encodep;
6895 {
6896   struct coding_system coding;
6897
6898   CHECK_STRING (string);
6899   CHECK_SYMBOL (coding_system);
6900
6901   if (NILP (coding_system))
6902     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6903
6904   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6905     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6906
6907   coding.mode |= CODING_MODE_LAST_BLOCK;
6908   string = (encodep
6909             ? encode_coding_string (string, &coding, !NILP (nocopy))
6910             : decode_coding_string (string, &coding, !NILP (nocopy)));
6911   Vlast_coding_system_used = coding.symbol;
6912
6913   return string;
6914 }
6915
6916 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6917        2, 3, 0,
6918        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6919 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6920 if the decoding operation is trivial.
6921 This function sets `last-coding-system-used' to the precise coding system
6922 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6923 not fully specified.)  */)
6924      (string, coding_system, nocopy)
6925      Lisp_Object string, coding_system, nocopy;
6926 {
6927   return code_convert_string1 (string, coding_system, nocopy, 0);
6928 }
6929
6930 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6931        2, 3, 0,
6932        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6933 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6934 if the encoding operation is trivial.
6935 This function sets `last-coding-system-used' to the precise coding system
6936 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6937 not fully specified.)  */)
6938      (string, coding_system, nocopy)
6939      Lisp_Object string, coding_system, nocopy;
6940 {
6941   return code_convert_string1 (string, coding_system, nocopy, 1);
6942 }
6943
6944 /* Encode or decode STRING according to CODING_SYSTEM.
6945    Do not set Vlast_coding_system_used.
6946
6947    This function is called only from macros DECODE_FILE and
6948    ENCODE_FILE, thus we ignore character composition.  */
6949
6950 Lisp_Object
6951 code_convert_string_norecord (string, coding_system, encodep)
6952      Lisp_Object string, coding_system;
6953      int encodep;
6954 {
6955   struct coding_system coding;
6956
6957   CHECK_STRING (string);
6958   CHECK_SYMBOL (coding_system);
6959
6960   if (NILP (coding_system))
6961     return string;
6962
6963   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6964     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6965
6966   coding.composing = COMPOSITION_DISABLED;
6967   coding.mode |= CODING_MODE_LAST_BLOCK;
6968   return (encodep
6969           ? encode_coding_string (string, &coding, 1)
6970           : decode_coding_string (string, &coding, 1));
6971 }
6972 \f
6973 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6974        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6975 Return the corresponding character.  */)
6976      (code)
6977      Lisp_Object code;
6978 {
6979   unsigned char c1, c2, s1, s2;
6980   Lisp_Object val;
6981
6982   CHECK_NUMBER (code);
6983   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6984   if (s1 == 0)
6985     {
6986       if (s2 < 0x80)
6987         XSETFASTINT (val, s2);
6988       else if (s2 >= 0xA0 || s2 <= 0xDF)
6989         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6990       else
6991         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6992     }
6993   else
6994     {
6995       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
6996           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6997         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6998       DECODE_SJIS (s1, s2, c1, c2);
6999       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7000     }
7001   return val;
7002 }
7003
7004 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7005        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7006 Return the corresponding code in SJIS.  */)
7007      (ch)
7008      Lisp_Object ch;
7009 {
7010   int charset, c1, c2, s1, s2;
7011   Lisp_Object val;
7012
7013   CHECK_NUMBER (ch);
7014   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7015   if (charset == CHARSET_ASCII)
7016     {
7017       val = ch;
7018     }
7019   else if (charset == charset_jisx0208
7020            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7021     {
7022       ENCODE_SJIS (c1, c2, s1, s2);
7023       XSETFASTINT (val, (s1 << 8) | s2);
7024     }
7025   else if (charset == charset_katakana_jisx0201
7026            && c1 > 0x20 && c2 < 0xE0)
7027     {
7028       XSETFASTINT (val, c1 | 0x80);
7029     }
7030   else
7031     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7032   return val;
7033 }
7034
7035 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7036        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7037 Return the corresponding character.  */)
7038      (code)
7039      Lisp_Object code;
7040 {
7041   int charset;
7042   unsigned char b1, b2, c1, c2;
7043   Lisp_Object val;
7044
7045   CHECK_NUMBER (code);
7046   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7047   if (b1 == 0)
7048     {
7049       if (b2 >= 0x80)
7050         error ("Invalid BIG5 code: %x", XFASTINT (code));
7051       val = code;
7052     }
7053   else
7054     {
7055       if ((b1 < 0xA1 || b1 > 0xFE)
7056           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7057         error ("Invalid BIG5 code: %x", XFASTINT (code));
7058       DECODE_BIG5 (b1, b2, charset, c1, c2);
7059       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7060     }
7061   return val;
7062 }
7063
7064 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7065        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7066 Return the corresponding character code in Big5.  */)
7067      (ch)
7068      Lisp_Object ch;
7069 {
7070   int charset, c1, c2, b1, b2;
7071   Lisp_Object val;
7072
7073   CHECK_NUMBER (ch);
7074   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7075   if (charset == CHARSET_ASCII)
7076     {
7077       val = ch;
7078     }
7079   else if ((charset == charset_big5_1
7080             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7081            || (charset == charset_big5_2
7082                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7083     {
7084       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7085       XSETFASTINT (val, (b1 << 8) | b2);
7086     }
7087   else
7088     error ("Can't encode to Big5: %d", XFASTINT (ch));
7089   return val;
7090 }
7091 \f
7092 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7093        Sset_terminal_coding_system_internal, 1, 1, 0,
7094        doc: /* Internal use only.  */)
7095      (coding_system)
7096      Lisp_Object coding_system;
7097 {
7098   CHECK_SYMBOL (coding_system);
7099   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7100   /* We had better not send unsafe characters to terminal.  */
7101   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
7102   /* Character composition should be disabled.  */
7103   terminal_coding.composing = COMPOSITION_DISABLED;
7104   /* Error notification should be suppressed.  */
7105   terminal_coding.suppress_error = 1;
7106   terminal_coding.src_multibyte = 1;
7107   terminal_coding.dst_multibyte = 0;
7108   return Qnil;
7109 }
7110
7111 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7112        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7113        doc: /* Internal use only.  */)
7114      (coding_system)
7115      Lisp_Object coding_system;
7116 {
7117   CHECK_SYMBOL (coding_system);
7118   setup_coding_system (Fcheck_coding_system (coding_system),
7119                        &safe_terminal_coding);
7120   /* Character composition should be disabled.  */
7121   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7122   /* Error notification should be suppressed.  */
7123   terminal_coding.suppress_error = 1;
7124   safe_terminal_coding.src_multibyte = 1;
7125   safe_terminal_coding.dst_multibyte = 0;
7126   return Qnil;
7127 }
7128
7129 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7130        Sterminal_coding_system, 0, 0, 0,
7131        doc: /* Return coding system specified for terminal output.  */)
7132      ()
7133 {
7134   return terminal_coding.symbol;
7135 }
7136
7137 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7138        Sset_keyboard_coding_system_internal, 1, 1, 0,
7139        doc: /* Internal use only.  */)
7140      (coding_system)
7141      Lisp_Object coding_system;
7142 {
7143   CHECK_SYMBOL (coding_system);
7144   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7145   /* Character composition should be disabled.  */
7146   keyboard_coding.composing = COMPOSITION_DISABLED;
7147   return Qnil;
7148 }
7149
7150 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7151        Skeyboard_coding_system, 0, 0, 0,
7152        doc: /* Return coding system specified for decoding keyboard input.  */)
7153      ()
7154 {
7155   return keyboard_coding.symbol;
7156 }
7157
7158 \f
7159 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7160        Sfind_operation_coding_system,  1, MANY, 0,
7161        doc: /* Choose a coding system for an operation based on the target name.
7162 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7163 DECODING-SYSTEM is the coding system to use for decoding
7164 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7165 for encoding (in case OPERATION does encoding).
7166
7167 The first argument OPERATION specifies an I/O primitive:
7168   For file I/O, `insert-file-contents' or `write-region'.
7169   For process I/O, `call-process', `call-process-region', or `start-process'.
7170   For network I/O, `open-network-stream'.
7171
7172 The remaining arguments should be the same arguments that were passed
7173 to the primitive.  Depending on which primitive, one of those arguments
7174 is selected as the TARGET.  For example, if OPERATION does file I/O,
7175 whichever argument specifies the file name is TARGET.
7176
7177 TARGET has a meaning which depends on OPERATION:
7178   For file I/O, TARGET is a file name.
7179   For process I/O, TARGET is a process name.
7180   For network I/O, TARGET is a service name or a port number
7181
7182 This function looks up what specified for TARGET in,
7183 `file-coding-system-alist', `process-coding-system-alist',
7184 or `network-coding-system-alist' depending on OPERATION.
7185 They may specify a coding system, a cons of coding systems,
7186 or a function symbol to call.
7187 In the last case, we call the function with one argument,
7188 which is a list of all the arguments given to this function.
7189
7190 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7191      (nargs, args)
7192      int nargs;
7193      Lisp_Object *args;
7194 {
7195   Lisp_Object operation, target_idx, target, val;
7196   register Lisp_Object chain;
7197
7198   if (nargs < 2)
7199     error ("Too few arguments");
7200   operation = args[0];
7201   if (!SYMBOLP (operation)
7202       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7203     error ("Invalid first argument");
7204   if (nargs < 1 + XINT (target_idx))
7205     error ("Too few arguments for operation: %s",
7206            SDATA (SYMBOL_NAME (operation)));
7207   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7208      argument to write-region) is string, it must be treated as a
7209      target file name.  */
7210   if (EQ (operation, Qwrite_region)
7211       && nargs > 5
7212       && STRINGP (args[5]))
7213     target_idx = make_number (4);
7214   target = args[XINT (target_idx) + 1];
7215   if (!(STRINGP (target)
7216         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7217     error ("Invalid argument %d", XINT (target_idx) + 1);
7218
7219   chain = ((EQ (operation, Qinsert_file_contents)
7220             || EQ (operation, Qwrite_region))
7221            ? Vfile_coding_system_alist
7222            : (EQ (operation, Qopen_network_stream)
7223               ? Vnetwork_coding_system_alist
7224               : Vprocess_coding_system_alist));
7225   if (NILP (chain))
7226     return Qnil;
7227
7228   for (; CONSP (chain); chain = XCDR (chain))
7229     {
7230       Lisp_Object elt;
7231       elt = XCAR (chain);
7232
7233       if (CONSP (elt)
7234           && ((STRINGP (target)
7235                && STRINGP (XCAR (elt))
7236                && fast_string_match (XCAR (elt), target) >= 0)
7237               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7238         {
7239           val = XCDR (elt);
7240           /* Here, if VAL is both a valid coding system and a valid
7241              function symbol, we return VAL as a coding system.  */
7242           if (CONSP (val))
7243             return val;
7244           if (! SYMBOLP (val))
7245             return Qnil;
7246           if (! NILP (Fcoding_system_p (val)))
7247             return Fcons (val, val);
7248           if (! NILP (Ffboundp (val)))
7249             {
7250               val = call1 (val, Flist (nargs, args));
7251               if (CONSP (val))
7252                 return val;
7253               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7254                 return Fcons (val, val);
7255             }
7256           return Qnil;
7257         }
7258     }
7259   return Qnil;
7260 }
7261
7262 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7263        Supdate_coding_systems_internal, 0, 0, 0,
7264        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7265 When values of any coding categories are changed, you must
7266 call this function.  */)
7267      ()
7268 {
7269   int i;
7270
7271   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7272     {
7273       Lisp_Object val;
7274
7275       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7276       if (!NILP (val))
7277         {
7278           if (! coding_system_table[i])
7279             coding_system_table[i] = ((struct coding_system *)
7280                                       xmalloc (sizeof (struct coding_system)));
7281           setup_coding_system (val, coding_system_table[i]);
7282         }
7283       else if (coding_system_table[i])
7284         {
7285           xfree (coding_system_table[i]);
7286           coding_system_table[i] = NULL;
7287         }
7288     }
7289
7290   return Qnil;
7291 }
7292
7293 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7294        Sset_coding_priority_internal, 0, 0, 0,
7295        doc: /* Update internal database for the current value of `coding-category-list'.
7296 This function is internal use only.  */)
7297      ()
7298 {
7299   int i = 0, idx;
7300   Lisp_Object val;
7301
7302   val = Vcoding_category_list;
7303
7304   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7305     {
7306       if (! SYMBOLP (XCAR (val)))
7307         break;
7308       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7309       if (idx >= CODING_CATEGORY_IDX_MAX)
7310         break;
7311       coding_priorities[i++] = (1 << idx);
7312       val = XCDR (val);
7313     }
7314   /* If coding-category-list is valid and contains all coding
7315      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7316      the following code saves Emacs from crashing.  */
7317   while (i < CODING_CATEGORY_IDX_MAX)
7318     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7319
7320   return Qnil;
7321 }
7322
7323 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7324        Sdefine_coding_system_internal, 1, 1, 0,
7325        doc: /* Register CODING-SYSTEM as a base coding system.
7326 This function is internal use only.  */)
7327      (coding_system)
7328      Lisp_Object coding_system;
7329 {
7330   Lisp_Object safe_chars, slot;
7331
7332   if (NILP (Fcheck_coding_system (coding_system)))
7333     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7334   safe_chars = coding_safe_chars (coding_system);
7335   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7336     error ("No valid safe-chars property for %s",
7337            SDATA (SYMBOL_NAME (coding_system)));
7338   if (EQ (safe_chars, Qt))
7339     {
7340       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7341         XSETCAR (Vcoding_system_safe_chars,
7342                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7343     }
7344   else
7345     {
7346       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7347       if (NILP (slot))
7348         XSETCDR (Vcoding_system_safe_chars,
7349                  nconc2 (XCDR (Vcoding_system_safe_chars),
7350                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7351       else
7352         XSETCDR (slot, safe_chars);
7353     }
7354   return Qnil;
7355 }
7356
7357 #endif /* emacs */
7358
7359 \f
7360 /*** 9. Post-amble ***/
7361
7362 void
7363 init_coding_once ()
7364 {
7365   int i;
7366
7367   /* Emacs' internal format specific initialize routine.  */
7368   for (i = 0; i <= 0x20; i++)
7369     emacs_code_class[i] = EMACS_control_code;
7370   emacs_code_class[0x0A] = EMACS_linefeed_code;
7371   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7372   for (i = 0x21 ; i < 0x7F; i++)
7373     emacs_code_class[i] = EMACS_ascii_code;
7374   emacs_code_class[0x7F] = EMACS_control_code;
7375   for (i = 0x80; i < 0xFF; i++)
7376     emacs_code_class[i] = EMACS_invalid_code;
7377   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7378   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7379   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7380   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7381
7382   /* ISO2022 specific initialize routine.  */
7383   for (i = 0; i < 0x20; i++)
7384     iso_code_class[i] = ISO_control_0;
7385   for (i = 0x21; i < 0x7F; i++)
7386     iso_code_class[i] = ISO_graphic_plane_0;
7387   for (i = 0x80; i < 0xA0; i++)
7388     iso_code_class[i] = ISO_control_1;
7389   for (i = 0xA1; i < 0xFF; i++)
7390     iso_code_class[i] = ISO_graphic_plane_1;
7391   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7392   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7393   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7394   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7395   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7396   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7397   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7398   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7399   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7400   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7401
7402   setup_coding_system (Qnil, &keyboard_coding);
7403   setup_coding_system (Qnil, &terminal_coding);
7404   setup_coding_system (Qnil, &safe_terminal_coding);
7405   setup_coding_system (Qnil, &default_buffer_file_coding);
7406
7407   bzero (coding_system_table, sizeof coding_system_table);
7408
7409   bzero (ascii_skip_code, sizeof ascii_skip_code);
7410   for (i = 0; i < 128; i++)
7411     ascii_skip_code[i] = 1;
7412
7413 #if defined (MSDOS) || defined (WINDOWSNT)
7414   system_eol_type = CODING_EOL_CRLF;
7415 #else
7416   system_eol_type = CODING_EOL_LF;
7417 #endif
7418
7419   inhibit_pre_post_conversion = 0;
7420 }
7421
7422 #ifdef emacs
7423
7424 void
7425 syms_of_coding ()
7426 {
7427   Qtarget_idx = intern ("target-idx");
7428   staticpro (&Qtarget_idx);
7429
7430   Qcoding_system_history = intern ("coding-system-history");
7431   staticpro (&Qcoding_system_history);
7432   Fset (Qcoding_system_history, Qnil);
7433
7434   /* Target FILENAME is the first argument.  */
7435   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7436   /* Target FILENAME is the third argument.  */
7437   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7438
7439   Qcall_process = intern ("call-process");
7440   staticpro (&Qcall_process);
7441   /* Target PROGRAM is the first argument.  */
7442   Fput (Qcall_process, Qtarget_idx, make_number (0));
7443
7444   Qcall_process_region = intern ("call-process-region");
7445   staticpro (&Qcall_process_region);
7446   /* Target PROGRAM is the third argument.  */
7447   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7448
7449   Qstart_process = intern ("start-process");
7450   staticpro (&Qstart_process);
7451   /* Target PROGRAM is the third argument.  */
7452   Fput (Qstart_process, Qtarget_idx, make_number (2));
7453
7454   Qopen_network_stream = intern ("open-network-stream");
7455   staticpro (&Qopen_network_stream);
7456   /* Target SERVICE is the fourth argument.  */
7457   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7458
7459   Qcoding_system = intern ("coding-system");
7460   staticpro (&Qcoding_system);
7461
7462   Qeol_type = intern ("eol-type");
7463   staticpro (&Qeol_type);
7464
7465   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7466   staticpro (&Qbuffer_file_coding_system);
7467
7468   Qpost_read_conversion = intern ("post-read-conversion");
7469   staticpro (&Qpost_read_conversion);
7470
7471   Qpre_write_conversion = intern ("pre-write-conversion");
7472   staticpro (&Qpre_write_conversion);
7473
7474   Qno_conversion = intern ("no-conversion");
7475   staticpro (&Qno_conversion);
7476
7477   Qundecided = intern ("undecided");
7478   staticpro (&Qundecided);
7479
7480   Qcoding_system_p = intern ("coding-system-p");
7481   staticpro (&Qcoding_system_p);
7482
7483   Qcoding_system_error = intern ("coding-system-error");
7484   staticpro (&Qcoding_system_error);
7485
7486   Fput (Qcoding_system_error, Qerror_conditions,
7487         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7488   Fput (Qcoding_system_error, Qerror_message,
7489         build_string ("Invalid coding system"));
7490
7491   Qcoding_category = intern ("coding-category");
7492   staticpro (&Qcoding_category);
7493   Qcoding_category_index = intern ("coding-category-index");
7494   staticpro (&Qcoding_category_index);
7495
7496   Vcoding_category_table
7497     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7498   staticpro (&Vcoding_category_table);
7499   {
7500     int i;
7501     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7502       {
7503         XVECTOR (Vcoding_category_table)->contents[i]
7504           = intern (coding_category_name[i]);
7505         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7506               Qcoding_category_index, make_number (i));
7507       }
7508   }
7509
7510   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7511   staticpro (&Vcoding_system_safe_chars);
7512
7513   Qtranslation_table = intern ("translation-table");
7514   staticpro (&Qtranslation_table);
7515   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7516
7517   Qtranslation_table_id = intern ("translation-table-id");
7518   staticpro (&Qtranslation_table_id);
7519
7520   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7521   staticpro (&Qtranslation_table_for_decode);
7522
7523   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7524   staticpro (&Qtranslation_table_for_encode);
7525
7526   Qsafe_chars = intern ("safe-chars");
7527   staticpro (&Qsafe_chars);
7528
7529   Qchar_coding_system = intern ("char-coding-system");
7530   staticpro (&Qchar_coding_system);
7531
7532   /* Intern this now in case it isn't already done.
7533      Setting this variable twice is harmless.
7534      But don't staticpro it here--that is done in alloc.c.  */
7535   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7536   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7537   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7538
7539   Qvalid_codes = intern ("valid-codes");
7540   staticpro (&Qvalid_codes);
7541
7542   Qemacs_mule = intern ("emacs-mule");
7543   staticpro (&Qemacs_mule);
7544
7545   Qraw_text = intern ("raw-text");
7546   staticpro (&Qraw_text);
7547
7548   defsubr (&Scoding_system_p);
7549   defsubr (&Sread_coding_system);
7550   defsubr (&Sread_non_nil_coding_system);
7551   defsubr (&Scheck_coding_system);
7552   defsubr (&Sdetect_coding_region);
7553   defsubr (&Sdetect_coding_string);
7554   defsubr (&Sfind_coding_systems_region_internal);
7555   defsubr (&Sfind_coding_systems_region_internal_2);
7556   defsubr (&Sunencodable_char_position);
7557   defsubr (&Sdecode_coding_region);
7558   defsubr (&Sencode_coding_region);
7559   defsubr (&Sdecode_coding_string);
7560   defsubr (&Sencode_coding_string);
7561   defsubr (&Sdecode_sjis_char);
7562   defsubr (&Sencode_sjis_char);
7563   defsubr (&Sdecode_big5_char);
7564   defsubr (&Sencode_big5_char);
7565   defsubr (&Sset_terminal_coding_system_internal);
7566   defsubr (&Sset_safe_terminal_coding_system_internal);
7567   defsubr (&Sterminal_coding_system);
7568   defsubr (&Sset_keyboard_coding_system_internal);
7569   defsubr (&Skeyboard_coding_system);
7570   defsubr (&Sfind_operation_coding_system);
7571   defsubr (&Supdate_coding_systems_internal);
7572   defsubr (&Sset_coding_priority_internal);
7573   defsubr (&Sdefine_coding_system_internal);
7574
7575   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7576                doc: /* List of coding systems.
7577
7578 Do not alter the value of this variable manually.  This variable should be
7579 updated by the functions `make-coding-system' and
7580 `define-coding-system-alias'.  */);
7581   Vcoding_system_list = Qnil;
7582
7583   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7584                doc: /* Alist of coding system names.
7585 Each element is one element list of coding system name.
7586 This variable is given to `completing-read' as TABLE argument.
7587
7588 Do not alter the value of this variable manually.  This variable should be
7589 updated by the functions `make-coding-system' and
7590 `define-coding-system-alias'.  */);
7591   Vcoding_system_alist = Qnil;
7592
7593   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7594                doc: /* List of coding-categories (symbols) ordered by priority.
7595
7596 On detecting a coding system, Emacs tries code detection algorithms
7597 associated with each coding-category one by one in this order.  When
7598 one algorithm agrees with a byte sequence of source text, the coding
7599 system bound to the corresponding coding-category is selected.  */);
7600   {
7601     int i;
7602
7603     Vcoding_category_list = Qnil;
7604     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7605       Vcoding_category_list
7606         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7607                  Vcoding_category_list);
7608   }
7609
7610   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7611                doc: /* Specify the coding system for read operations.
7612 It is useful to bind this variable with `let', but do not set it globally.
7613 If the value is a coding system, it is used for decoding on read operation.
7614 If not, an appropriate element is used from one of the coding system alists:
7615 There are three such tables, `file-coding-system-alist',
7616 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7617   Vcoding_system_for_read = Qnil;
7618
7619   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7620                doc: /* Specify the coding system for write operations.
7621 Programs bind this variable with `let', but you should not set it globally.
7622 If the value is a coding system, it is used for encoding of output,
7623 when writing it to a file and when sending it to a file or subprocess.
7624
7625 If this does not specify a coding system, an appropriate element
7626 is used from one of the coding system alists:
7627 There are three such tables, `file-coding-system-alist',
7628 `process-coding-system-alist', and `network-coding-system-alist'.
7629 For output to files, if the above procedure does not specify a coding system,
7630 the value of `buffer-file-coding-system' is used.  */);
7631   Vcoding_system_for_write = Qnil;
7632
7633   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7634                doc: /* Coding system used in the latest file or process I/O.  */);
7635   Vlast_coding_system_used = Qnil;
7636
7637   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7638                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7639 See info node `Coding Systems' and info node `Text and Binary' concerning
7640 such conversion.  */);
7641   inhibit_eol_conversion = 0;
7642
7643   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7644                doc: /* Non-nil means process buffer inherits coding system of process output.
7645 Bind it to t if the process output is to be treated as if it were a file
7646 read from some filesystem.  */);
7647   inherit_process_coding_system = 0;
7648
7649   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7650                doc: /* Alist to decide a coding system to use for a file I/O operation.
7651 The format is ((PATTERN . VAL) ...),
7652 where PATTERN is a regular expression matching a file name,
7653 VAL is a coding system, a cons of coding systems, or a function symbol.
7654 If VAL is a coding system, it is used for both decoding and encoding
7655 the file contents.
7656 If VAL is a cons of coding systems, the car part is used for decoding,
7657 and the cdr part is used for encoding.
7658 If VAL is a function symbol, the function must return a coding system
7659 or a cons of coding systems which are used as above.  The function gets
7660 the arguments with which `find-operation-coding-system' was called.
7661
7662 See also the function `find-operation-coding-system'
7663 and the variable `auto-coding-alist'.  */);
7664   Vfile_coding_system_alist = Qnil;
7665
7666   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7667     doc: /* Alist to decide a coding system to use for a process I/O operation.
7668 The format is ((PATTERN . VAL) ...),
7669 where PATTERN is a regular expression matching a program name,
7670 VAL is a coding system, a cons of coding systems, or a function symbol.
7671 If VAL is a coding system, it is used for both decoding what received
7672 from the program and encoding what sent to the program.
7673 If VAL is a cons of coding systems, the car part is used for decoding,
7674 and the cdr part is used for encoding.
7675 If VAL is a function symbol, the function must return a coding system
7676 or a cons of coding systems which are used as above.
7677
7678 See also the function `find-operation-coding-system'.  */);
7679   Vprocess_coding_system_alist = Qnil;
7680
7681   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7682     doc: /* Alist to decide a coding system to use for a network I/O operation.
7683 The format is ((PATTERN . VAL) ...),
7684 where PATTERN is a regular expression matching a network service name
7685 or is a port number to connect to,
7686 VAL is a coding system, a cons of coding systems, or a function symbol.
7687 If VAL is a coding system, it is used for both decoding what received
7688 from the network stream and encoding what sent to the network stream.
7689 If VAL is a cons of coding systems, the car part is used for decoding,
7690 and the cdr part is used for encoding.
7691 If VAL is a function symbol, the function must return a coding system
7692 or a cons of coding systems which are used as above.
7693
7694 See also the function `find-operation-coding-system'.  */);
7695   Vnetwork_coding_system_alist = Qnil;
7696
7697   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7698                doc: /* Coding system to use with system messages.
7699 Also used for decoding keyboard input on X Window system.  */);
7700   Vlocale_coding_system = Qnil;
7701
7702   /* The eol mnemonics are reset in startup.el system-dependently.  */
7703   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7704                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7705   eol_mnemonic_unix = build_string (":");
7706
7707   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7708                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7709   eol_mnemonic_dos = build_string ("\\");
7710
7711   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7712                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7713   eol_mnemonic_mac = build_string ("/");
7714
7715   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7716                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7717   eol_mnemonic_undecided = build_string (":");
7718
7719   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7720                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7721   Venable_character_translation = Qt;
7722
7723   DEFVAR_LISP ("standard-translation-table-for-decode",
7724                &Vstandard_translation_table_for_decode,
7725                doc: /* Table for translating characters while decoding.  */);
7726   Vstandard_translation_table_for_decode = Qnil;
7727
7728   DEFVAR_LISP ("standard-translation-table-for-encode",
7729                &Vstandard_translation_table_for_encode,
7730                doc: /* Table for translating characters while encoding.  */);
7731   Vstandard_translation_table_for_encode = Qnil;
7732
7733   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7734                doc: /* Alist of charsets vs revision numbers.
7735 While encoding, if a charset (car part of an element) is found,
7736 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7737   Vcharset_revision_alist = Qnil;
7738
7739   DEFVAR_LISP ("default-process-coding-system",
7740                &Vdefault_process_coding_system,
7741                doc: /* Cons of coding systems used for process I/O by default.
7742 The car part is used for decoding a process output,
7743 the cdr part is used for encoding a text to be sent to a process.  */);
7744   Vdefault_process_coding_system = Qnil;
7745
7746   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7747                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7748 This is a vector of length 256.
7749 If Nth element is non-nil, the existence of code N in a file
7750 \(or output of subprocess) doesn't prevent it to be detected as
7751 a coding system of ISO 2022 variant which has a flag
7752 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7753 or reading output of a subprocess.
7754 Only 128th through 159th elements has a meaning.  */);
7755   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7756
7757   DEFVAR_LISP ("select-safe-coding-system-function",
7758                &Vselect_safe_coding_system_function,
7759                doc: /* Function to call to select safe coding system for encoding a text.
7760
7761 If set, this function is called to force a user to select a proper
7762 coding system which can encode the text in the case that a default
7763 coding system used in each operation can't encode the text.
7764
7765 The default value is `select-safe-coding-system' (which see).  */);
7766   Vselect_safe_coding_system_function = Qnil;
7767
7768   DEFVAR_BOOL ("coding-system-require-warning",
7769                &coding_system_require_warning,
7770                doc: /* Internal use only.
7771 If non-nil, on writing a file, `select-safe-coding-system-function' is
7772 called even if `coding-system-for-write' is non-nil.  The command
7773 `universal-coding-system-argument' binds this variable to t temporarily.  */);
7774   coding_system_require_warning = 0;
7775
7776
7777   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
7778                doc: /* Char-table containing safe coding systems of each characters.
7779 Each element doesn't include such generic coding systems that can
7780 encode any characters.  They are in the first extra slot.  */);
7781   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
7782
7783   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7784                &inhibit_iso_escape_detection,
7785                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7786
7787 By default, on reading a file, Emacs tries to detect how the text is
7788 encoded.  This code detection is sensitive to escape sequences.  If
7789 the sequence is valid as ISO2022, the code is determined as one of
7790 the ISO2022 encodings, and the file is decoded by the corresponding
7791 coding system (e.g. `iso-2022-7bit').
7792
7793 However, there may be a case that you want to read escape sequences in
7794 a file as is.  In such a case, you can set this variable to non-nil.
7795 Then, as the code detection ignores any escape sequences, no file is
7796 detected as encoded in some ISO2022 encoding.  The result is that all
7797 escape sequences become visible in a buffer.
7798
7799 The default value is nil, and it is strongly recommended not to change
7800 it.  That is because many Emacs Lisp source files that contain
7801 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7802 in Emacs's distribution, and they won't be decoded correctly on
7803 reading if you suppress escape sequence detection.
7804
7805 The other way to read escape sequences in a file without decoding is
7806 to explicitly specify some coding system that doesn't use ISO2022's
7807 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7808   inhibit_iso_escape_detection = 0;
7809
7810   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7811                doc: /* Char table for translating self-inserting characters.
7812 This is applied to the result of input methods, not their input.  See also
7813 `keyboard-translate-table'.  */);
7814     Vtranslation_table_for_input = Qnil;
7815 }
7816
7817 char *
7818 emacs_strerror (error_number)
7819      int error_number;
7820 {
7821   char *str;
7822
7823   synchronize_system_messages_locale ();
7824   str = strerror (error_number);
7825
7826   if (! NILP (Vlocale_coding_system))
7827     {
7828       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7829                                                       Vlocale_coding_system,
7830                                                       0);
7831       str = (char *) SDATA (dec);
7832     }
7833
7834   return str;
7835 }
7836
7837 #endif /* emacs */
7838