code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Other
  71
  72   If a user wants to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represents a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See section 6 for more details.
  81
  82 */
  83
  84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  85
  86   How end-of-line of a text is encoded depends on a system.  For
  87   instance, Unix's format is just one byte of `line-feed' code,
  88   whereas DOS's format is two-byte sequence of `carriage-return' and
  89   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  90
  91   Since text characters encoding and end-of-line encoding are
  92   independent, any coding system described above can take
  93   any format of end-of-line.  So, Emacs has information of format of
  94   end-of-line in each coding-system.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_emacs_mule (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format (emacs-mule).  The resulting text
 118   goes to a place pointed to by DESTINATION, the length of which should
 119   not exceed DST_BYTES.  The number of bytes actually processed is
 120   returned as *CONSUMED.  The return value is the length of the decoded
 121   text.  Below is a template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 136   internal format (emacs-mule) to CODING.  The resulting text goes to
 137   a place pointed to by DESTINATION, the length of which should not
 138   exceed DST_BYTES.  The number of bytes actually processed is
 139   returned as *CONSUMED.  The return value is the length of the
 140   encoded text.  Below is a template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 262    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 263 int system_eol_type;
 264
 265 #ifdef emacs
 266
 267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 268
 269 /* Coding system emacs-mule is for converting only end-of-line format.  */
 270 Lisp_Object Qemacs_mule;
 271
 272 /* Coding-systems are handed between Emacs Lisp programs and C internal
 273    routines by the following three variables.  */
 274 /* Coding-system for reading files and receiving data from process.  */
 275 Lisp_Object Vcoding_system_for_read;
 276 /* Coding-system for writing files and sending data to process.  */
 277 Lisp_Object Vcoding_system_for_write;
 278 /* Coding-system actually used in the latest I/O.  */
 279 Lisp_Object Vlast_coding_system_used;
 280
 281 /* A vector of length 256 which contains information about special
 282    Microsoft codes.  */
 283 Lisp_Object Vmicrosoft_code_table;
 284
 285 /* Flag to inhibit code conversion of end-of-line format.  */
 286 int inhibit_eol_conversion;
 287
 288 /* Coding system to be used to encode text for terminal display.  */
 289 struct coding_system terminal_coding;
 290
 291 /* Coding system to be used to encode text for terminal display when
 292    terminal coding system is nil.  */
 293 struct coding_system safe_terminal_coding;
 294
 295 /* Coding system of what is sent from terminal keyboard.  */
 296 struct coding_system keyboard_coding;
 297
 298 Lisp_Object Vfile_coding_system_alist;
 299 Lisp_Object Vprocess_coding_system_alist;
 300 Lisp_Object Vnetwork_coding_system_alist;
 301
 302 #endif /* emacs */
 303
 304 Lisp_Object Qcoding_category_index;
 305
 306 /* List of symbols `coding-category-xxx' ordered by priority.  */
 307 Lisp_Object Vcoding_category_list;
 308
 309 /* Table of coding-systems currently assigned to each coding-category.  */
 310 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 311
 312 /* Table of names of symbol for each coding-category.  */
 313 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 314   "coding-category-emacs-mule",
 315   "coding-category-sjis",
 316   "coding-category-iso-7",
 317   "coding-category-iso-8-1",
 318   "coding-category-iso-8-2",
 319   "coding-category-iso-7-else",
 320   "coding-category-iso-8-else",
 321   "coding-category-big5",
 322   "coding-category-binary"
 323 };
 324
 325 /* Flag to tell if we look up unification table on character code
 326    conversion.  */
 327 Lisp_Object Venable_character_unification;
 328 /* Standard unification table to look up on decoding (reading).  */
 329 Lisp_Object Vstandard_character_unification_table_for_decode;
 330 /* Standard unification table to look up on encoding (writing).  */
 331 Lisp_Object Vstandard_character_unification_table_for_encode;
 332
 333 Lisp_Object Qcharacter_unification_table;
 334 Lisp_Object Qcharacter_unification_table_for_decode;
 335 Lisp_Object Qcharacter_unification_table_for_encode;
 336
 337 /* Alist of charsets vs revision number.  */
 338 Lisp_Object Vcharset_revision_alist;
 339
 340 /* Default coding systems used for process I/O.  */
 341 Lisp_Object Vdefault_process_coding_system;
 342
 343 \f
 344 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 345
 346 /* Emacs' internal format for encoding multiple character sets is a
 347    kind of multi-byte encoding, i.e. characters are encoded by
 348    variable-length sequences of one-byte codes.  ASCII characters
 349    and control characters (e.g. `tab', `newline') are represented by
 350    one-byte sequences which are their ASCII codes, in the range 0x00
 351    through 0x7F.  The other characters are represented by a sequence
 352    of `base leading-code', optional `extended leading-code', and one
 353    or two `position-code's.  The length of the sequence is determined
 354    by the base leading-code.  Leading-code takes the range 0x80
 355    through 0x9F, whereas extended leading-code and position-code take
 356    the range 0xA0 through 0xFF.  See `charset.h' for more details
 357    about leading-code and position-code.
 358
 359    There's one exception to this rule.  Special leading-code
 360    `leading-code-composition' denotes that the following several
 361    characters should be composed into one character.  Leading-codes of
 362    components (except for ASCII) are added 0x20.  An ASCII character
 363    component is represented by a 2-byte sequence of `0xA0' and
 364    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 365    details of composite character.  Hence, we can summarize the code
 366    range as follows:
 367
 368    --- CODE RANGE of Emacs' internal format ---
 369    (character set)      (range)
 370    ASCII                0x00 .. 0x7F
 371    ELSE (1st byte)      0x80 .. 0x9F
 372         (rest bytes)    0xA0 .. 0xFF
 373    ---------------------------------------------
 374
 375   */
 376
 377 enum emacs_code_class_type emacs_code_class[256];
 378
 379 /* Go to the next statement only if *SRC is accessible and the code is
 380    greater than 0xA0.  */
 381 #define CHECK_CODE_RANGE_A0_FF  \
 382   do {                          \
 383     if (src >= src_end)         \
 384       goto label_end_of_switch; \
 385     else if (*src++ < 0xA0)     \
 386       return 0;                 \
 387   } while (0)
 388
 389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 390    Check if a text is encoded in Emacs' internal format.  If it is,
 391    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 392
 393 int
 394 detect_coding_emacs_mule (src, src_end)
 395      unsigned char *src, *src_end;
 396 {
 397   unsigned char c;
 398   int composing = 0;
 399
 400   while (src < src_end)
 401     {
 402       c = *src++;
 403
 404       if (composing)
 405         {
 406           if (c < 0xA0)
 407             composing = 0;
 408           else
 409             c -= 0x20;
 410         }
 411
 412       switch (emacs_code_class[c])
 413         {
 414         case EMACS_ascii_code:
 415         case EMACS_linefeed_code:
 416           break;
 417
 418         case EMACS_control_code:
 419           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 420             return 0;
 421           break;
 422
 423         case EMACS_invalid_code:
 424           return 0;
 425
 426         case EMACS_leading_code_composition: /* c == 0x80 */
 427           if (composing)
 428             CHECK_CODE_RANGE_A0_FF;
 429           else
 430             composing = 1;
 431           break;
 432
 433         case EMACS_leading_code_4:
 434           CHECK_CODE_RANGE_A0_FF;
 435           /* fall down to check it two more times ...  */
 436
 437         case EMACS_leading_code_3:
 438           CHECK_CODE_RANGE_A0_FF;
 439           /* fall down to check it one more time ...  */
 440
 441         case EMACS_leading_code_2:
 442           CHECK_CODE_RANGE_A0_FF;
 443           break;
 444
 445         default:
 446         label_end_of_switch:
 447           break;
 448         }
 449     }
 450   return CODING_CATEGORY_MASK_EMACS_MULE;
 451 }
 452
 453 \f
 454 /*** 3. ISO2022 handlers ***/
 455
 456 /* The following note describes the coding system ISO2022 briefly.
 457    Since the intention of this note is to help in understanding of
 458    the programs in this file, some parts are NOT ACCURATE or OVERLY
 459    SIMPLIFIED.  For the thorough understanding, please refer to the
 460    original document of ISO2022.
 461
 462    ISO2022 provides many mechanisms to encode several character sets
 463    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 464    all text is encoded by codes of less than 128.  This may make the
 465    encoded text a little bit longer, but the text gets more stability
 466    to pass through several gateways (some of them strip off the MSB).
 467
 468    There are two kinds of character set: control character set and
 469    graphic character set.  The former contains control characters such
 470    as `newline' and `escape' to provide control functions (control
 471    functions are provided also by escape sequences).  The latter
 472    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 473    two control character sets and many graphic character sets.
 474
 475    Graphic character sets are classified into one of the following
 476    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 477    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 478    bytes (DIMENSION) and the number of characters in one dimension
 479    (CHARS) of the set.  In addition, each character set is assigned an
 480    identification tag (called "final character" and denoted as <F>
 481    here after) which is unique in each class.  <F> of each character
 482    set is decided by ECMA(*) when it is registered in ISO.  Code range
 483    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 484
 485    Note (*): ECMA = European Computer Manufacturers Association
 486
 487    Here are examples of graphic character set [NAME(<F>)]:
 488         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 489         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 490         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 491         o DIMENSION2_CHARS96 -- none for the moment
 492
 493    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 494         C0 [0x00..0x1F] -- control character plane 0
 495         GL [0x20..0x7F] -- graphic character plane 0
 496         C1 [0x80..0x9F] -- control character plane 1
 497         GR [0xA0..0xFF] -- graphic character plane 1
 498
 499    A control character set is directly designated and invoked to C0 or
 500    C1 by an escape sequence.  The most common case is that ISO646's
 501    control character set is designated/invoked to C0 and ISO6429's
 502    control character set is designated/invoked to C1, and usually
 503    these designations/invocations are omitted in a coded text.  With
 504    7-bit environment, only C0 can be used, and a control character for
 505    C1 is encoded by an appropriate escape sequence to fit in the
 506    environment.  All control characters for C1 are defined the
 507    corresponding escape sequences.
 508
 509    A graphic character set is at first designated to one of four
 510    graphic registers (G0 through G3), then these graphic registers are
 511    invoked to GL or GR.  These designations and invocations can be
 512    done independently.  The most common case is that G0 is invoked to
 513    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 514    these invocations and designations are omitted in a coded text.
 515    With 7-bit environment, only GL can be used.
 516
 517    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 518    and 0x7F of GL area work as control characters SPACE and DEL
 519    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 520
 521    There are two ways of invocation: locking-shift and single-shift.
 522    With locking-shift, the invocation lasts until the next different
 523    invocation, whereas with single-shift, the invocation works only
 524    for the following character and doesn't affect locking-shift.
 525    Invocations are done by the following control characters or escape
 526    sequences.
 527
 528    ----------------------------------------------------------------------
 529    function             control char    escape sequence description
 530    ----------------------------------------------------------------------
 531    SI  (shift-in)               0x0F    none            invoke G0 to GL
 532    SO  (shift-out)              0x0E    none            invoke G1 to GL
 533    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 534    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 535    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 536    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 537    ----------------------------------------------------------------------
 538    The first four are for locking-shift.  Control characters for these
 539    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 540
 541    Designations are done by the following escape sequences.
 542    ----------------------------------------------------------------------
 543    escape sequence      description
 544    ----------------------------------------------------------------------
 545    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 546    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 547    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 548    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 549    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 550    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 551    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 552    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 553    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 554    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 555    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 556    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 557    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 558    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 559    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 560    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 561    ----------------------------------------------------------------------
 562
 563    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 564    of dimension 1, chars 94, and final character <F>, and etc.
 565
 566    Note (*): Although these designations are not allowed in ISO2022,
 567    Emacs accepts them on decoding, and produces them on encoding
 568    CHARS96 character set in a coding system which is characterized as
 569    7-bit environment, non-locking-shift, and non-single-shift.
 570
 571    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 572    '(' can be omitted.  We call this as "short-form" here after.
 573
 574    Now you may notice that there are a lot of ways for encoding the
 575    same multilingual text in ISO2022.  Actually, there exists many
 576    coding systems such as Compound Text (used in X's inter client
 577    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 578    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 579    localized platforms), and all of these are variants of ISO2022.
 580
 581    In addition to the above, Emacs handles two more kinds of escape
 582    sequences: ISO6429's direction specification and Emacs' private
 583    sequence for specifying character composition.
 584
 585    ISO6429's direction specification takes the following format:
 586         o CSI ']'      -- end of the current direction
 587         o CSI '0' ']'  -- end of the current direction
 588         o CSI '1' ']'  -- start of left-to-right text
 589         o CSI '2' ']'  -- start of right-to-left text
 590    The control character CSI (0x9B: control sequence introducer) is
 591    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 592
 593    Character composition specification takes the following format:
 594         o ESC '0' -- start character composition
 595         o ESC '1' -- end character composition
 596    Since these are not standard escape sequences of any ISO, the use
 597    of them for these meaning is restricted to Emacs only.  */
 598
 599 enum iso_code_class_type iso_code_class[256];
 600
 601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 602    Check if a text is encoded in ISO2022.  If it is, returns an
 603    integer in which appropriate flag bits any of:
 604         CODING_CATEGORY_MASK_ISO_7
 605         CODING_CATEGORY_MASK_ISO_8_1
 606         CODING_CATEGORY_MASK_ISO_8_2
 607         CODING_CATEGORY_MASK_ISO_7_ELSE
 608         CODING_CATEGORY_MASK_ISO_8_ELSE
 609    are set.  If a code which should never appear in ISO2022 is found,
 610    returns 0.  */
 611
 612 int
 613 detect_coding_iso2022 (src, src_end)
 614      unsigned char *src, *src_end;
 615 {
 616   int mask = (CODING_CATEGORY_MASK_ISO_7
 617               | CODING_CATEGORY_MASK_ISO_8_1
 618               | CODING_CATEGORY_MASK_ISO_8_2
 619               | CODING_CATEGORY_MASK_ISO_7_ELSE
 620               | CODING_CATEGORY_MASK_ISO_8_ELSE
 621               );
 622   int g1 = 0;                   /* 1 iff designating to G1.  */
 623   int c, i;
 624
 625   while (src < src_end)
 626     {
 627       c = *src++;
 628       switch (c)
 629         {
 630         case ISO_CODE_ESC:
 631           if (src >= src_end)
 632             break;
 633           c = *src++;
 634           if ((c >= '(' && c <= '/'))
 635             {
 636               /* Designation sequence for a charset of dimension 1.  */
 637               if (src >= src_end)
 638                 break;
 639               c = *src++;
 640               if (c < ' ' || c >= 0x80)
 641                 /* Invalid designation sequence.  */
 642                 return 0;
 643             }
 644           else if (c == '$')
 645             {
 646               /* Designation sequence for a charset of dimension 2.  */
 647               if (src >= src_end)
 648                 break;
 649               c = *src++;
 650               if (c >= '@' && c <= 'B')
 651                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 652                 ;
 653               else if (c >= '(' && c <= '/')
 654                 {
 655                   if (src >= src_end)
 656                     break;
 657                   c = *src++;
 658                   if (c < ' ' || c >= 0x80)
 659                     /* Invalid designation sequence.  */
 660                     return 0;
 661                 }
 662               else
 663                 /* Invalid designation sequence.  */
 664                 return 0;
 665             }
 666           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 667             /* Locking shift.  */
 668             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 669                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 670           else if (c == '0' || c == '1' || c == '2')
 671             /* Start/end composition.  */
 672             ;
 673           else
 674             /* Invalid escape sequence.  */
 675             return 0;
 676           break;
 677
 678         case ISO_CODE_SO:
 679           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 680                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 681           break;
 682
 683         case ISO_CODE_CSI:
 684         case ISO_CODE_SS2:
 685         case ISO_CODE_SS3:
 686           return CODING_CATEGORY_MASK_ISO_8_ELSE;
 687
 688         default:
 689           if (c < 0x80)
 690             break;
 691           else if (c < 0xA0)
 692             {
 693               if (VECTORP (Vmicrosoft_code_table)
 694                   && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
 695                 {
 696                   mask &= ~(CODING_CATEGORY_MASK_ISO_7
 697                             | CODING_CATEGORY_MASK_ISO_7_ELSE);
 698                   break;
 699                 }
 700               return 0;
 701             }
 702           else
 703             {
 704               unsigned char *src_begin = src;
 705
 706               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 707                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 708               while (src < src_end && *src >= 0xA0)
 709                 src++;
 710               if ((src - src_begin - 1) & 1 && src < src_end)
 711                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 712             }
 713           break;
 714         }
 715     }
 716
 717   return mask;
 718 }
 719
 720 /* Decode a character of which charset is CHARSET and the 1st position
 721    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 722    fetched from SRC and set to C2.  If CHARSET is negative, it means
 723    that we are decoding ill formed text, and what we can do is just to
 724    read C1 as is.  */
 725
 726 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 727   do {                                                                  \
 728     int c_alt, charset_alt = (charset);                                 \
 729     if (COMPOSING_HEAD_P (coding->composing))                           \
 730       {                                                                 \
 731         *dst++ = LEADING_CODE_COMPOSITION;                              \
 732         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 733           /* To tell composition rules are embeded.  */                 \
 734           *dst++ = 0xFF;                                                \
 735         coding->composing += 2;                                         \
 736       }                                                                 \
 737     if ((charset) >= 0)                                                 \
 738       {                                                                 \
 739         if (CHARSET_DIMENSION (charset) == 2)                           \
 740           ONE_MORE_BYTE (c2);                                           \
 741         if (!NILP (unification_table)                                   \
 742             && ((c_alt = unify_char (unification_table,                 \
 743                                      -1, (charset), c1, c2)) >= 0))     \
 744           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 745       }                                                                 \
 746     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 747       DECODE_CHARACTER_ASCII (c1);                                      \
 748     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 749       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 750     else                                                                \
 751       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 752     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 753       /* To tell a composition rule follows.  */                        \
 754       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 755   } while (0)
 756
 757 /* Set designation state into CODING.  */
 758 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 759   do {                                                                  \
 760     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 761                                      make_number (chars),               \
 762                                      make_number (final_char));         \
 763     if (charset >= 0)                                                   \
 764       {                                                                 \
 765         if (coding->direction == 1                                      \
 766             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 767           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 768         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 769       }                                                                 \
 770   } while (0)
 771
 772 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 773
 774 int
 775 decode_coding_iso2022 (coding, source, destination,
 776                        src_bytes, dst_bytes, consumed)
 777      struct coding_system *coding;
 778      unsigned char *source, *destination;
 779      int src_bytes, dst_bytes;
 780      int *consumed;
 781 {
 782   unsigned char *src = source;
 783   unsigned char *src_end = source + src_bytes;
 784   unsigned char *dst = destination;
 785   unsigned char *dst_end = destination + dst_bytes;
 786   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 787      from DST_END to assure that overflow checking is necessary only
 788      at the head of loop.  */
 789   unsigned char *adjusted_dst_end = dst_end - 6;
 790   int charset;
 791   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 792   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 793   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 794   Lisp_Object unification_table
 795       = coding->character_unification_table_for_decode;
 796
 797   if (!NILP (Venable_character_unification) && NILP (unification_table))
 798     unification_table = Vstandard_character_unification_table_for_decode;
 799
 800   while (src < src_end && dst < adjusted_dst_end)
 801     {
 802       /* SRC_BASE remembers the start position in source in each loop.
 803          The loop will be exited when there's not enough source text
 804          to analyze long escape sequence or 2-byte code (within macros
 805          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 806          to SRC_BASE before exiting.  */
 807       unsigned char *src_base = src;
 808       int c1 = *src++, c2;
 809
 810       switch (iso_code_class [c1])
 811         {
 812         case ISO_0x20_or_0x7F:
 813           if (!coding->composing
 814               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 815             {
 816               /* This is SPACE or DEL.  */
 817               *dst++ = c1;
 818               break;
 819             }
 820           /* This is a graphic character, we fall down ...  */
 821
 822         case ISO_graphic_plane_0:
 823           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 824             {
 825               /* This is a composition rule.  */
 826               *dst++ = c1 | 0x80;
 827               coding->composing = COMPOSING_WITH_RULE_TAIL;
 828             }
 829           else
 830             DECODE_ISO_CHARACTER (charset0, c1);
 831           break;
 832
 833         case ISO_0xA0_or_0xFF:
 834           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 835             {
 836               /* Invalid code.  */
 837               *dst++ = c1;
 838               break;
 839             }
 840           /* This is a graphic character, we fall down ... */
 841
 842         case ISO_graphic_plane_1:
 843           DECODE_ISO_CHARACTER (charset1, c1);
 844           break;
 845
 846         case ISO_control_code:
 847           /* All ISO2022 control characters in this class have the
 848              same representation in Emacs internal format.  */
 849           *dst++ = c1;
 850           break;
 851
 852         case ISO_carriage_return:
 853           if (coding->eol_type == CODING_EOL_CR)
 854             {
 855               *dst++ = '\n';
 856             }
 857           else if (coding->eol_type == CODING_EOL_CRLF)
 858             {
 859               ONE_MORE_BYTE (c1);
 860               if (c1 == ISO_CODE_LF)
 861                 *dst++ = '\n';
 862               else
 863                 {
 864                   src--;
 865                   *dst++ = c1;
 866                 }
 867             }
 868           else
 869             {
 870               *dst++ = c1;
 871             }
 872           break;
 873
 874         case ISO_shift_out:
 875           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 876             goto label_invalid_escape_sequence;
 877           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 878           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 879           break;
 880
 881         case ISO_shift_in:
 882           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 883           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 884           break;
 885
 886         case ISO_single_shift_2_7:
 887         case ISO_single_shift_2:
 888           /* SS2 is handled as an escape sequence of ESC 'N' */
 889           c1 = 'N';
 890           goto label_escape_sequence;
 891
 892         case ISO_single_shift_3:
 893           /* SS2 is handled as an escape sequence of ESC 'O' */
 894           c1 = 'O';
 895           goto label_escape_sequence;
 896
 897         case ISO_control_sequence_introducer:
 898           /* CSI is handled as an escape sequence of ESC '[' ...  */
 899           c1 = '[';
 900           goto label_escape_sequence;
 901
 902         case ISO_escape:
 903           ONE_MORE_BYTE (c1);
 904         label_escape_sequence:
 905           /* Escape sequences handled by Emacs are invocation,
 906              designation, direction specification, and character
 907              composition specification.  */
 908           switch (c1)
 909             {
 910             case '&':           /* revision of following character set */
 911               ONE_MORE_BYTE (c1);
 912               if (!(c1 >= '@' && c1 <= '~'))
 913                 goto label_invalid_escape_sequence;
 914               ONE_MORE_BYTE (c1);
 915               if (c1 != ISO_CODE_ESC)
 916                 goto label_invalid_escape_sequence;
 917               ONE_MORE_BYTE (c1);
 918               goto label_escape_sequence;
 919
 920             case '$':           /* designation of 2-byte character set */
 921               ONE_MORE_BYTE (c1);
 922               if (c1 >= '@' && c1 <= 'B')
 923                 {       /* designation of JISX0208.1978, GB2312.1980,
 924                                    or JISX0208.1980 */
 925                   DECODE_DESIGNATION (0, 2, 94, c1);
 926                 }
 927               else if (c1 >= 0x28 && c1 <= 0x2B)
 928                 {       /* designation of DIMENSION2_CHARS94 character set */
 929                   ONE_MORE_BYTE (c2);
 930                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 931                 }
 932               else if (c1 >= 0x2C && c1 <= 0x2F)
 933                 {       /* designation of DIMENSION2_CHARS96 character set */
 934                   ONE_MORE_BYTE (c2);
 935                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 936                 }
 937               else
 938                 goto label_invalid_escape_sequence;
 939               break;
 940
 941             case 'n':           /* invocation of locking-shift-2 */
 942               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 943                 goto label_invalid_escape_sequence;
 944               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 945               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 946               break;
 947
 948             case 'o':           /* invocation of locking-shift-3 */
 949               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 950                 goto label_invalid_escape_sequence;
 951               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 952               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 953               break;
 954
 955             case 'N':           /* invocation of single-shift-2 */
 956               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 957                 goto label_invalid_escape_sequence;
 958               ONE_MORE_BYTE (c1);
 959               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 960               DECODE_ISO_CHARACTER (charset, c1);
 961               break;
 962
 963             case 'O':           /* invocation of single-shift-3 */
 964               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 965                 goto label_invalid_escape_sequence;
 966               ONE_MORE_BYTE (c1);
 967               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 968               DECODE_ISO_CHARACTER (charset, c1);
 969               break;
 970
 971             case '0':           /* start composing without embeded rules */
 972               coding->composing = COMPOSING_NO_RULE_HEAD;
 973               break;
 974
 975             case '1':           /* end composing */
 976               coding->composing = COMPOSING_NO;
 977               break;
 978
 979             case '2':           /* start composing with embeded rules */
 980               coding->composing = COMPOSING_WITH_RULE_HEAD;
 981               break;
 982
 983             case '[':           /* specification of direction */
 984               /* For the moment, nested direction is not supported.
 985                  So, the value of `coding->direction' is 0 or 1: 0
 986                  means left-to-right, 1 means right-to-left.  */
 987               ONE_MORE_BYTE (c1);
 988               switch (c1)
 989                 {
 990                 case ']':       /* end of the current direction */
 991                   coding->direction = 0;
 992
 993                 case '0':       /* end of the current direction */
 994                 case '1':       /* start of left-to-right direction */
 995                   ONE_MORE_BYTE (c1);
 996                   if (c1 == ']')
 997                     coding->direction = 0;
 998                   else
 999                     goto label_invalid_escape_sequence;
1000                   break;
1001
1002                 case '2':       /* start of right-to-left direction */
1003                   ONE_MORE_BYTE (c1);
1004                   if (c1 == ']')
1005                     coding->direction= 1;
1006                   else
1007                     goto label_invalid_escape_sequence;
1008                   break;
1009
1010                 default:
1011                   goto label_invalid_escape_sequence;
1012                 }
1013               break;
1014
1015             default:
1016               if (c1 >= 0x28 && c1 <= 0x2B)
1017                 {       /* designation of DIMENSION1_CHARS94 character set */
1018                   ONE_MORE_BYTE (c2);
1019                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1020                 }
1021               else if (c1 >= 0x2C && c1 <= 0x2F)
1022                 {       /* designation of DIMENSION1_CHARS96 character set */
1023                   ONE_MORE_BYTE (c2);
1024                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1025                 }
1026               else
1027                 {
1028                   goto label_invalid_escape_sequence;
1029                 }
1030             }
1031           /* We must update these variables now.  */
1032           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1033           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1034           break;
1035
1036         label_invalid_escape_sequence:
1037           {
1038             int length = src - src_base;
1039
1040             bcopy (src_base, dst, length);
1041             dst += length;
1042           }
1043         }
1044       continue;
1045
1046     label_end_of_loop:
1047       coding->carryover_size = src - src_base;
1048       bcopy (src_base, coding->carryover, coding->carryover_size);
1049       src = src_base;
1050       break;
1051     }
1052
1053   /* If this is the last block of the text to be decoded, we had
1054      better just flush out all remaining codes in the text although
1055      they are not valid characters.  */
1056   if (coding->last_block)
1057     {
1058       bcopy (src, dst, src_end - src);
1059       dst += (src_end - src);
1060       src = src_end;
1061     }
1062   *consumed = src - source;
1063   return dst - destination;
1064 }
1065
1066 /* ISO2022 encoding stuff.  */
1067
1068 /*
1069    It is not enough to say just "ISO2022" on encoding, we have to
1070    specify more details.  In Emacs, each coding-system of ISO2022
1071    variant has the following specifications:
1072         1. Initial designation to G0 thru G3.
1073         2. Allows short-form designation?
1074         3. ASCII should be designated to G0 before control characters?
1075         4. ASCII should be designated to G0 at end of line?
1076         5. 7-bit environment or 8-bit environment?
1077         6. Use locking-shift?
1078         7. Use Single-shift?
1079    And the following two are only for Japanese:
1080         8. Use ASCII in place of JIS0201-1976-Roman?
1081         9. Use JISX0208-1983 in place of JISX0208-1978?
1082    These specifications are encoded in `coding->flags' as flag bits
1083    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1084    details.
1085 */
1086
1087 /* Produce codes (escape sequence) for designating CHARSET to graphic
1088    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1089    the coding system CODING allows, produce designation sequence of
1090    short-form.  */
1091
1092 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1093   do {                                                                  \
1094     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1095     char *intermediate_char_94 = "()*+";                                \
1096     char *intermediate_char_96 = ",-./";                                \
1097     Lisp_Object temp                                                    \
1098       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1099     if (! NILP (temp))                                                  \
1100         {                                                               \
1101         *dst++ = ISO_CODE_ESC;                                          \
1102         *dst++ = '&';                                                   \
1103         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1104       }                                                                 \
1105     *dst++ = ISO_CODE_ESC;                                              \
1106     if (CHARSET_DIMENSION (charset) == 1)                               \
1107       {                                                                 \
1108         if (CHARSET_CHARS (charset) == 94)                              \
1109           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1110         else                                                            \
1111           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1112       }                                                                 \
1113     else                                                                \
1114       {                                                                 \
1115         *dst++ = '$';                                                   \
1116         if (CHARSET_CHARS (charset) == 94)                              \
1117           {                                                             \
1118             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1119                 || reg != 0                                             \
1120                 || final_char < '@' || final_char > 'B')                \
1121               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1122           }                                                             \
1123         else                                                            \
1124           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1125       }                                                                 \
1126     *dst++ = final_char;                                                \
1127     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1128   } while (0)
1129
1130 /* The following two macros produce codes (control character or escape
1131    sequence) for ISO2022 single-shift functions (single-shift-2 and
1132    single-shift-3).  */
1133
1134 #define ENCODE_SINGLE_SHIFT_2                           \
1135   do {                                                  \
1136     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1137       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1138     else                                                \
1139       *dst++ = ISO_CODE_SS2;                            \
1140     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1141   } while (0)
1142
1143 #define ENCODE_SINGLE_SHIFT_3                           \
1144   do {                                                  \
1145     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1146       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1147     else                                                \
1148       *dst++ = ISO_CODE_SS3;                            \
1149     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1150   } while (0)
1151
1152 /* The following four macros produce codes (control character or
1153    escape sequence) for ISO2022 locking-shift functions (shift-in,
1154    shift-out, locking-shift-2, and locking-shift-3).  */
1155
1156 #define ENCODE_SHIFT_IN                         \
1157   do {                                          \
1158     *dst++ = ISO_CODE_SI;                       \
1159     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1160   } while (0)
1161
1162 #define ENCODE_SHIFT_OUT                        \
1163   do {                                          \
1164     *dst++ = ISO_CODE_SO;                       \
1165     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1166   } while (0)
1167
1168 #define ENCODE_LOCKING_SHIFT_2                  \
1169   do {                                          \
1170     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1171     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1172   } while (0)
1173
1174 #define ENCODE_LOCKING_SHIFT_3                  \
1175   do {                                          \
1176     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1177     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1178   } while (0)
1179
1180 /* Produce codes for a DIMENSION1 character whose character set is
1181    CHARSET and whose position-code is C1.  Designation and invocation
1182    sequences are also produced in advance if necessary.  */
1183
1184
1185 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                     \
1186   do {                                                                   \
1187     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                        \
1188       {                                                                  \
1189         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                  \
1190           *dst++ = c1 & 0x7F;                                            \
1191         else                                                             \
1192           *dst++ = c1 | 0x80;                                            \
1193         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                    \
1194         break;                                                           \
1195       }                                                                  \
1196     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))       \
1197       {                                                                  \
1198         *dst++ = c1 & 0x7F;                                              \
1199         break;                                                           \
1200       }                                                                  \
1201     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))       \
1202       {                                                                  \
1203         *dst++ = c1 | 0x80;                                              \
1204         break;                                                           \
1205       }                                                                  \
1206     else if (coding->flags & CODING_FLAG_ISO_SAFE                        \
1207              && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) \
1208                  == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))           \
1209       {                                                                  \
1210         /* We should not encode this character, instead produce one or   \
1211            two `?'s.  */                                                 \
1212         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                  \
1213         if (CHARSET_WIDTH (charset) == 2)                                \
1214           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                \
1215         break;                                                           \
1216       }                                                                  \
1217     else                                                                 \
1218       /* Since CHARSET is not yet invoked to any graphic planes, we      \
1219          must invoke it, or, at first, designate it to some graphic      \
1220          register.  Then repeat the loop to actually produce the         \
1221          character.  */                                                  \
1222       dst = encode_invocation_designation (charset, coding, dst);        \
1223   } while (1)
1224
1225 /* Produce codes for a DIMENSION2 character whose character set is
1226    CHARSET and whose position-codes are C1 and C2.  Designation and
1227    invocation codes are also produced in advance if necessary.  */
1228
1229 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                 \
1230   do {                                                                   \
1231     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                        \
1232       {                                                                  \
1233         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                  \
1234           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                        \
1235         else                                                             \
1236           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                        \
1237         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                    \
1238         break;                                                           \
1239       }                                                                  \
1240     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))       \
1241       {                                                                  \
1242         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                           \
1243         break;                                                           \
1244       }                                                                  \
1245     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))       \
1246       {                                                                  \
1247         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                           \
1248         break;                                                           \
1249       }                                                                  \
1250     else if (coding->flags & CODING_FLAG_ISO_SAFE                        \
1251              && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) \
1252                  == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))           \
1253       {                                                                  \
1254         /* We should not encode this character, instead produce one or   \
1255            two `?'s.  */                                                 \
1256         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                  \
1257         if (CHARSET_WIDTH (charset) == 2)                                \
1258           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                \
1259         break;                                                           \
1260       }                                                                  \
1261     else                                                                 \
1262       /* Since CHARSET is not yet invoked to any graphic planes, we      \
1263          must invoke it, or, at first, designate it to some graphic      \
1264          register.  Then repeat the loop to actually produce the         \
1265          character.  */                                                  \
1266       dst = encode_invocation_designation (charset, coding, dst);        \
1267   } while (1)
1268
1269 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1270   do {                                                                    \
1271     int c_alt, charset_alt;                                               \
1272     if (!NILP (unification_table)                                         \
1273         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1274             >= 0))                                                        \
1275       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1276     else                                                                  \
1277       charset_alt = charset;                                              \
1278     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1279       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1280     else                                                                  \
1281       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1282   } while (0)
1283
1284 /* Produce designation and invocation codes at a place pointed by DST
1285    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1286    Return new DST.  */
1287
1288 unsigned char *
1289 encode_invocation_designation (charset, coding, dst)
1290      int charset;
1291      struct coding_system *coding;
1292      unsigned char *dst;
1293 {
1294   int reg;                      /* graphic register number */
1295
1296   /* At first, check designations.  */
1297   for (reg = 0; reg < 4; reg++)
1298     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1299       break;
1300
1301   if (reg >= 4)
1302     {
1303       /* CHARSET is not yet designated to any graphic registers.  */
1304       /* At first check the requested designation.  */
1305       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1306       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1307         /* Since CHARSET requests no special designation, designate it
1308            to graphic register 0.  */
1309         reg = 0;
1310
1311       ENCODE_DESIGNATION (charset, reg, coding);
1312     }
1313
1314   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1315       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1316     {
1317       /* Since the graphic register REG is not invoked to any graphic
1318          planes, invoke it to graphic plane 0.  */
1319       switch (reg)
1320         {
1321         case 0:                 /* graphic register 0 */
1322           ENCODE_SHIFT_IN;
1323           break;
1324
1325         case 1:                 /* graphic register 1 */
1326           ENCODE_SHIFT_OUT;
1327           break;
1328
1329         case 2:                 /* graphic register 2 */
1330           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1331             ENCODE_SINGLE_SHIFT_2;
1332           else
1333             ENCODE_LOCKING_SHIFT_2;
1334           break;
1335
1336         case 3:                 /* graphic register 3 */
1337           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1338             ENCODE_SINGLE_SHIFT_3;
1339           else
1340             ENCODE_LOCKING_SHIFT_3;
1341           break;
1342         }
1343     }
1344   return dst;
1345 }
1346
1347 /* The following two macros produce codes for indicating composition.  */
1348 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1349 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1350 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1351
1352 /* The following three macros produce codes for indicating direction
1353    of text.  */
1354 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1355   do {                                                  \
1356     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1357       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1358     else                                                \
1359       *dst++ = ISO_CODE_CSI;                            \
1360   } while (0)
1361
1362 #define ENCODE_DIRECTION_R2L    \
1363   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1364
1365 #define ENCODE_DIRECTION_L2R    \
1366   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1367
1368 /* Produce codes for designation and invocation to reset the graphic
1369    planes and registers to initial state.  */
1370 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1371   do {                                                                      \
1372     int reg;                                                                \
1373     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1374       ENCODE_SHIFT_IN;                                                      \
1375     for (reg = 0; reg < 4; reg++)                                           \
1376       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1377           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1378               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1379         ENCODE_DESIGNATION                                                  \
1380           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1381   } while (0)
1382
1383 /* Produce designation sequences of charsets in the line started from
1384    *SRC to a place pointed by DSTP.
1385
1386    If the current block ends before any end-of-line, we may fail to
1387    find all the necessary *designations.  */
1388 encode_designation_at_bol (coding, table, src, src_end, dstp)
1389      struct coding_system *coding;
1390      Lisp_Object table;
1391      unsigned char *src, *src_end, **dstp;
1392 {
1393   int charset, c, found = 0, reg;
1394   /* Table of charsets to be designated to each graphic register.  */
1395   int r[4];
1396   unsigned char *dst = *dstp;
1397
1398   for (reg = 0; reg < 4; reg++)
1399     r[reg] = -1;
1400
1401   while (src < src_end && *src != '\n' && found < 4)
1402     {
1403       int bytes = BYTES_BY_CHAR_HEAD (*src);
1404
1405       if (NILP (table))
1406         charset = CHARSET_AT (src);
1407       else
1408         {
1409           int c_alt, c1, c2;
1410
1411           SPLIT_STRING(src, bytes, charset, c1, c2);
1412           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1413             charset = CHAR_CHARSET (c_alt);
1414         }
1415
1416       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1417       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1418         {
1419           found++;
1420           r[reg] = charset;
1421         }
1422
1423       src += bytes;
1424     }
1425
1426   if (found)
1427     {
1428       for (reg = 0; reg < 4; reg++)
1429         if (r[reg] >= 0
1430             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1431           ENCODE_DESIGNATION (r[reg], reg, coding);
1432       *dstp = dst;
1433     }
1434 }
1435
1436 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1437
1438 int
1439 encode_coding_iso2022 (coding, source, destination,
1440                        src_bytes, dst_bytes, consumed)
1441      struct coding_system *coding;
1442      unsigned char *source, *destination;
1443      int src_bytes, dst_bytes;
1444      int *consumed;
1445 {
1446   unsigned char *src = source;
1447   unsigned char *src_end = source + src_bytes;
1448   unsigned char *dst = destination;
1449   unsigned char *dst_end = destination + dst_bytes;
1450   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1451      from DST_END to assure overflow checking is necessary only at the
1452      head of loop.  */
1453   unsigned char *adjusted_dst_end = dst_end - 19;
1454   Lisp_Object unification_table
1455       = coding->character_unification_table_for_encode;
1456
1457   if (!NILP (Venable_character_unification) && NILP (unification_table))
1458     unification_table = Vstandard_character_unification_table_for_encode;
1459
1460   while (src < src_end && dst < adjusted_dst_end)
1461     {
1462       /* SRC_BASE remembers the start position in source in each loop.
1463          The loop will be exited when there's not enough source text
1464          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1465          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1466          reset to SRC_BASE before exiting.  */
1467       unsigned char *src_base = src;
1468       int charset, c1, c2, c3, c4;
1469
1470       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1471           && CODING_SPEC_ISO_BOL (coding))
1472         {
1473           /* We have to produce designation sequences if any now.  */
1474           encode_designation_at_bol (coding, unification_table,
1475                                      src, src_end, &dst);
1476           CODING_SPEC_ISO_BOL (coding) = 0;
1477         }
1478
1479       c1 = *src++;
1480       /* If we are seeing a component of a composite character, we are
1481          seeing a leading-code specially encoded for composition, or a
1482          composition rule if composing with rule.  We must set C1
1483          to a normal leading-code or an ASCII code.  If we are not at
1484          a composed character, we must reset the composition state.  */
1485       if (COMPOSING_P (coding->composing))
1486         {
1487           if (c1 < 0xA0)
1488             {
1489               /* We are not in a composite character any longer.  */
1490               coding->composing = COMPOSING_NO;
1491               ENCODE_COMPOSITION_END;
1492             }
1493           else
1494             {
1495               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1496                 {
1497                   *dst++ = c1 & 0x7F;
1498                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1499                   continue;
1500                 }
1501               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1502                 coding->composing = COMPOSING_WITH_RULE_RULE;
1503               if (c1 == 0xA0)
1504                 {
1505                   /* This is an ASCII component.  */
1506                   ONE_MORE_BYTE (c1);
1507                   c1 &= 0x7F;
1508                 }
1509               else
1510                 /* This is a leading-code of non ASCII component.  */
1511                 c1 -= 0x20;
1512             }
1513         }
1514
1515       /* Now encode one character.  C1 is a control character, an
1516          ASCII character, or a leading-code of multi-byte character.  */
1517       switch (emacs_code_class[c1])
1518         {
1519         case EMACS_ascii_code:
1520           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1521           break;
1522
1523         case EMACS_control_code:
1524           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1525             ENCODE_RESET_PLANE_AND_REGISTER;
1526           *dst++ = c1;
1527           break;
1528
1529         case EMACS_carriage_return_code:
1530           if (!coding->selective)
1531             {
1532               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1533                 ENCODE_RESET_PLANE_AND_REGISTER;
1534               *dst++ = c1;
1535               break;
1536             }
1537           /* fall down to treat '\r' as '\n' ...  */
1538
1539         case EMACS_linefeed_code:
1540           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1541             ENCODE_RESET_PLANE_AND_REGISTER;
1542           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1543             bcopy (coding->spec.iso2022.initial_designation,
1544                    coding->spec.iso2022.current_designation,
1545                    sizeof coding->spec.iso2022.initial_designation);
1546           if (coding->eol_type == CODING_EOL_LF
1547               || coding->eol_type == CODING_EOL_UNDECIDED)
1548             *dst++ = ISO_CODE_LF;
1549           else if (coding->eol_type == CODING_EOL_CRLF)
1550             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1551           else
1552             *dst++ = ISO_CODE_CR;
1553           CODING_SPEC_ISO_BOL (coding) = 1;
1554           break;
1555
1556         case EMACS_leading_code_2:
1557           ONE_MORE_BYTE (c2);
1558           if (c2 < 0xA0)
1559             {
1560               /* invalid sequence */
1561               *dst++ = c1;
1562               *dst++ = c2;
1563             }
1564           else
1565             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1566           break;
1567
1568         case EMACS_leading_code_3:
1569           TWO_MORE_BYTES (c2, c3);
1570           if (c2 < 0xA0 || c3 < 0xA0)
1571             {
1572               /* invalid sequence */
1573               *dst++ = c1;
1574               *dst++ = c2;
1575               *dst++ = c3;
1576             }
1577           else if (c1 < LEADING_CODE_PRIVATE_11)
1578             ENCODE_ISO_CHARACTER (c1, c2, c3);
1579           else
1580             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1581           break;
1582
1583         case EMACS_leading_code_4:
1584           THREE_MORE_BYTES (c2, c3, c4);
1585           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1586             {
1587               /* invalid sequence */
1588               *dst++ = c1;
1589               *dst++ = c2;
1590               *dst++ = c3;
1591               *dst++ = c4;
1592             }
1593           else
1594             ENCODE_ISO_CHARACTER (c2, c3, c4);
1595           break;
1596
1597         case EMACS_leading_code_composition:
1598           ONE_MORE_BYTE (c2);
1599           if (c2 < 0xA0)
1600             {
1601               /* invalid sequence */
1602               *dst++ = c1;
1603               *dst++ = c2;
1604             }
1605           else if (c2 == 0xFF)
1606             {
1607               coding->composing = COMPOSING_WITH_RULE_HEAD;
1608               ENCODE_COMPOSITION_WITH_RULE_START;
1609             }
1610           else
1611             {
1612               /* Rewind one byte because it is a character code of
1613                  composition elements.  */
1614               src--;
1615               coding->composing = COMPOSING_NO_RULE_HEAD;
1616               ENCODE_COMPOSITION_NO_RULE_START;
1617             }
1618           break;
1619
1620         case EMACS_invalid_code:
1621           *dst++ = c1;
1622           break;
1623         }
1624       continue;
1625     label_end_of_loop:
1626       /* We reach here because the source date ends not at character
1627          boundary.  */
1628       coding->carryover_size = src_end - src_base;
1629       bcopy (src_base, coding->carryover, coding->carryover_size);
1630       src = src_end;
1631       break;
1632     }
1633
1634   /* If this is the last block of the text to be encoded, we must
1635      reset graphic planes and registers to the initial state.  */
1636   if (src >= src_end && coding->last_block)
1637     {
1638       ENCODE_RESET_PLANE_AND_REGISTER;
1639       if (coding->carryover_size > 0
1640           && coding->carryover_size < (dst_end - dst))
1641         {
1642           bcopy (coding->carryover, dst, coding->carryover_size);
1643           dst += coding->carryover_size;
1644           coding->carryover_size = 0;
1645         }
1646     }
1647   *consumed = src - source;
1648   return dst - destination;
1649 }
1650
1651 \f
1652 /*** 4. SJIS and BIG5 handlers ***/
1653
1654 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1655    quite widely.  So, for the moment, Emacs supports them in the bare
1656    C code.  But, in the future, they may be supported only by CCL.  */
1657
1658 /* SJIS is a coding system encoding three character sets: ASCII, right
1659    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1660    as is.  A character of charset katakana-jisx0201 is encoded by
1661    "position-code + 0x80".  A character of charset japanese-jisx0208
1662    is encoded in 2-byte but two position-codes are divided and shifted
1663    so that it fit in the range below.
1664
1665    --- CODE RANGE of SJIS ---
1666    (character set)      (range)
1667    ASCII                0x00 .. 0x7F
1668    KATAKANA-JISX0201    0xA0 .. 0xDF
1669    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1670             (2nd byte)  0x40 .. 0xFF
1671    -------------------------------
1672
1673 */
1674
1675 /* BIG5 is a coding system encoding two character sets: ASCII and
1676    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1677    character set and is encoded in two-byte.
1678
1679    --- CODE RANGE of BIG5 ---
1680    (character set)      (range)
1681    ASCII                0x00 .. 0x7F
1682    Big5 (1st byte)      0xA1 .. 0xFE
1683         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1684    --------------------------
1685
1686    Since the number of characters in Big5 is larger than maximum
1687    characters in Emacs' charset (96x96), it can't be handled as one
1688    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1689    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1690    contains frequently used characters and the latter contains less
1691    frequently used characters.  */
1692
1693 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1694    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1695    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1696    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1697
1698 /* Number of Big5 characters which have the same code in 1st byte.  */
1699 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1700
1701 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1702   do {                                                                  \
1703     unsigned int temp                                                   \
1704       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1705     if (b1 < 0xC9)                                                      \
1706       charset = charset_big5_1;                                         \
1707     else                                                                \
1708       {                                                                 \
1709         charset = charset_big5_2;                                       \
1710         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1711       }                                                                 \
1712     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1713     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1714   } while (0)
1715
1716 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1717   do {                                                                  \
1718     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1719     if (charset == charset_big5_2)                                      \
1720       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1721     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1722     b2 = temp % BIG5_SAME_ROW;                                          \
1723     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1724   } while (0)
1725
1726 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1727   do {                                                                  \
1728     int c_alt, charset_alt = (charset);                                 \
1729     if (!NILP (unification_table)                                       \
1730         && ((c_alt = unify_char (unification_table,                     \
1731                                  -1, (charset), c1, c2)) >= 0))         \
1732           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1733     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1734       DECODE_CHARACTER_ASCII (c1);                                      \
1735     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1736       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1737     else                                                                \
1738       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1739   } while (0)
1740
1741 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1742   do {                                                                    \
1743     int c_alt, charset_alt;                                               \
1744     if (!NILP (unification_table)                                         \
1745         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1746             >= 0))                                                        \
1747       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1748     else                                                                  \
1749       charset_alt = charset;                                              \
1750     if (charset_alt == charset_ascii)                                     \
1751       *dst++ = c1;                                                        \
1752     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1753       {                                                                   \
1754         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1755           *dst++ = c1;                                                    \
1756         else                                                              \
1757           *dst++ = charset_alt, *dst++ = c1;                              \
1758       }                                                                   \
1759     else                                                                  \
1760       {                                                                   \
1761         c1 &= 0x7F, c2 &= 0x7F;                                           \
1762         if (sjis_p && charset_alt == charset_jisx0208)                    \
1763           {                                                               \
1764             unsigned char s1, s2;                                         \
1765                                                                           \
1766             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1767             *dst++ = s1, *dst++ = s2;                                     \
1768           }                                                               \
1769         else if (!sjis_p                                                  \
1770                  && (charset_alt == charset_big5_1                        \
1771                      || charset_alt == charset_big5_2))                   \
1772           {                                                               \
1773             unsigned char b1, b2;                                         \
1774                                                                           \
1775             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1776             *dst++ = b1, *dst++ = b2;                                     \
1777           }                                                               \
1778         else                                                              \
1779           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1780       }                                                                   \
1781   } while (0);
1782
1783 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1784    Check if a text is encoded in SJIS.  If it is, return
1785    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1786
1787 int
1788 detect_coding_sjis (src, src_end)
1789      unsigned char *src, *src_end;
1790 {
1791   unsigned char c;
1792
1793   while (src < src_end)
1794     {
1795       c = *src++;
1796       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1797         return 0;
1798       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1799         {
1800           if (src < src_end && *src++ < 0x40)
1801             return 0;
1802         }
1803     }
1804   return CODING_CATEGORY_MASK_SJIS;
1805 }
1806
1807 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1808    Check if a text is encoded in BIG5.  If it is, return
1809    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1810
1811 int
1812 detect_coding_big5 (src, src_end)
1813      unsigned char *src, *src_end;
1814 {
1815   unsigned char c;
1816
1817   while (src < src_end)
1818     {
1819       c = *src++;
1820       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1821         return 0;
1822       if (c >= 0xA1)
1823         {
1824           if (src >= src_end)
1825             break;
1826           c = *src++;
1827           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1828             return 0;
1829         }
1830     }
1831   return CODING_CATEGORY_MASK_BIG5;
1832 }
1833
1834 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1835    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1836
1837 int
1838 decode_coding_sjis_big5 (coding, source, destination,
1839                          src_bytes, dst_bytes, consumed, sjis_p)
1840      struct coding_system *coding;
1841      unsigned char *source, *destination;
1842      int src_bytes, dst_bytes;
1843      int *consumed;
1844      int sjis_p;
1845 {
1846   unsigned char *src = source;
1847   unsigned char *src_end = source + src_bytes;
1848   unsigned char *dst = destination;
1849   unsigned char *dst_end = destination + dst_bytes;
1850   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1851      from DST_END to assure overflow checking is necessary only at the
1852      head of loop.  */
1853   unsigned char *adjusted_dst_end = dst_end - 3;
1854   Lisp_Object unification_table
1855       = coding->character_unification_table_for_decode;
1856
1857   if (!NILP (Venable_character_unification) && NILP (unification_table))
1858     unification_table = Vstandard_character_unification_table_for_decode;
1859
1860   while (src < src_end && dst < adjusted_dst_end)
1861     {
1862       /* SRC_BASE remembers the start position in source in each loop.
1863          The loop will be exited when there's not enough source text
1864          to analyze two-byte character (within macro ONE_MORE_BYTE).
1865          In that case, SRC is reset to SRC_BASE before exiting.  */
1866       unsigned char *src_base = src;
1867       unsigned char c1 = *src++, c2, c3, c4;
1868
1869       if (c1 == '\r')
1870         {
1871           if (coding->eol_type == CODING_EOL_CRLF)
1872             {
1873               ONE_MORE_BYTE (c2);
1874               if (c2 == '\n')
1875                 *dst++ = c2;
1876               else
1877                 /* To process C2 again, SRC is subtracted by 1.  */
1878                 *dst++ = c1, src--;
1879             }
1880           else
1881             *dst++ = c1;
1882         }
1883       else if (c1 < 0x20)
1884         *dst++ = c1;
1885       else if (c1 < 0x80)
1886         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1887       else if (c1 < 0xA0 || c1 >= 0xE0)
1888         {
1889           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1890           if (sjis_p)
1891             {
1892               ONE_MORE_BYTE (c2);
1893               DECODE_SJIS (c1, c2, c3, c4);
1894               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1895             }
1896           else if (c1 >= 0xE0 && c1 < 0xFF)
1897             {
1898               int charset;
1899
1900               ONE_MORE_BYTE (c2);
1901               DECODE_BIG5 (c1, c2, charset, c3, c4);
1902               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1903             }
1904           else                  /* Invalid code */
1905             *dst++ = c1;
1906         }
1907       else
1908         {
1909           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1910           if (sjis_p)
1911             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1912           else
1913             {
1914               int charset;
1915
1916               ONE_MORE_BYTE (c2);
1917               DECODE_BIG5 (c1, c2, charset, c3, c4);
1918               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1919             }
1920         }
1921       continue;
1922
1923     label_end_of_loop:
1924       coding->carryover_size = src - src_base;
1925       bcopy (src_base, coding->carryover, coding->carryover_size);
1926       src = src_base;
1927       break;
1928     }
1929
1930   *consumed = src - source;
1931   return dst - destination;
1932 }
1933
1934 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1935    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1936    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1937    sure that all these charsets are registered as official charset
1938    (i.e. do not have extended leading-codes).  Characters of other
1939    charsets are produced without any encoding.  If SJIS_P is 1, encode
1940    SJIS text, else encode BIG5 text.  */
1941
1942 int
1943 encode_coding_sjis_big5 (coding, source, destination,
1944                          src_bytes, dst_bytes, consumed, sjis_p)
1945      struct coding_system *coding;
1946      unsigned char *source, *destination;
1947      int src_bytes, dst_bytes;
1948      int *consumed;
1949      int sjis_p;
1950 {
1951   unsigned char *src = source;
1952   unsigned char *src_end = source + src_bytes;
1953   unsigned char *dst = destination;
1954   unsigned char *dst_end = destination + dst_bytes;
1955   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1956      from DST_END to assure overflow checking is necessary only at the
1957      head of loop.  */
1958   unsigned char *adjusted_dst_end = dst_end - 1;
1959   Lisp_Object unification_table
1960       = coding->character_unification_table_for_encode;
1961
1962   if (!NILP (Venable_character_unification) && NILP (unification_table))
1963     unification_table = Vstandard_character_unification_table_for_encode;
1964
1965   while (src < src_end && dst < adjusted_dst_end)
1966     {
1967       /* SRC_BASE remembers the start position in source in each loop.
1968          The loop will be exited when there's not enough source text
1969          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1970          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1971          before exiting.  */
1972       unsigned char *src_base = src;
1973       unsigned char c1 = *src++, c2, c3, c4;
1974
1975       if (coding->composing)
1976         {
1977           if (c1 == 0xA0)
1978             {
1979               ONE_MORE_BYTE (c1);
1980               c1 &= 0x7F;
1981             }
1982           else if (c1 >= 0xA0)
1983             c1 -= 0x20;
1984           else
1985             coding->composing = 0;
1986         }
1987
1988       switch (emacs_code_class[c1])
1989         {
1990         case EMACS_ascii_code:
1991           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1992           break;
1993
1994         case EMACS_control_code:
1995           *dst++ = c1;
1996           break;
1997
1998         case EMACS_carriage_return_code:
1999           if (!coding->selective)
2000             {
2001               *dst++ = c1;
2002               break;
2003             }
2004           /* fall down to treat '\r' as '\n' ...  */
2005
2006         case EMACS_linefeed_code:
2007           if (coding->eol_type == CODING_EOL_LF
2008               || coding->eol_type == CODING_EOL_UNDECIDED)
2009             *dst++ = '\n';
2010           else if (coding->eol_type == CODING_EOL_CRLF)
2011             *dst++ = '\r', *dst++ = '\n';
2012           else
2013             *dst++ = '\r';
2014           break;
2015
2016         case EMACS_leading_code_2:
2017           ONE_MORE_BYTE (c2);
2018           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2019           break;
2020
2021         case EMACS_leading_code_3:
2022           TWO_MORE_BYTES (c2, c3);
2023           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2024           break;
2025
2026         case EMACS_leading_code_4:
2027           THREE_MORE_BYTES (c2, c3, c4);
2028           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2029           break;
2030
2031         case EMACS_leading_code_composition:
2032           coding->composing = 1;
2033           break;
2034
2035         default:                /* i.e. case EMACS_invalid_code: */
2036           *dst++ = c1;
2037         }
2038       continue;
2039
2040     label_end_of_loop:
2041       coding->carryover_size = src_end - src_base;
2042       bcopy (src_base, coding->carryover, coding->carryover_size);
2043       src = src_end;
2044       break;
2045     }
2046
2047   *consumed = src - source;
2048   return dst - destination;
2049 }
2050
2051 \f
2052 /*** 5. End-of-line handlers ***/
2053
2054 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2055    This function is called only when `coding->eol_type' is
2056    CODING_EOL_CRLF or CODING_EOL_CR.  */
2057
2058 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2059      struct coding_system *coding;
2060      unsigned char *source, *destination;
2061      int src_bytes, dst_bytes;
2062      int *consumed;
2063 {
2064   unsigned char *src = source;
2065   unsigned char *src_end = source + src_bytes;
2066   unsigned char *dst = destination;
2067   unsigned char *dst_end = destination + dst_bytes;
2068   int produced;
2069
2070   switch (coding->eol_type)
2071     {
2072     case CODING_EOL_CRLF:
2073       {
2074         /* Since the maximum bytes produced by each loop is 2, we
2075            subtract 1 from DST_END to assure overflow checking is
2076            necessary only at the head of loop.  */
2077         unsigned char *adjusted_dst_end = dst_end - 1;
2078
2079         while (src < src_end && dst < adjusted_dst_end)
2080           {
2081             unsigned char *src_base = src;
2082             unsigned char c = *src++;
2083             if (c == '\r')
2084               {
2085                 ONE_MORE_BYTE (c);
2086                 if (c != '\n')
2087                   *dst++ = '\r';
2088                 *dst++ = c;
2089               }
2090             else
2091               *dst++ = c;
2092             continue;
2093
2094           label_end_of_loop:
2095             coding->carryover_size = src - src_base;
2096             bcopy (src_base, coding->carryover, coding->carryover_size);
2097             src = src_base;
2098             break;
2099           }
2100         *consumed = src - source;
2101         produced = dst - destination;
2102         break;
2103       }
2104
2105     case CODING_EOL_CR:
2106       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2107       bcopy (source, destination, produced);
2108       dst_end = destination + produced;
2109       while (dst < dst_end)
2110         if (*dst++ == '\r') dst[-1] = '\n';
2111       *consumed = produced;
2112       break;
2113
2114     default:                    /* i.e. case: CODING_EOL_LF */
2115       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2116       bcopy (source, destination, produced);
2117       *consumed = produced;
2118       break;
2119     }
2120
2121   return produced;
2122 }
2123
2124 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2125    format of end-of-line according to `coding->eol_type'.  If
2126    `coding->selective' is 1, code '\r' in source text also means
2127    end-of-line.  */
2128
2129 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2130      struct coding_system *coding;
2131      unsigned char *source, *destination;
2132      int src_bytes, dst_bytes;
2133      int *consumed;
2134 {
2135   unsigned char *src = source;
2136   unsigned char *dst = destination;
2137   int produced;
2138
2139   if (src_bytes <= 0)
2140     return 0;
2141
2142   switch (coding->eol_type)
2143     {
2144     case CODING_EOL_LF:
2145     case CODING_EOL_UNDECIDED:
2146       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2147       bcopy (source, destination, produced);
2148       if (coding->selective)
2149         {
2150           int i = produced;
2151           while (i--)
2152             if (*dst++ == '\r') dst[-1] = '\n';
2153         }
2154       *consumed = produced;
2155
2156     case CODING_EOL_CRLF:
2157       {
2158         unsigned char c;
2159         unsigned char *src_end = source + src_bytes;
2160         unsigned char *dst_end = destination + dst_bytes;
2161         /* Since the maximum bytes produced by each loop is 2, we
2162            subtract 1 from DST_END to assure overflow checking is
2163            necessary only at the head of loop.  */
2164         unsigned char *adjusted_dst_end = dst_end - 1;
2165
2166         while (src < src_end && dst < adjusted_dst_end)
2167           {
2168             c = *src++;
2169             if (c == '\n' || (c == '\r' && coding->selective))
2170               *dst++ = '\r', *dst++ = '\n';
2171             else
2172               *dst++ = c;
2173           }
2174         produced = dst - destination;
2175         *consumed = src - source;
2176         break;
2177       }
2178
2179     default:                    /* i.e. case CODING_EOL_CR: */
2180       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2181       bcopy (source, destination, produced);
2182       {
2183         int i = produced;
2184         while (i--)
2185           if (*dst++ == '\n') dst[-1] = '\r';
2186       }
2187       *consumed = produced;
2188     }
2189
2190   return produced;
2191 }
2192
2193 \f
2194 /*** 6. C library functions ***/
2195
2196 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2197    has a property `coding-system'.  The value of this property is a
2198    vector of length 5 (called as coding-vector).  Among elements of
2199    this vector, the first (element[0]) and the fifth (element[4])
2200    carry important information for decoding/encoding.  Before
2201    decoding/encoding, this information should be set in fields of a
2202    structure of type `coding_system'.
2203
2204    A value of property `coding-system' can be a symbol of another
2205    subsidiary coding-system.  In that case, Emacs gets coding-vector
2206    from that symbol.
2207
2208    `element[0]' contains information to be set in `coding->type'.  The
2209    value and its meaning is as follows:
2210
2211    0 -- coding_type_emacs_mule
2212    1 -- coding_type_sjis
2213    2 -- coding_type_iso2022
2214    3 -- coding_type_big5
2215    4 -- coding_type_ccl encoder/decoder written in CCL
2216    nil -- coding_type_no_conversion
2217    t -- coding_type_undecided (automatic conversion on decoding,
2218                                no-conversion on encoding)
2219
2220    `element[4]' contains information to be set in `coding->flags' and
2221    `coding->spec'.  The meaning varies by `coding->type'.
2222
2223    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2224    of length 32 (of which the first 13 sub-elements are used now).
2225    Meanings of these sub-elements are:
2226
2227    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2228         If the value is an integer of valid charset, the charset is
2229         assumed to be designated to graphic register N initially.
2230
2231         If the value is minus, it is a minus value of charset which
2232         reserves graphic register N, which means that the charset is
2233         not designated initially but should be designated to graphic
2234         register N just before encoding a character in that charset.
2235
2236         If the value is nil, graphic register N is never used on
2237         encoding.
2238
2239    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2240         Each value takes t or nil.  See the section ISO2022 of
2241         `coding.h' for more information.
2242
2243    If `coding->type' is `coding_type_big5', element[4] is t to denote
2244    BIG5-ETen or nil to denote BIG5-HKU.
2245
2246    If `coding->type' takes the other value, element[4] is ignored.
2247
2248    Emacs Lisp's coding system also carries information about format of
2249    end-of-line in a value of property `eol-type'.  If the value is
2250    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2251    means CODING_EOL_CR.  If it is not integer, it should be a vector
2252    of subsidiary coding systems of which property `eol-type' has one
2253    of above values.
2254
2255 */
2256
2257 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2258    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2259    is setup so that no conversion is necessary and return -1, else
2260    return 0.  */
2261
2262 int
2263 setup_coding_system (coding_system, coding)
2264      Lisp_Object coding_system;
2265      struct coding_system *coding;
2266 {
2267   Lisp_Object type, eol_type;
2268
2269   /* At first, set several fields to default values.  */
2270   coding->require_flushing = 0;
2271   coding->last_block = 0;
2272   coding->selective = 0;
2273   coding->composing = 0;
2274   coding->direction = 0;
2275   coding->carryover_size = 0;
2276   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2277   coding->character_unification_table_for_decode = Qnil;
2278   coding->character_unification_table_for_encode = Qnil;
2279
2280   Vlast_coding_system_used = coding->symbol = coding_system;
2281   eol_type = Qnil;
2282   /* Get value of property `coding-system' until we get a vector.
2283      While doing that, also get values of properties
2284      `post-read-conversion', `pre-write-conversion',
2285      `character-unification-table-for-decode',
2286      `character-unification-table-for-encode' and `eol-type'.  */
2287   while (!NILP (coding_system) && SYMBOLP (coding_system))
2288     {
2289       if (NILP (coding->post_read_conversion))
2290         coding->post_read_conversion = Fget (coding_system,
2291                                              Qpost_read_conversion);
2292       if (NILP (coding->pre_write_conversion))
2293         coding->pre_write_conversion = Fget (coding_system,
2294                                              Qpre_write_conversion);
2295       if (!inhibit_eol_conversion && NILP (eol_type))
2296         eol_type = Fget (coding_system, Qeol_type);
2297
2298       if (NILP (coding->character_unification_table_for_decode))
2299         coding->character_unification_table_for_decode
2300           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2301
2302       if (NILP (coding->character_unification_table_for_encode))
2303         coding->character_unification_table_for_encode
2304           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2305
2306       coding_system = Fget (coding_system, Qcoding_system);
2307     }
2308
2309   while (!NILP (coding->character_unification_table_for_decode)
2310          && SYMBOLP (coding->character_unification_table_for_decode))
2311         coding->character_unification_table_for_decode
2312           = Fget (coding->character_unification_table_for_decode,
2313                   Qcharacter_unification_table_for_decode);
2314   if (!NILP (coding->character_unification_table_for_decode)
2315       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2316       coding->character_unification_table_for_decode = Qnil;
2317
2318   while (!NILP (coding->character_unification_table_for_encode)
2319          && SYMBOLP (coding->character_unification_table_for_encode))
2320         coding->character_unification_table_for_encode
2321           = Fget (coding->character_unification_table_for_encode,
2322                   Qcharacter_unification_table_for_encode);
2323   if (!NILP (coding->character_unification_table_for_encode)
2324       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2325       coding->character_unification_table_for_encode = Qnil;
2326
2327   if (!VECTORP (coding_system)
2328       || XVECTOR (coding_system)->size != 5)
2329     goto label_invalid_coding_system;
2330
2331   if (VECTORP (eol_type))
2332     coding->eol_type = CODING_EOL_UNDECIDED;
2333   else if (XFASTINT (eol_type) == 1)
2334     coding->eol_type = CODING_EOL_CRLF;
2335   else if (XFASTINT (eol_type) == 2)
2336     coding->eol_type = CODING_EOL_CR;
2337   else
2338     coding->eol_type = CODING_EOL_LF;
2339
2340   type = XVECTOR (coding_system)->contents[0];
2341   switch (XFASTINT (type))
2342     {
2343     case 0:
2344       coding->type = coding_type_emacs_mule;
2345       break;
2346
2347     case 1:
2348       coding->type = coding_type_sjis;
2349       break;
2350
2351     case 2:
2352       coding->type = coding_type_iso2022;
2353       {
2354         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2355         Lisp_Object *flags;
2356         int i, charset, default_reg_bits = 0;
2357
2358         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2359           goto label_invalid_coding_system;
2360
2361         flags = XVECTOR (val)->contents;
2362         coding->flags
2363           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2364              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2365              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2366              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2367              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2368              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2369              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2370              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2371              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2372              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2373              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2374              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2375              );
2376
2377         /* Invoke graphic register 0 to plane 0.  */
2378         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2379         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2380         CODING_SPEC_ISO_INVOCATION (coding, 1)
2381           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2382         /* Not single shifting at first.  */
2383         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2384         /* Beginning of buffer should also be regarded as bol. */
2385         CODING_SPEC_ISO_BOL(coding) = 1;
2386
2387         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2388            FLAGS[REG] can be one of below:
2389                 integer CHARSET: CHARSET occupies register I,
2390                 t: designate nothing to REG initially, but can be used
2391                   by any charsets,
2392                 list of integer, nil, or t: designate the first
2393                   element (if integer) to REG initially, the remaining
2394                   elements (if integer) is designated to REG on request,
2395                   if an element is t, REG can be used by any charset,
2396                 nil: REG is never used.  */
2397         for (charset = 0; charset <= MAX_CHARSET; charset++)
2398           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2399             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2400         for (i = 0; i < 4; i++)
2401           {
2402             if (INTEGERP (flags[i])
2403                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2404                 || (charset = get_charset_id (flags[i])) >= 0)
2405               {
2406                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2407                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2408               }
2409             else if (EQ (flags[i], Qt))
2410               {
2411                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2412                 default_reg_bits |= 1 << i;
2413               }
2414             else if (CONSP (flags[i]))
2415               {
2416                 Lisp_Object tail = flags[i];
2417
2418                 if (INTEGERP (XCONS (tail)->car)
2419                     && (charset = XINT (XCONS (tail)->car),
2420                         CHARSET_VALID_P (charset))
2421                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2422                   {
2423                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2424                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2425                   }
2426                 else
2427                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2428                 tail = XCONS (tail)->cdr;
2429                 while (CONSP (tail))
2430                   {
2431                     if (INTEGERP (XCONS (tail)->car)
2432                         && (charset = XINT (XCONS (tail)->car),
2433                             CHARSET_VALID_P (charset))
2434                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2435                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2436                         = i;
2437                     else if (EQ (XCONS (tail)->car, Qt))
2438                       default_reg_bits |= 1 << i;
2439                     tail = XCONS (tail)->cdr;
2440                   }
2441               }
2442             else
2443               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2444
2445             CODING_SPEC_ISO_DESIGNATION (coding, i)
2446               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2447           }
2448
2449         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2450           {
2451             /* REG 1 can be used only by locking shift in 7-bit env.  */
2452             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2453               default_reg_bits &= ~2;
2454             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2455               /* Without any shifting, only REG 0 and 1 can be used.  */
2456               default_reg_bits &= 3;
2457           }
2458
2459         if (! (coding->flags & CODING_FLAG_ISO_SAFE))
2460           for (charset = 0; charset <= MAX_CHARSET; charset++)
2461             if (CHARSET_VALID_P (charset)
2462                 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2463                     == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2464               {
2465                 /* We have not yet decided where to designate CHARSET.  */
2466                 int reg_bits = default_reg_bits;
2467
2468                 if (CHARSET_CHARS (charset) == 96)
2469                   /* A charset of CHARS96 can't be designated to REG 0.  */
2470                   reg_bits &= ~1;
2471
2472                 if (reg_bits)
2473                   /* There exist some default graphic register.  */
2474                   CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2475                     = (reg_bits & 1
2476                        ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2477                 else
2478                   /* We anyway have to designate CHARSET to somewhere.  */
2479                   CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2480                     = (CHARSET_CHARS (charset) == 94
2481                        ? 0
2482                        : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2483                            || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2484                           ? 1
2485                           : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2486                              ? 2 : 0)));
2487               }
2488       }
2489       coding->require_flushing = 1;
2490       break;
2491
2492     case 3:
2493       coding->type = coding_type_big5;
2494       coding->flags
2495         = (NILP (XVECTOR (coding_system)->contents[4])
2496            ? CODING_FLAG_BIG5_HKU
2497            : CODING_FLAG_BIG5_ETEN);
2498       break;
2499
2500     case 4:
2501       coding->type = coding_type_ccl;
2502       {
2503         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2504         if (CONSP  (val)
2505             && VECTORP (XCONS (val)->car)
2506             && VECTORP (XCONS (val)->cdr))
2507           {
2508             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2509             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2510           }
2511         else
2512           goto label_invalid_coding_system;
2513       }
2514       coding->require_flushing = 1;
2515       break;
2516
2517     default:
2518       if (EQ (type, Qt))
2519         coding->type = coding_type_undecided;
2520       else
2521         coding->type = coding_type_no_conversion;
2522       break;
2523     }
2524   return 0;
2525
2526  label_invalid_coding_system:
2527   coding->type = coding_type_no_conversion;
2528   coding->eol_type = CODING_EOL_LF;
2529   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2530     = Qnil;
2531   return -1;
2532 }
2533
2534 /* Emacs has a mechanism to automatically detect a coding system if it
2535    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2536    it's impossible to distinguish some coding systems accurately
2537    because they use the same range of codes.  So, at first, coding
2538    systems are categorized into 7, those are:
2539
2540    o coding-category-emacs-mule
2541
2542         The category for a coding system which has the same code range
2543         as Emacs' internal format.  Assigned the coding-system (Lisp
2544         symbol) `emacs-mule' by default.
2545
2546    o coding-category-sjis
2547
2548         The category for a coding system which has the same code range
2549         as SJIS.  Assigned the coding-system (Lisp
2550         symbol) `japanese-shift-jis' by default.
2551
2552    o coding-category-iso-7
2553
2554         The category for a coding system which has the same code range
2555         as ISO2022 of 7-bit environment.  This doesn't use any locking
2556         shift and single shift functions.  Assigned the coding-system
2557         (Lisp symbol) `iso-2022-7bit' by default.
2558
2559    o coding-category-iso-8-1
2560
2561         The category for a coding system which has the same code range
2562         as ISO2022 of 8-bit environment and graphic plane 1 used only
2563         for DIMENSION1 charset.  This doesn't use any locking shift
2564         and single shift functions.  Assigned the coding-system (Lisp
2565         symbol) `iso-latin-1' by default.
2566
2567    o coding-category-iso-8-2
2568
2569         The category for a coding system which has the same code range
2570         as ISO2022 of 8-bit environment and graphic plane 1 used only
2571         for DIMENSION2 charset.  This doesn't use any locking shift
2572         and single shift functions.  Assigned the coding-system (Lisp
2573         symbol) `japanese-iso-8bit' by default.
2574
2575    o coding-category-iso-7-else
2576
2577         The category for a coding system which has the same code range
2578         as ISO2022 of 7-bit environemnt but uses locking shift or
2579         single shift functions.  Assigned the coding-system (Lisp
2580         symbol) `iso-2022-7bit-lock' by default.
2581
2582    o coding-category-iso-8-else
2583
2584         The category for a coding system which has the same code range
2585         as ISO2022 of 8-bit environemnt but uses locking shift or
2586         single shift functions.  Assigned the coding-system (Lisp
2587         symbol) `iso-2022-8bit-ss2' by default.
2588
2589    o coding-category-big5
2590
2591         The category for a coding system which has the same code range
2592         as BIG5.  Assigned the coding-system (Lisp symbol)
2593         `cn-big5' by default.
2594
2595    o coding-category-binary
2596
2597         The category for a coding system not categorized in any of the
2598         above.  Assigned the coding-system (Lisp symbol)
2599         `no-conversion' by default.
2600
2601    Each of them is a Lisp symbol and the value is an actual
2602    `coding-system's (this is also a Lisp symbol) assigned by a user.
2603    What Emacs does actually is to detect a category of coding system.
2604    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2605    decide only one possible category, it selects a category of the
2606    highest priority.  Priorities of categories are also specified by a
2607    user in a Lisp variable `coding-category-list'.
2608
2609 */
2610
2611 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2612    If it detects possible coding systems, return an integer in which
2613    appropriate flag bits are set.  Flag bits are defined by macros
2614    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2615
2616 int
2617 detect_coding_mask (src, src_bytes)
2618      unsigned char *src;
2619      int src_bytes;
2620 {
2621   register unsigned char c;
2622   unsigned char *src_end = src + src_bytes;
2623   int mask;
2624
2625   /* At first, skip all ASCII characters and control characters except
2626      for three ISO2022 specific control characters.  */
2627  label_loop_detect_coding:
2628   while (src < src_end)
2629     {
2630       c = *src;
2631       if (c >= 0x80
2632           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2633         break;
2634       src++;
2635     }
2636
2637   if (src >= src_end)
2638     /* We found nothing other than ASCII.  There's nothing to do.  */
2639     return CODING_CATEGORY_MASK_ANY;
2640
2641   /* The text seems to be encoded in some multilingual coding system.
2642      Now, try to find in which coding system the text is encoded.  */
2643   if (c < 0x80)
2644     {
2645       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2646       /* C is an ISO2022 specific control code of C0.  */
2647       mask = detect_coding_iso2022 (src, src_end);
2648       src++;
2649       if (mask == CODING_CATEGORY_MASK_ANY)
2650         /* No valid ISO2022 code follows C.  Try again.  */
2651         goto label_loop_detect_coding;
2652     }
2653   else if (c < 0xA0)
2654     {
2655       /* If C is a special Microsoft code,
2656          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2657          or is an ISO2022 control-sequence-introducer (CSI),
2658          we should also consider the possibility of someof ISO2022 codings.  */
2659       if ((VECTORP (Vmicrosoft_code_table)
2660            && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
2661           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2662           || (c == ISO_CODE_CSI
2663               && (src < src_end
2664                   && (*src == ']'
2665                       || (src + 1 < src_end
2666                           && src[1] == ']'
2667                           && (*src == '0' || *src == '1' || *src == '2'))))))
2668         mask = (detect_coding_iso2022 (src, src_end)
2669                 | detect_coding_sjis (src, src_end)
2670                 | detect_coding_emacs_mule (src, src_end)
2671                 | CODING_CATEGORY_MASK_BINARY);
2672
2673       else
2674         /* C is the first byte of SJIS character code, or a
2675            leading-code of Emacs.  */
2676         mask = (detect_coding_sjis (src, src_end)
2677                 | detect_coding_emacs_mule (src, src_end)
2678                 | CODING_CATEGORY_MASK_BINARY);
2679     }
2680   else
2681     /* C is a character of ISO2022 in graphic plane right,
2682        or a SJIS's 1-byte character code (i.e. JISX0201),
2683        or the first byte of BIG5's 2-byte code.  */
2684     mask = (detect_coding_iso2022 (src, src_end)
2685             | detect_coding_sjis (src, src_end)
2686             | detect_coding_big5 (src, src_end)
2687             | CODING_CATEGORY_MASK_BINARY);
2688
2689   return mask;
2690 }
2691
2692 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2693    The information of the detected coding system is set in CODING.  */
2694
2695 void
2696 detect_coding (coding, src, src_bytes)
2697      struct coding_system *coding;
2698      unsigned char *src;
2699      int src_bytes;
2700 {
2701   int mask = detect_coding_mask (src, src_bytes);
2702   int idx;
2703
2704   if (mask == CODING_CATEGORY_MASK_ANY)
2705     /* We found nothing other than ASCII.  There's nothing to do.  */
2706     return;
2707
2708   if (!mask)
2709     /* The source text seems to be encoded in unknown coding system.
2710        Emacs regards the category of such a kind of coding system as
2711        `coding-category-binary'.  We assume that a user has assigned
2712        an appropriate coding system for a `coding-category-binary'.  */
2713     idx = CODING_CATEGORY_IDX_BINARY;
2714   else
2715     {
2716       /* We found some plausible coding systems.  Let's use a coding
2717          system of the highest priority.  */
2718       Lisp_Object val = Vcoding_category_list;
2719
2720       if (CONSP (val))
2721         while (!NILP (val))
2722           {
2723             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2724             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2725               break;
2726             val = XCONS (val)->cdr;
2727           }
2728       else
2729         val = Qnil;
2730
2731       if (NILP (val))
2732         {
2733           /* For unknown reason, `Vcoding_category_list' contains none
2734              of found categories.  Let's use any of them.  */
2735           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2736             if (mask & (1 << idx))
2737               break;
2738         }
2739     }
2740   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2741 }
2742
2743 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2744    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2745    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2746
2747 #define MAX_EOL_CHECK_COUNT 3
2748
2749 int
2750 detect_eol_type (src, src_bytes)
2751      unsigned char *src;
2752      int src_bytes;
2753 {
2754   unsigned char *src_end = src + src_bytes;
2755   unsigned char c;
2756   int total = 0;                /* How many end-of-lines are found so far.  */
2757   int eol_type = CODING_EOL_UNDECIDED;
2758   int this_eol_type;
2759
2760   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2761     {
2762       c = *src++;
2763       if (c == '\n' || c == '\r')
2764         {
2765           total++;
2766           if (c == '\n')
2767             this_eol_type = CODING_EOL_LF;
2768           else if (src >= src_end || *src != '\n')
2769             this_eol_type = CODING_EOL_CR;
2770           else
2771             this_eol_type = CODING_EOL_CRLF, src++;
2772
2773           if (eol_type == CODING_EOL_UNDECIDED)
2774             /* This is the first end-of-line.  */
2775             eol_type = this_eol_type;
2776           else if (eol_type != this_eol_type)
2777             /* The found type is different from what found before.
2778                We had better not decode end-of-line.  */
2779             return CODING_EOL_LF;
2780         }
2781     }
2782
2783   return eol_type;
2784 }
2785
2786 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2787    is encoded.  If it detects an appropriate format of end-of-line, it
2788    sets the information in *CODING.  */
2789
2790 void
2791 detect_eol (coding, src, src_bytes)
2792      struct coding_system *coding;
2793      unsigned char *src;
2794      int src_bytes;
2795 {
2796   Lisp_Object val;
2797   int eol_type = detect_eol_type (src, src_bytes);
2798
2799   if (eol_type == CODING_EOL_UNDECIDED)
2800     /*  We found no end-of-line in the source text.  */
2801     return;
2802
2803   val = Fget (coding->symbol, Qeol_type);
2804   if (VECTORP (val) && XVECTOR (val)->size == 3)
2805     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2806 }
2807
2808 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2809    decoding, it may detect coding system and format of end-of-line if
2810    those are not yet decided.  */
2811
2812 int
2813 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2814      struct coding_system *coding;
2815      unsigned char *source, *destination;
2816      int src_bytes, dst_bytes;
2817      int *consumed;
2818 {
2819   int produced;
2820
2821   if (src_bytes <= 0)
2822     {
2823       *consumed = 0;
2824       return 0;
2825     }
2826
2827   if (coding->type == coding_type_undecided)
2828     detect_coding (coding, source, src_bytes);
2829
2830   if (coding->eol_type == CODING_EOL_UNDECIDED)
2831     detect_eol (coding, source, src_bytes);
2832
2833   coding->carryover_size = 0;
2834   switch (coding->type)
2835     {
2836     case coding_type_no_conversion:
2837     label_no_conversion:
2838       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2839       bcopy (source, destination, produced);
2840       *consumed = produced;
2841       break;
2842
2843     case coding_type_emacs_mule:
2844     case coding_type_undecided:
2845       if (coding->eol_type == CODING_EOL_LF
2846           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2847         goto label_no_conversion;
2848       produced = decode_eol (coding, source, destination,
2849                              src_bytes, dst_bytes, consumed);
2850       break;
2851
2852     case coding_type_sjis:
2853       produced = decode_coding_sjis_big5 (coding, source, destination,
2854                                           src_bytes, dst_bytes, consumed,
2855                                           1);
2856       break;
2857
2858     case coding_type_iso2022:
2859       produced = decode_coding_iso2022 (coding, source, destination,
2860                                         src_bytes, dst_bytes, consumed);
2861       break;
2862
2863     case coding_type_big5:
2864       produced = decode_coding_sjis_big5 (coding, source, destination,
2865                                           src_bytes, dst_bytes, consumed,
2866                                           0);
2867       break;
2868
2869     case coding_type_ccl:
2870       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2871                              src_bytes, dst_bytes, consumed);
2872       break;
2873     }
2874
2875   return produced;
2876 }
2877
2878 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2879
2880 int
2881 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2882      struct coding_system *coding;
2883      unsigned char *source, *destination;
2884      int src_bytes, dst_bytes;
2885      int *consumed;
2886 {
2887   int produced;
2888
2889   switch (coding->type)
2890     {
2891     case coding_type_no_conversion:
2892     label_no_conversion:
2893       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2894       if (produced > 0)
2895         {
2896           bcopy (source, destination, produced);
2897           if (coding->selective)
2898             {
2899               unsigned char *p = destination, *pend = destination + produced;
2900               while (p < pend)
2901                 if (*p++ == '\015') p[-1] = '\n';
2902             }
2903         }
2904       *consumed = produced;
2905       break;
2906
2907     case coding_type_emacs_mule:
2908     case coding_type_undecided:
2909       if (coding->eol_type == CODING_EOL_LF
2910           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2911         goto label_no_conversion;
2912       produced = encode_eol (coding, source, destination,
2913                              src_bytes, dst_bytes, consumed);
2914       break;
2915
2916     case coding_type_sjis:
2917       produced = encode_coding_sjis_big5 (coding, source, destination,
2918                                           src_bytes, dst_bytes, consumed,
2919                                           1);
2920       break;
2921
2922     case coding_type_iso2022:
2923       produced = encode_coding_iso2022 (coding, source, destination,
2924                                         src_bytes, dst_bytes, consumed);
2925       break;
2926
2927     case coding_type_big5:
2928       produced = encode_coding_sjis_big5 (coding, source, destination,
2929                                           src_bytes, dst_bytes, consumed,
2930                                           0);
2931       break;
2932
2933     case coding_type_ccl:
2934       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2935                              src_bytes, dst_bytes, consumed);
2936       break;
2937     }
2938
2939   return produced;
2940 }
2941
2942 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2943
2944 /* Return maximum size (bytes) of a buffer enough for decoding
2945    SRC_BYTES of text encoded in CODING.  */
2946
2947 int
2948 decoding_buffer_size (coding, src_bytes)
2949      struct coding_system *coding;
2950      int src_bytes;
2951 {
2952   int magnification;
2953
2954   if (coding->type == coding_type_iso2022)
2955     magnification = 3;
2956   else if (coding->type == coding_type_ccl)
2957     magnification = coding->spec.ccl.decoder.buf_magnification;
2958   else
2959     magnification = 2;
2960
2961   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2962 }
2963
2964 /* Return maximum size (bytes) of a buffer enough for encoding
2965    SRC_BYTES of text to CODING.  */
2966
2967 int
2968 encoding_buffer_size (coding, src_bytes)
2969      struct coding_system *coding;
2970      int src_bytes;
2971 {
2972   int magnification;
2973
2974   if (coding->type == coding_type_ccl)
2975     magnification = coding->spec.ccl.encoder.buf_magnification;
2976   else
2977     magnification = 3;
2978
2979   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2980 }
2981
2982 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2983 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2984 #endif
2985
2986 char *conversion_buffer;
2987 int conversion_buffer_size;
2988
2989 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2990    or decoding.  Sufficient memory is allocated automatically.  If we
2991    run out of memory, return NULL.  */
2992
2993 char *
2994 get_conversion_buffer (size)
2995      int size;
2996 {
2997   if (size > conversion_buffer_size)
2998     {
2999       char *buf;
3000       int real_size = conversion_buffer_size * 2;
3001
3002       while (real_size < size) real_size *= 2;
3003       buf = (char *) xmalloc (real_size);
3004       xfree (conversion_buffer);
3005       conversion_buffer = buf;
3006       conversion_buffer_size = real_size;
3007     }
3008   return conversion_buffer;
3009 }
3010
3011 \f
3012 #ifdef emacs
3013 /*** 7. Emacs Lisp library functions ***/
3014
3015 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3016        1, 1, 0,
3017   "Return coding-spec of CODING-SYSTEM.\n\
3018 If CODING-SYSTEM is not a valid coding-system, return nil.")
3019   (obj)
3020      Lisp_Object obj;
3021 {
3022   while (SYMBOLP (obj) && !NILP (obj))
3023     obj = Fget (obj, Qcoding_system);
3024   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3025           ? Qnil : obj);
3026 }
3027
3028 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3029   "Return t if OBJECT is nil or a coding-system.\n\
3030 See document of make-coding-system for coding-system object.")
3031   (obj)
3032      Lisp_Object obj;
3033 {
3034   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3035 }
3036
3037 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3038        Sread_non_nil_coding_system, 1, 1, 0,
3039   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3040   (prompt)
3041      Lisp_Object prompt;
3042 {
3043   Lisp_Object val;
3044   do
3045     {
3046       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3047                               Qt, Qnil, Qnil, Qnil);
3048     }
3049   while (XSTRING (val)->size == 0);
3050   return (Fintern (val, Qnil));
3051 }
3052
3053 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3054   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3055   (prompt)
3056      Lisp_Object prompt;
3057 {
3058   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3059                                       Qt, Qnil, Qnil, Qnil);
3060   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3061 }
3062
3063 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3064        1, 1, 0,
3065   "Check validity of CODING-SYSTEM.\n\
3066 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3067 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3068 The value of property should be a vector of length 5.")
3069   (coding_system)
3070      Lisp_Object coding_system;
3071 {
3072   CHECK_SYMBOL (coding_system, 0);
3073   if (!NILP (Fcoding_system_p (coding_system)))
3074     return coding_system;
3075   while (1)
3076     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3077 }
3078
3079 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3080        2, 2, 0,
3081   "Detect coding system of the text in the region between START and END.\n\
3082 Return a list of possible coding systems ordered by priority.\n\
3083 If only ASCII characters are found, it returns `undecided'\n\
3084  or its subsidiary coding system according to a detected end-of-line format.")
3085   (b, e)
3086      Lisp_Object b, e;
3087 {
3088   int coding_mask, eol_type;
3089   Lisp_Object val;
3090   int beg, end;
3091
3092   validate_region (&b, &e);
3093   beg = XINT (b), end = XINT (e);
3094   if (beg < GPT && end >= GPT) move_gap (end);
3095
3096   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3097   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3098
3099   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3100     {
3101       val = intern ("undecided");
3102       if (eol_type != CODING_EOL_UNDECIDED)
3103         {
3104           Lisp_Object val2 = Fget (val, Qeol_type);
3105           if (VECTORP (val2))
3106             val = XVECTOR (val2)->contents[eol_type];
3107         }
3108     }
3109   else
3110     {
3111       Lisp_Object val2;
3112
3113       /* At first, gather possible coding-systems in VAL in a reverse
3114          order.  */
3115       val = Qnil;
3116       for (val2 = Vcoding_category_list;
3117            !NILP (val2);
3118            val2 = XCONS (val2)->cdr)
3119         {
3120           int idx
3121             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3122           if (coding_mask & (1 << idx))
3123             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3124         }
3125
3126       /* Then, change the order of the list, while getting subsidiary
3127          coding-systems.  */
3128       val2 = val;
3129       val = Qnil;
3130       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3131         {
3132           if (eol_type == CODING_EOL_UNDECIDED)
3133             val = Fcons (XCONS (val2)->car, val);
3134           else
3135             {
3136               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3137               if (VECTORP (val3))
3138                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3139               else
3140                 val = Fcons (XCONS (val2)->car, val);
3141             }
3142         }
3143     }
3144
3145   return val;
3146 }
3147
3148 /* Scan text in the region between *BEGP and *ENDP, skip characters
3149    which we never have to encode to (iff ENCODEP is 1) or decode from
3150    coding system CODING at the head and tail, then set BEGP and ENDP
3151    to the addresses of start and end of the text we actually convert.  */
3152
3153 void
3154 shrink_conversion_area (begp, endp, coding, encodep)
3155      unsigned char **begp, **endp;
3156      struct coding_system *coding;
3157      int encodep;
3158 {
3159   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3160
3161   if (coding->eol_type != CODING_EOL_LF
3162       && coding->eol_type != CODING_EOL_UNDECIDED)
3163     /* Since we anyway have to convert end-of-line format, it is not
3164        worth skipping at most 100 bytes or so.  */
3165     return;
3166
3167   if (encodep)                  /* for encoding */
3168     {
3169       switch (coding->type)
3170         {
3171         case coding_type_no_conversion:
3172         case coding_type_emacs_mule:
3173         case coding_type_undecided:
3174           /* We need no conversion.  */
3175           *begp = *endp;
3176           return;
3177         case coding_type_ccl:
3178           /* We can't skip any data.  */
3179           return;
3180         case coding_type_iso2022:
3181           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3182             {
3183               unsigned char *bol = beg_addr;
3184               while (beg_addr < end_addr && *beg_addr < 0x80)
3185                 {
3186                   beg_addr++;
3187                   if (*(beg_addr - 1) == '\n')
3188                     bol = beg_addr;
3189                 }
3190               beg_addr = bol;
3191               goto label_skip_tail;
3192             }
3193           /* fall down ... */
3194         default:
3195           /* We can skip all ASCII characters at the head and tail.  */
3196           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3197         label_skip_tail:
3198           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3199           break;
3200         }
3201     }
3202   else                          /* for decoding */
3203     {
3204       switch (coding->type)
3205         {
3206         case coding_type_no_conversion:
3207           /* We need no conversion.  */
3208           *begp = *endp;
3209           return;
3210         case coding_type_emacs_mule:
3211           if (coding->eol_type == CODING_EOL_LF)
3212             {
3213               /* We need no conversion.  */
3214               *begp = *endp;
3215               return;
3216             }
3217           /* We can skip all but carriage-return.  */
3218           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3219           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3220           break;
3221         case coding_type_sjis:
3222         case coding_type_big5:
3223           /* We can skip all ASCII characters at the head.  */
3224           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3225           /* We can skip all ASCII characters at the tail except for
3226              the second byte of SJIS or BIG5 code.  */
3227           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3228           if (end_addr != *endp)
3229             end_addr++;
3230           break;
3231         case coding_type_ccl:
3232           /* We can't skip any data.  */
3233           return;
3234         default:                /* i.e. case coding_type_iso2022: */
3235           {
3236             unsigned char c;
3237
3238             /* We can skip all ASCII characters except for a few
3239                control codes at the head.  */
3240             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3241                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3242                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3243               beg_addr++;
3244           }
3245           break;
3246         }
3247     }
3248   *begp = beg_addr;
3249   *endp = end_addr;
3250   return;
3251 }
3252
3253 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3254    text between B and E.  B and E are buffer position.  */
3255
3256 Lisp_Object
3257 code_convert_region (b, e, coding, encodep)
3258      Lisp_Object b, e;
3259      struct coding_system *coding;
3260      int encodep;
3261 {
3262   int beg, end, len, consumed, produced;
3263   char *buf;
3264   unsigned char *begp, *endp;
3265   int pos = PT;
3266
3267   validate_region (&b, &e);
3268   beg = XINT (b), end = XINT (e);
3269   if (beg < GPT && end >= GPT)
3270     move_gap (end);
3271
3272   if (encodep && !NILP (coding->pre_write_conversion))
3273     {
3274       /* We must call a pre-conversion function which may put a new
3275          text to be converted in a new buffer.  */
3276       struct buffer *old = current_buffer, *new;
3277
3278       TEMP_SET_PT (beg);
3279       call2 (coding->pre_write_conversion, b, e);
3280       if (old != current_buffer)
3281         {
3282           /* Replace the original text by the text just generated.  */
3283           len = ZV - BEGV;
3284           new = current_buffer;
3285           set_buffer_internal (old);
3286           del_range (beg, end);
3287           insert_from_buffer (new, 1, len, 0);
3288           end = beg + len;
3289         }
3290     }
3291
3292   /* We may be able to shrink the conversion region.  */
3293   begp = POS_ADDR (beg); endp = begp + (end - beg);
3294   shrink_conversion_area (&begp, &endp, coding, encodep);
3295
3296   if (begp == endp)
3297     /* We need no conversion.  */
3298     len = end - beg;
3299   else
3300     {
3301       beg += begp - POS_ADDR (beg);
3302       end =  beg + (endp - begp);
3303
3304       if (encodep)
3305         len = encoding_buffer_size (coding, end - beg);
3306       else
3307         len = decoding_buffer_size (coding, end - beg);
3308       buf = get_conversion_buffer (len);
3309
3310       coding->last_block = 1;
3311       produced = (encodep
3312                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3313                                    &consumed)
3314                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3315                                    &consumed));
3316
3317       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3318
3319       TEMP_SET_PT (beg);
3320       insert (buf, produced);
3321       del_range (PT, PT + end - beg);
3322       if (pos >= end)
3323         pos = PT + (pos - end);
3324       else if (pos > beg)
3325         pos = beg;
3326       TEMP_SET_PT (pos);
3327   }
3328
3329   if (!encodep && !NILP (coding->post_read_conversion))
3330     {
3331       /* We must call a post-conversion function which may alter
3332          the text just converted.  */
3333       Lisp_Object insval;
3334
3335       beg = XINT (b);
3336       TEMP_SET_PT (beg);
3337       insval = call1 (coding->post_read_conversion, make_number (len));
3338       CHECK_NUMBER (insval, 0);
3339       len = XINT (insval);
3340     }
3341
3342   return make_number (len);
3343 }
3344
3345 Lisp_Object
3346 code_convert_string (str, coding, encodep, nocopy)
3347      Lisp_Object str, nocopy;
3348      struct coding_system *coding;
3349      int encodep;
3350 {
3351   int len, consumed, produced;
3352   char *buf;
3353   unsigned char *begp, *endp;
3354   int head_skip, tail_skip;
3355   struct gcpro gcpro1;
3356
3357   if (encodep && !NILP (coding->pre_write_conversion)
3358       || !encodep && !NILP (coding->post_read_conversion))
3359     {
3360       /* Since we have to call Lisp functions which assume target text
3361          is in a buffer, after setting a temporary buffer, call
3362          code_convert_region.  */
3363       int count = specpdl_ptr - specpdl;
3364       int len = XSTRING (str)->size;
3365       Lisp_Object result;
3366       struct buffer *old = current_buffer;
3367
3368       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3369       temp_output_buffer_setup (" *code-converting-work*");
3370       set_buffer_internal (XBUFFER (Vstandard_output));
3371       insert_from_string (str, 0, len, 0);
3372       code_convert_region (make_number (BEGV), make_number (ZV),
3373                            coding, encodep);
3374       result = make_buffer_string (BEGV, ZV, 0);
3375       set_buffer_internal (old);
3376       return unbind_to (count, result);
3377     }
3378
3379   /* We may be able to shrink the conversion region.  */
3380   begp = XSTRING (str)->data;
3381   endp = begp + XSTRING (str)->size;
3382   shrink_conversion_area (&begp, &endp, coding, encodep);
3383
3384   if (begp == endp)
3385     /* We need no conversion.  */
3386     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3387
3388   head_skip = begp - XSTRING (str)->data;
3389   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3390
3391   GCPRO1 (str);
3392
3393   if (encodep)
3394     len = encoding_buffer_size (coding, endp - begp);
3395   else
3396     len = decoding_buffer_size (coding, endp - begp);
3397   buf = get_conversion_buffer (len + head_skip + tail_skip);
3398
3399   bcopy (XSTRING (str)->data, buf, head_skip);
3400   coding->last_block = 1;
3401   produced = (encodep
3402               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3403                                buf + head_skip, endp - begp, len, &consumed)
3404               : decode_coding (coding, XSTRING (str)->data + head_skip,
3405                                buf + head_skip, endp - begp, len, &consumed));
3406   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3407          buf + head_skip + produced,
3408          tail_skip);
3409
3410   UNGCPRO;
3411
3412   return make_string (buf, head_skip + produced + tail_skip);
3413 }
3414
3415 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3416        3, 3, "r\nzCoding system: ",
3417   "Decode current region by specified coding system.\n\
3418 When called from a program, takes three arguments:\n\
3419 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3420 Return length of decoded text.")
3421   (b, e, coding_system)
3422      Lisp_Object b, e, coding_system;
3423 {
3424   struct coding_system coding;
3425
3426   CHECK_NUMBER_COERCE_MARKER (b, 0);
3427   CHECK_NUMBER_COERCE_MARKER (e, 1);
3428   CHECK_SYMBOL (coding_system, 2);
3429
3430   if (NILP (coding_system))
3431     return make_number (XFASTINT (e) - XFASTINT (b));
3432   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3433     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3434
3435   return code_convert_region (b, e, &coding, 0);
3436 }
3437
3438 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3439        3, 3, "r\nzCoding system: ",
3440   "Encode current region by specified coding system.\n\
3441 When called from a program, takes three arguments:\n\
3442 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3443 Return length of encoded text.")
3444   (b, e, coding_system)
3445      Lisp_Object b, e, coding_system;
3446 {
3447   struct coding_system coding;
3448
3449   CHECK_NUMBER_COERCE_MARKER (b, 0);
3450   CHECK_NUMBER_COERCE_MARKER (e, 1);
3451   CHECK_SYMBOL (coding_system, 2);
3452
3453   if (NILP (coding_system))
3454     return make_number (XFASTINT (e) - XFASTINT (b));
3455   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3456     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3457
3458   return code_convert_region (b, e, &coding, 1);
3459 }
3460
3461 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3462        2, 3, 0,
3463   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3464 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3465 of decoding.")
3466   (string, coding_system, nocopy)
3467      Lisp_Object string, coding_system, nocopy;
3468 {
3469   struct coding_system coding;
3470
3471   CHECK_STRING (string, 0);
3472   CHECK_SYMBOL (coding_system, 1);
3473
3474   if (NILP (coding_system))
3475     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3476   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3477     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3478
3479   return code_convert_string (string, &coding, 0, nocopy);
3480 }
3481
3482 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3483        2, 3, 0,
3484   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3485 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3486 of encoding.")
3487   (string, coding_system, nocopy)
3488      Lisp_Object string, coding_system, nocopy;
3489 {
3490   struct coding_system coding;
3491
3492   CHECK_STRING (string, 0);
3493   CHECK_SYMBOL (coding_system, 1);
3494
3495   if (NILP (coding_system))
3496     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3497   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3498     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3499
3500   return code_convert_string (string, &coding, 1, nocopy);
3501 }
3502
3503 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3504   "Decode a JISX0208 character of shift-jis encoding.\n\
3505 CODE is the character code in SJIS.\n\
3506 Return the corresponding character.")
3507   (code)
3508      Lisp_Object code;
3509 {
3510   unsigned char c1, c2, s1, s2;
3511   Lisp_Object val;
3512
3513   CHECK_NUMBER (code, 0);
3514   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3515   DECODE_SJIS (s1, s2, c1, c2);
3516   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3517   return val;
3518 }
3519
3520 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3521   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3522 Return the corresponding character code in SJIS.")
3523   (ch)
3524      Lisp_Object ch;
3525 {
3526   int charset, c1, c2, s1, s2;
3527   Lisp_Object val;
3528
3529   CHECK_NUMBER (ch, 0);
3530   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3531   if (charset == charset_jisx0208)
3532     {
3533       ENCODE_SJIS (c1, c2, s1, s2);
3534       XSETFASTINT (val, (s1 << 8) | s2);
3535     }
3536   else
3537     XSETFASTINT (val, 0);
3538   return val;
3539 }
3540
3541 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3542   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3543 CODE is the character code in BIG5.\n\
3544 Return the corresponding character.")
3545   (code)
3546      Lisp_Object code;
3547 {
3548   int charset;
3549   unsigned char b1, b2, c1, c2;
3550   Lisp_Object val;
3551
3552   CHECK_NUMBER (code, 0);
3553   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3554   DECODE_BIG5 (b1, b2, charset, c1, c2);
3555   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3556   return val;
3557 }
3558
3559 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3560   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3561 Return the corresponding character code in Big5.")
3562   (ch)
3563      Lisp_Object ch;
3564 {
3565   int charset, c1, c2, b1, b2;
3566   Lisp_Object val;
3567
3568   CHECK_NUMBER (ch, 0);
3569   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3570   if (charset == charset_big5_1 || charset == charset_big5_2)
3571     {
3572       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3573       XSETFASTINT (val, (b1 << 8) | b2);
3574     }
3575   else
3576     XSETFASTINT (val, 0);
3577   return val;
3578 }
3579
3580 DEFUN ("set-terminal-coding-system-internal",
3581        Fset_terminal_coding_system_internal,
3582        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3583   (coding_system)
3584      Lisp_Object coding_system;
3585 {
3586   CHECK_SYMBOL (coding_system, 0);
3587   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3588   return Qnil;
3589 }
3590
3591 DEFUN ("set-safe-terminal-coding-system-internal",
3592        Fset_safe_terminal_coding_system_internal,
3593        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3594   (coding_system)
3595      Lisp_Object coding_system;
3596 {
3597   CHECK_SYMBOL (coding_system, 0);
3598   setup_coding_system (Fcheck_coding_system (coding_system),
3599                        &safe_terminal_coding);
3600   return Qnil;
3601 }
3602
3603 DEFUN ("terminal-coding-system",
3604        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3605   "Return coding-system of your terminal.")
3606   ()
3607 {
3608   return terminal_coding.symbol;
3609 }
3610
3611 DEFUN ("set-keyboard-coding-system-internal",
3612        Fset_keyboard_coding_system_internal,
3613        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3614   (coding_system)
3615      Lisp_Object coding_system;
3616 {
3617   CHECK_SYMBOL (coding_system, 0);
3618   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3619   return Qnil;
3620 }
3621
3622 DEFUN ("keyboard-coding-system",
3623        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3624   "Return coding-system of what is sent from terminal keyboard.")
3625   ()
3626 {
3627   return keyboard_coding.symbol;
3628 }
3629
3630 \f
3631 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3632        Sfind_operation_coding_system,  1, MANY, 0,
3633   "Choose a coding system for an operation based on the target name.\n\
3634 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3635 DECODING-SYSTEM is the coding system to use for decoding\n\
3636 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3637 for encoding (in case OPERATION does encoding).\n\
3638 \n\
3639 The first argument OPERATION specifies an I/O primitive:\n\
3640   For file I/O, `insert-file-contents' or `write-region'.\n\
3641   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3642   For network I/O, `open-network-stream'.\n\
3643 \n\
3644 The remaining arguments should be the same arguments that were passed\n\
3645 to the primitive.  Depending on which primitive, one of those arguments\n\
3646 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3647 whichever argument specifies the file name is TARGET.\n\
3648 \n\
3649 TARGET has a meaning which depends on OPERATION:\n\
3650   For file I/O, TARGET is a file name.\n\
3651   For process I/O, TARGET is a process name.\n\
3652   For network I/O, TARGET is a service name or a port number\n\
3653 \n\
3654 This function looks up what specified for TARGET in,\n\
3655 `file-coding-system-alist', `process-coding-system-alist',\n\
3656 or `network-coding-system-alist' depending on OPERATION.\n\
3657 They may specify a coding system, a cons of coding systems,\n\
3658 or a function symbol to call.\n\
3659 In the last case, we call the function with one argument,\n\
3660 which is a list of all the arguments given to this function.")
3661   (nargs, args)
3662      int nargs;
3663      Lisp_Object *args;
3664 {
3665   Lisp_Object operation, target_idx, target, val;
3666   register Lisp_Object chain;
3667
3668   if (nargs < 2)
3669     error ("Too few arguments");
3670   operation = args[0];
3671   if (!SYMBOLP (operation)
3672       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3673     error ("Invalid first arguement");
3674   if (nargs < 1 + XINT (target_idx))
3675     error ("Too few arguments for operation: %s",
3676            XSYMBOL (operation)->name->data);
3677   target = args[XINT (target_idx) + 1];
3678   if (!(STRINGP (target)
3679         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3680     error ("Invalid %dth argument", XINT (target_idx) + 1);
3681
3682   chain = ((EQ (operation, Qinsert_file_contents)
3683             || EQ (operation, Qwrite_region))
3684            ? Vfile_coding_system_alist
3685            : (EQ (operation, Qopen_network_stream)
3686               ? Vnetwork_coding_system_alist
3687               : Vprocess_coding_system_alist));
3688   if (NILP (chain))
3689     return Qnil;
3690
3691   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3692     {
3693       Lisp_Object elt = XCONS (chain)->car;
3694
3695       if (CONSP (elt)
3696           && ((STRINGP (target)
3697                && STRINGP (XCONS (elt)->car)
3698                && fast_string_match (XCONS (elt)->car, target) >= 0)
3699               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3700         {
3701           val = XCONS (elt)->cdr;
3702           if (CONSP (val))
3703             return val;
3704           if (! SYMBOLP (val))
3705             return Qnil;
3706           if (! NILP (Fcoding_system_p (val)))
3707             return Fcons (val, val);
3708           if (!NILP (Ffboundp (val)))
3709             return call1 (val, Flist (nargs, args));
3710           return Qnil;
3711         }
3712     }
3713   return Qnil;
3714 }
3715
3716 #endif /* emacs */
3717
3718 \f
3719 /*** 8. Post-amble ***/
3720
3721 init_coding_once ()
3722 {
3723   int i;
3724
3725   /* Emacs' internal format specific initialize routine.  */
3726   for (i = 0; i <= 0x20; i++)
3727     emacs_code_class[i] = EMACS_control_code;
3728   emacs_code_class[0x0A] = EMACS_linefeed_code;
3729   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3730   for (i = 0x21 ; i < 0x7F; i++)
3731     emacs_code_class[i] = EMACS_ascii_code;
3732   emacs_code_class[0x7F] = EMACS_control_code;
3733   emacs_code_class[0x80] = EMACS_leading_code_composition;
3734   for (i = 0x81; i < 0xFF; i++)
3735     emacs_code_class[i] = EMACS_invalid_code;
3736   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3737   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3738   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3739   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3740
3741   /* ISO2022 specific initialize routine.  */
3742   for (i = 0; i < 0x20; i++)
3743     iso_code_class[i] = ISO_control_code;
3744   for (i = 0x21; i < 0x7F; i++)
3745     iso_code_class[i] = ISO_graphic_plane_0;
3746   for (i = 0x80; i < 0xA0; i++)
3747     iso_code_class[i] = ISO_control_code;
3748   for (i = 0xA1; i < 0xFF; i++)
3749     iso_code_class[i] = ISO_graphic_plane_1;
3750   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3751   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3752   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3753   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3754   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3755   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3756   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3757   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3758   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3759   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3760
3761   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3762   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3763
3764   setup_coding_system (Qnil, &keyboard_coding);
3765   setup_coding_system (Qnil, &terminal_coding);
3766   setup_coding_system (Qnil, &safe_terminal_coding);
3767
3768 #if defined (MSDOS) || defined (WINDOWSNT)
3769   system_eol_type = CODING_EOL_CRLF;
3770 #else
3771   system_eol_type = CODING_EOL_LF;
3772 #endif
3773 }
3774
3775 #ifdef emacs
3776
3777 syms_of_coding ()
3778 {
3779   Qtarget_idx = intern ("target-idx");
3780   staticpro (&Qtarget_idx);
3781
3782   /* Target FILENAME is the first argument.  */
3783   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3784   /* Target FILENAME is the third argument.  */
3785   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3786
3787   Qcall_process = intern ("call-process");
3788   staticpro (&Qcall_process);
3789   /* Target PROGRAM is the first argument.  */
3790   Fput (Qcall_process, Qtarget_idx, make_number (0));
3791
3792   Qcall_process_region = intern ("call-process-region");
3793   staticpro (&Qcall_process_region);
3794   /* Target PROGRAM is the third argument.  */
3795   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3796
3797   Qstart_process = intern ("start-process");
3798   staticpro (&Qstart_process);
3799   /* Target PROGRAM is the third argument.  */
3800   Fput (Qstart_process, Qtarget_idx, make_number (2));
3801
3802   Qopen_network_stream = intern ("open-network-stream");
3803   staticpro (&Qopen_network_stream);
3804   /* Target SERVICE is the fourth argument.  */
3805   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3806
3807   Qcoding_system = intern ("coding-system");
3808   staticpro (&Qcoding_system);
3809
3810   Qeol_type = intern ("eol-type");
3811   staticpro (&Qeol_type);
3812
3813   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3814   staticpro (&Qbuffer_file_coding_system);
3815
3816   Qpost_read_conversion = intern ("post-read-conversion");
3817   staticpro (&Qpost_read_conversion);
3818
3819   Qpre_write_conversion = intern ("pre-write-conversion");
3820   staticpro (&Qpre_write_conversion);
3821
3822   Qcoding_system_spec = intern ("coding-system-spec");
3823   staticpro (&Qcoding_system_spec);
3824
3825   Qcoding_system_p = intern ("coding-system-p");
3826   staticpro (&Qcoding_system_p);
3827
3828   Qcoding_system_error = intern ("coding-system-error");
3829   staticpro (&Qcoding_system_error);
3830
3831   Fput (Qcoding_system_error, Qerror_conditions,
3832         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3833   Fput (Qcoding_system_error, Qerror_message,
3834         build_string ("Invalid coding system"));
3835
3836   Qcoding_category_index = intern ("coding-category-index");
3837   staticpro (&Qcoding_category_index);
3838
3839   {
3840     int i;
3841     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3842       {
3843         coding_category_table[i] = intern (coding_category_name[i]);
3844         staticpro (&coding_category_table[i]);
3845         Fput (coding_category_table[i], Qcoding_category_index,
3846               make_number (i));
3847       }
3848   }
3849
3850   Qcharacter_unification_table = intern ("character-unification-table");
3851   staticpro (&Qcharacter_unification_table);
3852   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3853         make_number (0));
3854
3855   Qcharacter_unification_table_for_decode
3856     = intern ("character-unification-table-for-decode");
3857   staticpro (&Qcharacter_unification_table_for_decode);
3858
3859   Qcharacter_unification_table_for_encode
3860     = intern ("character-unification-table-for-encode");
3861   staticpro (&Qcharacter_unification_table_for_encode);
3862
3863   Qemacs_mule = intern ("emacs-mule");
3864   staticpro (&Qemacs_mule);
3865
3866   defsubr (&Scoding_system_spec);
3867   defsubr (&Scoding_system_p);
3868   defsubr (&Sread_coding_system);
3869   defsubr (&Sread_non_nil_coding_system);
3870   defsubr (&Scheck_coding_system);
3871   defsubr (&Sdetect_coding_region);
3872   defsubr (&Sdecode_coding_region);
3873   defsubr (&Sencode_coding_region);
3874   defsubr (&Sdecode_coding_string);
3875   defsubr (&Sencode_coding_string);
3876   defsubr (&Sdecode_sjis_char);
3877   defsubr (&Sencode_sjis_char);
3878   defsubr (&Sdecode_big5_char);
3879   defsubr (&Sencode_big5_char);
3880   defsubr (&Sset_terminal_coding_system_internal);
3881   defsubr (&Sset_safe_terminal_coding_system_internal);
3882   defsubr (&Sterminal_coding_system);
3883   defsubr (&Sset_keyboard_coding_system_internal);
3884   defsubr (&Skeyboard_coding_system);
3885   defsubr (&Sfind_operation_coding_system);
3886
3887   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3888     "List of coding-categories (symbols) ordered by priority.");
3889   {
3890     int i;
3891
3892     Vcoding_category_list = Qnil;
3893     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3894       Vcoding_category_list
3895         = Fcons (coding_category_table[i], Vcoding_category_list);
3896   }
3897
3898   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3899     "Specify the coding system for read operations.\n\
3900 It is useful to bind this variable with `let', but do not set it globally.\n\
3901 If the value is a coding system, it is used for decoding on read operation.\n\
3902 If not, an appropriate element is used from one of the coding system alists:\n\
3903 There are three such tables, `file-coding-system-alist',\n\
3904 `process-coding-system-alist', and `network-coding-system-alist'.");
3905   Vcoding_system_for_read = Qnil;
3906
3907   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3908     "Specify the coding system for write operations.\n\
3909 It is useful to bind this variable with `let', but do not set it globally.\n\
3910 If the value is a coding system, it is used for encoding on write operation.\n\
3911 If not, an appropriate element is used from one of the coding system alists:\n\
3912 There are three such tables, `file-coding-system-alist',\n\
3913 `process-coding-system-alist', and `network-coding-system-alist'.");
3914   Vcoding_system_for_write = Qnil;
3915
3916   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3917     "Coding system used in the latest file or process I/O.");
3918   Vlast_coding_system_used = Qnil;
3919
3920   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3921     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3922   inhibit_eol_conversion = 0;
3923
3924   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3925     "Alist to decide a coding system to use for a file I/O operation.\n\
3926 The format is ((PATTERN . VAL) ...),\n\
3927 where PATTERN is a regular expression matching a file name,\n\
3928 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3929 If VAL is a coding system, it is used for both decoding and encoding\n\
3930 the file contents.\n\
3931 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3932 and the cdr part is used for encoding.\n\
3933 If VAL is a function symbol, the function must return a coding system\n\
3934 or a cons of coding systems which are used as above.\n\
3935 \n\
3936 See also the function `find-operation-coding-system'.");
3937   Vfile_coding_system_alist = Qnil;
3938
3939   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3940     "Alist to decide a coding system to use for a process I/O operation.\n\
3941 The format is ((PATTERN . VAL) ...),\n\
3942 where PATTERN is a regular expression matching a program name,\n\
3943 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3944 If VAL is a coding system, it is used for both decoding what received\n\
3945 from the program and encoding what sent to the program.\n\
3946 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3947 and the cdr part is used for encoding.\n\
3948 If VAL is a function symbol, the function must return a coding system\n\
3949 or a cons of coding systems which are used as above.\n\
3950 \n\
3951 See also the function `find-operation-coding-system'.");
3952   Vprocess_coding_system_alist = Qnil;
3953
3954   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3955     "Alist to decide a coding system to use for a network I/O operation.\n\
3956 The format is ((PATTERN . VAL) ...),\n\
3957 where PATTERN is a regular expression matching a network service name\n\
3958 or is a port number to connect to,\n\
3959 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3960 If VAL is a coding system, it is used for both decoding what received\n\
3961 from the network stream and encoding what sent to the network stream.\n\
3962 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3963 and the cdr part is used for encoding.\n\
3964 If VAL is a function symbol, the function must return a coding system\n\
3965 or a cons of coding systems which are used as above.\n\
3966 \n\
3967 See also the function `find-operation-coding-system'.");
3968   Vnetwork_coding_system_alist = Qnil;
3969
3970   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3971     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3972   eol_mnemonic_unix = ':';
3973
3974   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3975     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3976   eol_mnemonic_dos = '\\';
3977
3978   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3979     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3980   eol_mnemonic_mac = '/';
3981
3982   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3983     "Mnemonic character indicating end-of-line format is not yet decided.");
3984   eol_mnemonic_undecided = ':';
3985
3986   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3987     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3988   Venable_character_unification = Qt;
3989
3990   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3991     &Vstandard_character_unification_table_for_decode,
3992     "Table for unifying characters when reading.");
3993   Vstandard_character_unification_table_for_decode = Qnil;
3994
3995   DEFVAR_LISP ("standard-character-unification-table-for-encode",
3996     &Vstandard_character_unification_table_for_encode,
3997     "Table for unifying characters when writing.");
3998   Vstandard_character_unification_table_for_encode = Qnil;
3999
4000   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4001     "Alist of charsets vs revision numbers.\n\
4002 While encoding, if a charset (car part of an element) is found,\n\
4003 designate it with the escape sequence identifing revision (cdr part of the element).");
4004   Vcharset_revision_alist = Qnil;
4005
4006   DEFVAR_LISP ("default-process-coding-system",
4007                &Vdefault_process_coding_system,
4008     "Cons of coding systems used for process I/O by default.\n\
4009 The car part is used for decoding a process output,\n\
4010 the cdr part is used for encoding a text to be sent to a process.");
4011   Vdefault_process_coding_system = Qnil;
4012
4013   DEFVAR_LISP ("special-microsoft-code-table", &Vmicrosoft_code_table,
4014     "Table of special Microsoft codes in the range 128..159 (inclusive).\n\
4015 This is a vector of length 256.\n\
4016 If Nth element is non-nil, the existence of code N in a file\n\
4017 (or output of subprocess) doesn't prevent it to be detected as\n\
4018 a coding system of ISO 2022 variant (e.g. iso-latin-1) on reading a file\n\
4019 or reading output of a subprocess.\n\
4020 Only 128th through 159th elements has a meaning.");
4021   Vmicrosoft_code_table = Fmake_vector (make_number (256), Qnil);
4022 }
4023
4024 #endif /* emacs */