code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Raw text
  71
  72   A coding system for a text containing random 8-bit code.  Emacs does
  73   no code conversion on such a text except for end-of-line format.
  74
  75   5. Other
  76
  77   If a user wants to read/write a text encoded in a coding system not
  78   listed above, he can supply a decoder and an encoder for it in CCL
  79   (Code Conversion Language) programs.  Emacs executes the CCL program
  80   while reading/writing.
  81
  82   Emacs represents a coding-system by a Lisp symbol that has a property
  83   `coding-system'.  But, before actually using the coding-system, the
  84   information about it is set in a structure of type `struct
  85   coding_system' for rapid processing.  See section 6 for more details.
  86
  87 */
  88
  89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  90
  91   How end-of-line of a text is encoded depends on a system.  For
  92   instance, Unix's format is just one byte of `line-feed' code,
  93   whereas DOS's format is two-byte sequence of `carriage-return' and
  94   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  95
  96   Since text characters encoding and end-of-line encoding are
  97   independent, any coding system described above can take
  98   any format of end-of-line.  So, Emacs has information of format of
  99   end-of-line in each coding-system.  See section 6 for more details.
 100
 101 */
 102
 103 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 104
 105   These functions check if a text between SRC and SRC_END is encoded
 106   in the coding system category XXX.  Each returns an integer value in
 107   which appropriate flag bits for the category XXX is set.  The flag
 108   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 109   template of these functions.  */
 110 #if 0
 111 int
 112 detect_coding_emacs_mule (src, src_end)
 113      unsigned char *src, *src_end;
 114 {
 115   ...
 116 }
 117 #endif
 118
 119 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 120
 121   These functions decode SRC_BYTES length text at SOURCE encoded in
 122   CODING to Emacs' internal format (emacs-mule).  The resulting text
 123   goes to a place pointed to by DESTINATION, the length of which should
 124   not exceed DST_BYTES.  The number of bytes actually processed is
 125   returned as *CONSUMED.  The return value is the length of the decoded
 126   text.  Below is a template of these functions.  */
 127 #if 0
 128 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 129      struct coding_system *coding;
 130      unsigned char *source, *destination;
 131      int src_bytes, dst_bytes;
 132      int *consumed;
 133 {
 134   ...
 135 }
 136 #endif
 137
 138 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 139
 140   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 141   internal format (emacs-mule) to CODING.  The resulting text goes to
 142   a place pointed to by DESTINATION, the length of which should not
 143   exceed DST_BYTES.  The number of bytes actually processed is
 144   returned as *CONSUMED.  The return value is the length of the
 145   encoded text.  Below is a template of these functions.  */
 146 #if 0
 147 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 148      struct coding_system *coding;
 149      unsigned char *source, *destination;
 150      int src_bytes, dst_bytes;
 151      int *consumed;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** COMMONLY USED MACROS ***/
 158
 159 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 160    THREE_MORE_BYTES safely get one, two, and three bytes from the
 161    source text respectively.  If there are not enough bytes in the
 162    source, they jump to `label_end_of_loop'.  The caller should set
 163    variables `src' and `src_end' to appropriate areas in advance.  */
 164
 165 #define ONE_MORE_BYTE(c1)       \
 166   do {                          \
 167     if (src < src_end)          \
 168       c1 = *src++;              \
 169     else                        \
 170       goto label_end_of_loop;   \
 171   } while (0)
 172
 173 #define TWO_MORE_BYTES(c1, c2)  \
 174   do {                          \
 175     if (src + 1 < src_end)      \
 176       c1 = *src++, c2 = *src++; \
 177     else                        \
 178       goto label_end_of_loop;   \
 179   } while (0)
 180
 181 #define THREE_MORE_BYTES(c1, c2, c3)            \
 182   do {                                          \
 183     if (src + 2 < src_end)                      \
 184       c1 = *src++, c2 = *src++, c3 = *src++;    \
 185     else                                        \
 186       goto label_end_of_loop;                   \
 187   } while (0)
 188
 189 /* The following three macros DECODE_CHARACTER_ASCII,
 190    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 191    the multi-byte form of a character of each class at the place
 192    pointed by `dst'.  The caller should set the variable `dst' to
 193    point to an appropriate area and the variable `coding' to point to
 194    the coding-system of the currently decoding text in advance.  */
 195
 196 /* Decode one ASCII character C.  */
 197
 198 #define DECODE_CHARACTER_ASCII(c)                               \
 199   do {                                                          \
 200     if (COMPOSING_P (coding->composing))                        \
 201       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 202     else                                                        \
 203       *dst++ = (c);                                             \
 204   } while (0)
 205
 206 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 207    position-code is C.  */
 208
 209 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 210   do {                                                                  \
 211     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 212     if (COMPOSING_P (coding->composing))                                \
 213       *dst++ = leading_code + 0x20;                                     \
 214     else                                                                \
 215       *dst++ = leading_code;                                            \
 216     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 217       *dst++ = leading_code;                                            \
 218     *dst++ = (c) | 0x80;                                                \
 219   } while (0)
 220
 221 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 222    position-codes are C1 and C2.  */
 223
 224 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 225   do {                                                  \
 226     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 227     *dst++ = (c2) | 0x80;                               \
 228   } while (0)
 229
 230 \f
 231 /*** 1. Preamble ***/
 232
 233 #include <stdio.h>
 234
 235 #ifdef emacs
 236
 237 #include <config.h>
 238 #include "lisp.h"
 239 #include "buffer.h"
 240 #include "charset.h"
 241 #include "ccl.h"
 242 #include "coding.h"
 243 #include "window.h"
 244
 245 #else  /* not emacs */
 246
 247 #include "mulelib.h"
 248
 249 #endif /* not emacs */
 250
 251 Lisp_Object Qcoding_system, Qeol_type;
 252 Lisp_Object Qbuffer_file_coding_system;
 253 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 254 Lisp_Object Qno_conversion, Qundecided;
 255 Lisp_Object Qcoding_system_history;
 256 Lisp_Object Qsafe_charsets;
 257
 258 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 259 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 260 Lisp_Object Qstart_process, Qopen_network_stream;
 261 Lisp_Object Qtarget_idx;
 262
 263 /* Mnemonic character of each format of end-of-line.  */
 264 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 265 /* Mnemonic character to indicate format of end-of-line is not yet
 266    decided.  */
 267 int eol_mnemonic_undecided;
 268
 269 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 270    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 271 int system_eol_type;
 272
 273 #ifdef emacs
 274
 275 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 276
 277 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 278
 279 /* Coding system emacs-mule is for converting only end-of-line format.  */
 280 Lisp_Object Qemacs_mule;
 281
 282 /* Coding-systems are handed between Emacs Lisp programs and C internal
 283    routines by the following three variables.  */
 284 /* Coding-system for reading files and receiving data from process.  */
 285 Lisp_Object Vcoding_system_for_read;
 286 /* Coding-system for writing files and sending data to process.  */
 287 Lisp_Object Vcoding_system_for_write;
 288 /* Coding-system actually used in the latest I/O.  */
 289 Lisp_Object Vlast_coding_system_used;
 290
 291 /* A vector of length 256 which contains information about special
 292    Latin codes (espepcially for dealing with Microsoft code).  */
 293 Lisp_Object Vlatin_extra_code_table;
 294
 295 /* Flag to inhibit code conversion of end-of-line format.  */
 296 int inhibit_eol_conversion;
 297
 298 /* Coding system to be used to encode text for terminal display.  */
 299 struct coding_system terminal_coding;
 300
 301 /* Coding system to be used to encode text for terminal display when
 302    terminal coding system is nil.  */
 303 struct coding_system safe_terminal_coding;
 304
 305 /* Coding system of what is sent from terminal keyboard.  */
 306 struct coding_system keyboard_coding;
 307
 308 Lisp_Object Vfile_coding_system_alist;
 309 Lisp_Object Vprocess_coding_system_alist;
 310 Lisp_Object Vnetwork_coding_system_alist;
 311
 312 #endif /* emacs */
 313
 314 Lisp_Object Qcoding_category_index;
 315
 316 /* List of symbols `coding-category-xxx' ordered by priority.  */
 317 Lisp_Object Vcoding_category_list;
 318
 319 /* Table of coding-systems currently assigned to each coding-category.  */
 320 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 321
 322 /* Table of names of symbol for each coding-category.  */
 323 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 324   "coding-category-emacs-mule",
 325   "coding-category-sjis",
 326   "coding-category-iso-7",
 327   "coding-category-iso-8-1",
 328   "coding-category-iso-8-2",
 329   "coding-category-iso-7-else",
 330   "coding-category-iso-8-else",
 331   "coding-category-big5",
 332   "coding-category-raw-text",
 333   "coding-category-binary"
 334 };
 335
 336 /* Flag to tell if we look up unification table on character code
 337    conversion.  */
 338 Lisp_Object Venable_character_unification;
 339 /* Standard unification table to look up on decoding (reading).  */
 340 Lisp_Object Vstandard_character_unification_table_for_decode;
 341 /* Standard unification table to look up on encoding (writing).  */
 342 Lisp_Object Vstandard_character_unification_table_for_encode;
 343
 344 Lisp_Object Qcharacter_unification_table;
 345 Lisp_Object Qcharacter_unification_table_for_decode;
 346 Lisp_Object Qcharacter_unification_table_for_encode;
 347
 348 /* Alist of charsets vs revision number.  */
 349 Lisp_Object Vcharset_revision_alist;
 350
 351 /* Default coding systems used for process I/O.  */
 352 Lisp_Object Vdefault_process_coding_system;
 353
 354 \f
 355 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 356
 357 /* Emacs' internal format for encoding multiple character sets is a
 358    kind of multi-byte encoding, i.e. characters are encoded by
 359    variable-length sequences of one-byte codes.  ASCII characters
 360    and control characters (e.g. `tab', `newline') are represented by
 361    one-byte sequences which are their ASCII codes, in the range 0x00
 362    through 0x7F.  The other characters are represented by a sequence
 363    of `base leading-code', optional `extended leading-code', and one
 364    or two `position-code's.  The length of the sequence is determined
 365    by the base leading-code.  Leading-code takes the range 0x80
 366    through 0x9F, whereas extended leading-code and position-code take
 367    the range 0xA0 through 0xFF.  See `charset.h' for more details
 368    about leading-code and position-code.
 369
 370    There's one exception to this rule.  Special leading-code
 371    `leading-code-composition' denotes that the following several
 372    characters should be composed into one character.  Leading-codes of
 373    components (except for ASCII) are added 0x20.  An ASCII character
 374    component is represented by a 2-byte sequence of `0xA0' and
 375    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 376    details of composite character.  Hence, we can summarize the code
 377    range as follows:
 378
 379    --- CODE RANGE of Emacs' internal format ---
 380    (character set)      (range)
 381    ASCII                0x00 .. 0x7F
 382    ELSE (1st byte)      0x80 .. 0x9F
 383         (rest bytes)    0xA0 .. 0xFF
 384    ---------------------------------------------
 385
 386   */
 387
 388 enum emacs_code_class_type emacs_code_class[256];
 389
 390 /* Go to the next statement only if *SRC is accessible and the code is
 391    greater than 0xA0.  */
 392 #define CHECK_CODE_RANGE_A0_FF  \
 393   do {                          \
 394     if (src >= src_end)         \
 395       goto label_end_of_switch; \
 396     else if (*src++ < 0xA0)     \
 397       return 0;                 \
 398   } while (0)
 399
 400 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 401    Check if a text is encoded in Emacs' internal format.  If it is,
 402    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 403
 404 int
 405 detect_coding_emacs_mule (src, src_end)
 406      unsigned char *src, *src_end;
 407 {
 408   unsigned char c;
 409   int composing = 0;
 410
 411   while (src < src_end)
 412     {
 413       c = *src++;
 414
 415       if (composing)
 416         {
 417           if (c < 0xA0)
 418             composing = 0;
 419           else
 420             c -= 0x20;
 421         }
 422
 423       switch (emacs_code_class[c])
 424         {
 425         case EMACS_ascii_code:
 426         case EMACS_linefeed_code:
 427           break;
 428
 429         case EMACS_control_code:
 430           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 431             return 0;
 432           break;
 433
 434         case EMACS_invalid_code:
 435           return 0;
 436
 437         case EMACS_leading_code_composition: /* c == 0x80 */
 438           if (composing)
 439             CHECK_CODE_RANGE_A0_FF;
 440           else
 441             composing = 1;
 442           break;
 443
 444         case EMACS_leading_code_4:
 445           CHECK_CODE_RANGE_A0_FF;
 446           /* fall down to check it two more times ...  */
 447
 448         case EMACS_leading_code_3:
 449           CHECK_CODE_RANGE_A0_FF;
 450           /* fall down to check it one more time ...  */
 451
 452         case EMACS_leading_code_2:
 453           CHECK_CODE_RANGE_A0_FF;
 454           break;
 455
 456         default:
 457         label_end_of_switch:
 458           break;
 459         }
 460     }
 461   return CODING_CATEGORY_MASK_EMACS_MULE;
 462 }
 463
 464 \f
 465 /*** 3. ISO2022 handlers ***/
 466
 467 /* The following note describes the coding system ISO2022 briefly.
 468    Since the intention of this note is to help in understanding of
 469    the programs in this file, some parts are NOT ACCURATE or OVERLY
 470    SIMPLIFIED.  For the thorough understanding, please refer to the
 471    original document of ISO2022.
 472
 473    ISO2022 provides many mechanisms to encode several character sets
 474    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 475    all text is encoded by codes of less than 128.  This may make the
 476    encoded text a little bit longer, but the text gets more stability
 477    to pass through several gateways (some of them strip off the MSB).
 478
 479    There are two kinds of character set: control character set and
 480    graphic character set.  The former contains control characters such
 481    as `newline' and `escape' to provide control functions (control
 482    functions are provided also by escape sequences).  The latter
 483    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 484    two control character sets and many graphic character sets.
 485
 486    Graphic character sets are classified into one of the following
 487    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 488    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 489    bytes (DIMENSION) and the number of characters in one dimension
 490    (CHARS) of the set.  In addition, each character set is assigned an
 491    identification tag (called "final character" and denoted as <F>
 492    here after) which is unique in each class.  <F> of each character
 493    set is decided by ECMA(*) when it is registered in ISO.  Code range
 494    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 495
 496    Note (*): ECMA = European Computer Manufacturers Association
 497
 498    Here are examples of graphic character set [NAME(<F>)]:
 499         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 500         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 501         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 502         o DIMENSION2_CHARS96 -- none for the moment
 503
 504    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 505         C0 [0x00..0x1F] -- control character plane 0
 506         GL [0x20..0x7F] -- graphic character plane 0
 507         C1 [0x80..0x9F] -- control character plane 1
 508         GR [0xA0..0xFF] -- graphic character plane 1
 509
 510    A control character set is directly designated and invoked to C0 or
 511    C1 by an escape sequence.  The most common case is that ISO646's
 512    control character set is designated/invoked to C0 and ISO6429's
 513    control character set is designated/invoked to C1, and usually
 514    these designations/invocations are omitted in a coded text.  With
 515    7-bit environment, only C0 can be used, and a control character for
 516    C1 is encoded by an appropriate escape sequence to fit in the
 517    environment.  All control characters for C1 are defined the
 518    corresponding escape sequences.
 519
 520    A graphic character set is at first designated to one of four
 521    graphic registers (G0 through G3), then these graphic registers are
 522    invoked to GL or GR.  These designations and invocations can be
 523    done independently.  The most common case is that G0 is invoked to
 524    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 525    these invocations and designations are omitted in a coded text.
 526    With 7-bit environment, only GL can be used.
 527
 528    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 529    and 0x7F of GL area work as control characters SPACE and DEL
 530    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 531
 532    There are two ways of invocation: locking-shift and single-shift.
 533    With locking-shift, the invocation lasts until the next different
 534    invocation, whereas with single-shift, the invocation works only
 535    for the following character and doesn't affect locking-shift.
 536    Invocations are done by the following control characters or escape
 537    sequences.
 538
 539    ----------------------------------------------------------------------
 540    function             control char    escape sequence description
 541    ----------------------------------------------------------------------
 542    SI  (shift-in)               0x0F    none            invoke G0 to GL
 543    SO  (shift-out)              0x0E    none            invoke G1 to GL
 544    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 545    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 546    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 547    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 548    ----------------------------------------------------------------------
 549    The first four are for locking-shift.  Control characters for these
 550    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 551
 552    Designations are done by the following escape sequences.
 553    ----------------------------------------------------------------------
 554    escape sequence      description
 555    ----------------------------------------------------------------------
 556    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 557    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 558    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 559    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 560    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 561    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 562    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 563    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 564    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 565    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 566    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 567    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 568    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 569    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 570    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 571    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 572    ----------------------------------------------------------------------
 573
 574    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 575    of dimension 1, chars 94, and final character <F>, and etc.
 576
 577    Note (*): Although these designations are not allowed in ISO2022,
 578    Emacs accepts them on decoding, and produces them on encoding
 579    CHARS96 character set in a coding system which is characterized as
 580    7-bit environment, non-locking-shift, and non-single-shift.
 581
 582    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 583    '(' can be omitted.  We call this as "short-form" here after.
 584
 585    Now you may notice that there are a lot of ways for encoding the
 586    same multilingual text in ISO2022.  Actually, there exists many
 587    coding systems such as Compound Text (used in X's inter client
 588    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 589    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 590    localized platforms), and all of these are variants of ISO2022.
 591
 592    In addition to the above, Emacs handles two more kinds of escape
 593    sequences: ISO6429's direction specification and Emacs' private
 594    sequence for specifying character composition.
 595
 596    ISO6429's direction specification takes the following format:
 597         o CSI ']'      -- end of the current direction
 598         o CSI '0' ']'  -- end of the current direction
 599         o CSI '1' ']'  -- start of left-to-right text
 600         o CSI '2' ']'  -- start of right-to-left text
 601    The control character CSI (0x9B: control sequence introducer) is
 602    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 603
 604    Character composition specification takes the following format:
 605         o ESC '0' -- start character composition
 606         o ESC '1' -- end character composition
 607    Since these are not standard escape sequences of any ISO, the use
 608    of them for these meaning is restricted to Emacs only.  */
 609
 610 enum iso_code_class_type iso_code_class[256];
 611
 612 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 613    Check if a text is encoded in ISO2022.  If it is, returns an
 614    integer in which appropriate flag bits any of:
 615         CODING_CATEGORY_MASK_ISO_7
 616         CODING_CATEGORY_MASK_ISO_8_1
 617         CODING_CATEGORY_MASK_ISO_8_2
 618         CODING_CATEGORY_MASK_ISO_7_ELSE
 619         CODING_CATEGORY_MASK_ISO_8_ELSE
 620    are set.  If a code which should never appear in ISO2022 is found,
 621    returns 0.  */
 622
 623 int
 624 detect_coding_iso2022 (src, src_end)
 625      unsigned char *src, *src_end;
 626 {
 627   int mask = (CODING_CATEGORY_MASK_ISO_7
 628               | CODING_CATEGORY_MASK_ISO_8_1
 629               | CODING_CATEGORY_MASK_ISO_8_2
 630               | CODING_CATEGORY_MASK_ISO_7_ELSE
 631               | CODING_CATEGORY_MASK_ISO_8_ELSE
 632               );
 633   int g1 = 0;                   /* 1 iff designating to G1.  */
 634   int c, i;
 635   struct coding_system coding_iso_8_1, coding_iso_8_2;
 636
 637   /* Coding systems of these categories may accept latin extra codes.  */
 638   setup_coding_system
 639     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
 640      &coding_iso_8_1);
 641   setup_coding_system
 642     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
 643      &coding_iso_8_2);
 644
 645   while (mask && src < src_end)
 646     {
 647       c = *src++;
 648       switch (c)
 649         {
 650         case ISO_CODE_ESC:
 651           if (src >= src_end)
 652             break;
 653           c = *src++;
 654           if ((c >= '(' && c <= '/'))
 655             {
 656               /* Designation sequence for a charset of dimension 1.  */
 657               if (src >= src_end)
 658                 break;
 659               c = *src++;
 660               if (c < ' ' || c >= 0x80)
 661                 /* Invalid designation sequence.  */
 662                 return 0;
 663             }
 664           else if (c == '$')
 665             {
 666               /* Designation sequence for a charset of dimension 2.  */
 667               if (src >= src_end)
 668                 break;
 669               c = *src++;
 670               if (c >= '@' && c <= 'B')
 671                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 672                 ;
 673               else if (c >= '(' && c <= '/')
 674                 {
 675                   if (src >= src_end)
 676                     break;
 677                   c = *src++;
 678                   if (c < ' ' || c >= 0x80)
 679                     /* Invalid designation sequence.  */
 680                     return 0;
 681                 }
 682               else
 683                 /* Invalid designation sequence.  */
 684                 return 0;
 685             }
 686           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 687             /* Locking shift.  */
 688             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 689                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 690           else if (c == '0' || c == '1' || c == '2')
 691             /* Start/end composition.  */
 692             ;
 693           else
 694             /* Invalid escape sequence.  */
 695             return 0;
 696           break;
 697
 698         case ISO_CODE_SO:
 699           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 700                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 701           break;
 702
 703         case ISO_CODE_CSI:
 704         case ISO_CODE_SS2:
 705         case ISO_CODE_SS3:
 706           {
 707             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 708
 709             if (c != ISO_CODE_CSI)
 710               {
 711                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
 712                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 713                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
 714                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 715               }
 716             if (VECTORP (Vlatin_extra_code_table)
 717                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 718               {
 719                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 720                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 721                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 722                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 723               }
 724             mask &= newmask;
 725           }
 726           break;
 727
 728         default:
 729           if (c < 0x80)
 730             break;
 731           else if (c < 0xA0)
 732             {
 733               if (VECTORP (Vlatin_extra_code_table)
 734                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 735                 {
 736                   int newmask = 0;
 737
 738                   if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 739                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 740                   if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 741                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 742                   mask &= newmask;
 743                 }
 744               else
 745                 return 0;
 746             }
 747           else
 748             {
 749               unsigned char *src_begin = src;
 750
 751               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 752                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 753               while (src < src_end && *src >= 0xA0)
 754                 src++;
 755               if ((src - src_begin - 1) & 1 && src < src_end)
 756                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 757             }
 758           break;
 759         }
 760     }
 761
 762   return mask;
 763 }
 764
 765 /* Decode a character of which charset is CHARSET and the 1st position
 766    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 767    fetched from SRC and set to C2.  If CHARSET is negative, it means
 768    that we are decoding ill formed text, and what we can do is just to
 769    read C1 as is.  */
 770
 771 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 772   do {                                                                  \
 773     int c_alt, charset_alt = (charset);                                 \
 774     if (COMPOSING_HEAD_P (coding->composing))                           \
 775       {                                                                 \
 776         *dst++ = LEADING_CODE_COMPOSITION;                              \
 777         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 778           /* To tell composition rules are embeded.  */                 \
 779           *dst++ = 0xFF;                                                \
 780         coding->composing += 2;                                         \
 781       }                                                                 \
 782     if ((charset) >= 0)                                                 \
 783       {                                                                 \
 784         if (CHARSET_DIMENSION (charset) == 2)                           \
 785           {                                                             \
 786             ONE_MORE_BYTE (c2);                                         \
 787             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F         \
 788                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)  \
 789               {                                                         \
 790                 src--;                                                  \
 791                 c2 = ' ';                                               \
 792               }                                                         \
 793           }                                                             \
 794         if (!NILP (unification_table)                                   \
 795             && ((c_alt = unify_char (unification_table,                 \
 796                                      -1, (charset), c1, c2)) >= 0))     \
 797           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 798       }                                                                 \
 799     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 800       DECODE_CHARACTER_ASCII (c1);                                      \
 801     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 802       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 803     else                                                                \
 804       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 805     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 806       /* To tell a composition rule follows.  */                        \
 807       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 808   } while (0)
 809
 810 /* Set designation state into CODING.  */
 811 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 812   do {                                                                  \
 813     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 814                                      make_number (chars),               \
 815                                      make_number (final_char));         \
 816     if (charset >= 0)                                                   \
 817       {                                                                 \
 818         if (coding->direction == 1                                      \
 819             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 820           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 821         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 822       }                                                                 \
 823   } while (0)
 824
 825 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 826
 827 int
 828 decode_coding_iso2022 (coding, source, destination,
 829                        src_bytes, dst_bytes, consumed)
 830      struct coding_system *coding;
 831      unsigned char *source, *destination;
 832      int src_bytes, dst_bytes;
 833      int *consumed;
 834 {
 835   unsigned char *src = source;
 836   unsigned char *src_end = source + src_bytes;
 837   unsigned char *dst = destination;
 838   unsigned char *dst_end = destination + dst_bytes;
 839   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 840      from DST_END to assure that overflow checking is necessary only
 841      at the head of loop.  */
 842   unsigned char *adjusted_dst_end = dst_end - 6;
 843   int charset;
 844   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 845   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 846   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 847   Lisp_Object unification_table
 848       = coding->character_unification_table_for_decode;
 849
 850   if (!NILP (Venable_character_unification) && NILP (unification_table))
 851     unification_table = Vstandard_character_unification_table_for_decode;
 852
 853   while (src < src_end && dst < adjusted_dst_end)
 854     {
 855       /* SRC_BASE remembers the start position in source in each loop.
 856          The loop will be exited when there's not enough source text
 857          to analyze long escape sequence or 2-byte code (within macros
 858          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 859          to SRC_BASE before exiting.  */
 860       unsigned char *src_base = src;
 861       int c1 = *src++, c2;
 862
 863       switch (iso_code_class [c1])
 864         {
 865         case ISO_0x20_or_0x7F:
 866           if (!coding->composing
 867               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 868             {
 869               /* This is SPACE or DEL.  */
 870               *dst++ = c1;
 871               break;
 872             }
 873           /* This is a graphic character, we fall down ...  */
 874
 875         case ISO_graphic_plane_0:
 876           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 877             {
 878               /* This is a composition rule.  */
 879               *dst++ = c1 | 0x80;
 880               coding->composing = COMPOSING_WITH_RULE_TAIL;
 881             }
 882           else
 883             DECODE_ISO_CHARACTER (charset0, c1);
 884           break;
 885
 886         case ISO_0xA0_or_0xFF:
 887           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 888             {
 889               /* Invalid code.  */
 890               *dst++ = c1;
 891               break;
 892             }
 893           /* This is a graphic character, we fall down ... */
 894
 895         case ISO_graphic_plane_1:
 896           DECODE_ISO_CHARACTER (charset1, c1);
 897           break;
 898
 899         case ISO_control_code:
 900           /* All ISO2022 control characters in this class have the
 901              same representation in Emacs internal format.  */
 902           *dst++ = c1;
 903           break;
 904
 905         case ISO_carriage_return:
 906           if (coding->eol_type == CODING_EOL_CR)
 907             {
 908               *dst++ = '\n';
 909             }
 910           else if (coding->eol_type == CODING_EOL_CRLF)
 911             {
 912               ONE_MORE_BYTE (c1);
 913               if (c1 == ISO_CODE_LF)
 914                 *dst++ = '\n';
 915               else
 916                 {
 917                   src--;
 918                   *dst++ = c1;
 919                 }
 920             }
 921           else
 922             {
 923               *dst++ = c1;
 924             }
 925           break;
 926
 927         case ISO_shift_out:
 928           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 929             goto label_invalid_escape_sequence;
 930           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 931           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 932           break;
 933
 934         case ISO_shift_in:
 935           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 936           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 937           break;
 938
 939         case ISO_single_shift_2_7:
 940         case ISO_single_shift_2:
 941           /* SS2 is handled as an escape sequence of ESC 'N' */
 942           c1 = 'N';
 943           goto label_escape_sequence;
 944
 945         case ISO_single_shift_3:
 946           /* SS2 is handled as an escape sequence of ESC 'O' */
 947           c1 = 'O';
 948           goto label_escape_sequence;
 949
 950         case ISO_control_sequence_introducer:
 951           /* CSI is handled as an escape sequence of ESC '[' ...  */
 952           c1 = '[';
 953           goto label_escape_sequence;
 954
 955         case ISO_escape:
 956           ONE_MORE_BYTE (c1);
 957         label_escape_sequence:
 958           /* Escape sequences handled by Emacs are invocation,
 959              designation, direction specification, and character
 960              composition specification.  */
 961           switch (c1)
 962             {
 963             case '&':           /* revision of following character set */
 964               ONE_MORE_BYTE (c1);
 965               if (!(c1 >= '@' && c1 <= '~'))
 966                 goto label_invalid_escape_sequence;
 967               ONE_MORE_BYTE (c1);
 968               if (c1 != ISO_CODE_ESC)
 969                 goto label_invalid_escape_sequence;
 970               ONE_MORE_BYTE (c1);
 971               goto label_escape_sequence;
 972
 973             case '$':           /* designation of 2-byte character set */
 974               ONE_MORE_BYTE (c1);
 975               if (c1 >= '@' && c1 <= 'B')
 976                 {       /* designation of JISX0208.1978, GB2312.1980,
 977                                    or JISX0208.1980 */
 978                   DECODE_DESIGNATION (0, 2, 94, c1);
 979                 }
 980               else if (c1 >= 0x28 && c1 <= 0x2B)
 981                 {       /* designation of DIMENSION2_CHARS94 character set */
 982                   ONE_MORE_BYTE (c2);
 983                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 984                 }
 985               else if (c1 >= 0x2C && c1 <= 0x2F)
 986                 {       /* designation of DIMENSION2_CHARS96 character set */
 987                   ONE_MORE_BYTE (c2);
 988                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 989                 }
 990               else
 991                 goto label_invalid_escape_sequence;
 992               break;
 993
 994             case 'n':           /* invocation of locking-shift-2 */
 995               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 996                 goto label_invalid_escape_sequence;
 997               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 998               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 999               break;
1000
1001             case 'o':           /* invocation of locking-shift-3 */
1002               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1003                 goto label_invalid_escape_sequence;
1004               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1005               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1006               break;
1007
1008             case 'N':           /* invocation of single-shift-2 */
1009               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1010                 goto label_invalid_escape_sequence;
1011               ONE_MORE_BYTE (c1);
1012               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1013               DECODE_ISO_CHARACTER (charset, c1);
1014               break;
1015
1016             case 'O':           /* invocation of single-shift-3 */
1017               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1018                 goto label_invalid_escape_sequence;
1019               ONE_MORE_BYTE (c1);
1020               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1021               DECODE_ISO_CHARACTER (charset, c1);
1022               break;
1023
1024             case '0':           /* start composing without embeded rules */
1025               coding->composing = COMPOSING_NO_RULE_HEAD;
1026               break;
1027
1028             case '1':           /* end composing */
1029               coding->composing = COMPOSING_NO;
1030               break;
1031
1032             case '2':           /* start composing with embeded rules */
1033               coding->composing = COMPOSING_WITH_RULE_HEAD;
1034               break;
1035
1036             case '[':           /* specification of direction */
1037               /* For the moment, nested direction is not supported.
1038                  So, the value of `coding->direction' is 0 or 1: 0
1039                  means left-to-right, 1 means right-to-left.  */
1040               ONE_MORE_BYTE (c1);
1041               switch (c1)
1042                 {
1043                 case ']':       /* end of the current direction */
1044                   coding->direction = 0;
1045
1046                 case '0':       /* end of the current direction */
1047                 case '1':       /* start of left-to-right direction */
1048                   ONE_MORE_BYTE (c1);
1049                   if (c1 == ']')
1050                     coding->direction = 0;
1051                   else
1052                     goto label_invalid_escape_sequence;
1053                   break;
1054
1055                 case '2':       /* start of right-to-left direction */
1056                   ONE_MORE_BYTE (c1);
1057                   if (c1 == ']')
1058                     coding->direction= 1;
1059                   else
1060                     goto label_invalid_escape_sequence;
1061                   break;
1062
1063                 default:
1064                   goto label_invalid_escape_sequence;
1065                 }
1066               break;
1067
1068             default:
1069               if (c1 >= 0x28 && c1 <= 0x2B)
1070                 {       /* designation of DIMENSION1_CHARS94 character set */
1071                   ONE_MORE_BYTE (c2);
1072                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1073                 }
1074               else if (c1 >= 0x2C && c1 <= 0x2F)
1075                 {       /* designation of DIMENSION1_CHARS96 character set */
1076                   ONE_MORE_BYTE (c2);
1077                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1078                 }
1079               else
1080                 {
1081                   goto label_invalid_escape_sequence;
1082                 }
1083             }
1084           /* We must update these variables now.  */
1085           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1086           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1087           break;
1088
1089         label_invalid_escape_sequence:
1090           {
1091             int length = src - src_base;
1092
1093             bcopy (src_base, dst, length);
1094             dst += length;
1095           }
1096         }
1097       continue;
1098
1099     label_end_of_loop:
1100       coding->carryover_size = src - src_base;
1101       bcopy (src_base, coding->carryover, coding->carryover_size);
1102       src = src_base;
1103       break;
1104     }
1105
1106   /* If this is the last block of the text to be decoded, we had
1107      better just flush out all remaining codes in the text although
1108      they are not valid characters.  */
1109   if (coding->last_block)
1110     {
1111       bcopy (src, dst, src_end - src);
1112       dst += (src_end - src);
1113       src = src_end;
1114     }
1115   *consumed = src - source;
1116   return dst - destination;
1117 }
1118
1119 /* ISO2022 encoding stuff.  */
1120
1121 /*
1122    It is not enough to say just "ISO2022" on encoding, we have to
1123    specify more details.  In Emacs, each coding-system of ISO2022
1124    variant has the following specifications:
1125         1. Initial designation to G0 thru G3.
1126         2. Allows short-form designation?
1127         3. ASCII should be designated to G0 before control characters?
1128         4. ASCII should be designated to G0 at end of line?
1129         5. 7-bit environment or 8-bit environment?
1130         6. Use locking-shift?
1131         7. Use Single-shift?
1132    And the following two are only for Japanese:
1133         8. Use ASCII in place of JIS0201-1976-Roman?
1134         9. Use JISX0208-1983 in place of JISX0208-1978?
1135    These specifications are encoded in `coding->flags' as flag bits
1136    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1137    details.
1138 */
1139
1140 /* Produce codes (escape sequence) for designating CHARSET to graphic
1141    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1142    the coding system CODING allows, produce designation sequence of
1143    short-form.  */
1144
1145 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1146   do {                                                                  \
1147     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1148     char *intermediate_char_94 = "()*+";                                \
1149     char *intermediate_char_96 = ",-./";                                \
1150     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1151     if (revision < 255)                                                 \
1152       {                                                                 \
1153         *dst++ = ISO_CODE_ESC;                                          \
1154         *dst++ = '&';                                                   \
1155         *dst++ = '@' + revision;                                        \
1156       }                                                                 \
1157     *dst++ = ISO_CODE_ESC;                                              \
1158     if (CHARSET_DIMENSION (charset) == 1)                               \
1159       {                                                                 \
1160         if (CHARSET_CHARS (charset) == 94)                              \
1161           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1162         else                                                            \
1163           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1164       }                                                                 \
1165     else                                                                \
1166       {                                                                 \
1167         *dst++ = '$';                                                   \
1168         if (CHARSET_CHARS (charset) == 94)                              \
1169           {                                                             \
1170             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1171                 || reg != 0                                             \
1172                 || final_char < '@' || final_char > 'B')                \
1173               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1174           }                                                             \
1175         else                                                            \
1176           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1177       }                                                                 \
1178     *dst++ = final_char;                                                \
1179     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1180   } while (0)
1181
1182 /* The following two macros produce codes (control character or escape
1183    sequence) for ISO2022 single-shift functions (single-shift-2 and
1184    single-shift-3).  */
1185
1186 #define ENCODE_SINGLE_SHIFT_2                           \
1187   do {                                                  \
1188     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1189       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1190     else                                                \
1191       *dst++ = ISO_CODE_SS2;                            \
1192     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1193   } while (0)
1194
1195 #define ENCODE_SINGLE_SHIFT_3                           \
1196   do {                                                  \
1197     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1198       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1199     else                                                \
1200       *dst++ = ISO_CODE_SS3;                            \
1201     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1202   } while (0)
1203
1204 /* The following four macros produce codes (control character or
1205    escape sequence) for ISO2022 locking-shift functions (shift-in,
1206    shift-out, locking-shift-2, and locking-shift-3).  */
1207
1208 #define ENCODE_SHIFT_IN                         \
1209   do {                                          \
1210     *dst++ = ISO_CODE_SI;                       \
1211     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1212   } while (0)
1213
1214 #define ENCODE_SHIFT_OUT                        \
1215   do {                                          \
1216     *dst++ = ISO_CODE_SO;                       \
1217     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1218   } while (0)
1219
1220 #define ENCODE_LOCKING_SHIFT_2                  \
1221   do {                                          \
1222     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1223     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1224   } while (0)
1225
1226 #define ENCODE_LOCKING_SHIFT_3                  \
1227   do {                                          \
1228     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1229     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1230   } while (0)
1231
1232 /* Produce codes for a DIMENSION1 character whose character set is
1233    CHARSET and whose position-code is C1.  Designation and invocation
1234    sequences are also produced in advance if necessary.  */
1235
1236
1237 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1238   do {                                                                  \
1239     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1240       {                                                                 \
1241         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1242           *dst++ = c1 & 0x7F;                                           \
1243         else                                                            \
1244           *dst++ = c1 | 0x80;                                           \
1245         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1246         break;                                                          \
1247       }                                                                 \
1248     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1249       {                                                                 \
1250         *dst++ = c1 & 0x7F;                                             \
1251         break;                                                          \
1252       }                                                                 \
1253     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1254       {                                                                 \
1255         *dst++ = c1 | 0x80;                                             \
1256         break;                                                          \
1257       }                                                                 \
1258     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1259              && !coding->safe_charsets[charset])                        \
1260       {                                                                 \
1261         /* We should not encode this character, instead produce one or  \
1262            two `?'s.  */                                                \
1263         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1264         if (CHARSET_WIDTH (charset) == 2)                               \
1265           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1266         break;                                                          \
1267       }                                                                 \
1268     else                                                                \
1269       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1270          must invoke it, or, at first, designate it to some graphic     \
1271          register.  Then repeat the loop to actually produce the        \
1272          character.  */                                                 \
1273       dst = encode_invocation_designation (charset, coding, dst);       \
1274   } while (1)
1275
1276 /* Produce codes for a DIMENSION2 character whose character set is
1277    CHARSET and whose position-codes are C1 and C2.  Designation and
1278    invocation codes are also produced in advance if necessary.  */
1279
1280 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1281   do {                                                                  \
1282     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1283       {                                                                 \
1284         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1285           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1286         else                                                            \
1287           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1288         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1289         break;                                                          \
1290       }                                                                 \
1291     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1292       {                                                                 \
1293         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1294         break;                                                          \
1295       }                                                                 \
1296     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1297       {                                                                 \
1298         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1299         break;                                                          \
1300       }                                                                 \
1301     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1302              && !coding->safe_charsets[charset])                        \
1303       {                                                                 \
1304         /* We should not encode this character, instead produce one or  \
1305            two `?'s.  */                                                \
1306         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1307         if (CHARSET_WIDTH (charset) == 2)                               \
1308           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1309         break;                                                          \
1310       }                                                                 \
1311     else                                                                \
1312       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1313          must invoke it, or, at first, designate it to some graphic     \
1314          register.  Then repeat the loop to actually produce the        \
1315          character.  */                                                 \
1316       dst = encode_invocation_designation (charset, coding, dst);       \
1317   } while (1)
1318
1319 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1320   do {                                                                    \
1321     int c_alt, charset_alt;                                               \
1322     if (!NILP (unification_table)                                         \
1323         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1324             >= 0))                                                        \
1325       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1326     else                                                                  \
1327       charset_alt = charset;                                              \
1328     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1329       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1330     else                                                                  \
1331       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1332   } while (0)
1333
1334 /* Produce designation and invocation codes at a place pointed by DST
1335    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1336    Return new DST.  */
1337
1338 unsigned char *
1339 encode_invocation_designation (charset, coding, dst)
1340      int charset;
1341      struct coding_system *coding;
1342      unsigned char *dst;
1343 {
1344   int reg;                      /* graphic register number */
1345
1346   /* At first, check designations.  */
1347   for (reg = 0; reg < 4; reg++)
1348     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1349       break;
1350
1351   if (reg >= 4)
1352     {
1353       /* CHARSET is not yet designated to any graphic registers.  */
1354       /* At first check the requested designation.  */
1355       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1356       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1357         /* Since CHARSET requests no special designation, designate it
1358            to graphic register 0.  */
1359         reg = 0;
1360
1361       ENCODE_DESIGNATION (charset, reg, coding);
1362     }
1363
1364   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1365       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1366     {
1367       /* Since the graphic register REG is not invoked to any graphic
1368          planes, invoke it to graphic plane 0.  */
1369       switch (reg)
1370         {
1371         case 0:                 /* graphic register 0 */
1372           ENCODE_SHIFT_IN;
1373           break;
1374
1375         case 1:                 /* graphic register 1 */
1376           ENCODE_SHIFT_OUT;
1377           break;
1378
1379         case 2:                 /* graphic register 2 */
1380           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1381             ENCODE_SINGLE_SHIFT_2;
1382           else
1383             ENCODE_LOCKING_SHIFT_2;
1384           break;
1385
1386         case 3:                 /* graphic register 3 */
1387           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1388             ENCODE_SINGLE_SHIFT_3;
1389           else
1390             ENCODE_LOCKING_SHIFT_3;
1391           break;
1392         }
1393     }
1394   return dst;
1395 }
1396
1397 /* The following two macros produce codes for indicating composition.  */
1398 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1399 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1400 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1401
1402 /* The following three macros produce codes for indicating direction
1403    of text.  */
1404 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1405   do {                                                  \
1406     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1407       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1408     else                                                \
1409       *dst++ = ISO_CODE_CSI;                            \
1410   } while (0)
1411
1412 #define ENCODE_DIRECTION_R2L    \
1413   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1414
1415 #define ENCODE_DIRECTION_L2R    \
1416   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1417
1418 /* Produce codes for designation and invocation to reset the graphic
1419    planes and registers to initial state.  */
1420 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1421   do {                                                                      \
1422     int reg;                                                                \
1423     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1424       ENCODE_SHIFT_IN;                                                      \
1425     for (reg = 0; reg < 4; reg++)                                           \
1426       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1427           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1428               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1429         ENCODE_DESIGNATION                                                  \
1430           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1431   } while (0)
1432
1433 /* Produce designation sequences of charsets in the line started from
1434    *SRC to a place pointed by DSTP.
1435
1436    If the current block ends before any end-of-line, we may fail to
1437    find all the necessary *designations.  */
1438 encode_designation_at_bol (coding, table, src, src_end, dstp)
1439      struct coding_system *coding;
1440      Lisp_Object table;
1441      unsigned char *src, *src_end, **dstp;
1442 {
1443   int charset, c, found = 0, reg;
1444   /* Table of charsets to be designated to each graphic register.  */
1445   int r[4];
1446   unsigned char *dst = *dstp;
1447
1448   for (reg = 0; reg < 4; reg++)
1449     r[reg] = -1;
1450
1451   while (src < src_end && *src != '\n' && found < 4)
1452     {
1453       int bytes = BYTES_BY_CHAR_HEAD (*src);
1454
1455       if (NILP (table))
1456         charset = CHARSET_AT (src);
1457       else
1458         {
1459           int c_alt;
1460           unsigned char c1, c2;
1461
1462           SPLIT_STRING(src, bytes, charset, c1, c2);
1463           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1464             charset = CHAR_CHARSET (c_alt);
1465         }
1466
1467       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1468       if (r[reg] < 0)
1469         {
1470           found++;
1471           r[reg] = charset;
1472         }
1473
1474       src += bytes;
1475     }
1476
1477   if (found)
1478     {
1479       for (reg = 0; reg < 4; reg++)
1480         if (r[reg] >= 0
1481             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1482           ENCODE_DESIGNATION (r[reg], reg, coding);
1483       *dstp = dst;
1484     }
1485 }
1486
1487 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1488
1489 int
1490 encode_coding_iso2022 (coding, source, destination,
1491                        src_bytes, dst_bytes, consumed)
1492      struct coding_system *coding;
1493      unsigned char *source, *destination;
1494      int src_bytes, dst_bytes;
1495      int *consumed;
1496 {
1497   unsigned char *src = source;
1498   unsigned char *src_end = source + src_bytes;
1499   unsigned char *dst = destination;
1500   unsigned char *dst_end = destination + dst_bytes;
1501   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1502      from DST_END to assure overflow checking is necessary only at the
1503      head of loop.  */
1504   unsigned char *adjusted_dst_end = dst_end - 19;
1505   Lisp_Object unification_table
1506       = coding->character_unification_table_for_encode;
1507
1508   if (!NILP (Venable_character_unification) && NILP (unification_table))
1509     unification_table = Vstandard_character_unification_table_for_encode;
1510
1511   while (src < src_end && dst < adjusted_dst_end)
1512     {
1513       /* SRC_BASE remembers the start position in source in each loop.
1514          The loop will be exited when there's not enough source text
1515          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1516          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1517          reset to SRC_BASE before exiting.  */
1518       unsigned char *src_base = src;
1519       int charset, c1, c2, c3, c4;
1520
1521       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1522           && CODING_SPEC_ISO_BOL (coding))
1523         {
1524           /* We have to produce designation sequences if any now.  */
1525           encode_designation_at_bol (coding, unification_table,
1526                                      src, src_end, &dst);
1527           CODING_SPEC_ISO_BOL (coding) = 0;
1528         }
1529
1530       c1 = *src++;
1531       /* If we are seeing a component of a composite character, we are
1532          seeing a leading-code specially encoded for composition, or a
1533          composition rule if composing with rule.  We must set C1
1534          to a normal leading-code or an ASCII code.  If we are not at
1535          a composed character, we must reset the composition state.  */
1536       if (COMPOSING_P (coding->composing))
1537         {
1538           if (c1 < 0xA0)
1539             {
1540               /* We are not in a composite character any longer.  */
1541               coding->composing = COMPOSING_NO;
1542               ENCODE_COMPOSITION_END;
1543             }
1544           else
1545             {
1546               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1547                 {
1548                   *dst++ = c1 & 0x7F;
1549                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1550                   continue;
1551                 }
1552               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1553                 coding->composing = COMPOSING_WITH_RULE_RULE;
1554               if (c1 == 0xA0)
1555                 {
1556                   /* This is an ASCII component.  */
1557                   ONE_MORE_BYTE (c1);
1558                   c1 &= 0x7F;
1559                 }
1560               else
1561                 /* This is a leading-code of non ASCII component.  */
1562                 c1 -= 0x20;
1563             }
1564         }
1565
1566       /* Now encode one character.  C1 is a control character, an
1567          ASCII character, or a leading-code of multi-byte character.  */
1568       switch (emacs_code_class[c1])
1569         {
1570         case EMACS_ascii_code:
1571           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1572           break;
1573
1574         case EMACS_control_code:
1575           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1576             ENCODE_RESET_PLANE_AND_REGISTER;
1577           *dst++ = c1;
1578           break;
1579
1580         case EMACS_carriage_return_code:
1581           if (!coding->selective)
1582             {
1583               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1584                 ENCODE_RESET_PLANE_AND_REGISTER;
1585               *dst++ = c1;
1586               break;
1587             }
1588           /* fall down to treat '\r' as '\n' ...  */
1589
1590         case EMACS_linefeed_code:
1591           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1592             ENCODE_RESET_PLANE_AND_REGISTER;
1593           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1594             bcopy (coding->spec.iso2022.initial_designation,
1595                    coding->spec.iso2022.current_designation,
1596                    sizeof coding->spec.iso2022.initial_designation);
1597           if (coding->eol_type == CODING_EOL_LF
1598               || coding->eol_type == CODING_EOL_UNDECIDED)
1599             *dst++ = ISO_CODE_LF;
1600           else if (coding->eol_type == CODING_EOL_CRLF)
1601             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1602           else
1603             *dst++ = ISO_CODE_CR;
1604           CODING_SPEC_ISO_BOL (coding) = 1;
1605           break;
1606
1607         case EMACS_leading_code_2:
1608           ONE_MORE_BYTE (c2);
1609           if (c2 < 0xA0)
1610             {
1611               /* invalid sequence */
1612               *dst++ = c1;
1613               *dst++ = c2;
1614             }
1615           else
1616             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1617           break;
1618
1619         case EMACS_leading_code_3:
1620           TWO_MORE_BYTES (c2, c3);
1621           if (c2 < 0xA0 || c3 < 0xA0)
1622             {
1623               /* invalid sequence */
1624               *dst++ = c1;
1625               *dst++ = c2;
1626               *dst++ = c3;
1627             }
1628           else if (c1 < LEADING_CODE_PRIVATE_11)
1629             ENCODE_ISO_CHARACTER (c1, c2, c3);
1630           else
1631             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1632           break;
1633
1634         case EMACS_leading_code_4:
1635           THREE_MORE_BYTES (c2, c3, c4);
1636           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1637             {
1638               /* invalid sequence */
1639               *dst++ = c1;
1640               *dst++ = c2;
1641               *dst++ = c3;
1642               *dst++ = c4;
1643             }
1644           else
1645             ENCODE_ISO_CHARACTER (c2, c3, c4);
1646           break;
1647
1648         case EMACS_leading_code_composition:
1649           ONE_MORE_BYTE (c2);
1650           if (c2 < 0xA0)
1651             {
1652               /* invalid sequence */
1653               *dst++ = c1;
1654               *dst++ = c2;
1655             }
1656           else if (c2 == 0xFF)
1657             {
1658               coding->composing = COMPOSING_WITH_RULE_HEAD;
1659               ENCODE_COMPOSITION_WITH_RULE_START;
1660             }
1661           else
1662             {
1663               /* Rewind one byte because it is a character code of
1664                  composition elements.  */
1665               src--;
1666               coding->composing = COMPOSING_NO_RULE_HEAD;
1667               ENCODE_COMPOSITION_NO_RULE_START;
1668             }
1669           break;
1670
1671         case EMACS_invalid_code:
1672           *dst++ = c1;
1673           break;
1674         }
1675       continue;
1676     label_end_of_loop:
1677       /* We reach here because the source date ends not at character
1678          boundary.  */
1679       coding->carryover_size = src_end - src_base;
1680       bcopy (src_base, coding->carryover, coding->carryover_size);
1681       src = src_end;
1682       break;
1683     }
1684
1685   /* If this is the last block of the text to be encoded, we must
1686      reset graphic planes and registers to the initial state.  */
1687   if (src >= src_end && coding->last_block)
1688     {
1689       ENCODE_RESET_PLANE_AND_REGISTER;
1690       if (coding->carryover_size > 0
1691           && coding->carryover_size < (dst_end - dst))
1692         {
1693           bcopy (coding->carryover, dst, coding->carryover_size);
1694           dst += coding->carryover_size;
1695           coding->carryover_size = 0;
1696         }
1697     }
1698   *consumed = src - source;
1699   return dst - destination;
1700 }
1701
1702 \f
1703 /*** 4. SJIS and BIG5 handlers ***/
1704
1705 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1706    quite widely.  So, for the moment, Emacs supports them in the bare
1707    C code.  But, in the future, they may be supported only by CCL.  */
1708
1709 /* SJIS is a coding system encoding three character sets: ASCII, right
1710    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1711    as is.  A character of charset katakana-jisx0201 is encoded by
1712    "position-code + 0x80".  A character of charset japanese-jisx0208
1713    is encoded in 2-byte but two position-codes are divided and shifted
1714    so that it fit in the range below.
1715
1716    --- CODE RANGE of SJIS ---
1717    (character set)      (range)
1718    ASCII                0x00 .. 0x7F
1719    KATAKANA-JISX0201    0xA0 .. 0xDF
1720    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1721             (2nd byte)  0x40 .. 0xFF
1722    -------------------------------
1723
1724 */
1725
1726 /* BIG5 is a coding system encoding two character sets: ASCII and
1727    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1728    character set and is encoded in two-byte.
1729
1730    --- CODE RANGE of BIG5 ---
1731    (character set)      (range)
1732    ASCII                0x00 .. 0x7F
1733    Big5 (1st byte)      0xA1 .. 0xFE
1734         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1735    --------------------------
1736
1737    Since the number of characters in Big5 is larger than maximum
1738    characters in Emacs' charset (96x96), it can't be handled as one
1739    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1740    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1741    contains frequently used characters and the latter contains less
1742    frequently used characters.  */
1743
1744 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1745    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1746    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1747    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1748
1749 /* Number of Big5 characters which have the same code in 1st byte.  */
1750 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1751
1752 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1753   do {                                                                  \
1754     unsigned int temp                                                   \
1755       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1756     if (b1 < 0xC9)                                                      \
1757       charset = charset_big5_1;                                         \
1758     else                                                                \
1759       {                                                                 \
1760         charset = charset_big5_2;                                       \
1761         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1762       }                                                                 \
1763     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1764     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1765   } while (0)
1766
1767 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1768   do {                                                                  \
1769     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1770     if (charset == charset_big5_2)                                      \
1771       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1772     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1773     b2 = temp % BIG5_SAME_ROW;                                          \
1774     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1775   } while (0)
1776
1777 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1778   do {                                                                  \
1779     int c_alt, charset_alt = (charset);                                 \
1780     if (!NILP (unification_table)                                       \
1781         && ((c_alt = unify_char (unification_table,                     \
1782                                  -1, (charset), c1, c2)) >= 0))         \
1783           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1784     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1785       DECODE_CHARACTER_ASCII (c1);                                      \
1786     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1787       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1788     else                                                                \
1789       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1790   } while (0)
1791
1792 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1793   do {                                                                    \
1794     int c_alt, charset_alt;                                               \
1795     if (!NILP (unification_table)                                         \
1796         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1797             >= 0))                                                        \
1798       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1799     else                                                                  \
1800       charset_alt = charset;                                              \
1801     if (charset_alt == charset_ascii)                                     \
1802       *dst++ = c1;                                                        \
1803     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1804       {                                                                   \
1805         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1806           *dst++ = c1;                                                    \
1807         else                                                              \
1808           *dst++ = charset_alt, *dst++ = c1;                              \
1809       }                                                                   \
1810     else                                                                  \
1811       {                                                                   \
1812         c1 &= 0x7F, c2 &= 0x7F;                                           \
1813         if (sjis_p && charset_alt == charset_jisx0208)                    \
1814           {                                                               \
1815             unsigned char s1, s2;                                         \
1816                                                                           \
1817             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1818             *dst++ = s1, *dst++ = s2;                                     \
1819           }                                                               \
1820         else if (!sjis_p                                                  \
1821                  && (charset_alt == charset_big5_1                        \
1822                      || charset_alt == charset_big5_2))                   \
1823           {                                                               \
1824             unsigned char b1, b2;                                         \
1825                                                                           \
1826             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1827             *dst++ = b1, *dst++ = b2;                                     \
1828           }                                                               \
1829         else                                                              \
1830           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1831       }                                                                   \
1832   } while (0);
1833
1834 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1835    Check if a text is encoded in SJIS.  If it is, return
1836    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1837
1838 int
1839 detect_coding_sjis (src, src_end)
1840      unsigned char *src, *src_end;
1841 {
1842   unsigned char c;
1843
1844   while (src < src_end)
1845     {
1846       c = *src++;
1847       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1848         return 0;
1849       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1850         {
1851           if (src < src_end && *src++ < 0x40)
1852             return 0;
1853         }
1854     }
1855   return CODING_CATEGORY_MASK_SJIS;
1856 }
1857
1858 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1859    Check if a text is encoded in BIG5.  If it is, return
1860    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1861
1862 int
1863 detect_coding_big5 (src, src_end)
1864      unsigned char *src, *src_end;
1865 {
1866   unsigned char c;
1867
1868   while (src < src_end)
1869     {
1870       c = *src++;
1871       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1872         return 0;
1873       if (c >= 0xA1)
1874         {
1875           if (src >= src_end)
1876             break;
1877           c = *src++;
1878           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1879             return 0;
1880         }
1881     }
1882   return CODING_CATEGORY_MASK_BIG5;
1883 }
1884
1885 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1886    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1887
1888 int
1889 decode_coding_sjis_big5 (coding, source, destination,
1890                          src_bytes, dst_bytes, consumed, sjis_p)
1891      struct coding_system *coding;
1892      unsigned char *source, *destination;
1893      int src_bytes, dst_bytes;
1894      int *consumed;
1895      int sjis_p;
1896 {
1897   unsigned char *src = source;
1898   unsigned char *src_end = source + src_bytes;
1899   unsigned char *dst = destination;
1900   unsigned char *dst_end = destination + dst_bytes;
1901   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1902      from DST_END to assure overflow checking is necessary only at the
1903      head of loop.  */
1904   unsigned char *adjusted_dst_end = dst_end - 3;
1905   Lisp_Object unification_table
1906       = coding->character_unification_table_for_decode;
1907
1908   if (!NILP (Venable_character_unification) && NILP (unification_table))
1909     unification_table = Vstandard_character_unification_table_for_decode;
1910
1911   while (src < src_end && dst < adjusted_dst_end)
1912     {
1913       /* SRC_BASE remembers the start position in source in each loop.
1914          The loop will be exited when there's not enough source text
1915          to analyze two-byte character (within macro ONE_MORE_BYTE).
1916          In that case, SRC is reset to SRC_BASE before exiting.  */
1917       unsigned char *src_base = src;
1918       unsigned char c1 = *src++, c2, c3, c4;
1919
1920       if (c1 == '\r')
1921         {
1922           if (coding->eol_type == CODING_EOL_CRLF)
1923             {
1924               ONE_MORE_BYTE (c2);
1925               if (c2 == '\n')
1926                 *dst++ = c2;
1927               else
1928                 /* To process C2 again, SRC is subtracted by 1.  */
1929                 *dst++ = c1, src--;
1930             }
1931           else
1932             *dst++ = c1;
1933         }
1934       else if (c1 < 0x20)
1935         *dst++ = c1;
1936       else if (c1 < 0x80)
1937         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1938       else if (c1 < 0xA0 || c1 >= 0xE0)
1939         {
1940           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1941           if (sjis_p)
1942             {
1943               ONE_MORE_BYTE (c2);
1944               DECODE_SJIS (c1, c2, c3, c4);
1945               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1946             }
1947           else if (c1 >= 0xE0 && c1 < 0xFF)
1948             {
1949               int charset;
1950
1951               ONE_MORE_BYTE (c2);
1952               DECODE_BIG5 (c1, c2, charset, c3, c4);
1953               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1954             }
1955           else                  /* Invalid code */
1956             *dst++ = c1;
1957         }
1958       else
1959         {
1960           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1961           if (sjis_p)
1962             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1963           else
1964             {
1965               int charset;
1966
1967               ONE_MORE_BYTE (c2);
1968               DECODE_BIG5 (c1, c2, charset, c3, c4);
1969               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1970             }
1971         }
1972       continue;
1973
1974     label_end_of_loop:
1975       coding->carryover_size = src - src_base;
1976       bcopy (src_base, coding->carryover, coding->carryover_size);
1977       src = src_base;
1978       break;
1979     }
1980
1981   *consumed = src - source;
1982   return dst - destination;
1983 }
1984
1985 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1986    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1987    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1988    sure that all these charsets are registered as official charset
1989    (i.e. do not have extended leading-codes).  Characters of other
1990    charsets are produced without any encoding.  If SJIS_P is 1, encode
1991    SJIS text, else encode BIG5 text.  */
1992
1993 int
1994 encode_coding_sjis_big5 (coding, source, destination,
1995                          src_bytes, dst_bytes, consumed, sjis_p)
1996      struct coding_system *coding;
1997      unsigned char *source, *destination;
1998      int src_bytes, dst_bytes;
1999      int *consumed;
2000      int sjis_p;
2001 {
2002   unsigned char *src = source;
2003   unsigned char *src_end = source + src_bytes;
2004   unsigned char *dst = destination;
2005   unsigned char *dst_end = destination + dst_bytes;
2006   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2007      from DST_END to assure overflow checking is necessary only at the
2008      head of loop.  */
2009   unsigned char *adjusted_dst_end = dst_end - 1;
2010   Lisp_Object unification_table
2011       = coding->character_unification_table_for_encode;
2012
2013   if (!NILP (Venable_character_unification) && NILP (unification_table))
2014     unification_table = Vstandard_character_unification_table_for_encode;
2015
2016   while (src < src_end && dst < adjusted_dst_end)
2017     {
2018       /* SRC_BASE remembers the start position in source in each loop.
2019          The loop will be exited when there's not enough source text
2020          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2021          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2022          before exiting.  */
2023       unsigned char *src_base = src;
2024       unsigned char c1 = *src++, c2, c3, c4;
2025
2026       if (coding->composing)
2027         {
2028           if (c1 == 0xA0)
2029             {
2030               ONE_MORE_BYTE (c1);
2031               c1 &= 0x7F;
2032             }
2033           else if (c1 >= 0xA0)
2034             c1 -= 0x20;
2035           else
2036             coding->composing = 0;
2037         }
2038
2039       switch (emacs_code_class[c1])
2040         {
2041         case EMACS_ascii_code:
2042           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2043           break;
2044
2045         case EMACS_control_code:
2046           *dst++ = c1;
2047           break;
2048
2049         case EMACS_carriage_return_code:
2050           if (!coding->selective)
2051             {
2052               *dst++ = c1;
2053               break;
2054             }
2055           /* fall down to treat '\r' as '\n' ...  */
2056
2057         case EMACS_linefeed_code:
2058           if (coding->eol_type == CODING_EOL_LF
2059               || coding->eol_type == CODING_EOL_UNDECIDED)
2060             *dst++ = '\n';
2061           else if (coding->eol_type == CODING_EOL_CRLF)
2062             *dst++ = '\r', *dst++ = '\n';
2063           else
2064             *dst++ = '\r';
2065           break;
2066
2067         case EMACS_leading_code_2:
2068           ONE_MORE_BYTE (c2);
2069           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2070           break;
2071
2072         case EMACS_leading_code_3:
2073           TWO_MORE_BYTES (c2, c3);
2074           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2075           break;
2076
2077         case EMACS_leading_code_4:
2078           THREE_MORE_BYTES (c2, c3, c4);
2079           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2080           break;
2081
2082         case EMACS_leading_code_composition:
2083           coding->composing = 1;
2084           break;
2085
2086         default:                /* i.e. case EMACS_invalid_code: */
2087           *dst++ = c1;
2088         }
2089       continue;
2090
2091     label_end_of_loop:
2092       coding->carryover_size = src_end - src_base;
2093       bcopy (src_base, coding->carryover, coding->carryover_size);
2094       src = src_end;
2095       break;
2096     }
2097
2098   *consumed = src - source;
2099   return dst - destination;
2100 }
2101
2102 \f
2103 /*** 5. End-of-line handlers ***/
2104
2105 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2106    This function is called only when `coding->eol_type' is
2107    CODING_EOL_CRLF or CODING_EOL_CR.  */
2108
2109 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2110      struct coding_system *coding;
2111      unsigned char *source, *destination;
2112      int src_bytes, dst_bytes;
2113      int *consumed;
2114 {
2115   unsigned char *src = source;
2116   unsigned char *src_end = source + src_bytes;
2117   unsigned char *dst = destination;
2118   unsigned char *dst_end = destination + dst_bytes;
2119   int produced;
2120
2121   switch (coding->eol_type)
2122     {
2123     case CODING_EOL_CRLF:
2124       {
2125         /* Since the maximum bytes produced by each loop is 2, we
2126            subtract 1 from DST_END to assure overflow checking is
2127            necessary only at the head of loop.  */
2128         unsigned char *adjusted_dst_end = dst_end - 1;
2129
2130         while (src < src_end && dst < adjusted_dst_end)
2131           {
2132             unsigned char *src_base = src;
2133             unsigned char c = *src++;
2134             if (c == '\r')
2135               {
2136                 ONE_MORE_BYTE (c);
2137                 if (c != '\n')
2138                   *dst++ = '\r';
2139                 *dst++ = c;
2140               }
2141             else
2142               *dst++ = c;
2143             continue;
2144
2145           label_end_of_loop:
2146             coding->carryover_size = src - src_base;
2147             bcopy (src_base, coding->carryover, coding->carryover_size);
2148             src = src_base;
2149             break;
2150           }
2151         *consumed = src - source;
2152         produced = dst - destination;
2153         break;
2154       }
2155
2156     case CODING_EOL_CR:
2157       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2158       bcopy (source, destination, produced);
2159       dst_end = destination + produced;
2160       while (dst < dst_end)
2161         if (*dst++ == '\r') dst[-1] = '\n';
2162       *consumed = produced;
2163       break;
2164
2165     default:                    /* i.e. case: CODING_EOL_LF */
2166       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2167       bcopy (source, destination, produced);
2168       *consumed = produced;
2169       break;
2170     }
2171
2172   return produced;
2173 }
2174
2175 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2176    format of end-of-line according to `coding->eol_type'.  If
2177    `coding->selective' is 1, code '\r' in source text also means
2178    end-of-line.  */
2179
2180 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2181      struct coding_system *coding;
2182      unsigned char *source, *destination;
2183      int src_bytes, dst_bytes;
2184      int *consumed;
2185 {
2186   unsigned char *src = source;
2187   unsigned char *dst = destination;
2188   int produced;
2189
2190   if (src_bytes <= 0)
2191     return 0;
2192
2193   switch (coding->eol_type)
2194     {
2195     case CODING_EOL_LF:
2196     case CODING_EOL_UNDECIDED:
2197       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2198       bcopy (source, destination, produced);
2199       if (coding->selective)
2200         {
2201           int i = produced;
2202           while (i--)
2203             if (*dst++ == '\r') dst[-1] = '\n';
2204         }
2205       *consumed = produced;
2206
2207     case CODING_EOL_CRLF:
2208       {
2209         unsigned char c;
2210         unsigned char *src_end = source + src_bytes;
2211         unsigned char *dst_end = destination + dst_bytes;
2212         /* Since the maximum bytes produced by each loop is 2, we
2213            subtract 1 from DST_END to assure overflow checking is
2214            necessary only at the head of loop.  */
2215         unsigned char *adjusted_dst_end = dst_end - 1;
2216
2217         while (src < src_end && dst < adjusted_dst_end)
2218           {
2219             c = *src++;
2220             if (c == '\n' || (c == '\r' && coding->selective))
2221               *dst++ = '\r', *dst++ = '\n';
2222             else
2223               *dst++ = c;
2224           }
2225         produced = dst - destination;
2226         *consumed = src - source;
2227         break;
2228       }
2229
2230     default:                    /* i.e. case CODING_EOL_CR: */
2231       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2232       bcopy (source, destination, produced);
2233       {
2234         int i = produced;
2235         while (i--)
2236           if (*dst++ == '\n') dst[-1] = '\r';
2237       }
2238       *consumed = produced;
2239     }
2240
2241   return produced;
2242 }
2243
2244 \f
2245 /*** 6. C library functions ***/
2246
2247 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2248    has a property `coding-system'.  The value of this property is a
2249    vector of length 5 (called as coding-vector).  Among elements of
2250    this vector, the first (element[0]) and the fifth (element[4])
2251    carry important information for decoding/encoding.  Before
2252    decoding/encoding, this information should be set in fields of a
2253    structure of type `coding_system'.
2254
2255    A value of property `coding-system' can be a symbol of another
2256    subsidiary coding-system.  In that case, Emacs gets coding-vector
2257    from that symbol.
2258
2259    `element[0]' contains information to be set in `coding->type'.  The
2260    value and its meaning is as follows:
2261
2262    0 -- coding_type_emacs_mule
2263    1 -- coding_type_sjis
2264    2 -- coding_type_iso2022
2265    3 -- coding_type_big5
2266    4 -- coding_type_ccl encoder/decoder written in CCL
2267    nil -- coding_type_no_conversion
2268    t -- coding_type_undecided (automatic conversion on decoding,
2269                                no-conversion on encoding)
2270
2271    `element[4]' contains information to be set in `coding->flags' and
2272    `coding->spec'.  The meaning varies by `coding->type'.
2273
2274    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2275    of length 32 (of which the first 13 sub-elements are used now).
2276    Meanings of these sub-elements are:
2277
2278    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2279         If the value is an integer of valid charset, the charset is
2280         assumed to be designated to graphic register N initially.
2281
2282         If the value is minus, it is a minus value of charset which
2283         reserves graphic register N, which means that the charset is
2284         not designated initially but should be designated to graphic
2285         register N just before encoding a character in that charset.
2286
2287         If the value is nil, graphic register N is never used on
2288         encoding.
2289
2290    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2291         Each value takes t or nil.  See the section ISO2022 of
2292         `coding.h' for more information.
2293
2294    If `coding->type' is `coding_type_big5', element[4] is t to denote
2295    BIG5-ETen or nil to denote BIG5-HKU.
2296
2297    If `coding->type' takes the other value, element[4] is ignored.
2298
2299    Emacs Lisp's coding system also carries information about format of
2300    end-of-line in a value of property `eol-type'.  If the value is
2301    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2302    means CODING_EOL_CR.  If it is not integer, it should be a vector
2303    of subsidiary coding systems of which property `eol-type' has one
2304    of above values.
2305
2306 */
2307
2308 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2309    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2310    is setup so that no conversion is necessary and return -1, else
2311    return 0.  */
2312
2313 int
2314 setup_coding_system (coding_system, coding)
2315      Lisp_Object coding_system;
2316      struct coding_system *coding;
2317 {
2318   Lisp_Object coding_spec, plist, type, eol_type;
2319   Lisp_Object val;
2320   int i;
2321
2322   /* At first, set several fields to default values.  */
2323   coding->last_block = 0;
2324   coding->selective = 0;
2325   coding->composing = 0;
2326   coding->direction = 0;
2327   coding->carryover_size = 0;
2328   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2329   coding->character_unification_table_for_decode = Qnil;
2330   coding->character_unification_table_for_encode = Qnil;
2331
2332   Vlast_coding_system_used = coding->symbol = coding_system;
2333   eol_type = Qnil;
2334
2335   /* Get values of property `coding-system' and `eol-type'.
2336      Also get values of coding system properties:
2337      `post-read-conversion', `pre-write-conversion',
2338      `character-unification-table-for-decode',
2339      `character-unification-table-for-encode'.  */
2340   coding_spec = Fget (coding_system, Qcoding_system);
2341   if (!VECTORP (coding_spec)
2342       || XVECTOR (coding_spec)->size != 5
2343       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2344     goto label_invalid_coding_system;
2345   if (!inhibit_eol_conversion)
2346     eol_type = Fget (coding_system, Qeol_type);
2347
2348   plist = XVECTOR (coding_spec)->contents[3];
2349   coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2350   coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2351   val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2352   if (SYMBOLP (val))
2353     val = Fget (val, Qcharacter_unification_table_for_decode);
2354   coding->character_unification_table_for_decode
2355     = CHAR_TABLE_P (val) ? val : Qnil;
2356   val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2357   if (SYMBOLP (val))
2358     val = Fget (val, Qcharacter_unification_table_for_encode);
2359   coding->character_unification_table_for_encode
2360     = CHAR_TABLE_P (val) ? val : Qnil;
2361
2362   val = Fplist_get (plist, Qsafe_charsets);
2363   if (EQ (val, Qt))
2364     {
2365       for (i = 0; i <= MAX_CHARSET; i++)
2366         coding->safe_charsets[i] = 1;
2367     }
2368   else
2369     {
2370       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2371       while (CONSP (val))
2372         {
2373           if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2374             coding->safe_charsets[i] = 1;
2375           val = XCONS (val)->cdr;
2376         }
2377     }
2378
2379   if (VECTORP (eol_type))
2380     {
2381       coding->eol_type = CODING_EOL_UNDECIDED;
2382       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2383     }
2384   else if (XFASTINT (eol_type) == 1)
2385     {
2386       coding->eol_type = CODING_EOL_CRLF;
2387       coding->common_flags
2388         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2389     }
2390   else if (XFASTINT (eol_type) == 2)
2391     {
2392       coding->eol_type = CODING_EOL_CR;
2393       coding->common_flags
2394         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2395     }
2396   else
2397     {
2398       coding->eol_type = CODING_EOL_LF;
2399       coding->common_flags = 0;
2400     }
2401
2402   type = XVECTOR (coding_spec)->contents[0];
2403   switch (XFASTINT (type))
2404     {
2405     case 0:
2406       coding->type = coding_type_emacs_mule;
2407       if (!NILP (coding->post_read_conversion))
2408         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2409       if (!NILP (coding->pre_write_conversion))
2410         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2411       break;
2412
2413     case 1:
2414       coding->type = coding_type_sjis;
2415       coding->common_flags
2416         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2417       break;
2418
2419     case 2:
2420       coding->type = coding_type_iso2022;
2421       coding->common_flags
2422         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2423       {
2424         Lisp_Object val, temp;
2425         Lisp_Object *flags;
2426         int i, charset, default_reg_bits = 0;
2427
2428         val = XVECTOR (coding_spec)->contents[4];
2429
2430         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2431           goto label_invalid_coding_system;
2432
2433         flags = XVECTOR (val)->contents;
2434         coding->flags
2435           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2436              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2437              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2438              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2439              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2440              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2441              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2442              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2443              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2444              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2445              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2446              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2447              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2448              );
2449
2450         /* Invoke graphic register 0 to plane 0.  */
2451         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2452         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2453         CODING_SPEC_ISO_INVOCATION (coding, 1)
2454           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2455         /* Not single shifting at first.  */
2456         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2457         /* Beginning of buffer should also be regarded as bol. */
2458         CODING_SPEC_ISO_BOL (coding) = 1;
2459
2460         for (charset = 0; charset <= MAX_CHARSET; charset++)
2461           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2462         val = Vcharset_revision_alist;
2463         while (CONSP (val))
2464           {
2465             charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2466             if (charset >= 0
2467                 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2468                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2469               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2470             val = XCONS (val)->cdr;
2471           }
2472
2473         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2474            FLAGS[REG] can be one of below:
2475                 integer CHARSET: CHARSET occupies register I,
2476                 t: designate nothing to REG initially, but can be used
2477                   by any charsets,
2478                 list of integer, nil, or t: designate the first
2479                   element (if integer) to REG initially, the remaining
2480                   elements (if integer) is designated to REG on request,
2481                   if an element is t, REG can be used by any charset,
2482                 nil: REG is never used.  */
2483         for (charset = 0; charset <= MAX_CHARSET; charset++)
2484           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2485             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2486         for (i = 0; i < 4; i++)
2487           {
2488             if (INTEGERP (flags[i])
2489                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2490                 || (charset = get_charset_id (flags[i])) >= 0)
2491               {
2492                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2493                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2494               }
2495             else if (EQ (flags[i], Qt))
2496               {
2497                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2498                 default_reg_bits |= 1 << i;
2499               }
2500             else if (CONSP (flags[i]))
2501               {
2502                 Lisp_Object tail = flags[i];
2503
2504                 if (INTEGERP (XCONS (tail)->car)
2505                     && (charset = XINT (XCONS (tail)->car),
2506                         CHARSET_VALID_P (charset))
2507                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2508                   {
2509                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2510                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2511                   }
2512                 else
2513                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2514                 tail = XCONS (tail)->cdr;
2515                 while (CONSP (tail))
2516                   {
2517                     if (INTEGERP (XCONS (tail)->car)
2518                         && (charset = XINT (XCONS (tail)->car),
2519                             CHARSET_VALID_P (charset))
2520                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2521                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2522                         = i;
2523                     else if (EQ (XCONS (tail)->car, Qt))
2524                       default_reg_bits |= 1 << i;
2525                     tail = XCONS (tail)->cdr;
2526                   }
2527               }
2528             else
2529               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2530
2531             CODING_SPEC_ISO_DESIGNATION (coding, i)
2532               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2533           }
2534
2535         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2536           {
2537             /* REG 1 can be used only by locking shift in 7-bit env.  */
2538             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2539               default_reg_bits &= ~2;
2540             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2541               /* Without any shifting, only REG 0 and 1 can be used.  */
2542               default_reg_bits &= 3;
2543           }
2544
2545         for (charset = 0; charset <= MAX_CHARSET; charset++)
2546           if (CHARSET_VALID_P (charset)
2547               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2548                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2549             {
2550               /* We have not yet decided where to designate CHARSET.  */
2551               int reg_bits = default_reg_bits;
2552
2553               if (CHARSET_CHARS (charset) == 96)
2554                 /* A charset of CHARS96 can't be designated to REG 0.  */
2555                 reg_bits &= ~1;
2556
2557               if (reg_bits)
2558                 /* There exist some default graphic register.  */
2559                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2560                   = (reg_bits & 1
2561                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2562               else
2563                 /* We anyway have to designate CHARSET to somewhere.  */
2564                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2565                   = (CHARSET_CHARS (charset) == 94
2566                      ? 0
2567                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2568                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2569                         ? 1
2570                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2571                            ? 2 : 0)));
2572             }
2573       }
2574       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2575       break;
2576
2577     case 3:
2578       coding->type = coding_type_big5;
2579       coding->common_flags
2580         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2581       coding->flags
2582         = (NILP (XVECTOR (coding_spec)->contents[4])
2583            ? CODING_FLAG_BIG5_HKU
2584            : CODING_FLAG_BIG5_ETEN);
2585       break;
2586
2587     case 4:
2588       coding->type = coding_type_ccl;
2589       coding->common_flags
2590         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2591       {
2592         Lisp_Object val = XVECTOR (coding_spec)->contents[4];
2593         if (CONSP  (val)
2594             && VECTORP (XCONS (val)->car)
2595             && VECTORP (XCONS (val)->cdr))
2596           {
2597             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2598             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2599           }
2600         else
2601           goto label_invalid_coding_system;
2602       }
2603       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2604       break;
2605
2606     case 5:
2607       coding->type = coding_type_raw_text;
2608       break;
2609
2610     default:
2611       if (EQ (type, Qt))
2612         {
2613           coding->type = coding_type_undecided;
2614           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2615         }
2616       else
2617         coding->type = coding_type_no_conversion;
2618       break;
2619     }
2620   return 0;
2621
2622  label_invalid_coding_system:
2623   coding->type = coding_type_no_conversion;
2624   coding->common_flags = 0;
2625   coding->eol_type = CODING_EOL_LF;
2626   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2627     = Qnil;
2628   return -1;
2629 }
2630
2631 /* Emacs has a mechanism to automatically detect a coding system if it
2632    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2633    it's impossible to distinguish some coding systems accurately
2634    because they use the same range of codes.  So, at first, coding
2635    systems are categorized into 7, those are:
2636
2637    o coding-category-emacs-mule
2638
2639         The category for a coding system which has the same code range
2640         as Emacs' internal format.  Assigned the coding-system (Lisp
2641         symbol) `emacs-mule' by default.
2642
2643    o coding-category-sjis
2644
2645         The category for a coding system which has the same code range
2646         as SJIS.  Assigned the coding-system (Lisp
2647         symbol) `japanese-shift-jis' by default.
2648
2649    o coding-category-iso-7
2650
2651         The category for a coding system which has the same code range
2652         as ISO2022 of 7-bit environment.  This doesn't use any locking
2653         shift and single shift functions.  Assigned the coding-system
2654         (Lisp symbol) `iso-2022-7bit' by default.
2655
2656    o coding-category-iso-8-1
2657
2658         The category for a coding system which has the same code range
2659         as ISO2022 of 8-bit environment and graphic plane 1 used only
2660         for DIMENSION1 charset.  This doesn't use any locking shift
2661         and single shift functions.  Assigned the coding-system (Lisp
2662         symbol) `iso-latin-1' by default.
2663
2664    o coding-category-iso-8-2
2665
2666         The category for a coding system which has the same code range
2667         as ISO2022 of 8-bit environment and graphic plane 1 used only
2668         for DIMENSION2 charset.  This doesn't use any locking shift
2669         and single shift functions.  Assigned the coding-system (Lisp
2670         symbol) `japanese-iso-8bit' by default.
2671
2672    o coding-category-iso-7-else
2673
2674         The category for a coding system which has the same code range
2675         as ISO2022 of 7-bit environemnt but uses locking shift or
2676         single shift functions.  Assigned the coding-system (Lisp
2677         symbol) `iso-2022-7bit-lock' by default.
2678
2679    o coding-category-iso-8-else
2680
2681         The category for a coding system which has the same code range
2682         as ISO2022 of 8-bit environemnt but uses locking shift or
2683         single shift functions.  Assigned the coding-system (Lisp
2684         symbol) `iso-2022-8bit-ss2' by default.
2685
2686    o coding-category-big5
2687
2688         The category for a coding system which has the same code range
2689         as BIG5.  Assigned the coding-system (Lisp symbol)
2690         `cn-big5' by default.
2691
2692    o coding-category-binary
2693
2694         The category for a coding system not categorized in any of the
2695         above.  Assigned the coding-system (Lisp symbol)
2696         `no-conversion' by default.
2697
2698    Each of them is a Lisp symbol and the value is an actual
2699    `coding-system's (this is also a Lisp symbol) assigned by a user.
2700    What Emacs does actually is to detect a category of coding system.
2701    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2702    decide only one possible category, it selects a category of the
2703    highest priority.  Priorities of categories are also specified by a
2704    user in a Lisp variable `coding-category-list'.
2705
2706 */
2707
2708 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2709    If it detects possible coding systems, return an integer in which
2710    appropriate flag bits are set.  Flag bits are defined by macros
2711    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2712
2713 int
2714 detect_coding_mask (src, src_bytes)
2715      unsigned char *src;
2716      int src_bytes;
2717 {
2718   register unsigned char c;
2719   unsigned char *src_end = src + src_bytes;
2720   int mask;
2721
2722   /* At first, skip all ASCII characters and control characters except
2723      for three ISO2022 specific control characters.  */
2724  label_loop_detect_coding:
2725   while (src < src_end)
2726     {
2727       c = *src;
2728       if (c >= 0x80
2729           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2730         break;
2731       src++;
2732     }
2733
2734   if (src >= src_end)
2735     /* We found nothing other than ASCII.  There's nothing to do.  */
2736     return CODING_CATEGORY_MASK_ANY;
2737
2738   /* The text seems to be encoded in some multilingual coding system.
2739      Now, try to find in which coding system the text is encoded.  */
2740   if (c < 0x80)
2741     {
2742       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2743       /* C is an ISO2022 specific control code of C0.  */
2744       mask = detect_coding_iso2022 (src, src_end);
2745       src++;
2746       if (mask == 0)
2747         /* No valid ISO2022 code follows C.  Try again.  */
2748         goto label_loop_detect_coding;
2749       mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2750     }
2751   else if (c < 0xA0)
2752     {
2753       /* If C is a special latin extra code,
2754          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2755          or is an ISO2022 control-sequence-introducer (CSI),
2756          we should also consider the possibility of ISO2022 codings.  */
2757       if ((VECTORP (Vlatin_extra_code_table)
2758            && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2759           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2760           || (c == ISO_CODE_CSI
2761               && (src < src_end
2762                   && (*src == ']'
2763                       || (src + 1 < src_end
2764                           && src[1] == ']'
2765                           && (*src == '0' || *src == '1' || *src == '2'))))))
2766         mask = (detect_coding_iso2022 (src, src_end)
2767                 | detect_coding_sjis (src, src_end)
2768                 | detect_coding_emacs_mule (src, src_end)
2769                 | CODING_CATEGORY_MASK_RAW_TEXT);
2770
2771       else
2772         /* C is the first byte of SJIS character code,
2773            or a leading-code of Emacs' internal format (emacs-mule).  */
2774         mask = (detect_coding_sjis (src, src_end)
2775                 | detect_coding_emacs_mule (src, src_end)
2776                 | CODING_CATEGORY_MASK_RAW_TEXT);
2777     }
2778   else
2779     /* C is a character of ISO2022 in graphic plane right,
2780        or a SJIS's 1-byte character code (i.e. JISX0201),
2781        or the first byte of BIG5's 2-byte code.  */
2782     mask = (detect_coding_iso2022 (src, src_end)
2783             | detect_coding_sjis (src, src_end)
2784             | detect_coding_big5 (src, src_end)
2785             | CODING_CATEGORY_MASK_RAW_TEXT);
2786
2787   return mask;
2788 }
2789
2790 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2791    The information of the detected coding system is set in CODING.  */
2792
2793 void
2794 detect_coding (coding, src, src_bytes)
2795      struct coding_system *coding;
2796      unsigned char *src;
2797      int src_bytes;
2798 {
2799   int mask = detect_coding_mask (src, src_bytes);
2800   int idx;
2801   Lisp_Object val = Vcoding_category_list;
2802
2803   if (mask == CODING_CATEGORY_MASK_ANY)
2804     /* We found nothing other than ASCII.  There's nothing to do.  */
2805     return;
2806
2807   /* We found some plausible coding systems.  Let's use a coding
2808      system of the highest priority.  */
2809
2810   if (CONSP (val))
2811     while (!NILP (val))
2812       {
2813         idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2814         if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2815           break;
2816         val = XCONS (val)->cdr;
2817       }
2818   else
2819     val = Qnil;
2820
2821   if (NILP (val))
2822     {
2823       /* For unknown reason, `Vcoding_category_list' contains none of
2824          found categories.  Let's use any of them.  */
2825       for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2826         if (mask & (1 << idx))
2827           break;
2828     }
2829   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2830 }
2831
2832 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2833    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2834    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2835
2836 #define MAX_EOL_CHECK_COUNT 3
2837
2838 int
2839 detect_eol_type (src, src_bytes)
2840      unsigned char *src;
2841      int src_bytes;
2842 {
2843   unsigned char *src_end = src + src_bytes;
2844   unsigned char c;
2845   int total = 0;                /* How many end-of-lines are found so far.  */
2846   int eol_type = CODING_EOL_UNDECIDED;
2847   int this_eol_type;
2848
2849   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2850     {
2851       c = *src++;
2852       if (c == '\n' || c == '\r')
2853         {
2854           total++;
2855           if (c == '\n')
2856             this_eol_type = CODING_EOL_LF;
2857           else if (src >= src_end || *src != '\n')
2858             this_eol_type = CODING_EOL_CR;
2859           else
2860             this_eol_type = CODING_EOL_CRLF, src++;
2861
2862           if (eol_type == CODING_EOL_UNDECIDED)
2863             /* This is the first end-of-line.  */
2864             eol_type = this_eol_type;
2865           else if (eol_type != this_eol_type)
2866             /* The found type is different from what found before.
2867                Let's notice the caller about this inconsistency.  */
2868             return CODING_EOL_INCONSISTENT;
2869         }
2870     }
2871
2872   return eol_type;
2873 }
2874
2875 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2876    is encoded.  If it detects an appropriate format of end-of-line, it
2877    sets the information in *CODING.  */
2878
2879 void
2880 detect_eol (coding, src, src_bytes)
2881      struct coding_system *coding;
2882      unsigned char *src;
2883      int src_bytes;
2884 {
2885   Lisp_Object val;
2886   int eol_type = detect_eol_type (src, src_bytes);
2887
2888   if (eol_type == CODING_EOL_UNDECIDED)
2889     /*  We found no end-of-line in the source text.  */
2890     return;
2891
2892   if (eol_type == CODING_EOL_INCONSISTENT)
2893     {
2894 #if 0
2895       /* This code is suppressed until we find a better way to
2896          distinguish raw text file and binary file.  */
2897
2898       /* If we have already detected that the coding is raw-text, the
2899          coding should actually be no-conversion.  */
2900       if (coding->type == coding_type_raw_text)
2901         {
2902           setup_coding_system (Qno_conversion, coding);
2903           return;
2904         }
2905       /* Else, let's decode only text code anyway.  */
2906 #endif /* 0 */
2907       eol_type = CODING_EOL_LF;
2908     }
2909
2910   val = Fget (coding->symbol, Qeol_type);
2911   if (VECTORP (val) && XVECTOR (val)->size == 3)
2912     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2913 }
2914
2915 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2916    decoding, it may detect coding system and format of end-of-line if
2917    those are not yet decided.  */
2918
2919 int
2920 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2921      struct coding_system *coding;
2922      unsigned char *source, *destination;
2923      int src_bytes, dst_bytes;
2924      int *consumed;
2925 {
2926   int produced;
2927
2928   if (src_bytes <= 0)
2929     {
2930       *consumed = 0;
2931       return 0;
2932     }
2933
2934   if (coding->type == coding_type_undecided)
2935     detect_coding (coding, source, src_bytes);
2936
2937   if (coding->eol_type == CODING_EOL_UNDECIDED)
2938     detect_eol (coding, source, src_bytes);
2939
2940   coding->carryover_size = 0;
2941   switch (coding->type)
2942     {
2943     case coding_type_no_conversion:
2944     label_no_conversion:
2945       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2946       bcopy (source, destination, produced);
2947       *consumed = produced;
2948       break;
2949
2950     case coding_type_emacs_mule:
2951     case coding_type_undecided:
2952     case coding_type_raw_text:
2953       if (coding->eol_type == CODING_EOL_LF
2954           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2955         goto label_no_conversion;
2956       produced = decode_eol (coding, source, destination,
2957                              src_bytes, dst_bytes, consumed);
2958       break;
2959
2960     case coding_type_sjis:
2961       produced = decode_coding_sjis_big5 (coding, source, destination,
2962                                           src_bytes, dst_bytes, consumed,
2963                                           1);
2964       break;
2965
2966     case coding_type_iso2022:
2967       produced = decode_coding_iso2022 (coding, source, destination,
2968                                         src_bytes, dst_bytes, consumed);
2969       break;
2970
2971     case coding_type_big5:
2972       produced = decode_coding_sjis_big5 (coding, source, destination,
2973                                           src_bytes, dst_bytes, consumed,
2974                                           0);
2975       break;
2976
2977     case coding_type_ccl:
2978       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2979                              src_bytes, dst_bytes, consumed);
2980       break;
2981     }
2982
2983   return produced;
2984 }
2985
2986 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2987
2988 int
2989 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2990      struct coding_system *coding;
2991      unsigned char *source, *destination;
2992      int src_bytes, dst_bytes;
2993      int *consumed;
2994 {
2995   int produced;
2996
2997   switch (coding->type)
2998     {
2999     case coding_type_no_conversion:
3000     label_no_conversion:
3001       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
3002       if (produced > 0)
3003         {
3004           bcopy (source, destination, produced);
3005           if (coding->selective)
3006             {
3007               unsigned char *p = destination, *pend = destination + produced;
3008               while (p < pend)
3009                 if (*p++ == '\015') p[-1] = '\n';
3010             }
3011         }
3012       *consumed = produced;
3013       break;
3014
3015     case coding_type_emacs_mule:
3016     case coding_type_undecided:
3017     case coding_type_raw_text:
3018       if (coding->eol_type == CODING_EOL_LF
3019           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3020         goto label_no_conversion;
3021       produced = encode_eol (coding, source, destination,
3022                              src_bytes, dst_bytes, consumed);
3023       break;
3024
3025     case coding_type_sjis:
3026       produced = encode_coding_sjis_big5 (coding, source, destination,
3027                                           src_bytes, dst_bytes, consumed,
3028                                           1);
3029       break;
3030
3031     case coding_type_iso2022:
3032       produced = encode_coding_iso2022 (coding, source, destination,
3033                                         src_bytes, dst_bytes, consumed);
3034       break;
3035
3036     case coding_type_big5:
3037       produced = encode_coding_sjis_big5 (coding, source, destination,
3038                                           src_bytes, dst_bytes, consumed,
3039                                           0);
3040       break;
3041
3042     case coding_type_ccl:
3043       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
3044                              src_bytes, dst_bytes, consumed);
3045       break;
3046     }
3047
3048   return produced;
3049 }
3050
3051 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3052
3053 /* Return maximum size (bytes) of a buffer enough for decoding
3054    SRC_BYTES of text encoded in CODING.  */
3055
3056 int
3057 decoding_buffer_size (coding, src_bytes)
3058      struct coding_system *coding;
3059      int src_bytes;
3060 {
3061   int magnification;
3062
3063   if (coding->type == coding_type_iso2022)
3064     magnification = 3;
3065   else if (coding->type == coding_type_ccl)
3066     magnification = coding->spec.ccl.decoder.buf_magnification;
3067   else
3068     magnification = 2;
3069
3070   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3071 }
3072
3073 /* Return maximum size (bytes) of a buffer enough for encoding
3074    SRC_BYTES of text to CODING.  */
3075
3076 int
3077 encoding_buffer_size (coding, src_bytes)
3078      struct coding_system *coding;
3079      int src_bytes;
3080 {
3081   int magnification;
3082
3083   if (coding->type == coding_type_ccl)
3084     magnification = coding->spec.ccl.encoder.buf_magnification;
3085   else
3086     magnification = 3;
3087
3088   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3089 }
3090
3091 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3092 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3093 #endif
3094
3095 char *conversion_buffer;
3096 int conversion_buffer_size;
3097
3098 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3099    or decoding.  Sufficient memory is allocated automatically.  If we
3100    run out of memory, return NULL.  */
3101
3102 char *
3103 get_conversion_buffer (size)
3104      int size;
3105 {
3106   if (size > conversion_buffer_size)
3107     {
3108       char *buf;
3109       int real_size = conversion_buffer_size * 2;
3110
3111       while (real_size < size) real_size *= 2;
3112       buf = (char *) xmalloc (real_size);
3113       xfree (conversion_buffer);
3114       conversion_buffer = buf;
3115       conversion_buffer_size = real_size;
3116     }
3117   return conversion_buffer;
3118 }
3119
3120 \f
3121 #ifdef emacs
3122 /*** 7. Emacs Lisp library functions ***/
3123
3124 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3125   "Return t if OBJECT is nil or a coding-system.\n\
3126 See document of make-coding-system for coding-system object.")
3127   (obj)
3128      Lisp_Object obj;
3129 {
3130   if (NILP (obj))
3131     return Qt;
3132   if (!SYMBOLP (obj))
3133     return Qnil;
3134   /* Get coding-spec vector for OBJ.  */
3135   obj = Fget (obj, Qcoding_system);
3136   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
3137           ? Qt : Qnil);
3138 }
3139
3140 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3141        Sread_non_nil_coding_system, 1, 1, 0,
3142   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3143   (prompt)
3144      Lisp_Object prompt;
3145 {
3146   Lisp_Object val;
3147   do
3148     {
3149       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
3150                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
3151     }
3152   while (XSTRING (val)->size == 0);
3153   return (Fintern (val, Qnil));
3154 }
3155
3156 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3157   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3158 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3159   (prompt, default_coding_system)
3160      Lisp_Object prompt, default_coding_system;
3161 {
3162   Lisp_Object val;
3163   if (SYMBOLP (default_coding_system))
3164     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
3165   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
3166                           Qt, Qnil, Qcoding_system_history,
3167                           default_coding_system, Qnil);
3168   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3169 }
3170
3171 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3172        1, 1, 0,
3173   "Check validity of CODING-SYSTEM.\n\
3174 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3175 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3176 The value of property should be a vector of length 5.")
3177   (coding_system)
3178      Lisp_Object coding_system;
3179 {
3180   CHECK_SYMBOL (coding_system, 0);
3181   if (!NILP (Fcoding_system_p (coding_system)))
3182     return coding_system;
3183   while (1)
3184     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3185 }
3186
3187 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3188        2, 2, 0,
3189   "Detect coding system of the text in the region between START and END.\n\
3190 Return a list of possible coding systems ordered by priority.\n\
3191 If only ASCII characters are found, it returns `undecided'\n\
3192  or its subsidiary coding system according to a detected end-of-line format.")
3193   (b, e)
3194      Lisp_Object b, e;
3195 {
3196   int coding_mask, eol_type;
3197   Lisp_Object val;
3198   int beg, end;
3199
3200   validate_region (&b, &e);
3201   beg = XINT (b), end = XINT (e);
3202   if (beg < GPT && end >= GPT) move_gap (end);
3203
3204   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3205   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3206
3207   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3208     {
3209       val = Qundecided;
3210       if (eol_type != CODING_EOL_UNDECIDED
3211           && eol_type != CODING_EOL_INCONSISTENT)
3212         {
3213           Lisp_Object val2;
3214           val2 = Fget (Qundecided, Qeol_type);
3215           if (VECTORP (val2))
3216             val = XVECTOR (val2)->contents[eol_type];
3217         }
3218     }
3219   else
3220     {
3221       Lisp_Object val2;
3222
3223       /* At first, gather possible coding-systems in VAL in a reverse
3224          order.  */
3225       val = Qnil;
3226       for (val2 = Vcoding_category_list;
3227            !NILP (val2);
3228            val2 = XCONS (val2)->cdr)
3229         {
3230           int idx
3231             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3232           if (coding_mask & (1 << idx))
3233             {
3234 #if 0
3235               /* This code is suppressed until we find a better way to
3236                  distinguish raw text file and binary file.  */
3237
3238               if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3239                   && eol_type == CODING_EOL_INCONSISTENT)
3240                 val = Fcons (Qno_conversion, val);
3241               else
3242 #endif /* 0 */
3243                 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3244             }
3245         }
3246
3247       /* Then, change the order of the list, while getting subsidiary
3248          coding-systems.  */
3249       val2 = val;
3250       val = Qnil;
3251       if (eol_type == CODING_EOL_INCONSISTENT)
3252         eol_type == CODING_EOL_UNDECIDED;
3253       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3254         {
3255           if (eol_type == CODING_EOL_UNDECIDED)
3256             val = Fcons (XCONS (val2)->car, val);
3257           else
3258             {
3259               Lisp_Object val3;
3260               val3 = Fget (XCONS (val2)->car, Qeol_type);
3261               if (VECTORP (val3))
3262                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3263               else
3264                 val = Fcons (XCONS (val2)->car, val);
3265             }
3266         }
3267     }
3268
3269   return val;
3270 }
3271
3272 /* Scan text in the region between *BEGP and *ENDP, skip characters
3273    which we never have to encode to (iff ENCODEP is 1) or decode from
3274    coding system CODING at the head and tail, then set BEGP and ENDP
3275    to the addresses of start and end of the text we actually convert.  */
3276
3277 void
3278 shrink_conversion_area (begp, endp, coding, encodep)
3279      unsigned char **begp, **endp;
3280      struct coding_system *coding;
3281      int encodep;
3282 {
3283   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3284
3285   if (coding->eol_type != CODING_EOL_LF
3286       && coding->eol_type != CODING_EOL_UNDECIDED)
3287     /* Since we anyway have to convert end-of-line format, it is not
3288        worth skipping at most 100 bytes or so.  */
3289     return;
3290
3291   if (encodep)                  /* for encoding */
3292     {
3293       switch (coding->type)
3294         {
3295         case coding_type_no_conversion:
3296         case coding_type_emacs_mule:
3297         case coding_type_undecided:
3298         case coding_type_raw_text:
3299           /* We need no conversion.  */
3300           *begp = *endp;
3301           return;
3302         case coding_type_ccl:
3303           /* We can't skip any data.  */
3304           return;
3305         case coding_type_iso2022:
3306           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3307             {
3308               unsigned char *bol = beg_addr;
3309               while (beg_addr < end_addr && *beg_addr < 0x80)
3310                 {
3311                   beg_addr++;
3312                   if (*(beg_addr - 1) == '\n')
3313                     bol = beg_addr;
3314                 }
3315               beg_addr = bol;
3316               goto label_skip_tail;
3317             }
3318           /* fall down ... */
3319         default:
3320           /* We can skip all ASCII characters at the head and tail.  */
3321           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3322         label_skip_tail:
3323           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3324           break;
3325         }
3326     }
3327   else                          /* for decoding */
3328     {
3329       switch (coding->type)
3330         {
3331         case coding_type_no_conversion:
3332           /* We need no conversion.  */
3333           *begp = *endp;
3334           return;
3335         case coding_type_emacs_mule:
3336         case coding_type_raw_text:
3337           if (coding->eol_type == CODING_EOL_LF)
3338             {
3339               /* We need no conversion.  */
3340               *begp = *endp;
3341               return;
3342             }
3343           /* We can skip all but carriage-return.  */
3344           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3345           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3346           break;
3347         case coding_type_sjis:
3348         case coding_type_big5:
3349           /* We can skip all ASCII characters at the head.  */
3350           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3351           /* We can skip all ASCII characters at the tail except for
3352              the second byte of SJIS or BIG5 code.  */
3353           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3354           if (end_addr != *endp)
3355             end_addr++;
3356           break;
3357         case coding_type_ccl:
3358           /* We can't skip any data.  */
3359           return;
3360         default:                /* i.e. case coding_type_iso2022: */
3361           {
3362             unsigned char c;
3363
3364             /* We can skip all ASCII characters except for a few
3365                control codes at the head.  */
3366             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3367                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3368                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3369               beg_addr++;
3370           }
3371           break;
3372         }
3373     }
3374   *begp = beg_addr;
3375   *endp = end_addr;
3376   return;
3377 }
3378
3379 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3380    text between B and E.  B and E are buffer position.  */
3381
3382 Lisp_Object
3383 code_convert_region (b, e, coding, encodep)
3384      Lisp_Object b, e;
3385      struct coding_system *coding;
3386      int encodep;
3387 {
3388   int beg, end, len, consumed, produced;
3389   char *buf;
3390   unsigned char *begp, *endp;
3391   int pos = PT;
3392
3393   validate_region (&b, &e);
3394   beg = XINT (b), end = XINT (e);
3395   if (beg < GPT && end >= GPT)
3396     move_gap (end);
3397
3398   if (encodep && !NILP (coding->pre_write_conversion))
3399     {
3400       /* We must call a pre-conversion function which may put a new
3401          text to be converted in a new buffer.  */
3402       struct buffer *old = current_buffer, *new;
3403
3404       TEMP_SET_PT (beg);
3405       call2 (coding->pre_write_conversion, b, e);
3406       if (old != current_buffer)
3407         {
3408           /* Replace the original text by the text just generated.  */
3409           len = ZV - BEGV;
3410           new = current_buffer;
3411           set_buffer_internal (old);
3412           del_range (beg, end);
3413           insert_from_buffer (new, 1, len, 0);
3414           end = beg + len;
3415         }
3416     }
3417
3418   /* We may be able to shrink the conversion region.  */
3419   begp = POS_ADDR (beg); endp = begp + (end - beg);
3420   shrink_conversion_area (&begp, &endp, coding, encodep);
3421
3422   if (begp == endp)
3423     /* We need no conversion.  */
3424     len = end - beg;
3425   else
3426     {
3427       beg += begp - POS_ADDR (beg);
3428       end =  beg + (endp - begp);
3429
3430       if (encodep)
3431         len = encoding_buffer_size (coding, end - beg);
3432       else
3433         len = decoding_buffer_size (coding, end - beg);
3434       buf = get_conversion_buffer (len);
3435
3436       coding->last_block = 1;
3437       produced = (encodep
3438                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3439                                    &consumed)
3440                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3441                                    &consumed));
3442
3443       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3444
3445       TEMP_SET_PT (beg);
3446       insert (buf, produced);
3447       del_range (PT, PT + end - beg);
3448       if (pos >= end)
3449         pos = PT + (pos - end);
3450       else if (pos > beg)
3451         pos = beg;
3452       TEMP_SET_PT (pos);
3453   }
3454
3455   if (!encodep && !NILP (coding->post_read_conversion))
3456     {
3457       /* We must call a post-conversion function which may alter
3458          the text just converted.  */
3459       Lisp_Object insval;
3460
3461       beg = XINT (b);
3462       TEMP_SET_PT (beg);
3463       insval = call1 (coding->post_read_conversion, make_number (len));
3464       CHECK_NUMBER (insval, 0);
3465       if (pos >= beg + len)
3466         pos += XINT (insval) - len;
3467       else if (pos > beg)
3468         pos = beg;
3469       TEMP_SET_PT (pos);
3470       len = XINT (insval);
3471     }
3472
3473   return make_number (len);
3474 }
3475
3476 Lisp_Object
3477 code_convert_string (str, coding, encodep, nocopy)
3478      Lisp_Object str, nocopy;
3479      struct coding_system *coding;
3480      int encodep;
3481 {
3482   int len, consumed, produced;
3483   char *buf;
3484   unsigned char *begp, *endp;
3485   int head_skip, tail_skip;
3486   struct gcpro gcpro1;
3487
3488   if (encodep && !NILP (coding->pre_write_conversion)
3489       || !encodep && !NILP (coding->post_read_conversion))
3490     {
3491       /* Since we have to call Lisp functions which assume target text
3492          is in a buffer, after setting a temporary buffer, call
3493          code_convert_region.  */
3494       int count = specpdl_ptr - specpdl;
3495       int len = XSTRING (str)->size;
3496       Lisp_Object result;
3497       struct buffer *old = current_buffer;
3498
3499       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3500       temp_output_buffer_setup (" *code-converting-work*");
3501       set_buffer_internal (XBUFFER (Vstandard_output));
3502       insert_from_string (str, 0, len, 0);
3503       code_convert_region (make_number (BEGV), make_number (ZV),
3504                            coding, encodep);
3505       result = make_buffer_string (BEGV, ZV, 0);
3506       set_buffer_internal (old);
3507       return unbind_to (count, result);
3508     }
3509
3510   /* We may be able to shrink the conversion region.  */
3511   begp = XSTRING (str)->data;
3512   endp = begp + XSTRING (str)->size;
3513   shrink_conversion_area (&begp, &endp, coding, encodep);
3514
3515   if (begp == endp)
3516     /* We need no conversion.  */
3517     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3518
3519   head_skip = begp - XSTRING (str)->data;
3520   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3521
3522   GCPRO1 (str);
3523
3524   if (encodep)
3525     len = encoding_buffer_size (coding, endp - begp);
3526   else
3527     len = decoding_buffer_size (coding, endp - begp);
3528   buf = get_conversion_buffer (len + head_skip + tail_skip);
3529
3530   bcopy (XSTRING (str)->data, buf, head_skip);
3531   coding->last_block = 1;
3532   produced = (encodep
3533               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3534                                buf + head_skip, endp - begp, len, &consumed)
3535               : decode_coding (coding, XSTRING (str)->data + head_skip,
3536                                buf + head_skip, endp - begp, len, &consumed));
3537   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3538          buf + head_skip + produced,
3539          tail_skip);
3540
3541   UNGCPRO;
3542
3543   return make_string (buf, head_skip + produced + tail_skip);
3544 }
3545
3546 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3547        3, 3, "r\nzCoding system: ",
3548   "Decode current region by specified coding system.\n\
3549 When called from a program, takes three arguments:\n\
3550 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3551 Return length of decoded text.")
3552   (b, e, coding_system)
3553      Lisp_Object b, e, coding_system;
3554 {
3555   struct coding_system coding;
3556
3557   CHECK_NUMBER_COERCE_MARKER (b, 0);
3558   CHECK_NUMBER_COERCE_MARKER (e, 1);
3559   CHECK_SYMBOL (coding_system, 2);
3560
3561   if (NILP (coding_system))
3562     return make_number (XFASTINT (e) - XFASTINT (b));
3563   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3564     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3565
3566   return code_convert_region (b, e, &coding, 0);
3567 }
3568
3569 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3570        3, 3, "r\nzCoding system: ",
3571   "Encode current region by specified coding system.\n\
3572 When called from a program, takes three arguments:\n\
3573 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3574 Return length of encoded text.")
3575   (b, e, coding_system)
3576      Lisp_Object b, e, coding_system;
3577 {
3578   struct coding_system coding;
3579
3580   CHECK_NUMBER_COERCE_MARKER (b, 0);
3581   CHECK_NUMBER_COERCE_MARKER (e, 1);
3582   CHECK_SYMBOL (coding_system, 2);
3583
3584   if (NILP (coding_system))
3585     return make_number (XFASTINT (e) - XFASTINT (b));
3586   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3587     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3588
3589   return code_convert_region (b, e, &coding, 1);
3590 }
3591
3592 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3593        2, 3, 0,
3594   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3595 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3596 if the decoding operation is trivial.")
3597   (string, coding_system, nocopy)
3598      Lisp_Object string, coding_system, nocopy;
3599 {
3600   struct coding_system coding;
3601
3602   CHECK_STRING (string, 0);
3603   CHECK_SYMBOL (coding_system, 1);
3604
3605   if (NILP (coding_system))
3606     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3607   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3608     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3609
3610   return code_convert_string (string, &coding, 0, nocopy);
3611 }
3612
3613 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3614        2, 3, 0,
3615   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3616 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3617 if the encoding operation is trivial.")
3618   (string, coding_system, nocopy)
3619      Lisp_Object string, coding_system, nocopy;
3620 {
3621   struct coding_system coding;
3622
3623   CHECK_STRING (string, 0);
3624   CHECK_SYMBOL (coding_system, 1);
3625
3626   if (NILP (coding_system))
3627     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3628   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3629     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3630
3631   return code_convert_string (string, &coding, 1, nocopy);
3632 }
3633
3634 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3635   "Decode a JISX0208 character of shift-jis encoding.\n\
3636 CODE is the character code in SJIS.\n\
3637 Return the corresponding character.")
3638   (code)
3639      Lisp_Object code;
3640 {
3641   unsigned char c1, c2, s1, s2;
3642   Lisp_Object val;
3643
3644   CHECK_NUMBER (code, 0);
3645   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3646   DECODE_SJIS (s1, s2, c1, c2);
3647   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3648   return val;
3649 }
3650
3651 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3652   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3653 Return the corresponding character code in SJIS.")
3654   (ch)
3655      Lisp_Object ch;
3656 {
3657   int charset, c1, c2, s1, s2;
3658   Lisp_Object val;
3659
3660   CHECK_NUMBER (ch, 0);
3661   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3662   if (charset == charset_jisx0208)
3663     {
3664       ENCODE_SJIS (c1, c2, s1, s2);
3665       XSETFASTINT (val, (s1 << 8) | s2);
3666     }
3667   else
3668     XSETFASTINT (val, 0);
3669   return val;
3670 }
3671
3672 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3673   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3674 CODE is the character code in BIG5.\n\
3675 Return the corresponding character.")
3676   (code)
3677      Lisp_Object code;
3678 {
3679   int charset;
3680   unsigned char b1, b2, c1, c2;
3681   Lisp_Object val;
3682
3683   CHECK_NUMBER (code, 0);
3684   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3685   DECODE_BIG5 (b1, b2, charset, c1, c2);
3686   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3687   return val;
3688 }
3689
3690 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3691   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3692 Return the corresponding character code in Big5.")
3693   (ch)
3694      Lisp_Object ch;
3695 {
3696   int charset, c1, c2, b1, b2;
3697   Lisp_Object val;
3698
3699   CHECK_NUMBER (ch, 0);
3700   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3701   if (charset == charset_big5_1 || charset == charset_big5_2)
3702     {
3703       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3704       XSETFASTINT (val, (b1 << 8) | b2);
3705     }
3706   else
3707     XSETFASTINT (val, 0);
3708   return val;
3709 }
3710
3711 DEFUN ("set-terminal-coding-system-internal",
3712        Fset_terminal_coding_system_internal,
3713        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3714   (coding_system)
3715      Lisp_Object coding_system;
3716 {
3717   CHECK_SYMBOL (coding_system, 0);
3718   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3719   /* We had better not send unsafe characters to terminal.  */
3720   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3721
3722   return Qnil;
3723 }
3724
3725 DEFUN ("set-safe-terminal-coding-system-internal",
3726        Fset_safe_terminal_coding_system_internal,
3727        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3728   (coding_system)
3729      Lisp_Object coding_system;
3730 {
3731   CHECK_SYMBOL (coding_system, 0);
3732   setup_coding_system (Fcheck_coding_system (coding_system),
3733                        &safe_terminal_coding);
3734   return Qnil;
3735 }
3736
3737 DEFUN ("terminal-coding-system",
3738        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3739   "Return coding-system of your terminal.")
3740   ()
3741 {
3742   return terminal_coding.symbol;
3743 }
3744
3745 DEFUN ("set-keyboard-coding-system-internal",
3746        Fset_keyboard_coding_system_internal,
3747        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3748   (coding_system)
3749      Lisp_Object coding_system;
3750 {
3751   CHECK_SYMBOL (coding_system, 0);
3752   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3753   return Qnil;
3754 }
3755
3756 DEFUN ("keyboard-coding-system",
3757        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3758   "Return coding-system of what is sent from terminal keyboard.")
3759   ()
3760 {
3761   return keyboard_coding.symbol;
3762 }
3763
3764 \f
3765 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3766        Sfind_operation_coding_system,  1, MANY, 0,
3767   "Choose a coding system for an operation based on the target name.\n\
3768 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3769 DECODING-SYSTEM is the coding system to use for decoding\n\
3770 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3771 for encoding (in case OPERATION does encoding).\n\
3772 \n\
3773 The first argument OPERATION specifies an I/O primitive:\n\
3774   For file I/O, `insert-file-contents' or `write-region'.\n\
3775   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3776   For network I/O, `open-network-stream'.\n\
3777 \n\
3778 The remaining arguments should be the same arguments that were passed\n\
3779 to the primitive.  Depending on which primitive, one of those arguments\n\
3780 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3781 whichever argument specifies the file name is TARGET.\n\
3782 \n\
3783 TARGET has a meaning which depends on OPERATION:\n\
3784   For file I/O, TARGET is a file name.\n\
3785   For process I/O, TARGET is a process name.\n\
3786   For network I/O, TARGET is a service name or a port number\n\
3787 \n\
3788 This function looks up what specified for TARGET in,\n\
3789 `file-coding-system-alist', `process-coding-system-alist',\n\
3790 or `network-coding-system-alist' depending on OPERATION.\n\
3791 They may specify a coding system, a cons of coding systems,\n\
3792 or a function symbol to call.\n\
3793 In the last case, we call the function with one argument,\n\
3794 which is a list of all the arguments given to this function.")
3795   (nargs, args)
3796      int nargs;
3797      Lisp_Object *args;
3798 {
3799   Lisp_Object operation, target_idx, target, val;
3800   register Lisp_Object chain;
3801
3802   if (nargs < 2)
3803     error ("Too few arguments");
3804   operation = args[0];
3805   if (!SYMBOLP (operation)
3806       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3807     error ("Invalid first arguement");
3808   if (nargs < 1 + XINT (target_idx))
3809     error ("Too few arguments for operation: %s",
3810            XSYMBOL (operation)->name->data);
3811   target = args[XINT (target_idx) + 1];
3812   if (!(STRINGP (target)
3813         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3814     error ("Invalid %dth argument", XINT (target_idx) + 1);
3815
3816   chain = ((EQ (operation, Qinsert_file_contents)
3817             || EQ (operation, Qwrite_region))
3818            ? Vfile_coding_system_alist
3819            : (EQ (operation, Qopen_network_stream)
3820               ? Vnetwork_coding_system_alist
3821               : Vprocess_coding_system_alist));
3822   if (NILP (chain))
3823     return Qnil;
3824
3825   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3826     {
3827       Lisp_Object elt;
3828       elt = XCONS (chain)->car;
3829
3830       if (CONSP (elt)
3831           && ((STRINGP (target)
3832                && STRINGP (XCONS (elt)->car)
3833                && fast_string_match (XCONS (elt)->car, target) >= 0)
3834               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3835         {
3836           val = XCONS (elt)->cdr;
3837           /* Here, if VAL is both a valid coding system and a valid
3838              function symbol, we return VAL as a coding system.  */
3839           if (CONSP (val))
3840             return val;
3841           if (! SYMBOLP (val))
3842             return Qnil;
3843           if (! NILP (Fcoding_system_p (val)))
3844             return Fcons (val, val);
3845           if (! NILP (Ffboundp (val)))
3846             {
3847               val = call1 (val, Flist (nargs, args));
3848               if (CONSP (val))
3849                 return val;
3850               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
3851                 return Fcons (val, val);
3852             }
3853           return Qnil;
3854         }
3855     }
3856   return Qnil;
3857 }
3858
3859 #endif /* emacs */
3860
3861 \f
3862 /*** 8. Post-amble ***/
3863
3864 init_coding_once ()
3865 {
3866   int i;
3867
3868   /* Emacs' internal format specific initialize routine.  */
3869   for (i = 0; i <= 0x20; i++)
3870     emacs_code_class[i] = EMACS_control_code;
3871   emacs_code_class[0x0A] = EMACS_linefeed_code;
3872   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3873   for (i = 0x21 ; i < 0x7F; i++)
3874     emacs_code_class[i] = EMACS_ascii_code;
3875   emacs_code_class[0x7F] = EMACS_control_code;
3876   emacs_code_class[0x80] = EMACS_leading_code_composition;
3877   for (i = 0x81; i < 0xFF; i++)
3878     emacs_code_class[i] = EMACS_invalid_code;
3879   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3880   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3881   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3882   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3883
3884   /* ISO2022 specific initialize routine.  */
3885   for (i = 0; i < 0x20; i++)
3886     iso_code_class[i] = ISO_control_code;
3887   for (i = 0x21; i < 0x7F; i++)
3888     iso_code_class[i] = ISO_graphic_plane_0;
3889   for (i = 0x80; i < 0xA0; i++)
3890     iso_code_class[i] = ISO_control_code;
3891   for (i = 0xA1; i < 0xFF; i++)
3892     iso_code_class[i] = ISO_graphic_plane_1;
3893   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3894   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3895   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3896   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3897   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3898   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3899   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3900   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3901   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3902   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3903
3904   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3905   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3906
3907   setup_coding_system (Qnil, &keyboard_coding);
3908   setup_coding_system (Qnil, &terminal_coding);
3909   setup_coding_system (Qnil, &safe_terminal_coding);
3910
3911 #if defined (MSDOS) || defined (WINDOWSNT)
3912   system_eol_type = CODING_EOL_CRLF;
3913 #else
3914   system_eol_type = CODING_EOL_LF;
3915 #endif
3916 }
3917
3918 #ifdef emacs
3919
3920 syms_of_coding ()
3921 {
3922   Qtarget_idx = intern ("target-idx");
3923   staticpro (&Qtarget_idx);
3924
3925   Qcoding_system_history = intern ("coding-system-history");
3926   staticpro (&Qcoding_system_history);
3927   Fset (Qcoding_system_history, Qnil);
3928
3929   /* Target FILENAME is the first argument.  */
3930   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3931   /* Target FILENAME is the third argument.  */
3932   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3933
3934   Qcall_process = intern ("call-process");
3935   staticpro (&Qcall_process);
3936   /* Target PROGRAM is the first argument.  */
3937   Fput (Qcall_process, Qtarget_idx, make_number (0));
3938
3939   Qcall_process_region = intern ("call-process-region");
3940   staticpro (&Qcall_process_region);
3941   /* Target PROGRAM is the third argument.  */
3942   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3943
3944   Qstart_process = intern ("start-process");
3945   staticpro (&Qstart_process);
3946   /* Target PROGRAM is the third argument.  */
3947   Fput (Qstart_process, Qtarget_idx, make_number (2));
3948
3949   Qopen_network_stream = intern ("open-network-stream");
3950   staticpro (&Qopen_network_stream);
3951   /* Target SERVICE is the fourth argument.  */
3952   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3953
3954   Qcoding_system = intern ("coding-system");
3955   staticpro (&Qcoding_system);
3956
3957   Qeol_type = intern ("eol-type");
3958   staticpro (&Qeol_type);
3959
3960   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3961   staticpro (&Qbuffer_file_coding_system);
3962
3963   Qpost_read_conversion = intern ("post-read-conversion");
3964   staticpro (&Qpost_read_conversion);
3965
3966   Qpre_write_conversion = intern ("pre-write-conversion");
3967   staticpro (&Qpre_write_conversion);
3968
3969   Qno_conversion = intern ("no-conversion");
3970   staticpro (&Qno_conversion);
3971
3972   Qundecided = intern ("undecided");
3973   staticpro (&Qundecided);
3974
3975   Qcoding_system_p = intern ("coding-system-p");
3976   staticpro (&Qcoding_system_p);
3977
3978   Qcoding_system_error = intern ("coding-system-error");
3979   staticpro (&Qcoding_system_error);
3980
3981   Fput (Qcoding_system_error, Qerror_conditions,
3982         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3983   Fput (Qcoding_system_error, Qerror_message,
3984         build_string ("Invalid coding system"));
3985
3986   Qcoding_category_index = intern ("coding-category-index");
3987   staticpro (&Qcoding_category_index);
3988
3989   {
3990     int i;
3991     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3992       {
3993         coding_category_table[i] = intern (coding_category_name[i]);
3994         staticpro (&coding_category_table[i]);
3995         Fput (coding_category_table[i], Qcoding_category_index,
3996               make_number (i));
3997       }
3998   }
3999
4000   Qcharacter_unification_table = intern ("character-unification-table");
4001   staticpro (&Qcharacter_unification_table);
4002   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
4003         make_number (0));
4004
4005   Qcharacter_unification_table_for_decode
4006     = intern ("character-unification-table-for-decode");
4007   staticpro (&Qcharacter_unification_table_for_decode);
4008
4009   Qcharacter_unification_table_for_encode
4010     = intern ("character-unification-table-for-encode");
4011   staticpro (&Qcharacter_unification_table_for_encode);
4012
4013   Qsafe_charsets = intern ("safe-charsets");
4014   staticpro (&Qsafe_charsets);
4015
4016   Qemacs_mule = intern ("emacs-mule");
4017   staticpro (&Qemacs_mule);
4018
4019   defsubr (&Scoding_system_p);
4020   defsubr (&Sread_coding_system);
4021   defsubr (&Sread_non_nil_coding_system);
4022   defsubr (&Scheck_coding_system);
4023   defsubr (&Sdetect_coding_region);
4024   defsubr (&Sdecode_coding_region);
4025   defsubr (&Sencode_coding_region);
4026   defsubr (&Sdecode_coding_string);
4027   defsubr (&Sencode_coding_string);
4028   defsubr (&Sdecode_sjis_char);
4029   defsubr (&Sencode_sjis_char);
4030   defsubr (&Sdecode_big5_char);
4031   defsubr (&Sencode_big5_char);
4032   defsubr (&Sset_terminal_coding_system_internal);
4033   defsubr (&Sset_safe_terminal_coding_system_internal);
4034   defsubr (&Sterminal_coding_system);
4035   defsubr (&Sset_keyboard_coding_system_internal);
4036   defsubr (&Skeyboard_coding_system);
4037   defsubr (&Sfind_operation_coding_system);
4038
4039   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
4040     "List of coding systems.\n\
4041 \n\
4042 Do not alter the value of this variable manually.  This variable should be\n\
4043 updated by the functions `make-coding-system' and\n\
4044 `define-coding-system-alias'.");
4045   Vcoding_system_list = Qnil;
4046
4047   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
4048     "Alist of coding system names.\n\
4049 Each element is one element list of coding system name.\n\
4050 This variable is given to `completing-read' as TABLE argument.\n\
4051 \n\
4052 Do not alter the value of this variable manually.  This variable should be\n\
4053 updated by the functions `make-coding-system' and\n\
4054 `define-coding-system-alias'.");
4055   Vcoding_system_alist = Qnil;
4056
4057   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
4058     "List of coding-categories (symbols) ordered by priority.");
4059   {
4060     int i;
4061
4062     Vcoding_category_list = Qnil;
4063     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4064       Vcoding_category_list
4065         = Fcons (coding_category_table[i], Vcoding_category_list);
4066   }
4067
4068   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
4069     "Specify the coding system for read operations.\n\
4070 It is useful to bind this variable with `let', but do not set it globally.\n\
4071 If the value is a coding system, it is used for decoding on read operation.\n\
4072 If not, an appropriate element is used from one of the coding system alists:\n\
4073 There are three such tables, `file-coding-system-alist',\n\
4074 `process-coding-system-alist', and `network-coding-system-alist'.");
4075   Vcoding_system_for_read = Qnil;
4076
4077   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
4078     "Specify the coding system for write operations.\n\
4079 It is useful to bind this variable with `let', but do not set it globally.\n\
4080 If the value is a coding system, it is used for encoding on write operation.\n\
4081 If not, an appropriate element is used from one of the coding system alists:\n\
4082 There are three such tables, `file-coding-system-alist',\n\
4083 `process-coding-system-alist', and `network-coding-system-alist'.");
4084   Vcoding_system_for_write = Qnil;
4085
4086   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4087     "Coding system used in the latest file or process I/O.");
4088   Vlast_coding_system_used = Qnil;
4089
4090   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4091     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4092   inhibit_eol_conversion = 0;
4093
4094   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4095     "Alist to decide a coding system to use for a file I/O operation.\n\
4096 The format is ((PATTERN . VAL) ...),\n\
4097 where PATTERN is a regular expression matching a file name,\n\
4098 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4099 If VAL is a coding system, it is used for both decoding and encoding\n\
4100 the file contents.\n\
4101 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4102 and the cdr part is used for encoding.\n\
4103 If VAL is a function symbol, the function must return a coding system\n\
4104 or a cons of coding systems which are used as above.\n\
4105 \n\
4106 See also the function `find-operation-coding-system'.");
4107   Vfile_coding_system_alist = Qnil;
4108
4109   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4110     "Alist to decide a coding system to use for a process I/O operation.\n\
4111 The format is ((PATTERN . VAL) ...),\n\
4112 where PATTERN is a regular expression matching a program name,\n\
4113 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4114 If VAL is a coding system, it is used for both decoding what received\n\
4115 from the program and encoding what sent to the program.\n\
4116 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4117 and the cdr part is used for encoding.\n\
4118 If VAL is a function symbol, the function must return a coding system\n\
4119 or a cons of coding systems which are used as above.\n\
4120 \n\
4121 See also the function `find-operation-coding-system'.");
4122   Vprocess_coding_system_alist = Qnil;
4123
4124   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4125     "Alist to decide a coding system to use for a network I/O operation.\n\
4126 The format is ((PATTERN . VAL) ...),\n\
4127 where PATTERN is a regular expression matching a network service name\n\
4128 or is a port number to connect to,\n\
4129 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4130 If VAL is a coding system, it is used for both decoding what received\n\
4131 from the network stream and encoding what sent to the network stream.\n\
4132 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4133 and the cdr part is used for encoding.\n\
4134 If VAL is a function symbol, the function must return a coding system\n\
4135 or a cons of coding systems which are used as above.\n\
4136 \n\
4137 See also the function `find-operation-coding-system'.");
4138   Vnetwork_coding_system_alist = Qnil;
4139
4140   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4141     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4142   eol_mnemonic_unix = ':';
4143
4144   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4145     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4146   eol_mnemonic_dos = '\\';
4147
4148   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4149     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4150   eol_mnemonic_mac = '/';
4151
4152   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4153     "Mnemonic character indicating end-of-line format is not yet decided.");
4154   eol_mnemonic_undecided = ':';
4155
4156   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4157     "Non-nil means ISO 2022 encoder/decoder do character unification.");
4158   Venable_character_unification = Qt;
4159
4160   DEFVAR_LISP ("standard-character-unification-table-for-decode",
4161     &Vstandard_character_unification_table_for_decode,
4162     "Table for unifying characters when reading.");
4163   Vstandard_character_unification_table_for_decode = Qnil;
4164
4165   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4166     &Vstandard_character_unification_table_for_encode,
4167     "Table for unifying characters when writing.");
4168   Vstandard_character_unification_table_for_encode = Qnil;
4169
4170   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4171     "Alist of charsets vs revision numbers.\n\
4172 While encoding, if a charset (car part of an element) is found,\n\
4173 designate it with the escape sequence identifing revision (cdr part of the element).");
4174   Vcharset_revision_alist = Qnil;
4175
4176   DEFVAR_LISP ("default-process-coding-system",
4177                &Vdefault_process_coding_system,
4178     "Cons of coding systems used for process I/O by default.\n\
4179 The car part is used for decoding a process output,\n\
4180 the cdr part is used for encoding a text to be sent to a process.");
4181   Vdefault_process_coding_system = Qnil;
4182
4183   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4184     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4185 This is a vector of length 256.\n\
4186 If Nth element is non-nil, the existence of code N in a file\n\
4187 \(or output of subprocess) doesn't prevent it to be detected as\n\
4188 a coding system of ISO 2022 variant which has a flag\n\
4189 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4190 or reading output of a subprocess.\n\
4191 Only 128th through 159th elements has a meaning.");
4192   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4193 }
4194
4195 #endif /* emacs */