code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Other
  71
  72   If a user wants to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represents a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See section 6 for more details.
  81
  82 */
  83
  84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  85
  86   How end-of-line of a text is encoded depends on a system.  For
  87   instance, Unix's format is just one byte of `line-feed' code,
  88   whereas DOS's format is two-byte sequence of `carriage-return' and
  89   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  90
  91   Since text characters encoding and end-of-line encoding are
  92   independent, any coding system described above can take
  93   any format of end-of-line.  So, Emacs has information of format of
  94   end-of-line in each coding-system.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_emacs_mule (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format (emacs-mule).  The resulting text
 118   goes to a place pointed to by DESTINATION, the length of which should
 119   not exceed DST_BYTES.  The number of bytes actually processed is
 120   returned as *CONSUMED.  The return value is the length of the decoded
 121   text.  Below is a template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 136   internal format (emacs-mule) to CODING.  The resulting text goes to
 137   a place pointed to by DESTINATION, the length of which should not
 138   exceed DST_BYTES.  The number of bytes actually processed is
 139   returned as *CONSUMED.  The return value is the length of the
 140   encoded text.  Below is a template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 262    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 263 int system_eol_type;
 264
 265 #ifdef emacs
 266
 267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 268
 269 /* Coding system emacs-mule is for converting only end-of-line format.  */
 270 Lisp_Object Qemacs_mule;
 271
 272 /* Coding-systems are handed between Emacs Lisp programs and C internal
 273    routines by the following three variables.  */
 274 /* Coding-system for reading files and receiving data from process.  */
 275 Lisp_Object Vcoding_system_for_read;
 276 /* Coding-system for writing files and sending data to process.  */
 277 Lisp_Object Vcoding_system_for_write;
 278 /* Coding-system actually used in the latest I/O.  */
 279 Lisp_Object Vlast_coding_system_used;
 280
 281 /* A vector of length 256 which contains information about special
 282    Microsoft codes.  */
 283 Lisp_Object Vmicrosoft_code_table;
 284
 285 /* Flag to inhibit code conversion of end-of-line format.  */
 286 int inhibit_eol_conversion;
 287
 288 /* Coding system to be used to encode text for terminal display.  */
 289 struct coding_system terminal_coding;
 290
 291 /* Coding system to be used to encode text for terminal display when
 292    terminal coding system is nil.  */
 293 struct coding_system safe_terminal_coding;
 294
 295 /* Coding system of what is sent from terminal keyboard.  */
 296 struct coding_system keyboard_coding;
 297
 298 Lisp_Object Vfile_coding_system_alist;
 299 Lisp_Object Vprocess_coding_system_alist;
 300 Lisp_Object Vnetwork_coding_system_alist;
 301
 302 #endif /* emacs */
 303
 304 Lisp_Object Qcoding_category_index;
 305
 306 /* List of symbols `coding-category-xxx' ordered by priority.  */
 307 Lisp_Object Vcoding_category_list;
 308
 309 /* Table of coding-systems currently assigned to each coding-category.  */
 310 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 311
 312 /* Table of names of symbol for each coding-category.  */
 313 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 314   "coding-category-emacs-mule",
 315   "coding-category-sjis",
 316   "coding-category-iso-7",
 317   "coding-category-iso-8-1",
 318   "coding-category-iso-8-2",
 319   "coding-category-iso-7-else",
 320   "coding-category-iso-8-else",
 321   "coding-category-big5",
 322   "coding-category-binary"
 323 };
 324
 325 /* Flag to tell if we look up unification table on character code
 326    conversion.  */
 327 Lisp_Object Venable_character_unification;
 328 /* Standard unification table to look up on decoding (reading).  */
 329 Lisp_Object Vstandard_character_unification_table_for_decode;
 330 /* Standard unification table to look up on encoding (writing).  */
 331 Lisp_Object Vstandard_character_unification_table_for_encode;
 332
 333 Lisp_Object Qcharacter_unification_table;
 334 Lisp_Object Qcharacter_unification_table_for_decode;
 335 Lisp_Object Qcharacter_unification_table_for_encode;
 336
 337 /* Alist of charsets vs revision number.  */
 338 Lisp_Object Vcharset_revision_alist;
 339
 340 /* Default coding systems used for process I/O.  */
 341 Lisp_Object Vdefault_process_coding_system;
 342
 343 \f
 344 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 345
 346 /* Emacs' internal format for encoding multiple character sets is a
 347    kind of multi-byte encoding, i.e. characters are encoded by
 348    variable-length sequences of one-byte codes.  ASCII characters
 349    and control characters (e.g. `tab', `newline') are represented by
 350    one-byte sequences which are their ASCII codes, in the range 0x00
 351    through 0x7F.  The other characters are represented by a sequence
 352    of `base leading-code', optional `extended leading-code', and one
 353    or two `position-code's.  The length of the sequence is determined
 354    by the base leading-code.  Leading-code takes the range 0x80
 355    through 0x9F, whereas extended leading-code and position-code take
 356    the range 0xA0 through 0xFF.  See `charset.h' for more details
 357    about leading-code and position-code.
 358
 359    There's one exception to this rule.  Special leading-code
 360    `leading-code-composition' denotes that the following several
 361    characters should be composed into one character.  Leading-codes of
 362    components (except for ASCII) are added 0x20.  An ASCII character
 363    component is represented by a 2-byte sequence of `0xA0' and
 364    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 365    details of composite character.  Hence, we can summarize the code
 366    range as follows:
 367
 368    --- CODE RANGE of Emacs' internal format ---
 369    (character set)      (range)
 370    ASCII                0x00 .. 0x7F
 371    ELSE (1st byte)      0x80 .. 0x9F
 372         (rest bytes)    0xA0 .. 0xFF
 373    ---------------------------------------------
 374
 375   */
 376
 377 enum emacs_code_class_type emacs_code_class[256];
 378
 379 /* Go to the next statement only if *SRC is accessible and the code is
 380    greater than 0xA0.  */
 381 #define CHECK_CODE_RANGE_A0_FF  \
 382   do {                          \
 383     if (src >= src_end)         \
 384       goto label_end_of_switch; \
 385     else if (*src++ < 0xA0)     \
 386       return 0;                 \
 387   } while (0)
 388
 389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 390    Check if a text is encoded in Emacs' internal format.  If it is,
 391    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 392
 393 int
 394 detect_coding_emacs_mule (src, src_end)
 395      unsigned char *src, *src_end;
 396 {
 397   unsigned char c;
 398   int composing = 0;
 399
 400   while (src < src_end)
 401     {
 402       c = *src++;
 403
 404       if (composing)
 405         {
 406           if (c < 0xA0)
 407             composing = 0;
 408           else
 409             c -= 0x20;
 410         }
 411
 412       switch (emacs_code_class[c])
 413         {
 414         case EMACS_ascii_code:
 415         case EMACS_linefeed_code:
 416           break;
 417
 418         case EMACS_control_code:
 419           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 420             return 0;
 421           break;
 422
 423         case EMACS_invalid_code:
 424           return 0;
 425
 426         case EMACS_leading_code_composition: /* c == 0x80 */
 427           if (composing)
 428             CHECK_CODE_RANGE_A0_FF;
 429           else
 430             composing = 1;
 431           break;
 432
 433         case EMACS_leading_code_4:
 434           CHECK_CODE_RANGE_A0_FF;
 435           /* fall down to check it two more times ...  */
 436
 437         case EMACS_leading_code_3:
 438           CHECK_CODE_RANGE_A0_FF;
 439           /* fall down to check it one more time ...  */
 440
 441         case EMACS_leading_code_2:
 442           CHECK_CODE_RANGE_A0_FF;
 443           break;
 444
 445         default:
 446         label_end_of_switch:
 447           break;
 448         }
 449     }
 450   return CODING_CATEGORY_MASK_EMACS_MULE;
 451 }
 452
 453 \f
 454 /*** 3. ISO2022 handlers ***/
 455
 456 /* The following note describes the coding system ISO2022 briefly.
 457    Since the intention of this note is to help in understanding of
 458    the programs in this file, some parts are NOT ACCURATE or OVERLY
 459    SIMPLIFIED.  For the thorough understanding, please refer to the
 460    original document of ISO2022.
 461
 462    ISO2022 provides many mechanisms to encode several character sets
 463    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 464    all text is encoded by codes of less than 128.  This may make the
 465    encoded text a little bit longer, but the text gets more stability
 466    to pass through several gateways (some of them strip off the MSB).
 467
 468    There are two kinds of character set: control character set and
 469    graphic character set.  The former contains control characters such
 470    as `newline' and `escape' to provide control functions (control
 471    functions are provided also by escape sequences).  The latter
 472    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 473    two control character sets and many graphic character sets.
 474
 475    Graphic character sets are classified into one of the following
 476    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 477    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 478    bytes (DIMENSION) and the number of characters in one dimension
 479    (CHARS) of the set.  In addition, each character set is assigned an
 480    identification tag (called "final character" and denoted as <F>
 481    here after) which is unique in each class.  <F> of each character
 482    set is decided by ECMA(*) when it is registered in ISO.  Code range
 483    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 484
 485    Note (*): ECMA = European Computer Manufacturers Association
 486
 487    Here are examples of graphic character set [NAME(<F>)]:
 488         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 489         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 490         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 491         o DIMENSION2_CHARS96 -- none for the moment
 492
 493    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 494         C0 [0x00..0x1F] -- control character plane 0
 495         GL [0x20..0x7F] -- graphic character plane 0
 496         C1 [0x80..0x9F] -- control character plane 1
 497         GR [0xA0..0xFF] -- graphic character plane 1
 498
 499    A control character set is directly designated and invoked to C0 or
 500    C1 by an escape sequence.  The most common case is that ISO646's
 501    control character set is designated/invoked to C0 and ISO6429's
 502    control character set is designated/invoked to C1, and usually
 503    these designations/invocations are omitted in a coded text.  With
 504    7-bit environment, only C0 can be used, and a control character for
 505    C1 is encoded by an appropriate escape sequence to fit in the
 506    environment.  All control characters for C1 are defined the
 507    corresponding escape sequences.
 508
 509    A graphic character set is at first designated to one of four
 510    graphic registers (G0 through G3), then these graphic registers are
 511    invoked to GL or GR.  These designations and invocations can be
 512    done independently.  The most common case is that G0 is invoked to
 513    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 514    these invocations and designations are omitted in a coded text.
 515    With 7-bit environment, only GL can be used.
 516
 517    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 518    and 0x7F of GL area work as control characters SPACE and DEL
 519    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 520
 521    There are two ways of invocation: locking-shift and single-shift.
 522    With locking-shift, the invocation lasts until the next different
 523    invocation, whereas with single-shift, the invocation works only
 524    for the following character and doesn't affect locking-shift.
 525    Invocations are done by the following control characters or escape
 526    sequences.
 527
 528    ----------------------------------------------------------------------
 529    function             control char    escape sequence description
 530    ----------------------------------------------------------------------
 531    SI  (shift-in)               0x0F    none            invoke G0 to GL
 532    SO  (shift-out)              0x0E    none            invoke G1 to GL
 533    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 534    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 535    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 536    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 537    ----------------------------------------------------------------------
 538    The first four are for locking-shift.  Control characters for these
 539    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 540
 541    Designations are done by the following escape sequences.
 542    ----------------------------------------------------------------------
 543    escape sequence      description
 544    ----------------------------------------------------------------------
 545    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 546    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 547    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 548    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 549    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 550    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 551    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 552    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 553    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 554    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 555    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 556    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 557    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 558    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 559    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 560    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 561    ----------------------------------------------------------------------
 562
 563    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 564    of dimension 1, chars 94, and final character <F>, and etc.
 565
 566    Note (*): Although these designations are not allowed in ISO2022,
 567    Emacs accepts them on decoding, and produces them on encoding
 568    CHARS96 character set in a coding system which is characterized as
 569    7-bit environment, non-locking-shift, and non-single-shift.
 570
 571    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 572    '(' can be omitted.  We call this as "short-form" here after.
 573
 574    Now you may notice that there are a lot of ways for encoding the
 575    same multilingual text in ISO2022.  Actually, there exists many
 576    coding systems such as Compound Text (used in X's inter client
 577    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 578    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 579    localized platforms), and all of these are variants of ISO2022.
 580
 581    In addition to the above, Emacs handles two more kinds of escape
 582    sequences: ISO6429's direction specification and Emacs' private
 583    sequence for specifying character composition.
 584
 585    ISO6429's direction specification takes the following format:
 586         o CSI ']'      -- end of the current direction
 587         o CSI '0' ']'  -- end of the current direction
 588         o CSI '1' ']'  -- start of left-to-right text
 589         o CSI '2' ']'  -- start of right-to-left text
 590    The control character CSI (0x9B: control sequence introducer) is
 591    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 592
 593    Character composition specification takes the following format:
 594         o ESC '0' -- start character composition
 595         o ESC '1' -- end character composition
 596    Since these are not standard escape sequences of any ISO, the use
 597    of them for these meaning is restricted to Emacs only.  */
 598
 599 enum iso_code_class_type iso_code_class[256];
 600
 601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 602    Check if a text is encoded in ISO2022.  If it is, returns an
 603    integer in which appropriate flag bits any of:
 604         CODING_CATEGORY_MASK_ISO_7
 605         CODING_CATEGORY_MASK_ISO_8_1
 606         CODING_CATEGORY_MASK_ISO_8_2
 607         CODING_CATEGORY_MASK_ISO_7_ELSE
 608         CODING_CATEGORY_MASK_ISO_8_ELSE
 609    are set.  If a code which should never appear in ISO2022 is found,
 610    returns 0.  */
 611
 612 int
 613 detect_coding_iso2022 (src, src_end)
 614      unsigned char *src, *src_end;
 615 {
 616   int mask = (CODING_CATEGORY_MASK_ISO_7
 617               | CODING_CATEGORY_MASK_ISO_8_1
 618               | CODING_CATEGORY_MASK_ISO_8_2
 619               | CODING_CATEGORY_MASK_ISO_7_ELSE
 620               | CODING_CATEGORY_MASK_ISO_8_ELSE
 621               );
 622   int g1 = 0;                   /* 1 iff designating to G1.  */
 623   int c, i;
 624
 625   while (src < src_end)
 626     {
 627       c = *src++;
 628       switch (c)
 629         {
 630         case ISO_CODE_ESC:
 631           if (src >= src_end)
 632             break;
 633           c = *src++;
 634           if ((c >= '(' && c <= '/'))
 635             {
 636               /* Designation sequence for a charset of dimension 1.  */
 637               if (src >= src_end)
 638                 break;
 639               c = *src++;
 640               if (c < ' ' || c >= 0x80)
 641                 /* Invalid designation sequence.  */
 642                 return 0;
 643             }
 644           else if (c == '$')
 645             {
 646               /* Designation sequence for a charset of dimension 2.  */
 647               if (src >= src_end)
 648                 break;
 649               c = *src++;
 650               if (c >= '@' && c <= 'B')
 651                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 652                 ;
 653               else if (c >= '(' && c <= '/')
 654                 {
 655                   if (src >= src_end)
 656                     break;
 657                   c = *src++;
 658                   if (c < ' ' || c >= 0x80)
 659                     /* Invalid designation sequence.  */
 660                     return 0;
 661                 }
 662               else
 663                 /* Invalid designation sequence.  */
 664                 return 0;
 665             }
 666           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 667             /* Locking shift.  */
 668             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 669                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 670           else if (c == '0' || c == '1' || c == '2')
 671             /* Start/end composition.  */
 672             ;
 673           else
 674             /* Invalid escape sequence.  */
 675             return 0;
 676           break;
 677
 678         case ISO_CODE_SO:
 679           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 680                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 681           break;
 682
 683         case ISO_CODE_CSI:
 684         case ISO_CODE_SS2:
 685         case ISO_CODE_SS3:
 686           return CODING_CATEGORY_MASK_ISO_8_ELSE;
 687
 688         default:
 689           if (c < 0x80)
 690             break;
 691           else if (c < 0xA0)
 692             {
 693               if (VECTORP (Vmicrosoft_code_table)
 694                   && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
 695                 {
 696                   mask &= ~(CODING_CATEGORY_MASK_ISO_7
 697                             | CODING_CATEGORY_MASK_ISO_7_ELSE);
 698                   break;
 699                 }
 700               return 0;
 701             }
 702           else
 703             {
 704               unsigned char *src_begin = src;
 705
 706               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 707                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 708               while (src < src_end && *src >= 0xA0)
 709                 src++;
 710               if ((src - src_begin - 1) & 1 && src < src_end)
 711                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 712             }
 713           break;
 714         }
 715     }
 716
 717   return mask;
 718 }
 719
 720 /* Decode a character of which charset is CHARSET and the 1st position
 721    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 722    fetched from SRC and set to C2.  If CHARSET is negative, it means
 723    that we are decoding ill formed text, and what we can do is just to
 724    read C1 as is.  */
 725
 726 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 727   do {                                                                  \
 728     int c_alt, charset_alt = (charset);                                 \
 729     if (COMPOSING_HEAD_P (coding->composing))                           \
 730       {                                                                 \
 731         *dst++ = LEADING_CODE_COMPOSITION;                              \
 732         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 733           /* To tell composition rules are embeded.  */                 \
 734           *dst++ = 0xFF;                                                \
 735         coding->composing += 2;                                         \
 736       }                                                                 \
 737     if ((charset) >= 0)                                                 \
 738       {                                                                 \
 739         if (CHARSET_DIMENSION (charset) == 2)                           \
 740           ONE_MORE_BYTE (c2);                                           \
 741         if (!NILP (unification_table)                                   \
 742             && ((c_alt = unify_char (unification_table,                 \
 743                                      -1, (charset), c1, c2)) >= 0))     \
 744           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 745       }                                                                 \
 746     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 747       DECODE_CHARACTER_ASCII (c1);                                      \
 748     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 749       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 750     else                                                                \
 751       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 752     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 753       /* To tell a composition rule follows.  */                        \
 754       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 755   } while (0)
 756
 757 /* Set designation state into CODING.  */
 758 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 759   do {                                                                  \
 760     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 761                                      make_number (chars),               \
 762                                      make_number (final_char));         \
 763     if (charset >= 0)                                                   \
 764       {                                                                 \
 765         if (coding->direction == 1                                      \
 766             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 767           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 768         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 769       }                                                                 \
 770   } while (0)
 771
 772 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 773
 774 int
 775 decode_coding_iso2022 (coding, source, destination,
 776                        src_bytes, dst_bytes, consumed)
 777      struct coding_system *coding;
 778      unsigned char *source, *destination;
 779      int src_bytes, dst_bytes;
 780      int *consumed;
 781 {
 782   unsigned char *src = source;
 783   unsigned char *src_end = source + src_bytes;
 784   unsigned char *dst = destination;
 785   unsigned char *dst_end = destination + dst_bytes;
 786   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 787      from DST_END to assure that overflow checking is necessary only
 788      at the head of loop.  */
 789   unsigned char *adjusted_dst_end = dst_end - 6;
 790   int charset;
 791   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 792   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 793   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 794   Lisp_Object unification_table
 795       = coding->character_unification_table_for_decode;
 796
 797   if (!NILP (Venable_character_unification) && NILP (unification_table))
 798     unification_table = Vstandard_character_unification_table_for_decode;
 799
 800   while (src < src_end && dst < adjusted_dst_end)
 801     {
 802       /* SRC_BASE remembers the start position in source in each loop.
 803          The loop will be exited when there's not enough source text
 804          to analyze long escape sequence or 2-byte code (within macros
 805          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 806          to SRC_BASE before exiting.  */
 807       unsigned char *src_base = src;
 808       int c1 = *src++, c2;
 809
 810       switch (iso_code_class [c1])
 811         {
 812         case ISO_0x20_or_0x7F:
 813           if (!coding->composing
 814               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 815             {
 816               /* This is SPACE or DEL.  */
 817               *dst++ = c1;
 818               break;
 819             }
 820           /* This is a graphic character, we fall down ...  */
 821
 822         case ISO_graphic_plane_0:
 823           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 824             {
 825               /* This is a composition rule.  */
 826               *dst++ = c1 | 0x80;
 827               coding->composing = COMPOSING_WITH_RULE_TAIL;
 828             }
 829           else
 830             DECODE_ISO_CHARACTER (charset0, c1);
 831           break;
 832
 833         case ISO_0xA0_or_0xFF:
 834           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 835             {
 836               /* Invalid code.  */
 837               *dst++ = c1;
 838               break;
 839             }
 840           /* This is a graphic character, we fall down ... */
 841
 842         case ISO_graphic_plane_1:
 843           DECODE_ISO_CHARACTER (charset1, c1);
 844           break;
 845
 846         case ISO_control_code:
 847           /* All ISO2022 control characters in this class have the
 848              same representation in Emacs internal format.  */
 849           *dst++ = c1;
 850           break;
 851
 852         case ISO_carriage_return:
 853           if (coding->eol_type == CODING_EOL_CR)
 854             {
 855               *dst++ = '\n';
 856             }
 857           else if (coding->eol_type == CODING_EOL_CRLF)
 858             {
 859               ONE_MORE_BYTE (c1);
 860               if (c1 == ISO_CODE_LF)
 861                 *dst++ = '\n';
 862               else
 863                 {
 864                   src--;
 865                   *dst++ = c1;
 866                 }
 867             }
 868           else
 869             {
 870               *dst++ = c1;
 871             }
 872           break;
 873
 874         case ISO_shift_out:
 875           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 876             goto label_invalid_escape_sequence;
 877           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 878           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 879           break;
 880
 881         case ISO_shift_in:
 882           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 883           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 884           break;
 885
 886         case ISO_single_shift_2_7:
 887         case ISO_single_shift_2:
 888           /* SS2 is handled as an escape sequence of ESC 'N' */
 889           c1 = 'N';
 890           goto label_escape_sequence;
 891
 892         case ISO_single_shift_3:
 893           /* SS2 is handled as an escape sequence of ESC 'O' */
 894           c1 = 'O';
 895           goto label_escape_sequence;
 896
 897         case ISO_control_sequence_introducer:
 898           /* CSI is handled as an escape sequence of ESC '[' ...  */
 899           c1 = '[';
 900           goto label_escape_sequence;
 901
 902         case ISO_escape:
 903           ONE_MORE_BYTE (c1);
 904         label_escape_sequence:
 905           /* Escape sequences handled by Emacs are invocation,
 906              designation, direction specification, and character
 907              composition specification.  */
 908           switch (c1)
 909             {
 910             case '&':           /* revision of following character set */
 911               ONE_MORE_BYTE (c1);
 912               if (!(c1 >= '@' && c1 <= '~'))
 913                 goto label_invalid_escape_sequence;
 914               ONE_MORE_BYTE (c1);
 915               if (c1 != ISO_CODE_ESC)
 916                 goto label_invalid_escape_sequence;
 917               ONE_MORE_BYTE (c1);
 918               goto label_escape_sequence;
 919
 920             case '$':           /* designation of 2-byte character set */
 921               ONE_MORE_BYTE (c1);
 922               if (c1 >= '@' && c1 <= 'B')
 923                 {       /* designation of JISX0208.1978, GB2312.1980,
 924                                    or JISX0208.1980 */
 925                   DECODE_DESIGNATION (0, 2, 94, c1);
 926                 }
 927               else if (c1 >= 0x28 && c1 <= 0x2B)
 928                 {       /* designation of DIMENSION2_CHARS94 character set */
 929                   ONE_MORE_BYTE (c2);
 930                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 931                 }
 932               else if (c1 >= 0x2C && c1 <= 0x2F)
 933                 {       /* designation of DIMENSION2_CHARS96 character set */
 934                   ONE_MORE_BYTE (c2);
 935                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 936                 }
 937               else
 938                 goto label_invalid_escape_sequence;
 939               break;
 940
 941             case 'n':           /* invocation of locking-shift-2 */
 942               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 943                 goto label_invalid_escape_sequence;
 944               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 945               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 946               break;
 947
 948             case 'o':           /* invocation of locking-shift-3 */
 949               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 950                 goto label_invalid_escape_sequence;
 951               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 952               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 953               break;
 954
 955             case 'N':           /* invocation of single-shift-2 */
 956               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 957                 goto label_invalid_escape_sequence;
 958               ONE_MORE_BYTE (c1);
 959               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 960               DECODE_ISO_CHARACTER (charset, c1);
 961               break;
 962
 963             case 'O':           /* invocation of single-shift-3 */
 964               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 965                 goto label_invalid_escape_sequence;
 966               ONE_MORE_BYTE (c1);
 967               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 968               DECODE_ISO_CHARACTER (charset, c1);
 969               break;
 970
 971             case '0':           /* start composing without embeded rules */
 972               coding->composing = COMPOSING_NO_RULE_HEAD;
 973               break;
 974
 975             case '1':           /* end composing */
 976               coding->composing = COMPOSING_NO;
 977               break;
 978
 979             case '2':           /* start composing with embeded rules */
 980               coding->composing = COMPOSING_WITH_RULE_HEAD;
 981               break;
 982
 983             case '[':           /* specification of direction */
 984               /* For the moment, nested direction is not supported.
 985                  So, the value of `coding->direction' is 0 or 1: 0
 986                  means left-to-right, 1 means right-to-left.  */
 987               ONE_MORE_BYTE (c1);
 988               switch (c1)
 989                 {
 990                 case ']':       /* end of the current direction */
 991                   coding->direction = 0;
 992
 993                 case '0':       /* end of the current direction */
 994                 case '1':       /* start of left-to-right direction */
 995                   ONE_MORE_BYTE (c1);
 996                   if (c1 == ']')
 997                     coding->direction = 0;
 998                   else
 999                     goto label_invalid_escape_sequence;
1000                   break;
1001
1002                 case '2':       /* start of right-to-left direction */
1003                   ONE_MORE_BYTE (c1);
1004                   if (c1 == ']')
1005                     coding->direction= 1;
1006                   else
1007                     goto label_invalid_escape_sequence;
1008                   break;
1009
1010                 default:
1011                   goto label_invalid_escape_sequence;
1012                 }
1013               break;
1014
1015             default:
1016               if (c1 >= 0x28 && c1 <= 0x2B)
1017                 {       /* designation of DIMENSION1_CHARS94 character set */
1018                   ONE_MORE_BYTE (c2);
1019                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1020                 }
1021               else if (c1 >= 0x2C && c1 <= 0x2F)
1022                 {       /* designation of DIMENSION1_CHARS96 character set */
1023                   ONE_MORE_BYTE (c2);
1024                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1025                 }
1026               else
1027                 {
1028                   goto label_invalid_escape_sequence;
1029                 }
1030             }
1031           /* We must update these variables now.  */
1032           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1033           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1034           break;
1035
1036         label_invalid_escape_sequence:
1037           {
1038             int length = src - src_base;
1039
1040             bcopy (src_base, dst, length);
1041             dst += length;
1042           }
1043         }
1044       continue;
1045
1046     label_end_of_loop:
1047       coding->carryover_size = src - src_base;
1048       bcopy (src_base, coding->carryover, coding->carryover_size);
1049       src = src_base;
1050       break;
1051     }
1052
1053   /* If this is the last block of the text to be decoded, we had
1054      better just flush out all remaining codes in the text although
1055      they are not valid characters.  */
1056   if (coding->last_block)
1057     {
1058       bcopy (src, dst, src_end - src);
1059       dst += (src_end - src);
1060       src = src_end;
1061     }
1062   *consumed = src - source;
1063   return dst - destination;
1064 }
1065
1066 /* ISO2022 encoding stuff.  */
1067
1068 /*
1069    It is not enough to say just "ISO2022" on encoding, we have to
1070    specify more details.  In Emacs, each coding-system of ISO2022
1071    variant has the following specifications:
1072         1. Initial designation to G0 thru G3.
1073         2. Allows short-form designation?
1074         3. ASCII should be designated to G0 before control characters?
1075         4. ASCII should be designated to G0 at end of line?
1076         5. 7-bit environment or 8-bit environment?
1077         6. Use locking-shift?
1078         7. Use Single-shift?
1079    And the following two are only for Japanese:
1080         8. Use ASCII in place of JIS0201-1976-Roman?
1081         9. Use JISX0208-1983 in place of JISX0208-1978?
1082    These specifications are encoded in `coding->flags' as flag bits
1083    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1084    details.
1085 */
1086
1087 /* Produce codes (escape sequence) for designating CHARSET to graphic
1088    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1089    the coding system CODING allows, produce designation sequence of
1090    short-form.  */
1091
1092 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1093   do {                                                                  \
1094     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1095     char *intermediate_char_94 = "()*+";                                \
1096     char *intermediate_char_96 = ",-./";                                \
1097     Lisp_Object temp                                                    \
1098       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1099     if (! NILP (temp))                                                  \
1100         {                                                               \
1101         *dst++ = ISO_CODE_ESC;                                          \
1102         *dst++ = '&';                                                   \
1103         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1104       }                                                                 \
1105     *dst++ = ISO_CODE_ESC;                                              \
1106     if (CHARSET_DIMENSION (charset) == 1)                               \
1107       {                                                                 \
1108         if (CHARSET_CHARS (charset) == 94)                              \
1109           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1110         else                                                            \
1111           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1112       }                                                                 \
1113     else                                                                \
1114       {                                                                 \
1115         *dst++ = '$';                                                   \
1116         if (CHARSET_CHARS (charset) == 94)                              \
1117           {                                                             \
1118             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1119                 || reg != 0                                             \
1120                 || final_char < '@' || final_char > 'B')                \
1121               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1122           }                                                             \
1123         else                                                            \
1124           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1125       }                                                                 \
1126     *dst++ = final_char;                                                \
1127     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1128   } while (0)
1129
1130 /* The following two macros produce codes (control character or escape
1131    sequence) for ISO2022 single-shift functions (single-shift-2 and
1132    single-shift-3).  */
1133
1134 #define ENCODE_SINGLE_SHIFT_2                           \
1135   do {                                                  \
1136     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1137       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1138     else                                                \
1139       *dst++ = ISO_CODE_SS2;                            \
1140     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1141   } while (0)
1142
1143 #define ENCODE_SINGLE_SHIFT_3                           \
1144   do {                                                  \
1145     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1146       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1147     else                                                \
1148       *dst++ = ISO_CODE_SS3;                            \
1149     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1150   } while (0)
1151
1152 /* The following four macros produce codes (control character or
1153    escape sequence) for ISO2022 locking-shift functions (shift-in,
1154    shift-out, locking-shift-2, and locking-shift-3).  */
1155
1156 #define ENCODE_SHIFT_IN                         \
1157   do {                                          \
1158     *dst++ = ISO_CODE_SI;                       \
1159     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1160   } while (0)
1161
1162 #define ENCODE_SHIFT_OUT                        \
1163   do {                                          \
1164     *dst++ = ISO_CODE_SO;                       \
1165     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1166   } while (0)
1167
1168 #define ENCODE_LOCKING_SHIFT_2                  \
1169   do {                                          \
1170     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1171     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1172   } while (0)
1173
1174 #define ENCODE_LOCKING_SHIFT_3                  \
1175   do {                                          \
1176     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1177     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1178   } while (0)
1179
1180 /* Produce codes for a DIMENSION1 character whose character set is
1181    CHARSET and whose position-code is C1.  Designation and invocation
1182    sequences are also produced in advance if necessary.  */
1183
1184
1185 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1186   do {                                                                  \
1187     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1188       {                                                                 \
1189         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1190           *dst++ = c1 & 0x7F;                                           \
1191         else                                                            \
1192           *dst++ = c1 | 0x80;                                           \
1193         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1194         break;                                                          \
1195       }                                                                 \
1196     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1197       {                                                                 \
1198         *dst++ = c1 & 0x7F;                                             \
1199         break;                                                          \
1200       }                                                                 \
1201     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1202       {                                                                 \
1203         *dst++ = c1 | 0x80;                                             \
1204         break;                                                          \
1205       }                                                                 \
1206     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1207              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1208       {                                                                 \
1209         /* We should not encode this character, instead produce one or  \
1210            two `?'s.  */                                                \
1211         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1212         if (CHARSET_WIDTH (charset) == 2)                               \
1213           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1214         break;                                                          \
1215       }                                                                 \
1216     else                                                                \
1217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1218          must invoke it, or, at first, designate it to some graphic     \
1219          register.  Then repeat the loop to actually produce the        \
1220          character.  */                                                 \
1221       dst = encode_invocation_designation (charset, coding, dst);       \
1222   } while (1)
1223
1224 /* Produce codes for a DIMENSION2 character whose character set is
1225    CHARSET and whose position-codes are C1 and C2.  Designation and
1226    invocation codes are also produced in advance if necessary.  */
1227
1228 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1229   do {                                                                  \
1230     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1231       {                                                                 \
1232         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1233           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1234         else                                                            \
1235           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1236         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1237         break;                                                          \
1238       }                                                                 \
1239     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1240       {                                                                 \
1241         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1242         break;                                                          \
1243       }                                                                 \
1244     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1245       {                                                                 \
1246         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1247         break;                                                          \
1248       }                                                                 \
1249     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1250              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1251       {                                                                 \
1252         /* We should not encode this character, instead produce one or  \
1253            two `?'s.  */                                                \
1254         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1255         if (CHARSET_WIDTH (charset) == 2)                               \
1256           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1257         break;                                                          \
1258       }                                                                 \
1259     else                                                                \
1260       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1261          must invoke it, or, at first, designate it to some graphic     \
1262          register.  Then repeat the loop to actually produce the        \
1263          character.  */                                                 \
1264       dst = encode_invocation_designation (charset, coding, dst);       \
1265   } while (1)
1266
1267 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1268   do {                                                                    \
1269     int c_alt, charset_alt;                                               \
1270     if (!NILP (unification_table)                                         \
1271         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1272             >= 0))                                                        \
1273       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1274     else                                                                  \
1275       charset_alt = charset;                                              \
1276     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1277       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1278     else                                                                  \
1279       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1280   } while (0)
1281
1282 /* Produce designation and invocation codes at a place pointed by DST
1283    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1284    Return new DST.  */
1285
1286 unsigned char *
1287 encode_invocation_designation (charset, coding, dst)
1288      int charset;
1289      struct coding_system *coding;
1290      unsigned char *dst;
1291 {
1292   int reg;                      /* graphic register number */
1293
1294   /* At first, check designations.  */
1295   for (reg = 0; reg < 4; reg++)
1296     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1297       break;
1298
1299   if (reg >= 4)
1300     {
1301       /* CHARSET is not yet designated to any graphic registers.  */
1302       /* At first check the requested designation.  */
1303       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1304       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1305         /* Since CHARSET requests no special designation, designate it
1306            to graphic register 0.  */
1307         reg = 0;
1308
1309       ENCODE_DESIGNATION (charset, reg, coding);
1310     }
1311
1312   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1313       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1314     {
1315       /* Since the graphic register REG is not invoked to any graphic
1316          planes, invoke it to graphic plane 0.  */
1317       switch (reg)
1318         {
1319         case 0:                 /* graphic register 0 */
1320           ENCODE_SHIFT_IN;
1321           break;
1322
1323         case 1:                 /* graphic register 1 */
1324           ENCODE_SHIFT_OUT;
1325           break;
1326
1327         case 2:                 /* graphic register 2 */
1328           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1329             ENCODE_SINGLE_SHIFT_2;
1330           else
1331             ENCODE_LOCKING_SHIFT_2;
1332           break;
1333
1334         case 3:                 /* graphic register 3 */
1335           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1336             ENCODE_SINGLE_SHIFT_3;
1337           else
1338             ENCODE_LOCKING_SHIFT_3;
1339           break;
1340         }
1341     }
1342   return dst;
1343 }
1344
1345 /* The following two macros produce codes for indicating composition.  */
1346 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1347 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1348 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1349
1350 /* The following three macros produce codes for indicating direction
1351    of text.  */
1352 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1353   do {                                                  \
1354     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1355       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1356     else                                                \
1357       *dst++ = ISO_CODE_CSI;                            \
1358   } while (0)
1359
1360 #define ENCODE_DIRECTION_R2L    \
1361   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1362
1363 #define ENCODE_DIRECTION_L2R    \
1364   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1365
1366 /* Produce codes for designation and invocation to reset the graphic
1367    planes and registers to initial state.  */
1368 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1369   do {                                                                      \
1370     int reg;                                                                \
1371     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1372       ENCODE_SHIFT_IN;                                                      \
1373     for (reg = 0; reg < 4; reg++)                                           \
1374       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1375           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1376               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1377         ENCODE_DESIGNATION                                                  \
1378           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1379   } while (0)
1380
1381 /* Produce designation sequences of charsets in the line started from
1382    *SRC to a place pointed by DSTP.
1383
1384    If the current block ends before any end-of-line, we may fail to
1385    find all the necessary *designations.  */
1386 encode_designation_at_bol (coding, table, src, src_end, dstp)
1387      struct coding_system *coding;
1388      Lisp_Object table;
1389      unsigned char *src, *src_end, **dstp;
1390 {
1391   int charset, c, found = 0, reg;
1392   /* Table of charsets to be designated to each graphic register.  */
1393   int r[4];
1394   unsigned char *dst = *dstp;
1395
1396   for (reg = 0; reg < 4; reg++)
1397     r[reg] = -1;
1398
1399   while (src < src_end && *src != '\n' && found < 4)
1400     {
1401       int bytes = BYTES_BY_CHAR_HEAD (*src);
1402
1403       if (NILP (table))
1404         charset = CHARSET_AT (src);
1405       else
1406         {
1407           int c_alt, c1, c2;
1408
1409           SPLIT_STRING(src, bytes, charset, c1, c2);
1410           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1411             charset = CHAR_CHARSET (c_alt);
1412         }
1413
1414       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1415       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1416         {
1417           found++;
1418           r[reg] = charset;
1419         }
1420
1421       src += bytes;
1422     }
1423
1424   if (found)
1425     {
1426       for (reg = 0; reg < 4; reg++)
1427         if (r[reg] >= 0
1428             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1429           ENCODE_DESIGNATION (r[reg], reg, coding);
1430       *dstp = dst;
1431     }
1432 }
1433
1434 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1435
1436 int
1437 encode_coding_iso2022 (coding, source, destination,
1438                        src_bytes, dst_bytes, consumed)
1439      struct coding_system *coding;
1440      unsigned char *source, *destination;
1441      int src_bytes, dst_bytes;
1442      int *consumed;
1443 {
1444   unsigned char *src = source;
1445   unsigned char *src_end = source + src_bytes;
1446   unsigned char *dst = destination;
1447   unsigned char *dst_end = destination + dst_bytes;
1448   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1449      from DST_END to assure overflow checking is necessary only at the
1450      head of loop.  */
1451   unsigned char *adjusted_dst_end = dst_end - 19;
1452   Lisp_Object unification_table
1453       = coding->character_unification_table_for_encode;
1454
1455   if (!NILP (Venable_character_unification) && NILP (unification_table))
1456     unification_table = Vstandard_character_unification_table_for_encode;
1457
1458   while (src < src_end && dst < adjusted_dst_end)
1459     {
1460       /* SRC_BASE remembers the start position in source in each loop.
1461          The loop will be exited when there's not enough source text
1462          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1463          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1464          reset to SRC_BASE before exiting.  */
1465       unsigned char *src_base = src;
1466       int charset, c1, c2, c3, c4;
1467
1468       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1469           && CODING_SPEC_ISO_BOL (coding))
1470         {
1471           /* We have to produce designation sequences if any now.  */
1472           encode_designation_at_bol (coding, unification_table,
1473                                      src, src_end, &dst);
1474           CODING_SPEC_ISO_BOL (coding) = 0;
1475         }
1476
1477       c1 = *src++;
1478       /* If we are seeing a component of a composite character, we are
1479          seeing a leading-code specially encoded for composition, or a
1480          composition rule if composing with rule.  We must set C1
1481          to a normal leading-code or an ASCII code.  If we are not at
1482          a composed character, we must reset the composition state.  */
1483       if (COMPOSING_P (coding->composing))
1484         {
1485           if (c1 < 0xA0)
1486             {
1487               /* We are not in a composite character any longer.  */
1488               coding->composing = COMPOSING_NO;
1489               ENCODE_COMPOSITION_END;
1490             }
1491           else
1492             {
1493               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1494                 {
1495                   *dst++ = c1 & 0x7F;
1496                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1497                   continue;
1498                 }
1499               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1500                 coding->composing = COMPOSING_WITH_RULE_RULE;
1501               if (c1 == 0xA0)
1502                 {
1503                   /* This is an ASCII component.  */
1504                   ONE_MORE_BYTE (c1);
1505                   c1 &= 0x7F;
1506                 }
1507               else
1508                 /* This is a leading-code of non ASCII component.  */
1509                 c1 -= 0x20;
1510             }
1511         }
1512
1513       /* Now encode one character.  C1 is a control character, an
1514          ASCII character, or a leading-code of multi-byte character.  */
1515       switch (emacs_code_class[c1])
1516         {
1517         case EMACS_ascii_code:
1518           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1519           break;
1520
1521         case EMACS_control_code:
1522           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1523             ENCODE_RESET_PLANE_AND_REGISTER;
1524           *dst++ = c1;
1525           break;
1526
1527         case EMACS_carriage_return_code:
1528           if (!coding->selective)
1529             {
1530               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1531                 ENCODE_RESET_PLANE_AND_REGISTER;
1532               *dst++ = c1;
1533               break;
1534             }
1535           /* fall down to treat '\r' as '\n' ...  */
1536
1537         case EMACS_linefeed_code:
1538           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1539             ENCODE_RESET_PLANE_AND_REGISTER;
1540           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1541             bcopy (coding->spec.iso2022.initial_designation,
1542                    coding->spec.iso2022.current_designation,
1543                    sizeof coding->spec.iso2022.initial_designation);
1544           if (coding->eol_type == CODING_EOL_LF
1545               || coding->eol_type == CODING_EOL_UNDECIDED)
1546             *dst++ = ISO_CODE_LF;
1547           else if (coding->eol_type == CODING_EOL_CRLF)
1548             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1549           else
1550             *dst++ = ISO_CODE_CR;
1551           CODING_SPEC_ISO_BOL (coding) = 1;
1552           break;
1553
1554         case EMACS_leading_code_2:
1555           ONE_MORE_BYTE (c2);
1556           if (c2 < 0xA0)
1557             {
1558               /* invalid sequence */
1559               *dst++ = c1;
1560               *dst++ = c2;
1561             }
1562           else
1563             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1564           break;
1565
1566         case EMACS_leading_code_3:
1567           TWO_MORE_BYTES (c2, c3);
1568           if (c2 < 0xA0 || c3 < 0xA0)
1569             {
1570               /* invalid sequence */
1571               *dst++ = c1;
1572               *dst++ = c2;
1573               *dst++ = c3;
1574             }
1575           else if (c1 < LEADING_CODE_PRIVATE_11)
1576             ENCODE_ISO_CHARACTER (c1, c2, c3);
1577           else
1578             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1579           break;
1580
1581         case EMACS_leading_code_4:
1582           THREE_MORE_BYTES (c2, c3, c4);
1583           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1584             {
1585               /* invalid sequence */
1586               *dst++ = c1;
1587               *dst++ = c2;
1588               *dst++ = c3;
1589               *dst++ = c4;
1590             }
1591           else
1592             ENCODE_ISO_CHARACTER (c2, c3, c4);
1593           break;
1594
1595         case EMACS_leading_code_composition:
1596           ONE_MORE_BYTE (c2);
1597           if (c2 < 0xA0)
1598             {
1599               /* invalid sequence */
1600               *dst++ = c1;
1601               *dst++ = c2;
1602             }
1603           else if (c2 == 0xFF)
1604             {
1605               coding->composing = COMPOSING_WITH_RULE_HEAD;
1606               ENCODE_COMPOSITION_WITH_RULE_START;
1607             }
1608           else
1609             {
1610               /* Rewind one byte because it is a character code of
1611                  composition elements.  */
1612               src--;
1613               coding->composing = COMPOSING_NO_RULE_HEAD;
1614               ENCODE_COMPOSITION_NO_RULE_START;
1615             }
1616           break;
1617
1618         case EMACS_invalid_code:
1619           *dst++ = c1;
1620           break;
1621         }
1622       continue;
1623     label_end_of_loop:
1624       /* We reach here because the source date ends not at character
1625          boundary.  */
1626       coding->carryover_size = src_end - src_base;
1627       bcopy (src_base, coding->carryover, coding->carryover_size);
1628       src = src_end;
1629       break;
1630     }
1631
1632   /* If this is the last block of the text to be encoded, we must
1633      reset graphic planes and registers to the initial state.  */
1634   if (src >= src_end && coding->last_block)
1635     {
1636       ENCODE_RESET_PLANE_AND_REGISTER;
1637       if (coding->carryover_size > 0
1638           && coding->carryover_size < (dst_end - dst))
1639         {
1640           bcopy (coding->carryover, dst, coding->carryover_size);
1641           dst += coding->carryover_size;
1642           coding->carryover_size = 0;
1643         }
1644     }
1645   *consumed = src - source;
1646   return dst - destination;
1647 }
1648
1649 \f
1650 /*** 4. SJIS and BIG5 handlers ***/
1651
1652 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1653    quite widely.  So, for the moment, Emacs supports them in the bare
1654    C code.  But, in the future, they may be supported only by CCL.  */
1655
1656 /* SJIS is a coding system encoding three character sets: ASCII, right
1657    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1658    as is.  A character of charset katakana-jisx0201 is encoded by
1659    "position-code + 0x80".  A character of charset japanese-jisx0208
1660    is encoded in 2-byte but two position-codes are divided and shifted
1661    so that it fit in the range below.
1662
1663    --- CODE RANGE of SJIS ---
1664    (character set)      (range)
1665    ASCII                0x00 .. 0x7F
1666    KATAKANA-JISX0201    0xA0 .. 0xDF
1667    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1668             (2nd byte)  0x40 .. 0xFF
1669    -------------------------------
1670
1671 */
1672
1673 /* BIG5 is a coding system encoding two character sets: ASCII and
1674    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1675    character set and is encoded in two-byte.
1676
1677    --- CODE RANGE of BIG5 ---
1678    (character set)      (range)
1679    ASCII                0x00 .. 0x7F
1680    Big5 (1st byte)      0xA1 .. 0xFE
1681         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1682    --------------------------
1683
1684    Since the number of characters in Big5 is larger than maximum
1685    characters in Emacs' charset (96x96), it can't be handled as one
1686    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1687    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1688    contains frequently used characters and the latter contains less
1689    frequently used characters.  */
1690
1691 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1692    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1693    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1694    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1695
1696 /* Number of Big5 characters which have the same code in 1st byte.  */
1697 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1698
1699 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1700   do {                                                                  \
1701     unsigned int temp                                                   \
1702       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1703     if (b1 < 0xC9)                                                      \
1704       charset = charset_big5_1;                                         \
1705     else                                                                \
1706       {                                                                 \
1707         charset = charset_big5_2;                                       \
1708         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1709       }                                                                 \
1710     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1711     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1712   } while (0)
1713
1714 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1715   do {                                                                  \
1716     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1717     if (charset == charset_big5_2)                                      \
1718       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1719     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1720     b2 = temp % BIG5_SAME_ROW;                                          \
1721     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1722   } while (0)
1723
1724 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1725   do {                                                                  \
1726     int c_alt, charset_alt = (charset);                                 \
1727     if (!NILP (unification_table)                                       \
1728         && ((c_alt = unify_char (unification_table,                     \
1729                                  -1, (charset), c1, c2)) >= 0))         \
1730           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1731     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1732       DECODE_CHARACTER_ASCII (c1);                                      \
1733     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1734       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1735     else                                                                \
1736       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1737   } while (0)
1738
1739 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1740   do {                                                                    \
1741     int c_alt, charset_alt;                                               \
1742     if (!NILP (unification_table)                                         \
1743         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1744             >= 0))                                                        \
1745       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1746     else                                                                  \
1747       charset_alt = charset;                                              \
1748     if (charset_alt == charset_ascii)                                     \
1749       *dst++ = c1;                                                        \
1750     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1751       {                                                                   \
1752         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1753           *dst++ = c1;                                                    \
1754         else                                                              \
1755           *dst++ = charset_alt, *dst++ = c1;                              \
1756       }                                                                   \
1757     else                                                                  \
1758       {                                                                   \
1759         c1 &= 0x7F, c2 &= 0x7F;                                           \
1760         if (sjis_p && charset_alt == charset_jisx0208)                    \
1761           {                                                               \
1762             unsigned char s1, s2;                                         \
1763                                                                           \
1764             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1765             *dst++ = s1, *dst++ = s2;                                     \
1766           }                                                               \
1767         else if (!sjis_p                                                  \
1768                  && (charset_alt == charset_big5_1                        \
1769                      || charset_alt == charset_big5_2))                   \
1770           {                                                               \
1771             unsigned char b1, b2;                                         \
1772                                                                           \
1773             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1774             *dst++ = b1, *dst++ = b2;                                     \
1775           }                                                               \
1776         else                                                              \
1777           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1778       }                                                                   \
1779   } while (0);
1780
1781 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1782    Check if a text is encoded in SJIS.  If it is, return
1783    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1784
1785 int
1786 detect_coding_sjis (src, src_end)
1787      unsigned char *src, *src_end;
1788 {
1789   unsigned char c;
1790
1791   while (src < src_end)
1792     {
1793       c = *src++;
1794       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1795         return 0;
1796       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1797         {
1798           if (src < src_end && *src++ < 0x40)
1799             return 0;
1800         }
1801     }
1802   return CODING_CATEGORY_MASK_SJIS;
1803 }
1804
1805 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1806    Check if a text is encoded in BIG5.  If it is, return
1807    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1808
1809 int
1810 detect_coding_big5 (src, src_end)
1811      unsigned char *src, *src_end;
1812 {
1813   unsigned char c;
1814
1815   while (src < src_end)
1816     {
1817       c = *src++;
1818       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1819         return 0;
1820       if (c >= 0xA1)
1821         {
1822           if (src >= src_end)
1823             break;
1824           c = *src++;
1825           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1826             return 0;
1827         }
1828     }
1829   return CODING_CATEGORY_MASK_BIG5;
1830 }
1831
1832 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1833    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1834
1835 int
1836 decode_coding_sjis_big5 (coding, source, destination,
1837                          src_bytes, dst_bytes, consumed, sjis_p)
1838      struct coding_system *coding;
1839      unsigned char *source, *destination;
1840      int src_bytes, dst_bytes;
1841      int *consumed;
1842      int sjis_p;
1843 {
1844   unsigned char *src = source;
1845   unsigned char *src_end = source + src_bytes;
1846   unsigned char *dst = destination;
1847   unsigned char *dst_end = destination + dst_bytes;
1848   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1849      from DST_END to assure overflow checking is necessary only at the
1850      head of loop.  */
1851   unsigned char *adjusted_dst_end = dst_end - 3;
1852   Lisp_Object unification_table
1853       = coding->character_unification_table_for_decode;
1854
1855   if (!NILP (Venable_character_unification) && NILP (unification_table))
1856     unification_table = Vstandard_character_unification_table_for_decode;
1857
1858   while (src < src_end && dst < adjusted_dst_end)
1859     {
1860       /* SRC_BASE remembers the start position in source in each loop.
1861          The loop will be exited when there's not enough source text
1862          to analyze two-byte character (within macro ONE_MORE_BYTE).
1863          In that case, SRC is reset to SRC_BASE before exiting.  */
1864       unsigned char *src_base = src;
1865       unsigned char c1 = *src++, c2, c3, c4;
1866
1867       if (c1 == '\r')
1868         {
1869           if (coding->eol_type == CODING_EOL_CRLF)
1870             {
1871               ONE_MORE_BYTE (c2);
1872               if (c2 == '\n')
1873                 *dst++ = c2;
1874               else
1875                 /* To process C2 again, SRC is subtracted by 1.  */
1876                 *dst++ = c1, src--;
1877             }
1878           else
1879             *dst++ = c1;
1880         }
1881       else if (c1 < 0x20)
1882         *dst++ = c1;
1883       else if (c1 < 0x80)
1884         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1885       else if (c1 < 0xA0 || c1 >= 0xE0)
1886         {
1887           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1888           if (sjis_p)
1889             {
1890               ONE_MORE_BYTE (c2);
1891               DECODE_SJIS (c1, c2, c3, c4);
1892               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1893             }
1894           else if (c1 >= 0xE0 && c1 < 0xFF)
1895             {
1896               int charset;
1897
1898               ONE_MORE_BYTE (c2);
1899               DECODE_BIG5 (c1, c2, charset, c3, c4);
1900               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1901             }
1902           else                  /* Invalid code */
1903             *dst++ = c1;
1904         }
1905       else
1906         {
1907           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1908           if (sjis_p)
1909             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1910           else
1911             {
1912               int charset;
1913
1914               ONE_MORE_BYTE (c2);
1915               DECODE_BIG5 (c1, c2, charset, c3, c4);
1916               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1917             }
1918         }
1919       continue;
1920
1921     label_end_of_loop:
1922       coding->carryover_size = src - src_base;
1923       bcopy (src_base, coding->carryover, coding->carryover_size);
1924       src = src_base;
1925       break;
1926     }
1927
1928   *consumed = src - source;
1929   return dst - destination;
1930 }
1931
1932 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1933    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1934    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1935    sure that all these charsets are registered as official charset
1936    (i.e. do not have extended leading-codes).  Characters of other
1937    charsets are produced without any encoding.  If SJIS_P is 1, encode
1938    SJIS text, else encode BIG5 text.  */
1939
1940 int
1941 encode_coding_sjis_big5 (coding, source, destination,
1942                          src_bytes, dst_bytes, consumed, sjis_p)
1943      struct coding_system *coding;
1944      unsigned char *source, *destination;
1945      int src_bytes, dst_bytes;
1946      int *consumed;
1947      int sjis_p;
1948 {
1949   unsigned char *src = source;
1950   unsigned char *src_end = source + src_bytes;
1951   unsigned char *dst = destination;
1952   unsigned char *dst_end = destination + dst_bytes;
1953   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1954      from DST_END to assure overflow checking is necessary only at the
1955      head of loop.  */
1956   unsigned char *adjusted_dst_end = dst_end - 1;
1957   Lisp_Object unification_table
1958       = coding->character_unification_table_for_encode;
1959
1960   if (!NILP (Venable_character_unification) && NILP (unification_table))
1961     unification_table = Vstandard_character_unification_table_for_encode;
1962
1963   while (src < src_end && dst < adjusted_dst_end)
1964     {
1965       /* SRC_BASE remembers the start position in source in each loop.
1966          The loop will be exited when there's not enough source text
1967          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1968          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1969          before exiting.  */
1970       unsigned char *src_base = src;
1971       unsigned char c1 = *src++, c2, c3, c4;
1972
1973       if (coding->composing)
1974         {
1975           if (c1 == 0xA0)
1976             {
1977               ONE_MORE_BYTE (c1);
1978               c1 &= 0x7F;
1979             }
1980           else if (c1 >= 0xA0)
1981             c1 -= 0x20;
1982           else
1983             coding->composing = 0;
1984         }
1985
1986       switch (emacs_code_class[c1])
1987         {
1988         case EMACS_ascii_code:
1989           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1990           break;
1991
1992         case EMACS_control_code:
1993           *dst++ = c1;
1994           break;
1995
1996         case EMACS_carriage_return_code:
1997           if (!coding->selective)
1998             {
1999               *dst++ = c1;
2000               break;
2001             }
2002           /* fall down to treat '\r' as '\n' ...  */
2003
2004         case EMACS_linefeed_code:
2005           if (coding->eol_type == CODING_EOL_LF
2006               || coding->eol_type == CODING_EOL_UNDECIDED)
2007             *dst++ = '\n';
2008           else if (coding->eol_type == CODING_EOL_CRLF)
2009             *dst++ = '\r', *dst++ = '\n';
2010           else
2011             *dst++ = '\r';
2012           break;
2013
2014         case EMACS_leading_code_2:
2015           ONE_MORE_BYTE (c2);
2016           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2017           break;
2018
2019         case EMACS_leading_code_3:
2020           TWO_MORE_BYTES (c2, c3);
2021           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2022           break;
2023
2024         case EMACS_leading_code_4:
2025           THREE_MORE_BYTES (c2, c3, c4);
2026           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2027           break;
2028
2029         case EMACS_leading_code_composition:
2030           coding->composing = 1;
2031           break;
2032
2033         default:                /* i.e. case EMACS_invalid_code: */
2034           *dst++ = c1;
2035         }
2036       continue;
2037
2038     label_end_of_loop:
2039       coding->carryover_size = src_end - src_base;
2040       bcopy (src_base, coding->carryover, coding->carryover_size);
2041       src = src_end;
2042       break;
2043     }
2044
2045   *consumed = src - source;
2046   return dst - destination;
2047 }
2048
2049 \f
2050 /*** 5. End-of-line handlers ***/
2051
2052 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2053    This function is called only when `coding->eol_type' is
2054    CODING_EOL_CRLF or CODING_EOL_CR.  */
2055
2056 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2057      struct coding_system *coding;
2058      unsigned char *source, *destination;
2059      int src_bytes, dst_bytes;
2060      int *consumed;
2061 {
2062   unsigned char *src = source;
2063   unsigned char *src_end = source + src_bytes;
2064   unsigned char *dst = destination;
2065   unsigned char *dst_end = destination + dst_bytes;
2066   int produced;
2067
2068   switch (coding->eol_type)
2069     {
2070     case CODING_EOL_CRLF:
2071       {
2072         /* Since the maximum bytes produced by each loop is 2, we
2073            subtract 1 from DST_END to assure overflow checking is
2074            necessary only at the head of loop.  */
2075         unsigned char *adjusted_dst_end = dst_end - 1;
2076
2077         while (src < src_end && dst < adjusted_dst_end)
2078           {
2079             unsigned char *src_base = src;
2080             unsigned char c = *src++;
2081             if (c == '\r')
2082               {
2083                 ONE_MORE_BYTE (c);
2084                 if (c != '\n')
2085                   *dst++ = '\r';
2086                 *dst++ = c;
2087               }
2088             else
2089               *dst++ = c;
2090             continue;
2091
2092           label_end_of_loop:
2093             coding->carryover_size = src - src_base;
2094             bcopy (src_base, coding->carryover, coding->carryover_size);
2095             src = src_base;
2096             break;
2097           }
2098         *consumed = src - source;
2099         produced = dst - destination;
2100         break;
2101       }
2102
2103     case CODING_EOL_CR:
2104       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2105       bcopy (source, destination, produced);
2106       dst_end = destination + produced;
2107       while (dst < dst_end)
2108         if (*dst++ == '\r') dst[-1] = '\n';
2109       *consumed = produced;
2110       break;
2111
2112     default:                    /* i.e. case: CODING_EOL_LF */
2113       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2114       bcopy (source, destination, produced);
2115       *consumed = produced;
2116       break;
2117     }
2118
2119   return produced;
2120 }
2121
2122 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2123    format of end-of-line according to `coding->eol_type'.  If
2124    `coding->selective' is 1, code '\r' in source text also means
2125    end-of-line.  */
2126
2127 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2128      struct coding_system *coding;
2129      unsigned char *source, *destination;
2130      int src_bytes, dst_bytes;
2131      int *consumed;
2132 {
2133   unsigned char *src = source;
2134   unsigned char *dst = destination;
2135   int produced;
2136
2137   if (src_bytes <= 0)
2138     return 0;
2139
2140   switch (coding->eol_type)
2141     {
2142     case CODING_EOL_LF:
2143     case CODING_EOL_UNDECIDED:
2144       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2145       bcopy (source, destination, produced);
2146       if (coding->selective)
2147         {
2148           int i = produced;
2149           while (i--)
2150             if (*dst++ == '\r') dst[-1] = '\n';
2151         }
2152       *consumed = produced;
2153
2154     case CODING_EOL_CRLF:
2155       {
2156         unsigned char c;
2157         unsigned char *src_end = source + src_bytes;
2158         unsigned char *dst_end = destination + dst_bytes;
2159         /* Since the maximum bytes produced by each loop is 2, we
2160            subtract 1 from DST_END to assure overflow checking is
2161            necessary only at the head of loop.  */
2162         unsigned char *adjusted_dst_end = dst_end - 1;
2163
2164         while (src < src_end && dst < adjusted_dst_end)
2165           {
2166             c = *src++;
2167             if (c == '\n' || (c == '\r' && coding->selective))
2168               *dst++ = '\r', *dst++ = '\n';
2169             else
2170               *dst++ = c;
2171           }
2172         produced = dst - destination;
2173         *consumed = src - source;
2174         break;
2175       }
2176
2177     default:                    /* i.e. case CODING_EOL_CR: */
2178       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2179       bcopy (source, destination, produced);
2180       {
2181         int i = produced;
2182         while (i--)
2183           if (*dst++ == '\n') dst[-1] = '\r';
2184       }
2185       *consumed = produced;
2186     }
2187
2188   return produced;
2189 }
2190
2191 \f
2192 /*** 6. C library functions ***/
2193
2194 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2195    has a property `coding-system'.  The value of this property is a
2196    vector of length 5 (called as coding-vector).  Among elements of
2197    this vector, the first (element[0]) and the fifth (element[4])
2198    carry important information for decoding/encoding.  Before
2199    decoding/encoding, this information should be set in fields of a
2200    structure of type `coding_system'.
2201
2202    A value of property `coding-system' can be a symbol of another
2203    subsidiary coding-system.  In that case, Emacs gets coding-vector
2204    from that symbol.
2205
2206    `element[0]' contains information to be set in `coding->type'.  The
2207    value and its meaning is as follows:
2208
2209    0 -- coding_type_emacs_mule
2210    1 -- coding_type_sjis
2211    2 -- coding_type_iso2022
2212    3 -- coding_type_big5
2213    4 -- coding_type_ccl encoder/decoder written in CCL
2214    nil -- coding_type_no_conversion
2215    t -- coding_type_undecided (automatic conversion on decoding,
2216                                no-conversion on encoding)
2217
2218    `element[4]' contains information to be set in `coding->flags' and
2219    `coding->spec'.  The meaning varies by `coding->type'.
2220
2221    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2222    of length 32 (of which the first 13 sub-elements are used now).
2223    Meanings of these sub-elements are:
2224
2225    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2226         If the value is an integer of valid charset, the charset is
2227         assumed to be designated to graphic register N initially.
2228
2229         If the value is minus, it is a minus value of charset which
2230         reserves graphic register N, which means that the charset is
2231         not designated initially but should be designated to graphic
2232         register N just before encoding a character in that charset.
2233
2234         If the value is nil, graphic register N is never used on
2235         encoding.
2236
2237    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2238         Each value takes t or nil.  See the section ISO2022 of
2239         `coding.h' for more information.
2240
2241    If `coding->type' is `coding_type_big5', element[4] is t to denote
2242    BIG5-ETen or nil to denote BIG5-HKU.
2243
2244    If `coding->type' takes the other value, element[4] is ignored.
2245
2246    Emacs Lisp's coding system also carries information about format of
2247    end-of-line in a value of property `eol-type'.  If the value is
2248    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2249    means CODING_EOL_CR.  If it is not integer, it should be a vector
2250    of subsidiary coding systems of which property `eol-type' has one
2251    of above values.
2252
2253 */
2254
2255 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2256    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2257    is setup so that no conversion is necessary and return -1, else
2258    return 0.  */
2259
2260 int
2261 setup_coding_system (coding_system, coding)
2262      Lisp_Object coding_system;
2263      struct coding_system *coding;
2264 {
2265   Lisp_Object type, eol_type;
2266
2267   /* At first, set several fields to default values.  */
2268   coding->require_flushing = 0;
2269   coding->last_block = 0;
2270   coding->selective = 0;
2271   coding->composing = 0;
2272   coding->direction = 0;
2273   coding->carryover_size = 0;
2274   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2275   coding->character_unification_table_for_decode = Qnil;
2276   coding->character_unification_table_for_encode = Qnil;
2277
2278   Vlast_coding_system_used = coding->symbol = coding_system;
2279   eol_type = Qnil;
2280   /* Get value of property `coding-system' until we get a vector.
2281      While doing that, also get values of properties
2282      `post-read-conversion', `pre-write-conversion',
2283      `character-unification-table-for-decode',
2284      `character-unification-table-for-encode' and `eol-type'.  */
2285   while (!NILP (coding_system) && SYMBOLP (coding_system))
2286     {
2287       if (NILP (coding->post_read_conversion))
2288         coding->post_read_conversion = Fget (coding_system,
2289                                              Qpost_read_conversion);
2290       if (NILP (coding->pre_write_conversion))
2291         coding->pre_write_conversion = Fget (coding_system,
2292                                              Qpre_write_conversion);
2293       if (!inhibit_eol_conversion && NILP (eol_type))
2294         eol_type = Fget (coding_system, Qeol_type);
2295
2296       if (NILP (coding->character_unification_table_for_decode))
2297         coding->character_unification_table_for_decode
2298           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2299
2300       if (NILP (coding->character_unification_table_for_encode))
2301         coding->character_unification_table_for_encode
2302           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2303
2304       coding_system = Fget (coding_system, Qcoding_system);
2305     }
2306
2307   while (!NILP (coding->character_unification_table_for_decode)
2308          && SYMBOLP (coding->character_unification_table_for_decode))
2309         coding->character_unification_table_for_decode
2310           = Fget (coding->character_unification_table_for_decode,
2311                   Qcharacter_unification_table_for_decode);
2312   if (!NILP (coding->character_unification_table_for_decode)
2313       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2314       coding->character_unification_table_for_decode = Qnil;
2315
2316   while (!NILP (coding->character_unification_table_for_encode)
2317          && SYMBOLP (coding->character_unification_table_for_encode))
2318         coding->character_unification_table_for_encode
2319           = Fget (coding->character_unification_table_for_encode,
2320                   Qcharacter_unification_table_for_encode);
2321   if (!NILP (coding->character_unification_table_for_encode)
2322       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2323       coding->character_unification_table_for_encode = Qnil;
2324
2325   if (!VECTORP (coding_system)
2326       || XVECTOR (coding_system)->size != 5)
2327     goto label_invalid_coding_system;
2328
2329   if (VECTORP (eol_type))
2330     coding->eol_type = CODING_EOL_UNDECIDED;
2331   else if (XFASTINT (eol_type) == 1)
2332     coding->eol_type = CODING_EOL_CRLF;
2333   else if (XFASTINT (eol_type) == 2)
2334     coding->eol_type = CODING_EOL_CR;
2335   else
2336     coding->eol_type = CODING_EOL_LF;
2337
2338   type = XVECTOR (coding_system)->contents[0];
2339   switch (XFASTINT (type))
2340     {
2341     case 0:
2342       coding->type = coding_type_emacs_mule;
2343       break;
2344
2345     case 1:
2346       coding->type = coding_type_sjis;
2347       break;
2348
2349     case 2:
2350       coding->type = coding_type_iso2022;
2351       {
2352         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2353         Lisp_Object *flags;
2354         int i, charset, default_reg_bits = 0;
2355
2356         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2357           goto label_invalid_coding_system;
2358
2359         flags = XVECTOR (val)->contents;
2360         coding->flags
2361           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2362              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2363              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2364              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2365              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2366              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2367              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2368              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2369              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2370              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2371              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2372              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2373              );
2374
2375         /* Invoke graphic register 0 to plane 0.  */
2376         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2377         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2378         CODING_SPEC_ISO_INVOCATION (coding, 1)
2379           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2380         /* Not single shifting at first.  */
2381         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2382         /* Beginning of buffer should also be regarded as bol. */
2383         CODING_SPEC_ISO_BOL (coding) = 1;
2384
2385         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2386            FLAGS[REG] can be one of below:
2387                 integer CHARSET: CHARSET occupies register I,
2388                 t: designate nothing to REG initially, but can be used
2389                   by any charsets,
2390                 list of integer, nil, or t: designate the first
2391                   element (if integer) to REG initially, the remaining
2392                   elements (if integer) is designated to REG on request,
2393                   if an element is t, REG can be used by any charset,
2394                 nil: REG is never used.  */
2395         for (charset = 0; charset <= MAX_CHARSET; charset++)
2396           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2397             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2398         bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2399         for (i = 0; i < 4; i++)
2400           {
2401             if (INTEGERP (flags[i])
2402                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2403                 || (charset = get_charset_id (flags[i])) >= 0)
2404               {
2405                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2406                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2407                 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2408               }
2409             else if (EQ (flags[i], Qt))
2410               {
2411                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2412                 default_reg_bits |= 1 << i;
2413               }
2414             else if (CONSP (flags[i]))
2415               {
2416                 Lisp_Object tail = flags[i];
2417
2418                 if (INTEGERP (XCONS (tail)->car)
2419                     && (charset = XINT (XCONS (tail)->car),
2420                         CHARSET_VALID_P (charset))
2421                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2422                   {
2423                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2424                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2425                     CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2426                   }
2427                 else
2428                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2429                 tail = XCONS (tail)->cdr;
2430                 while (CONSP (tail))
2431                   {
2432                     if (INTEGERP (XCONS (tail)->car)
2433                         && (charset = XINT (XCONS (tail)->car),
2434                             CHARSET_VALID_P (charset))
2435                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2436                       {
2437                         CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2438                           = i;
2439                         CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2440                           = 1;
2441                       }
2442                     else if (EQ (XCONS (tail)->car, Qt))
2443                       default_reg_bits |= 1 << i;
2444                     tail = XCONS (tail)->cdr;
2445                   }
2446               }
2447             else
2448               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2449
2450             CODING_SPEC_ISO_DESIGNATION (coding, i)
2451               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2452           }
2453
2454         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2455           {
2456             /* REG 1 can be used only by locking shift in 7-bit env.  */
2457             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2458               default_reg_bits &= ~2;
2459             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2460               /* Without any shifting, only REG 0 and 1 can be used.  */
2461               default_reg_bits &= 3;
2462           }
2463
2464         for (charset = 0; charset <= MAX_CHARSET; charset++)
2465           if (CHARSET_VALID_P (charset)
2466               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2467                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2468             {
2469               /* We have not yet decided where to designate CHARSET.  */
2470               int reg_bits = default_reg_bits;
2471
2472               if (CHARSET_CHARS (charset) == 96)
2473                 /* A charset of CHARS96 can't be designated to REG 0.  */
2474                 reg_bits &= ~1;
2475
2476               if (reg_bits)
2477                 /* There exist some default graphic register.  */
2478                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2479                   = (reg_bits & 1
2480                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2481               else
2482                 /* We anyway have to designate CHARSET to somewhere.  */
2483                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2484                   = (CHARSET_CHARS (charset) == 94
2485                      ? 0
2486                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2487                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2488                         ? 1
2489                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2490                            ? 2 : 0)));
2491             }
2492       }
2493       coding->require_flushing = 1;
2494       break;
2495
2496     case 3:
2497       coding->type = coding_type_big5;
2498       coding->flags
2499         = (NILP (XVECTOR (coding_system)->contents[4])
2500            ? CODING_FLAG_BIG5_HKU
2501            : CODING_FLAG_BIG5_ETEN);
2502       break;
2503
2504     case 4:
2505       coding->type = coding_type_ccl;
2506       {
2507         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2508         if (CONSP  (val)
2509             && VECTORP (XCONS (val)->car)
2510             && VECTORP (XCONS (val)->cdr))
2511           {
2512             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2513             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2514           }
2515         else
2516           goto label_invalid_coding_system;
2517       }
2518       coding->require_flushing = 1;
2519       break;
2520
2521     default:
2522       if (EQ (type, Qt))
2523         coding->type = coding_type_undecided;
2524       else
2525         coding->type = coding_type_no_conversion;
2526       break;
2527     }
2528   return 0;
2529
2530  label_invalid_coding_system:
2531   coding->type = coding_type_no_conversion;
2532   coding->eol_type = CODING_EOL_LF;
2533   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2534     = Qnil;
2535   return -1;
2536 }
2537
2538 /* Emacs has a mechanism to automatically detect a coding system if it
2539    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2540    it's impossible to distinguish some coding systems accurately
2541    because they use the same range of codes.  So, at first, coding
2542    systems are categorized into 7, those are:
2543
2544    o coding-category-emacs-mule
2545
2546         The category for a coding system which has the same code range
2547         as Emacs' internal format.  Assigned the coding-system (Lisp
2548         symbol) `emacs-mule' by default.
2549
2550    o coding-category-sjis
2551
2552         The category for a coding system which has the same code range
2553         as SJIS.  Assigned the coding-system (Lisp
2554         symbol) `japanese-shift-jis' by default.
2555
2556    o coding-category-iso-7
2557
2558         The category for a coding system which has the same code range
2559         as ISO2022 of 7-bit environment.  This doesn't use any locking
2560         shift and single shift functions.  Assigned the coding-system
2561         (Lisp symbol) `iso-2022-7bit' by default.
2562
2563    o coding-category-iso-8-1
2564
2565         The category for a coding system which has the same code range
2566         as ISO2022 of 8-bit environment and graphic plane 1 used only
2567         for DIMENSION1 charset.  This doesn't use any locking shift
2568         and single shift functions.  Assigned the coding-system (Lisp
2569         symbol) `iso-latin-1' by default.
2570
2571    o coding-category-iso-8-2
2572
2573         The category for a coding system which has the same code range
2574         as ISO2022 of 8-bit environment and graphic plane 1 used only
2575         for DIMENSION2 charset.  This doesn't use any locking shift
2576         and single shift functions.  Assigned the coding-system (Lisp
2577         symbol) `japanese-iso-8bit' by default.
2578
2579    o coding-category-iso-7-else
2580
2581         The category for a coding system which has the same code range
2582         as ISO2022 of 7-bit environemnt but uses locking shift or
2583         single shift functions.  Assigned the coding-system (Lisp
2584         symbol) `iso-2022-7bit-lock' by default.
2585
2586    o coding-category-iso-8-else
2587
2588         The category for a coding system which has the same code range
2589         as ISO2022 of 8-bit environemnt but uses locking shift or
2590         single shift functions.  Assigned the coding-system (Lisp
2591         symbol) `iso-2022-8bit-ss2' by default.
2592
2593    o coding-category-big5
2594
2595         The category for a coding system which has the same code range
2596         as BIG5.  Assigned the coding-system (Lisp symbol)
2597         `cn-big5' by default.
2598
2599    o coding-category-binary
2600
2601         The category for a coding system not categorized in any of the
2602         above.  Assigned the coding-system (Lisp symbol)
2603         `no-conversion' by default.
2604
2605    Each of them is a Lisp symbol and the value is an actual
2606    `coding-system's (this is also a Lisp symbol) assigned by a user.
2607    What Emacs does actually is to detect a category of coding system.
2608    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2609    decide only one possible category, it selects a category of the
2610    highest priority.  Priorities of categories are also specified by a
2611    user in a Lisp variable `coding-category-list'.
2612
2613 */
2614
2615 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2616    If it detects possible coding systems, return an integer in which
2617    appropriate flag bits are set.  Flag bits are defined by macros
2618    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2619
2620 int
2621 detect_coding_mask (src, src_bytes)
2622      unsigned char *src;
2623      int src_bytes;
2624 {
2625   register unsigned char c;
2626   unsigned char *src_end = src + src_bytes;
2627   int mask;
2628
2629   /* At first, skip all ASCII characters and control characters except
2630      for three ISO2022 specific control characters.  */
2631  label_loop_detect_coding:
2632   while (src < src_end)
2633     {
2634       c = *src;
2635       if (c >= 0x80
2636           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2637         break;
2638       src++;
2639     }
2640
2641   if (src >= src_end)
2642     /* We found nothing other than ASCII.  There's nothing to do.  */
2643     return CODING_CATEGORY_MASK_ANY;
2644
2645   /* The text seems to be encoded in some multilingual coding system.
2646      Now, try to find in which coding system the text is encoded.  */
2647   if (c < 0x80)
2648     {
2649       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2650       /* C is an ISO2022 specific control code of C0.  */
2651       mask = detect_coding_iso2022 (src, src_end);
2652       src++;
2653       if (mask == CODING_CATEGORY_MASK_ANY)
2654         /* No valid ISO2022 code follows C.  Try again.  */
2655         goto label_loop_detect_coding;
2656     }
2657   else if (c < 0xA0)
2658     {
2659       /* If C is a special Microsoft code,
2660          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2661          or is an ISO2022 control-sequence-introducer (CSI),
2662          we should also consider the possibility of someof ISO2022 codings.  */
2663       if ((VECTORP (Vmicrosoft_code_table)
2664            && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
2665           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2666           || (c == ISO_CODE_CSI
2667               && (src < src_end
2668                   && (*src == ']'
2669                       || (src + 1 < src_end
2670                           && src[1] == ']'
2671                           && (*src == '0' || *src == '1' || *src == '2'))))))
2672         mask = (detect_coding_iso2022 (src, src_end)
2673                 | detect_coding_sjis (src, src_end)
2674                 | detect_coding_emacs_mule (src, src_end)
2675                 | CODING_CATEGORY_MASK_BINARY);
2676
2677       else
2678         /* C is the first byte of SJIS character code, or a
2679            leading-code of Emacs.  */
2680         mask = (detect_coding_sjis (src, src_end)
2681                 | detect_coding_emacs_mule (src, src_end)
2682                 | CODING_CATEGORY_MASK_BINARY);
2683     }
2684   else
2685     /* C is a character of ISO2022 in graphic plane right,
2686        or a SJIS's 1-byte character code (i.e. JISX0201),
2687        or the first byte of BIG5's 2-byte code.  */
2688     mask = (detect_coding_iso2022 (src, src_end)
2689             | detect_coding_sjis (src, src_end)
2690             | detect_coding_big5 (src, src_end)
2691             | CODING_CATEGORY_MASK_BINARY);
2692
2693   return mask;
2694 }
2695
2696 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2697    The information of the detected coding system is set in CODING.  */
2698
2699 void
2700 detect_coding (coding, src, src_bytes)
2701      struct coding_system *coding;
2702      unsigned char *src;
2703      int src_bytes;
2704 {
2705   int mask = detect_coding_mask (src, src_bytes);
2706   int idx;
2707
2708   if (mask == CODING_CATEGORY_MASK_ANY)
2709     /* We found nothing other than ASCII.  There's nothing to do.  */
2710     return;
2711
2712   if (!mask)
2713     /* The source text seems to be encoded in unknown coding system.
2714        Emacs regards the category of such a kind of coding system as
2715        `coding-category-binary'.  We assume that a user has assigned
2716        an appropriate coding system for a `coding-category-binary'.  */
2717     idx = CODING_CATEGORY_IDX_BINARY;
2718   else
2719     {
2720       /* We found some plausible coding systems.  Let's use a coding
2721          system of the highest priority.  */
2722       Lisp_Object val = Vcoding_category_list;
2723
2724       if (CONSP (val))
2725         while (!NILP (val))
2726           {
2727             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2728             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2729               break;
2730             val = XCONS (val)->cdr;
2731           }
2732       else
2733         val = Qnil;
2734
2735       if (NILP (val))
2736         {
2737           /* For unknown reason, `Vcoding_category_list' contains none
2738              of found categories.  Let's use any of them.  */
2739           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2740             if (mask & (1 << idx))
2741               break;
2742         }
2743     }
2744   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2745 }
2746
2747 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2748    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2749    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2750
2751 #define MAX_EOL_CHECK_COUNT 3
2752
2753 int
2754 detect_eol_type (src, src_bytes)
2755      unsigned char *src;
2756      int src_bytes;
2757 {
2758   unsigned char *src_end = src + src_bytes;
2759   unsigned char c;
2760   int total = 0;                /* How many end-of-lines are found so far.  */
2761   int eol_type = CODING_EOL_UNDECIDED;
2762   int this_eol_type;
2763
2764   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2765     {
2766       c = *src++;
2767       if (c == '\n' || c == '\r')
2768         {
2769           total++;
2770           if (c == '\n')
2771             this_eol_type = CODING_EOL_LF;
2772           else if (src >= src_end || *src != '\n')
2773             this_eol_type = CODING_EOL_CR;
2774           else
2775             this_eol_type = CODING_EOL_CRLF, src++;
2776
2777           if (eol_type == CODING_EOL_UNDECIDED)
2778             /* This is the first end-of-line.  */
2779             eol_type = this_eol_type;
2780           else if (eol_type != this_eol_type)
2781             /* The found type is different from what found before.
2782                We had better not decode end-of-line.  */
2783             return CODING_EOL_LF;
2784         }
2785     }
2786
2787   return eol_type;
2788 }
2789
2790 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2791    is encoded.  If it detects an appropriate format of end-of-line, it
2792    sets the information in *CODING.  */
2793
2794 void
2795 detect_eol (coding, src, src_bytes)
2796      struct coding_system *coding;
2797      unsigned char *src;
2798      int src_bytes;
2799 {
2800   Lisp_Object val;
2801   int eol_type = detect_eol_type (src, src_bytes);
2802
2803   if (eol_type == CODING_EOL_UNDECIDED)
2804     /*  We found no end-of-line in the source text.  */
2805     return;
2806
2807   val = Fget (coding->symbol, Qeol_type);
2808   if (VECTORP (val) && XVECTOR (val)->size == 3)
2809     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2810 }
2811
2812 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2813    decoding, it may detect coding system and format of end-of-line if
2814    those are not yet decided.  */
2815
2816 int
2817 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2818      struct coding_system *coding;
2819      unsigned char *source, *destination;
2820      int src_bytes, dst_bytes;
2821      int *consumed;
2822 {
2823   int produced;
2824
2825   if (src_bytes <= 0)
2826     {
2827       *consumed = 0;
2828       return 0;
2829     }
2830
2831   if (coding->type == coding_type_undecided)
2832     detect_coding (coding, source, src_bytes);
2833
2834   if (coding->eol_type == CODING_EOL_UNDECIDED)
2835     detect_eol (coding, source, src_bytes);
2836
2837   coding->carryover_size = 0;
2838   switch (coding->type)
2839     {
2840     case coding_type_no_conversion:
2841     label_no_conversion:
2842       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2843       bcopy (source, destination, produced);
2844       *consumed = produced;
2845       break;
2846
2847     case coding_type_emacs_mule:
2848     case coding_type_undecided:
2849       if (coding->eol_type == CODING_EOL_LF
2850           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2851         goto label_no_conversion;
2852       produced = decode_eol (coding, source, destination,
2853                              src_bytes, dst_bytes, consumed);
2854       break;
2855
2856     case coding_type_sjis:
2857       produced = decode_coding_sjis_big5 (coding, source, destination,
2858                                           src_bytes, dst_bytes, consumed,
2859                                           1);
2860       break;
2861
2862     case coding_type_iso2022:
2863       produced = decode_coding_iso2022 (coding, source, destination,
2864                                         src_bytes, dst_bytes, consumed);
2865       break;
2866
2867     case coding_type_big5:
2868       produced = decode_coding_sjis_big5 (coding, source, destination,
2869                                           src_bytes, dst_bytes, consumed,
2870                                           0);
2871       break;
2872
2873     case coding_type_ccl:
2874       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2875                              src_bytes, dst_bytes, consumed);
2876       break;
2877     }
2878
2879   return produced;
2880 }
2881
2882 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2883
2884 int
2885 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2886      struct coding_system *coding;
2887      unsigned char *source, *destination;
2888      int src_bytes, dst_bytes;
2889      int *consumed;
2890 {
2891   int produced;
2892
2893   switch (coding->type)
2894     {
2895     case coding_type_no_conversion:
2896     label_no_conversion:
2897       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2898       if (produced > 0)
2899         {
2900           bcopy (source, destination, produced);
2901           if (coding->selective)
2902             {
2903               unsigned char *p = destination, *pend = destination + produced;
2904               while (p < pend)
2905                 if (*p++ == '\015') p[-1] = '\n';
2906             }
2907         }
2908       *consumed = produced;
2909       break;
2910
2911     case coding_type_emacs_mule:
2912     case coding_type_undecided:
2913       if (coding->eol_type == CODING_EOL_LF
2914           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2915         goto label_no_conversion;
2916       produced = encode_eol (coding, source, destination,
2917                              src_bytes, dst_bytes, consumed);
2918       break;
2919
2920     case coding_type_sjis:
2921       produced = encode_coding_sjis_big5 (coding, source, destination,
2922                                           src_bytes, dst_bytes, consumed,
2923                                           1);
2924       break;
2925
2926     case coding_type_iso2022:
2927       produced = encode_coding_iso2022 (coding, source, destination,
2928                                         src_bytes, dst_bytes, consumed);
2929       break;
2930
2931     case coding_type_big5:
2932       produced = encode_coding_sjis_big5 (coding, source, destination,
2933                                           src_bytes, dst_bytes, consumed,
2934                                           0);
2935       break;
2936
2937     case coding_type_ccl:
2938       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2939                              src_bytes, dst_bytes, consumed);
2940       break;
2941     }
2942
2943   return produced;
2944 }
2945
2946 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2947
2948 /* Return maximum size (bytes) of a buffer enough for decoding
2949    SRC_BYTES of text encoded in CODING.  */
2950
2951 int
2952 decoding_buffer_size (coding, src_bytes)
2953      struct coding_system *coding;
2954      int src_bytes;
2955 {
2956   int magnification;
2957
2958   if (coding->type == coding_type_iso2022)
2959     magnification = 3;
2960   else if (coding->type == coding_type_ccl)
2961     magnification = coding->spec.ccl.decoder.buf_magnification;
2962   else
2963     magnification = 2;
2964
2965   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2966 }
2967
2968 /* Return maximum size (bytes) of a buffer enough for encoding
2969    SRC_BYTES of text to CODING.  */
2970
2971 int
2972 encoding_buffer_size (coding, src_bytes)
2973      struct coding_system *coding;
2974      int src_bytes;
2975 {
2976   int magnification;
2977
2978   if (coding->type == coding_type_ccl)
2979     magnification = coding->spec.ccl.encoder.buf_magnification;
2980   else
2981     magnification = 3;
2982
2983   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2984 }
2985
2986 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2987 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2988 #endif
2989
2990 char *conversion_buffer;
2991 int conversion_buffer_size;
2992
2993 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2994    or decoding.  Sufficient memory is allocated automatically.  If we
2995    run out of memory, return NULL.  */
2996
2997 char *
2998 get_conversion_buffer (size)
2999      int size;
3000 {
3001   if (size > conversion_buffer_size)
3002     {
3003       char *buf;
3004       int real_size = conversion_buffer_size * 2;
3005
3006       while (real_size < size) real_size *= 2;
3007       buf = (char *) xmalloc (real_size);
3008       xfree (conversion_buffer);
3009       conversion_buffer = buf;
3010       conversion_buffer_size = real_size;
3011     }
3012   return conversion_buffer;
3013 }
3014
3015 \f
3016 #ifdef emacs
3017 /*** 7. Emacs Lisp library functions ***/
3018
3019 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3020        1, 1, 0,
3021   "Return coding-spec of CODING-SYSTEM.\n\
3022 If CODING-SYSTEM is not a valid coding-system, return nil.")
3023   (obj)
3024      Lisp_Object obj;
3025 {
3026   while (SYMBOLP (obj) && !NILP (obj))
3027     obj = Fget (obj, Qcoding_system);
3028   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3029           ? Qnil : obj);
3030 }
3031
3032 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3033   "Return t if OBJECT is nil or a coding-system.\n\
3034 See document of make-coding-system for coding-system object.")
3035   (obj)
3036      Lisp_Object obj;
3037 {
3038   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3039 }
3040
3041 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3042        Sread_non_nil_coding_system, 1, 1, 0,
3043   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3044   (prompt)
3045      Lisp_Object prompt;
3046 {
3047   Lisp_Object val;
3048   do
3049     {
3050       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3051                               Qt, Qnil, Qnil, Qnil);
3052     }
3053   while (XSTRING (val)->size == 0);
3054   return (Fintern (val, Qnil));
3055 }
3056
3057 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3058   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3059   (prompt)
3060      Lisp_Object prompt;
3061 {
3062   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3063                                       Qt, Qnil, Qnil, Qnil);
3064   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3065 }
3066
3067 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3068        1, 1, 0,
3069   "Check validity of CODING-SYSTEM.\n\
3070 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3071 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3072 The value of property should be a vector of length 5.")
3073   (coding_system)
3074      Lisp_Object coding_system;
3075 {
3076   CHECK_SYMBOL (coding_system, 0);
3077   if (!NILP (Fcoding_system_p (coding_system)))
3078     return coding_system;
3079   while (1)
3080     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3081 }
3082
3083 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3084        2, 2, 0,
3085   "Detect coding system of the text in the region between START and END.\n\
3086 Return a list of possible coding systems ordered by priority.\n\
3087 If only ASCII characters are found, it returns `undecided'\n\
3088  or its subsidiary coding system according to a detected end-of-line format.")
3089   (b, e)
3090      Lisp_Object b, e;
3091 {
3092   int coding_mask, eol_type;
3093   Lisp_Object val;
3094   int beg, end;
3095
3096   validate_region (&b, &e);
3097   beg = XINT (b), end = XINT (e);
3098   if (beg < GPT && end >= GPT) move_gap (end);
3099
3100   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3101   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3102
3103   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3104     {
3105       val = intern ("undecided");
3106       if (eol_type != CODING_EOL_UNDECIDED)
3107         {
3108           Lisp_Object val2 = Fget (val, Qeol_type);
3109           if (VECTORP (val2))
3110             val = XVECTOR (val2)->contents[eol_type];
3111         }
3112     }
3113   else
3114     {
3115       Lisp_Object val2;
3116
3117       /* At first, gather possible coding-systems in VAL in a reverse
3118          order.  */
3119       val = Qnil;
3120       for (val2 = Vcoding_category_list;
3121            !NILP (val2);
3122            val2 = XCONS (val2)->cdr)
3123         {
3124           int idx
3125             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3126           if (coding_mask & (1 << idx))
3127             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3128         }
3129
3130       /* Then, change the order of the list, while getting subsidiary
3131          coding-systems.  */
3132       val2 = val;
3133       val = Qnil;
3134       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3135         {
3136           if (eol_type == CODING_EOL_UNDECIDED)
3137             val = Fcons (XCONS (val2)->car, val);
3138           else
3139             {
3140               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3141               if (VECTORP (val3))
3142                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3143               else
3144                 val = Fcons (XCONS (val2)->car, val);
3145             }
3146         }
3147     }
3148
3149   return val;
3150 }
3151
3152 /* Scan text in the region between *BEGP and *ENDP, skip characters
3153    which we never have to encode to (iff ENCODEP is 1) or decode from
3154    coding system CODING at the head and tail, then set BEGP and ENDP
3155    to the addresses of start and end of the text we actually convert.  */
3156
3157 void
3158 shrink_conversion_area (begp, endp, coding, encodep)
3159      unsigned char **begp, **endp;
3160      struct coding_system *coding;
3161      int encodep;
3162 {
3163   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3164
3165   if (coding->eol_type != CODING_EOL_LF
3166       && coding->eol_type != CODING_EOL_UNDECIDED)
3167     /* Since we anyway have to convert end-of-line format, it is not
3168        worth skipping at most 100 bytes or so.  */
3169     return;
3170
3171   if (encodep)                  /* for encoding */
3172     {
3173       switch (coding->type)
3174         {
3175         case coding_type_no_conversion:
3176         case coding_type_emacs_mule:
3177         case coding_type_undecided:
3178           /* We need no conversion.  */
3179           *begp = *endp;
3180           return;
3181         case coding_type_ccl:
3182           /* We can't skip any data.  */
3183           return;
3184         case coding_type_iso2022:
3185           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3186             {
3187               unsigned char *bol = beg_addr;
3188               while (beg_addr < end_addr && *beg_addr < 0x80)
3189                 {
3190                   beg_addr++;
3191                   if (*(beg_addr - 1) == '\n')
3192                     bol = beg_addr;
3193                 }
3194               beg_addr = bol;
3195               goto label_skip_tail;
3196             }
3197           /* fall down ... */
3198         default:
3199           /* We can skip all ASCII characters at the head and tail.  */
3200           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3201         label_skip_tail:
3202           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3203           break;
3204         }
3205     }
3206   else                          /* for decoding */
3207     {
3208       switch (coding->type)
3209         {
3210         case coding_type_no_conversion:
3211           /* We need no conversion.  */
3212           *begp = *endp;
3213           return;
3214         case coding_type_emacs_mule:
3215           if (coding->eol_type == CODING_EOL_LF)
3216             {
3217               /* We need no conversion.  */
3218               *begp = *endp;
3219               return;
3220             }
3221           /* We can skip all but carriage-return.  */
3222           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3223           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3224           break;
3225         case coding_type_sjis:
3226         case coding_type_big5:
3227           /* We can skip all ASCII characters at the head.  */
3228           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3229           /* We can skip all ASCII characters at the tail except for
3230              the second byte of SJIS or BIG5 code.  */
3231           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3232           if (end_addr != *endp)
3233             end_addr++;
3234           break;
3235         case coding_type_ccl:
3236           /* We can't skip any data.  */
3237           return;
3238         default:                /* i.e. case coding_type_iso2022: */
3239           {
3240             unsigned char c;
3241
3242             /* We can skip all ASCII characters except for a few
3243                control codes at the head.  */
3244             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3245                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3246                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3247               beg_addr++;
3248           }
3249           break;
3250         }
3251     }
3252   *begp = beg_addr;
3253   *endp = end_addr;
3254   return;
3255 }
3256
3257 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3258    text between B and E.  B and E are buffer position.  */
3259
3260 Lisp_Object
3261 code_convert_region (b, e, coding, encodep)
3262      Lisp_Object b, e;
3263      struct coding_system *coding;
3264      int encodep;
3265 {
3266   int beg, end, len, consumed, produced;
3267   char *buf;
3268   unsigned char *begp, *endp;
3269   int pos = PT;
3270
3271   validate_region (&b, &e);
3272   beg = XINT (b), end = XINT (e);
3273   if (beg < GPT && end >= GPT)
3274     move_gap (end);
3275
3276   if (encodep && !NILP (coding->pre_write_conversion))
3277     {
3278       /* We must call a pre-conversion function which may put a new
3279          text to be converted in a new buffer.  */
3280       struct buffer *old = current_buffer, *new;
3281
3282       TEMP_SET_PT (beg);
3283       call2 (coding->pre_write_conversion, b, e);
3284       if (old != current_buffer)
3285         {
3286           /* Replace the original text by the text just generated.  */
3287           len = ZV - BEGV;
3288           new = current_buffer;
3289           set_buffer_internal (old);
3290           del_range (beg, end);
3291           insert_from_buffer (new, 1, len, 0);
3292           end = beg + len;
3293         }
3294     }
3295
3296   /* We may be able to shrink the conversion region.  */
3297   begp = POS_ADDR (beg); endp = begp + (end - beg);
3298   shrink_conversion_area (&begp, &endp, coding, encodep);
3299
3300   if (begp == endp)
3301     /* We need no conversion.  */
3302     len = end - beg;
3303   else
3304     {
3305       beg += begp - POS_ADDR (beg);
3306       end =  beg + (endp - begp);
3307
3308       if (encodep)
3309         len = encoding_buffer_size (coding, end - beg);
3310       else
3311         len = decoding_buffer_size (coding, end - beg);
3312       buf = get_conversion_buffer (len);
3313
3314       coding->last_block = 1;
3315       produced = (encodep
3316                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3317                                    &consumed)
3318                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3319                                    &consumed));
3320
3321       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3322
3323       TEMP_SET_PT (beg);
3324       insert (buf, produced);
3325       del_range (PT, PT + end - beg);
3326       if (pos >= end)
3327         pos = PT + (pos - end);
3328       else if (pos > beg)
3329         pos = beg;
3330       TEMP_SET_PT (pos);
3331   }
3332
3333   if (!encodep && !NILP (coding->post_read_conversion))
3334     {
3335       /* We must call a post-conversion function which may alter
3336          the text just converted.  */
3337       Lisp_Object insval;
3338
3339       beg = XINT (b);
3340       TEMP_SET_PT (beg);
3341       insval = call1 (coding->post_read_conversion, make_number (len));
3342       CHECK_NUMBER (insval, 0);
3343       len = XINT (insval);
3344     }
3345
3346   return make_number (len);
3347 }
3348
3349 Lisp_Object
3350 code_convert_string (str, coding, encodep, nocopy)
3351      Lisp_Object str, nocopy;
3352      struct coding_system *coding;
3353      int encodep;
3354 {
3355   int len, consumed, produced;
3356   char *buf;
3357   unsigned char *begp, *endp;
3358   int head_skip, tail_skip;
3359   struct gcpro gcpro1;
3360
3361   if (encodep && !NILP (coding->pre_write_conversion)
3362       || !encodep && !NILP (coding->post_read_conversion))
3363     {
3364       /* Since we have to call Lisp functions which assume target text
3365          is in a buffer, after setting a temporary buffer, call
3366          code_convert_region.  */
3367       int count = specpdl_ptr - specpdl;
3368       int len = XSTRING (str)->size;
3369       Lisp_Object result;
3370       struct buffer *old = current_buffer;
3371
3372       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3373       temp_output_buffer_setup (" *code-converting-work*");
3374       set_buffer_internal (XBUFFER (Vstandard_output));
3375       insert_from_string (str, 0, len, 0);
3376       code_convert_region (make_number (BEGV), make_number (ZV),
3377                            coding, encodep);
3378       result = make_buffer_string (BEGV, ZV, 0);
3379       set_buffer_internal (old);
3380       return unbind_to (count, result);
3381     }
3382
3383   /* We may be able to shrink the conversion region.  */
3384   begp = XSTRING (str)->data;
3385   endp = begp + XSTRING (str)->size;
3386   shrink_conversion_area (&begp, &endp, coding, encodep);
3387
3388   if (begp == endp)
3389     /* We need no conversion.  */
3390     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3391
3392   head_skip = begp - XSTRING (str)->data;
3393   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3394
3395   GCPRO1 (str);
3396
3397   if (encodep)
3398     len = encoding_buffer_size (coding, endp - begp);
3399   else
3400     len = decoding_buffer_size (coding, endp - begp);
3401   buf = get_conversion_buffer (len + head_skip + tail_skip);
3402
3403   bcopy (XSTRING (str)->data, buf, head_skip);
3404   coding->last_block = 1;
3405   produced = (encodep
3406               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3407                                buf + head_skip, endp - begp, len, &consumed)
3408               : decode_coding (coding, XSTRING (str)->data + head_skip,
3409                                buf + head_skip, endp - begp, len, &consumed));
3410   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3411          buf + head_skip + produced,
3412          tail_skip);
3413
3414   UNGCPRO;
3415
3416   return make_string (buf, head_skip + produced + tail_skip);
3417 }
3418
3419 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3420        3, 3, "r\nzCoding system: ",
3421   "Decode current region by specified coding system.\n\
3422 When called from a program, takes three arguments:\n\
3423 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3424 Return length of decoded text.")
3425   (b, e, coding_system)
3426      Lisp_Object b, e, coding_system;
3427 {
3428   struct coding_system coding;
3429
3430   CHECK_NUMBER_COERCE_MARKER (b, 0);
3431   CHECK_NUMBER_COERCE_MARKER (e, 1);
3432   CHECK_SYMBOL (coding_system, 2);
3433
3434   if (NILP (coding_system))
3435     return make_number (XFASTINT (e) - XFASTINT (b));
3436   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3437     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3438
3439   return code_convert_region (b, e, &coding, 0);
3440 }
3441
3442 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3443        3, 3, "r\nzCoding system: ",
3444   "Encode current region by specified coding system.\n\
3445 When called from a program, takes three arguments:\n\
3446 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3447 Return length of encoded text.")
3448   (b, e, coding_system)
3449      Lisp_Object b, e, coding_system;
3450 {
3451   struct coding_system coding;
3452
3453   CHECK_NUMBER_COERCE_MARKER (b, 0);
3454   CHECK_NUMBER_COERCE_MARKER (e, 1);
3455   CHECK_SYMBOL (coding_system, 2);
3456
3457   if (NILP (coding_system))
3458     return make_number (XFASTINT (e) - XFASTINT (b));
3459   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3460     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3461
3462   return code_convert_region (b, e, &coding, 1);
3463 }
3464
3465 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3466        2, 3, 0,
3467   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3468 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3469 of decoding.")
3470   (string, coding_system, nocopy)
3471      Lisp_Object string, coding_system, nocopy;
3472 {
3473   struct coding_system coding;
3474
3475   CHECK_STRING (string, 0);
3476   CHECK_SYMBOL (coding_system, 1);
3477
3478   if (NILP (coding_system))
3479     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3480   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3481     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3482
3483   return code_convert_string (string, &coding, 0, nocopy);
3484 }
3485
3486 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3487        2, 3, 0,
3488   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3489 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3490 of encoding.")
3491   (string, coding_system, nocopy)
3492      Lisp_Object string, coding_system, nocopy;
3493 {
3494   struct coding_system coding;
3495
3496   CHECK_STRING (string, 0);
3497   CHECK_SYMBOL (coding_system, 1);
3498
3499   if (NILP (coding_system))
3500     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3501   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3502     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3503
3504   return code_convert_string (string, &coding, 1, nocopy);
3505 }
3506
3507 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3508   "Decode a JISX0208 character of shift-jis encoding.\n\
3509 CODE is the character code in SJIS.\n\
3510 Return the corresponding character.")
3511   (code)
3512      Lisp_Object code;
3513 {
3514   unsigned char c1, c2, s1, s2;
3515   Lisp_Object val;
3516
3517   CHECK_NUMBER (code, 0);
3518   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3519   DECODE_SJIS (s1, s2, c1, c2);
3520   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3521   return val;
3522 }
3523
3524 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3525   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3526 Return the corresponding character code in SJIS.")
3527   (ch)
3528      Lisp_Object ch;
3529 {
3530   int charset, c1, c2, s1, s2;
3531   Lisp_Object val;
3532
3533   CHECK_NUMBER (ch, 0);
3534   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3535   if (charset == charset_jisx0208)
3536     {
3537       ENCODE_SJIS (c1, c2, s1, s2);
3538       XSETFASTINT (val, (s1 << 8) | s2);
3539     }
3540   else
3541     XSETFASTINT (val, 0);
3542   return val;
3543 }
3544
3545 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3546   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3547 CODE is the character code in BIG5.\n\
3548 Return the corresponding character.")
3549   (code)
3550      Lisp_Object code;
3551 {
3552   int charset;
3553   unsigned char b1, b2, c1, c2;
3554   Lisp_Object val;
3555
3556   CHECK_NUMBER (code, 0);
3557   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3558   DECODE_BIG5 (b1, b2, charset, c1, c2);
3559   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3560   return val;
3561 }
3562
3563 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3564   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3565 Return the corresponding character code in Big5.")
3566   (ch)
3567      Lisp_Object ch;
3568 {
3569   int charset, c1, c2, b1, b2;
3570   Lisp_Object val;
3571
3572   CHECK_NUMBER (ch, 0);
3573   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3574   if (charset == charset_big5_1 || charset == charset_big5_2)
3575     {
3576       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3577       XSETFASTINT (val, (b1 << 8) | b2);
3578     }
3579   else
3580     XSETFASTINT (val, 0);
3581   return val;
3582 }
3583
3584 DEFUN ("set-terminal-coding-system-internal",
3585        Fset_terminal_coding_system_internal,
3586        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3587   (coding_system)
3588      Lisp_Object coding_system;
3589 {
3590   CHECK_SYMBOL (coding_system, 0);
3591   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3592   /* We had better not send unexpected characters to terminal.  */
3593   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3594
3595   return Qnil;
3596 }
3597
3598 DEFUN ("set-safe-terminal-coding-system-internal",
3599        Fset_safe_terminal_coding_system_internal,
3600        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3601   (coding_system)
3602      Lisp_Object coding_system;
3603 {
3604   CHECK_SYMBOL (coding_system, 0);
3605   setup_coding_system (Fcheck_coding_system (coding_system),
3606                        &safe_terminal_coding);
3607   return Qnil;
3608 }
3609
3610 DEFUN ("terminal-coding-system",
3611        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3612   "Return coding-system of your terminal.")
3613   ()
3614 {
3615   return terminal_coding.symbol;
3616 }
3617
3618 DEFUN ("set-keyboard-coding-system-internal",
3619        Fset_keyboard_coding_system_internal,
3620        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3621   (coding_system)
3622      Lisp_Object coding_system;
3623 {
3624   CHECK_SYMBOL (coding_system, 0);
3625   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3626   return Qnil;
3627 }
3628
3629 DEFUN ("keyboard-coding-system",
3630        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3631   "Return coding-system of what is sent from terminal keyboard.")
3632   ()
3633 {
3634   return keyboard_coding.symbol;
3635 }
3636
3637 \f
3638 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3639        Sfind_operation_coding_system,  1, MANY, 0,
3640   "Choose a coding system for an operation based on the target name.\n\
3641 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3642 DECODING-SYSTEM is the coding system to use for decoding\n\
3643 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3644 for encoding (in case OPERATION does encoding).\n\
3645 \n\
3646 The first argument OPERATION specifies an I/O primitive:\n\
3647   For file I/O, `insert-file-contents' or `write-region'.\n\
3648   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3649   For network I/O, `open-network-stream'.\n\
3650 \n\
3651 The remaining arguments should be the same arguments that were passed\n\
3652 to the primitive.  Depending on which primitive, one of those arguments\n\
3653 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3654 whichever argument specifies the file name is TARGET.\n\
3655 \n\
3656 TARGET has a meaning which depends on OPERATION:\n\
3657   For file I/O, TARGET is a file name.\n\
3658   For process I/O, TARGET is a process name.\n\
3659   For network I/O, TARGET is a service name or a port number\n\
3660 \n\
3661 This function looks up what specified for TARGET in,\n\
3662 `file-coding-system-alist', `process-coding-system-alist',\n\
3663 or `network-coding-system-alist' depending on OPERATION.\n\
3664 They may specify a coding system, a cons of coding systems,\n\
3665 or a function symbol to call.\n\
3666 In the last case, we call the function with one argument,\n\
3667 which is a list of all the arguments given to this function.")
3668   (nargs, args)
3669      int nargs;
3670      Lisp_Object *args;
3671 {
3672   Lisp_Object operation, target_idx, target, val;
3673   register Lisp_Object chain;
3674
3675   if (nargs < 2)
3676     error ("Too few arguments");
3677   operation = args[0];
3678   if (!SYMBOLP (operation)
3679       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3680     error ("Invalid first arguement");
3681   if (nargs < 1 + XINT (target_idx))
3682     error ("Too few arguments for operation: %s",
3683            XSYMBOL (operation)->name->data);
3684   target = args[XINT (target_idx) + 1];
3685   if (!(STRINGP (target)
3686         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3687     error ("Invalid %dth argument", XINT (target_idx) + 1);
3688
3689   chain = ((EQ (operation, Qinsert_file_contents)
3690             || EQ (operation, Qwrite_region))
3691            ? Vfile_coding_system_alist
3692            : (EQ (operation, Qopen_network_stream)
3693               ? Vnetwork_coding_system_alist
3694               : Vprocess_coding_system_alist));
3695   if (NILP (chain))
3696     return Qnil;
3697
3698   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3699     {
3700       Lisp_Object elt = XCONS (chain)->car;
3701
3702       if (CONSP (elt)
3703           && ((STRINGP (target)
3704                && STRINGP (XCONS (elt)->car)
3705                && fast_string_match (XCONS (elt)->car, target) >= 0)
3706               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3707         {
3708           val = XCONS (elt)->cdr;
3709           if (CONSP (val))
3710             return val;
3711           if (! SYMBOLP (val))
3712             return Qnil;
3713           if (! NILP (Fcoding_system_p (val)))
3714             return Fcons (val, val);
3715           if (!NILP (Ffboundp (val)))
3716             return call1 (val, Flist (nargs, args));
3717           return Qnil;
3718         }
3719     }
3720   return Qnil;
3721 }
3722
3723 #endif /* emacs */
3724
3725 \f
3726 /*** 8. Post-amble ***/
3727
3728 init_coding_once ()
3729 {
3730   int i;
3731
3732   /* Emacs' internal format specific initialize routine.  */
3733   for (i = 0; i <= 0x20; i++)
3734     emacs_code_class[i] = EMACS_control_code;
3735   emacs_code_class[0x0A] = EMACS_linefeed_code;
3736   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3737   for (i = 0x21 ; i < 0x7F; i++)
3738     emacs_code_class[i] = EMACS_ascii_code;
3739   emacs_code_class[0x7F] = EMACS_control_code;
3740   emacs_code_class[0x80] = EMACS_leading_code_composition;
3741   for (i = 0x81; i < 0xFF; i++)
3742     emacs_code_class[i] = EMACS_invalid_code;
3743   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3744   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3745   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3746   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3747
3748   /* ISO2022 specific initialize routine.  */
3749   for (i = 0; i < 0x20; i++)
3750     iso_code_class[i] = ISO_control_code;
3751   for (i = 0x21; i < 0x7F; i++)
3752     iso_code_class[i] = ISO_graphic_plane_0;
3753   for (i = 0x80; i < 0xA0; i++)
3754     iso_code_class[i] = ISO_control_code;
3755   for (i = 0xA1; i < 0xFF; i++)
3756     iso_code_class[i] = ISO_graphic_plane_1;
3757   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3758   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3759   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3760   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3761   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3762   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3763   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3764   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3765   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3766   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3767
3768   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3769   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3770
3771   setup_coding_system (Qnil, &keyboard_coding);
3772   setup_coding_system (Qnil, &terminal_coding);
3773   setup_coding_system (Qnil, &safe_terminal_coding);
3774
3775 #if defined (MSDOS) || defined (WINDOWSNT)
3776   system_eol_type = CODING_EOL_CRLF;
3777 #else
3778   system_eol_type = CODING_EOL_LF;
3779 #endif
3780 }
3781
3782 #ifdef emacs
3783
3784 syms_of_coding ()
3785 {
3786   Qtarget_idx = intern ("target-idx");
3787   staticpro (&Qtarget_idx);
3788
3789   /* Target FILENAME is the first argument.  */
3790   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3791   /* Target FILENAME is the third argument.  */
3792   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3793
3794   Qcall_process = intern ("call-process");
3795   staticpro (&Qcall_process);
3796   /* Target PROGRAM is the first argument.  */
3797   Fput (Qcall_process, Qtarget_idx, make_number (0));
3798
3799   Qcall_process_region = intern ("call-process-region");
3800   staticpro (&Qcall_process_region);
3801   /* Target PROGRAM is the third argument.  */
3802   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3803
3804   Qstart_process = intern ("start-process");
3805   staticpro (&Qstart_process);
3806   /* Target PROGRAM is the third argument.  */
3807   Fput (Qstart_process, Qtarget_idx, make_number (2));
3808
3809   Qopen_network_stream = intern ("open-network-stream");
3810   staticpro (&Qopen_network_stream);
3811   /* Target SERVICE is the fourth argument.  */
3812   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3813
3814   Qcoding_system = intern ("coding-system");
3815   staticpro (&Qcoding_system);
3816
3817   Qeol_type = intern ("eol-type");
3818   staticpro (&Qeol_type);
3819
3820   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3821   staticpro (&Qbuffer_file_coding_system);
3822
3823   Qpost_read_conversion = intern ("post-read-conversion");
3824   staticpro (&Qpost_read_conversion);
3825
3826   Qpre_write_conversion = intern ("pre-write-conversion");
3827   staticpro (&Qpre_write_conversion);
3828
3829   Qcoding_system_spec = intern ("coding-system-spec");
3830   staticpro (&Qcoding_system_spec);
3831
3832   Qcoding_system_p = intern ("coding-system-p");
3833   staticpro (&Qcoding_system_p);
3834
3835   Qcoding_system_error = intern ("coding-system-error");
3836   staticpro (&Qcoding_system_error);
3837
3838   Fput (Qcoding_system_error, Qerror_conditions,
3839         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3840   Fput (Qcoding_system_error, Qerror_message,
3841         build_string ("Invalid coding system"));
3842
3843   Qcoding_category_index = intern ("coding-category-index");
3844   staticpro (&Qcoding_category_index);
3845
3846   {
3847     int i;
3848     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3849       {
3850         coding_category_table[i] = intern (coding_category_name[i]);
3851         staticpro (&coding_category_table[i]);
3852         Fput (coding_category_table[i], Qcoding_category_index,
3853               make_number (i));
3854       }
3855   }
3856
3857   Qcharacter_unification_table = intern ("character-unification-table");
3858   staticpro (&Qcharacter_unification_table);
3859   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3860         make_number (0));
3861
3862   Qcharacter_unification_table_for_decode
3863     = intern ("character-unification-table-for-decode");
3864   staticpro (&Qcharacter_unification_table_for_decode);
3865
3866   Qcharacter_unification_table_for_encode
3867     = intern ("character-unification-table-for-encode");
3868   staticpro (&Qcharacter_unification_table_for_encode);
3869
3870   Qemacs_mule = intern ("emacs-mule");
3871   staticpro (&Qemacs_mule);
3872
3873   defsubr (&Scoding_system_spec);
3874   defsubr (&Scoding_system_p);
3875   defsubr (&Sread_coding_system);
3876   defsubr (&Sread_non_nil_coding_system);
3877   defsubr (&Scheck_coding_system);
3878   defsubr (&Sdetect_coding_region);
3879   defsubr (&Sdecode_coding_region);
3880   defsubr (&Sencode_coding_region);
3881   defsubr (&Sdecode_coding_string);
3882   defsubr (&Sencode_coding_string);
3883   defsubr (&Sdecode_sjis_char);
3884   defsubr (&Sencode_sjis_char);
3885   defsubr (&Sdecode_big5_char);
3886   defsubr (&Sencode_big5_char);
3887   defsubr (&Sset_terminal_coding_system_internal);
3888   defsubr (&Sset_safe_terminal_coding_system_internal);
3889   defsubr (&Sterminal_coding_system);
3890   defsubr (&Sset_keyboard_coding_system_internal);
3891   defsubr (&Skeyboard_coding_system);
3892   defsubr (&Sfind_operation_coding_system);
3893
3894   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3895     "List of coding-categories (symbols) ordered by priority.");
3896   {
3897     int i;
3898
3899     Vcoding_category_list = Qnil;
3900     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3901       Vcoding_category_list
3902         = Fcons (coding_category_table[i], Vcoding_category_list);
3903   }
3904
3905   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3906     "Specify the coding system for read operations.\n\
3907 It is useful to bind this variable with `let', but do not set it globally.\n\
3908 If the value is a coding system, it is used for decoding on read operation.\n\
3909 If not, an appropriate element is used from one of the coding system alists:\n\
3910 There are three such tables, `file-coding-system-alist',\n\
3911 `process-coding-system-alist', and `network-coding-system-alist'.");
3912   Vcoding_system_for_read = Qnil;
3913
3914   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3915     "Specify the coding system for write operations.\n\
3916 It is useful to bind this variable with `let', but do not set it globally.\n\
3917 If the value is a coding system, it is used for encoding on write operation.\n\
3918 If not, an appropriate element is used from one of the coding system alists:\n\
3919 There are three such tables, `file-coding-system-alist',\n\
3920 `process-coding-system-alist', and `network-coding-system-alist'.");
3921   Vcoding_system_for_write = Qnil;
3922
3923   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3924     "Coding system used in the latest file or process I/O.");
3925   Vlast_coding_system_used = Qnil;
3926
3927   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3928     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3929   inhibit_eol_conversion = 0;
3930
3931   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3932     "Alist to decide a coding system to use for a file I/O operation.\n\
3933 The format is ((PATTERN . VAL) ...),\n\
3934 where PATTERN is a regular expression matching a file name,\n\
3935 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3936 If VAL is a coding system, it is used for both decoding and encoding\n\
3937 the file contents.\n\
3938 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3939 and the cdr part is used for encoding.\n\
3940 If VAL is a function symbol, the function must return a coding system\n\
3941 or a cons of coding systems which are used as above.\n\
3942 \n\
3943 See also the function `find-operation-coding-system'.");
3944   Vfile_coding_system_alist = Qnil;
3945
3946   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3947     "Alist to decide a coding system to use for a process I/O operation.\n\
3948 The format is ((PATTERN . VAL) ...),\n\
3949 where PATTERN is a regular expression matching a program name,\n\
3950 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3951 If VAL is a coding system, it is used for both decoding what received\n\
3952 from the program and encoding what sent to the program.\n\
3953 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3954 and the cdr part is used for encoding.\n\
3955 If VAL is a function symbol, the function must return a coding system\n\
3956 or a cons of coding systems which are used as above.\n\
3957 \n\
3958 See also the function `find-operation-coding-system'.");
3959   Vprocess_coding_system_alist = Qnil;
3960
3961   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3962     "Alist to decide a coding system to use for a network I/O operation.\n\
3963 The format is ((PATTERN . VAL) ...),\n\
3964 where PATTERN is a regular expression matching a network service name\n\
3965 or is a port number to connect to,\n\
3966 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3967 If VAL is a coding system, it is used for both decoding what received\n\
3968 from the network stream and encoding what sent to the network stream.\n\
3969 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3970 and the cdr part is used for encoding.\n\
3971 If VAL is a function symbol, the function must return a coding system\n\
3972 or a cons of coding systems which are used as above.\n\
3973 \n\
3974 See also the function `find-operation-coding-system'.");
3975   Vnetwork_coding_system_alist = Qnil;
3976
3977   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3978     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3979   eol_mnemonic_unix = ':';
3980
3981   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3982     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3983   eol_mnemonic_dos = '\\';
3984
3985   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3986     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3987   eol_mnemonic_mac = '/';
3988
3989   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3990     "Mnemonic character indicating end-of-line format is not yet decided.");
3991   eol_mnemonic_undecided = ':';
3992
3993   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3994     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3995   Venable_character_unification = Qt;
3996
3997   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3998     &Vstandard_character_unification_table_for_decode,
3999     "Table for unifying characters when reading.");
4000   Vstandard_character_unification_table_for_decode = Qnil;
4001
4002   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4003     &Vstandard_character_unification_table_for_encode,
4004     "Table for unifying characters when writing.");
4005   Vstandard_character_unification_table_for_encode = Qnil;
4006
4007   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4008     "Alist of charsets vs revision numbers.\n\
4009 While encoding, if a charset (car part of an element) is found,\n\
4010 designate it with the escape sequence identifing revision (cdr part of the element).");
4011   Vcharset_revision_alist = Qnil;
4012
4013   DEFVAR_LISP ("default-process-coding-system",
4014                &Vdefault_process_coding_system,
4015     "Cons of coding systems used for process I/O by default.\n\
4016 The car part is used for decoding a process output,\n\
4017 the cdr part is used for encoding a text to be sent to a process.");
4018   Vdefault_process_coding_system = Qnil;
4019
4020   DEFVAR_LISP ("special-microsoft-code-table", &Vmicrosoft_code_table,
4021     "Table of special Microsoft codes in the range 128..159 (inclusive).\n\
4022 This is a vector of length 256.\n\
4023 If Nth element is non-nil, the existence of code N in a file\n\
4024 (or output of subprocess) doesn't prevent it to be detected as\n\
4025 a coding system of ISO 2022 variant (e.g. iso-latin-1) on reading a file\n\
4026 or reading output of a subprocess.\n\
4027 Only 128th through 159th elements has a meaning.");
4028   Vmicrosoft_code_table = Fmake_vector (make_number (256), Qnil);
4029 }
4030
4031 #endif /* emacs */