code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  */
 116 #if 0
 117 int
 118 detect_coding_emacs_mule (src, src_end)
 119      unsigned char *src, *src_end;
 120 {
 121   ...
 122 }
 123 #endif
 124
 125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 126
 127   These functions decode SRC_BYTES length of unibyte text at SOURCE
 128   encoded in CODING to Emacs' internal format.  The resulting
 129   multibyte text goes to a place pointed to by DESTINATION, the length
 130   of which should not exceed DST_BYTES.
 131
 132   These functions set the information of original and decoded texts in
 133   the members produced, produced_char, consumed, and consumed_char of
 134   the structure *CODING.  They also set the member result to one of
 135   CODING_FINISH_XXX indicating how the decoding finished.
 136
 137   DST_BYTES zero means that source area and destination area are
 138   overlapped, which means that we can produce a decoded text until it
 139   reaches at the head of not-yet-decoded source text.
 140
 141   Below is a template of these functions.  */
 142 #if 0
 143 static void
 144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148 {
 149   ...
 150 }
 151 #endif
 152
 153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 154
 155   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 156   internal multibyte format to CODING.  The resulting unibyte text
 157   goes to a place pointed to by DESTINATION, the length of which
 158   should not exceed DST_BYTES.
 159
 160   These functions set the information of original and encoded texts in
 161   the members produced, produced_char, consumed, and consumed_char of
 162   the structure *CODING.  They also set the member result to one of
 163   CODING_FINISH_XXX indicating how the encoding finished.
 164
 165   DST_BYTES zero means that source area and destination area are
 166   overlapped, which means that we can produce a encoded text until it
 167   reaches at the head of not-yet-encoded source text.
 168
 169   Below is a template of these functions.  */
 170 #if 0
 171 static void
 172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 173      struct coding_system *coding;
 174      unsigned char *source, *destination;
 175      int src_bytes, dst_bytes;
 176 {
 177   ...
 178 }
 179 #endif
 180
 181 /*** COMMONLY USED MACROS ***/
 182
 183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 184    get one, two, and three bytes from the source text respectively.
 185    If there are not enough bytes in the source, they jump to
 186    `label_end_of_loop'.  The caller should set variables `coding',
 187    `src' and `src_end' to appropriate pointer in advance.  These
 188    macros are called from decoding routines `decode_coding_XXX', thus
 189    it is assumed that the source text is unibyte.  */
 190
 191 #define ONE_MORE_BYTE(c1)                                       \
 192   do {                                                          \
 193     if (src >= src_end)                                         \
 194       {                                                         \
 195         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 196         goto label_end_of_loop;                                 \
 197       }                                                         \
 198     c1 = *src++;                                                \
 199   } while (0)
 200
 201 #define TWO_MORE_BYTES(c1, c2)                                  \
 202   do {                                                          \
 203     if (src + 1 >= src_end)                                     \
 204       {                                                         \
 205         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 206         goto label_end_of_loop;                                 \
 207       }                                                         \
 208     c1 = *src++;                                                \
 209     c2 = *src++;                                                \
 210   } while (0)
 211
 212
 213 /* Set C to the next character at the source text pointed by `src'.
 214    If there are not enough characters in the source, jump to
 215    `label_end_of_loop'.  The caller should set variables `coding'
 216    `src', `src_end', and `translation_table' to appropriate pointers
 217    in advance.  This macro is used in encoding routines
 218    `encode_coding_XXX', thus it assumes that the source text is in
 219    multibyte form except for 8-bit characters.  8-bit characters are
 220    in multibyte form if coding->src_multibyte is nonzero, else they
 221    are represented by a single byte.  */
 222
 223 #define ONE_MORE_CHAR(c)                                        \
 224   do {                                                          \
 225     int len = src_end - src;                                    \
 226     int bytes;                                                  \
 227     if (len <= 0)                                               \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         goto label_end_of_loop;                                 \
 231       }                                                         \
 232     if (coding->src_multibyte                                   \
 233         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 234       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 235     else                                                        \
 236       c = *src, bytes = 1;                                      \
 237     if (!NILP (translation_table))                              \
 238       c = translate_char (translation_table, c, 0, 0, 0);       \
 239     src += bytes;                                               \
 240   } while (0)
 241
 242
 243 /* Produce a multibyte form of characater C to `dst'.  Jump to
 244    `label_end_of_loop' if there's not enough space at `dst'.
 245
 246    If we are now in the middle of composition sequence, the decoded
 247    character may be ALTCHAR (for the current composition).  In that
 248    case, the character goes to coding->cmp_data->data instead of
 249    `dst'.
 250
 251    This macro is used in decoding routines.  */
 252
 253 #define EMIT_CHAR(c)                                                    \
 254   do {                                                                  \
 255     if (! COMPOSING_P (coding)                                          \
 256         || coding->composing == COMPOSITION_RELATIVE                    \
 257         || coding->composing == COMPOSITION_WITH_RULE)                  \
 258       {                                                                 \
 259         int bytes = CHAR_BYTES (c);                                     \
 260         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 261           {                                                             \
 262             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 263             goto label_end_of_loop;                                     \
 264           }                                                             \
 265         dst += CHAR_STRING (c, dst);                                    \
 266         coding->produced_char++;                                        \
 267       }                                                                 \
 268                                                                         \
 269     if (COMPOSING_P (coding)                                            \
 270         && coding->composing != COMPOSITION_RELATIVE)                   \
 271       {                                                                 \
 272         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 273         coding->composition_rule_follows                                \
 274           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 275       }                                                                 \
 276   } while (0)
 277
 278
 279 #define EMIT_ONE_BYTE(c)                                        \
 280   do {                                                          \
 281     if (dst >= (dst_bytes ? dst_end : src))                     \
 282       {                                                         \
 283         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 284         goto label_end_of_loop;                                 \
 285       }                                                         \
 286     *dst++ = c;                                                 \
 287   } while (0)
 288
 289 #define EMIT_TWO_BYTES(c1, c2)                                  \
 290   do {                                                          \
 291     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 292       {                                                         \
 293         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 294         goto label_end_of_loop;                                 \
 295       }                                                         \
 296     *dst++ = c1, *dst++ = c2;                                   \
 297   } while (0)
 298
 299 #define EMIT_BYTES(from, to)                                    \
 300   do {                                                          \
 301     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     while (from < to)                                           \
 307       *dst++ = *from++;                                         \
 308   } while (0)
 309
 310 \f
 311 /*** 1. Preamble ***/
 312
 313 #ifdef emacs
 314 #include <config.h>
 315 #endif
 316
 317 #include <stdio.h>
 318
 319 #ifdef emacs
 320
 321 #include "lisp.h"
 322 #include "buffer.h"
 323 #include "charset.h"
 324 #include "composite.h"
 325 #include "ccl.h"
 326 #include "coding.h"
 327 #include "window.h"
 328
 329 #else  /* not emacs */
 330
 331 #include "mulelib.h"
 332
 333 #endif /* not emacs */
 334
 335 Lisp_Object Qcoding_system, Qeol_type;
 336 Lisp_Object Qbuffer_file_coding_system;
 337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 338 Lisp_Object Qno_conversion, Qundecided;
 339 Lisp_Object Qcoding_system_history;
 340 Lisp_Object Qsafe_charsets;
 341 Lisp_Object Qvalid_codes;
 342
 343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 345 Lisp_Object Qstart_process, Qopen_network_stream;
 346 Lisp_Object Qtarget_idx;
 347
 348 Lisp_Object Vselect_safe_coding_system_function;
 349
 350 /* Mnemonic string for each format of end-of-line.  */
 351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 352 /* Mnemonic string to indicate format of end-of-line is not yet
 353    decided.  */
 354 Lisp_Object eol_mnemonic_undecided;
 355
 356 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 357    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 358 int system_eol_type;
 359
 360 #ifdef emacs
 361
 362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 363
 364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 365
 366 /* Coding system emacs-mule and raw-text are for converting only
 367    end-of-line format.  */
 368 Lisp_Object Qemacs_mule, Qraw_text;
 369
 370 /* Coding-systems are handed between Emacs Lisp programs and C internal
 371    routines by the following three variables.  */
 372 /* Coding-system for reading files and receiving data from process.  */
 373 Lisp_Object Vcoding_system_for_read;
 374 /* Coding-system for writing files and sending data to process.  */
 375 Lisp_Object Vcoding_system_for_write;
 376 /* Coding-system actually used in the latest I/O.  */
 377 Lisp_Object Vlast_coding_system_used;
 378
 379 /* A vector of length 256 which contains information about special
 380    Latin codes (especially for dealing with Microsoft codes).  */
 381 Lisp_Object Vlatin_extra_code_table;
 382
 383 /* Flag to inhibit code conversion of end-of-line format.  */
 384 int inhibit_eol_conversion;
 385
 386 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 387 int inherit_process_coding_system;
 388
 389 /* Coding system to be used to encode text for terminal display.  */
 390 struct coding_system terminal_coding;
 391
 392 /* Coding system to be used to encode text for terminal display when
 393    terminal coding system is nil.  */
 394 struct coding_system safe_terminal_coding;
 395
 396 /* Coding system of what is sent from terminal keyboard.  */
 397 struct coding_system keyboard_coding;
 398
 399 /* Default coding system to be used to write a file.  */
 400 struct coding_system default_buffer_file_coding;
 401
 402 Lisp_Object Vfile_coding_system_alist;
 403 Lisp_Object Vprocess_coding_system_alist;
 404 Lisp_Object Vnetwork_coding_system_alist;
 405
 406 Lisp_Object Vlocale_coding_system;
 407
 408 #endif /* emacs */
 409
 410 Lisp_Object Qcoding_category, Qcoding_category_index;
 411
 412 /* List of symbols `coding-category-xxx' ordered by priority.  */
 413 Lisp_Object Vcoding_category_list;
 414
 415 /* Table of coding categories (Lisp symbols).  */
 416 Lisp_Object Vcoding_category_table;
 417
 418 /* Table of names of symbol for each coding-category.  */
 419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 420   "coding-category-emacs-mule",
 421   "coding-category-sjis",
 422   "coding-category-iso-7",
 423   "coding-category-iso-7-tight",
 424   "coding-category-iso-8-1",
 425   "coding-category-iso-8-2",
 426   "coding-category-iso-7-else",
 427   "coding-category-iso-8-else",
 428   "coding-category-ccl",
 429   "coding-category-big5",
 430   "coding-category-utf-8",
 431   "coding-category-utf-16-be",
 432   "coding-category-utf-16-le",
 433   "coding-category-raw-text",
 434   "coding-category-binary"
 435 };
 436
 437 /* Table of pointers to coding systems corresponding to each coding
 438    categories.  */
 439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 440
 441 /* Table of coding category masks.  Nth element is a mask for a coding
 442    cateogry of which priority is Nth.  */
 443 static
 444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 445
 446 /* Flag to tell if we look up translation table on character code
 447    conversion.  */
 448 Lisp_Object Venable_character_translation;
 449 /* Standard translation table to look up on decoding (reading).  */
 450 Lisp_Object Vstandard_translation_table_for_decode;
 451 /* Standard translation table to look up on encoding (writing).  */
 452 Lisp_Object Vstandard_translation_table_for_encode;
 453
 454 Lisp_Object Qtranslation_table;
 455 Lisp_Object Qtranslation_table_id;
 456 Lisp_Object Qtranslation_table_for_decode;
 457 Lisp_Object Qtranslation_table_for_encode;
 458
 459 /* Alist of charsets vs revision number.  */
 460 Lisp_Object Vcharset_revision_alist;
 461
 462 /* Default coding systems used for process I/O.  */
 463 Lisp_Object Vdefault_process_coding_system;
 464
 465 /* Global flag to tell that we can't call post-read-conversion and
 466    pre-write-conversion functions.  Usually the value is zero, but it
 467    is set to 1 temporarily while such functions are running.  This is
 468    to avoid infinite recursive call.  */
 469 static int inhibit_pre_post_conversion;
 470
 471 \f
 472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 473
 474 /* Emacs' internal format for encoding multiple character sets is a
 475    kind of multi-byte encoding, i.e. characters are encoded by
 476    variable-length sequences of one-byte codes.
 477
 478    ASCII characters and control characters (e.g. `tab', `newline') are
 479    represented by one-byte sequences which are their ASCII codes, in
 480    the range 0x00 through 0x7F.
 481
 482    8-bit characters of the range 0x80..0x9F are represented by
 483    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 484    code + 0x20).
 485
 486    8-bit characters of the range 0xA0..0xFF are represented by
 487    one-byte sequences which are their 8-bit code.
 488
 489    The other characters are represented by a sequence of `base
 490    leading-code', optional `extended leading-code', and one or two
 491    `position-code's.  The length of the sequence is determined by the
 492    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 493    whereas extended leading-code and position-code take the range 0xA0
 494    through 0xFF.  See `charset.h' for more details about leading-code
 495    and position-code.
 496
 497    --- CODE RANGE of Emacs' internal format ---
 498    character set        range
 499    -------------        -----
 500    ascii                0x00..0x7F
 501    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 502    eight-bit-graphic    0xA0..0xBF
 503    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 504    ---------------------------------------------
 505
 506   */
 507
 508 enum emacs_code_class_type emacs_code_class[256];
 509
 510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 511    Check if a text is encoded in Emacs' internal format.  If it is,
 512    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 513
 514 int
 515 detect_coding_emacs_mule (src, src_end)
 516       unsigned char *src, *src_end;
 517 {
 518   unsigned char c;
 519   int composing = 0;
 520   /* Dummy for ONE_MORE_BYTE.  */
 521   struct coding_system dummy_coding;
 522   struct coding_system *coding = &dummy_coding;
 523
 524   while (1)
 525     {
 526       ONE_MORE_BYTE (c);
 527
 528       if (composing)
 529         {
 530           if (c < 0xA0)
 531             composing = 0;
 532           else if (c == 0xA0)
 533             {
 534               ONE_MORE_BYTE (c);
 535               c &= 0x7F;
 536             }
 537           else
 538             c -= 0x20;
 539         }
 540
 541       if (c < 0x20)
 542         {
 543           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 544             return 0;
 545         }
 546       else if (c >= 0x80 && c < 0xA0)
 547         {
 548           if (c == 0x80)
 549             /* Old leading code for a composite character.  */
 550             composing = 1;
 551           else
 552             {
 553               unsigned char *src_base = src - 1;
 554               int bytes;
 555
 556               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 557                                                bytes))
 558                 return 0;
 559               src = src_base + bytes;
 560             }
 561         }
 562     }
 563  label_end_of_loop:
 564   return CODING_CATEGORY_MASK_EMACS_MULE;
 565 }
 566
 567
 568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 569
 570 static void
 571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 572      struct coding_system *coding;
 573      unsigned char *source, *destination;
 574      int src_bytes, dst_bytes;
 575 {
 576   unsigned char *src = source;
 577   unsigned char *src_end = source + src_bytes;
 578   unsigned char *dst = destination;
 579   unsigned char *dst_end = destination + dst_bytes;
 580   /* SRC_BASE remembers the start position in source in each loop.
 581      The loop will be exited when there's not enough source code, or
 582      when there's not enough destination area to produce a
 583      character.  */
 584   unsigned char *src_base;
 585
 586   coding->produced_char = 0;
 587   while (src < src_end)
 588     {
 589       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 590       int bytes;
 591
 592       src_base = src;
 593       if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 594         {
 595           p = src;
 596           src += bytes;
 597         }
 598       else
 599         {
 600           bytes = CHAR_STRING (*src, tmp);
 601           p = tmp;
 602           src++;
 603         }
 604       if (dst + bytes >= (dst_bytes ? dst_end : src))
 605         {
 606           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 607           break;
 608         }
 609       while (bytes--) *dst++ = *p++;
 610       coding->produced_char++;
 611     }
 612   coding->consumed = coding->consumed_char = src_base - source;
 613   coding->produced = dst - destination;
 614 }
 615
 616 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 617   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 618
 619
 620 \f
 621 /*** 3. ISO2022 handlers ***/
 622
 623 /* The following note describes the coding system ISO2022 briefly.
 624    Since the intention of this note is to help understand the
 625    functions in this file, some parts are NOT ACCURATE or OVERLY
 626    SIMPLIFIED.  For thorough understanding, please refer to the
 627    original document of ISO2022.
 628
 629    ISO2022 provides many mechanisms to encode several character sets
 630    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 631    is encoded using bytes less than 128.  This may make the encoded
 632    text a little bit longer, but the text passes more easily through
 633    several gateways, some of which strip off MSB (Most Signigant Bit).
 634
 635    There are two kinds of character sets: control character set and
 636    graphic character set.  The former contains control characters such
 637    as `newline' and `escape' to provide control functions (control
 638    functions are also provided by escape sequences).  The latter
 639    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 640    two control character sets and many graphic character sets.
 641
 642    Graphic character sets are classified into one of the following
 643    four classes, according to the number of bytes (DIMENSION) and
 644    number of characters in one dimension (CHARS) of the set:
 645    - DIMENSION1_CHARS94
 646    - DIMENSION1_CHARS96
 647    - DIMENSION2_CHARS94
 648    - DIMENSION2_CHARS96
 649
 650    In addition, each character set is assigned an identification tag,
 651    unique for each set, called "final character" (denoted as <F>
 652    hereafter).  The <F> of each character set is decided by ECMA(*)
 653    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 654    (0x30..0x3F are for private use only).
 655
 656    Note (*): ECMA = European Computer Manufacturers Association
 657
 658    Here are examples of graphic character set [NAME(<F>)]:
 659         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 660         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 661         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 662         o DIMENSION2_CHARS96 -- none for the moment
 663
 664    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 665         C0 [0x00..0x1F] -- control character plane 0
 666         GL [0x20..0x7F] -- graphic character plane 0
 667         C1 [0x80..0x9F] -- control character plane 1
 668         GR [0xA0..0xFF] -- graphic character plane 1
 669
 670    A control character set is directly designated and invoked to C0 or
 671    C1 by an escape sequence.  The most common case is that:
 672    - ISO646's  control character set is designated/invoked to C0, and
 673    - ISO6429's control character set is designated/invoked to C1,
 674    and usually these designations/invocations are omitted in encoded
 675    text.  In a 7-bit environment, only C0 can be used, and a control
 676    character for C1 is encoded by an appropriate escape sequence to
 677    fit into the environment.  All control characters for C1 are
 678    defined to have corresponding escape sequences.
 679
 680    A graphic character set is at first designated to one of four
 681    graphic registers (G0 through G3), then these graphic registers are
 682    invoked to GL or GR.  These designations and invocations can be
 683    done independently.  The most common case is that G0 is invoked to
 684    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 685    these invocations and designations are omitted in encoded text.
 686    In a 7-bit environment, only GL can be used.
 687
 688    When a graphic character set of CHARS94 is invoked to GL, codes
 689    0x20 and 0x7F of the GL area work as control characters SPACE and
 690    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 691    be used.
 692
 693    There are two ways of invocation: locking-shift and single-shift.
 694    With locking-shift, the invocation lasts until the next different
 695    invocation, whereas with single-shift, the invocation affects the
 696    following character only and doesn't affect the locking-shift
 697    state.  Invocations are done by the following control characters or
 698    escape sequences:
 699
 700    ----------------------------------------------------------------------
 701    abbrev  function                  cntrl escape seq   description
 702    ----------------------------------------------------------------------
 703    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 704    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 705    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 706    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 707    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 708    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 709    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 710    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 711    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 712    ----------------------------------------------------------------------
 713    (*) These are not used by any known coding system.
 714
 715    Control characters for these functions are defined by macros
 716    ISO_CODE_XXX in `coding.h'.
 717
 718    Designations are done by the following escape sequences:
 719    ----------------------------------------------------------------------
 720    escape sequence      description
 721    ----------------------------------------------------------------------
 722    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 723    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 724    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 725    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 726    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 727    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 728    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 729    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 730    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 731    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 732    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 733    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 734    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 735    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 736    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 737    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 738    ----------------------------------------------------------------------
 739
 740    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 741    of dimension 1, chars 94, and final character <F>, etc...
 742
 743    Note (*): Although these designations are not allowed in ISO2022,
 744    Emacs accepts them on decoding, and produces them on encoding
 745    CHARS96 character sets in a coding system which is characterized as
 746    7-bit environment, non-locking-shift, and non-single-shift.
 747
 748    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 749    '(' can be omitted.  We refer to this as "short-form" hereafter.
 750
 751    Now you may notice that there are a lot of ways for encoding the
 752    same multilingual text in ISO2022.  Actually, there exist many
 753    coding systems such as Compound Text (used in X11's inter client
 754    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 755    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 756    localized platforms), and all of these are variants of ISO2022.
 757
 758    In addition to the above, Emacs handles two more kinds of escape
 759    sequences: ISO6429's direction specification and Emacs' private
 760    sequence for specifying character composition.
 761
 762    ISO6429's direction specification takes the following form:
 763         o CSI ']'      -- end of the current direction
 764         o CSI '0' ']'  -- end of the current direction
 765         o CSI '1' ']'  -- start of left-to-right text
 766         o CSI '2' ']'  -- start of right-to-left text
 767    The control character CSI (0x9B: control sequence introducer) is
 768    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 769
 770    Character composition specification takes the following form:
 771         o ESC '0' -- start relative composition
 772         o ESC '1' -- end composition
 773         o ESC '2' -- start rule-base composition (*)
 774         o ESC '3' -- start relative composition with alternate chars  (**)
 775         o ESC '4' -- start rule-base composition with alternate chars  (**)
 776   Since these are not standard escape sequences of any ISO standard,
 777   the use of them for these meaning is restricted to Emacs only.
 778
 779   (*) This form is used only in Emacs 20.5 and the older versions,
 780   but the newer versions can safely decode it.
 781   (**) This form is used only in Emacs 21.1 and the newer versions,
 782   and the older versions can't decode it.
 783
 784   Here's a list of examples usages of these composition escape
 785   sequences (categorized by `enum composition_method').
 786
 787   COMPOSITION_RELATIVE:
 788         ESC 0 CHAR [ CHAR ] ESC 1
 789   COMPOSITOIN_WITH_RULE:
 790         ESC 2 CHAR [ RULE CHAR ] ESC 1
 791   COMPOSITION_WITH_ALTCHARS:
 792         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 793   COMPOSITION_WITH_RULE_ALTCHARS:
 794         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 795
 796 enum iso_code_class_type iso_code_class[256];
 797
 798 #define CHARSET_OK(idx, charset)                                \
 799   (coding_system_table[idx]                                     \
 800    && (coding_system_table[idx]->safe_charsets[charset]         \
 801        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 802             (coding_system_table[idx], charset)                 \
 803            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 804
 805 #define SHIFT_OUT_OK(idx) \
 806   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 807
 808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 809    Check if a text is encoded in ISO2022.  If it is, returns an
 810    integer in which appropriate flag bits any of:
 811         CODING_CATEGORY_MASK_ISO_7
 812         CODING_CATEGORY_MASK_ISO_7_TIGHT
 813         CODING_CATEGORY_MASK_ISO_8_1
 814         CODING_CATEGORY_MASK_ISO_8_2
 815         CODING_CATEGORY_MASK_ISO_7_ELSE
 816         CODING_CATEGORY_MASK_ISO_8_ELSE
 817    are set.  If a code which should never appear in ISO2022 is found,
 818    returns 0.  */
 819
 820 int
 821 detect_coding_iso2022 (src, src_end)
 822      unsigned char *src, *src_end;
 823 {
 824   int mask = CODING_CATEGORY_MASK_ISO;
 825   int mask_found = 0;
 826   int reg[4], shift_out = 0, single_shifting = 0;
 827   int c, c1, i, charset;
 828   /* Dummy for ONE_MORE_BYTE.  */
 829   struct coding_system dummy_coding;
 830   struct coding_system *coding = &dummy_coding;
 831
 832   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 833   while (mask && src < src_end)
 834     {
 835       ONE_MORE_BYTE (c);
 836       switch (c)
 837         {
 838         case ISO_CODE_ESC:
 839           single_shifting = 0;
 840           ONE_MORE_BYTE (c);
 841           if (c >= '(' && c <= '/')
 842             {
 843               /* Designation sequence for a charset of dimension 1.  */
 844               ONE_MORE_BYTE (c1);
 845               if (c1 < ' ' || c1 >= 0x80
 846                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 847                 /* Invalid designation sequence.  Just ignore.  */
 848                 break;
 849               reg[(c - '(') % 4] = charset;
 850             }
 851           else if (c == '$')
 852             {
 853               /* Designation sequence for a charset of dimension 2.  */
 854               ONE_MORE_BYTE (c);
 855               if (c >= '@' && c <= 'B')
 856                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 857                 reg[0] = charset = iso_charset_table[1][0][c];
 858               else if (c >= '(' && c <= '/')
 859                 {
 860                   ONE_MORE_BYTE (c1);
 861                   if (c1 < ' ' || c1 >= 0x80
 862                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 863                     /* Invalid designation sequence.  Just ignore.  */
 864                     break;
 865                   reg[(c - '(') % 4] = charset;
 866                 }
 867               else
 868                 /* Invalid designation sequence.  Just ignore.  */
 869                 break;
 870             }
 871           else if (c == 'N' || c == 'O')
 872             {
 873               /* ESC <Fe> for SS2 or SS3.  */
 874               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 875               break;
 876             }
 877           else if (c >= '0' && c <= '4')
 878             {
 879               /* ESC <Fp> for start/end composition.  */
 880               mask_found |= CODING_CATEGORY_MASK_ISO;
 881               break;
 882             }
 883           else
 884             /* Invalid escape sequence.  Just ignore.  */
 885             break;
 886
 887           /* We found a valid designation sequence for CHARSET.  */
 888           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 889           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 890             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 891           else
 892             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 893           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 894             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 895           else
 896             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 897           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 898             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 899           else
 900             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 901           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 902             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 903           else
 904             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 905           break;
 906
 907         case ISO_CODE_SO:
 908           single_shifting = 0;
 909           if (shift_out == 0
 910               && (reg[1] >= 0
 911                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 912                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 913             {
 914               /* Locking shift out.  */
 915               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 916               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 917             }
 918           break;
 919
 920         case ISO_CODE_SI:
 921           single_shifting = 0;
 922           if (shift_out == 1)
 923             {
 924               /* Locking shift in.  */
 925               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 926               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 927             }
 928           break;
 929
 930         case ISO_CODE_CSI:
 931           single_shifting = 0;
 932         case ISO_CODE_SS2:
 933         case ISO_CODE_SS3:
 934           {
 935             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 936
 937             if (c != ISO_CODE_CSI)
 938               {
 939                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 940                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 941                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 942                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 943                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 944                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 945                 single_shifting = 1;
 946               }
 947             if (VECTORP (Vlatin_extra_code_table)
 948                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 949               {
 950                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 951                     & CODING_FLAG_ISO_LATIN_EXTRA)
 952                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 953                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 954                     & CODING_FLAG_ISO_LATIN_EXTRA)
 955                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 956               }
 957             mask &= newmask;
 958             mask_found |= newmask;
 959           }
 960           break;
 961
 962         default:
 963           if (c < 0x80)
 964             {
 965               single_shifting = 0;
 966               break;
 967             }
 968           else if (c < 0xA0)
 969             {
 970               single_shifting = 0;
 971               if (VECTORP (Vlatin_extra_code_table)
 972                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 973                 {
 974                   int newmask = 0;
 975
 976                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 977                       & CODING_FLAG_ISO_LATIN_EXTRA)
 978                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 979                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 980                       & CODING_FLAG_ISO_LATIN_EXTRA)
 981                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 982                   mask &= newmask;
 983                   mask_found |= newmask;
 984                 }
 985               else
 986                 return 0;
 987             }
 988           else
 989             {
 990               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 991                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 992               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 993               /* Check the length of succeeding codes of the range
 994                  0xA0..0FF.  If the byte length is odd, we exclude
 995                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 996                  when we are not single shifting.  */
 997               if (!single_shifting
 998                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
 999                 {
1000                   int i = 1;
1001                   while (src < src_end)
1002                     {
1003                       ONE_MORE_BYTE (c);
1004                       if (c < 0xA0)
1005                         break;
1006                       i++;
1007                     }
1008
1009                   if (i & 1 && src < src_end)
1010                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1011                   else
1012                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1013                 }
1014             }
1015           break;
1016         }
1017     }
1018  label_end_of_loop:
1019   return (mask & mask_found);
1020 }
1021
1022 /* Decode a character of which charset is CHARSET, the 1st position
1023    code is C1, the 2nd position code is C2, and return the decoded
1024    character code.  If the variable `translation_table' is non-nil,
1025    returned the translated code.  */
1026
1027 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1028   (NILP (translation_table)                     \
1029    ? MAKE_CHAR (charset, c1, c2)                \
1030    : translate_char (translation_table, -1, charset, c1, c2))
1031
1032 /* Set designation state into CODING.  */
1033 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1034   do {                                                                     \
1035     int charset;                                                           \
1036                                                                            \
1037     if (final_char < '0' || final_char >= 128)                             \
1038       goto label_invalid_code;                                             \
1039     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1040                                  make_number (chars),                      \
1041                                  make_number (final_char));                \
1042     if (charset >= 0                                                       \
1043         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1044             || coding->safe_charsets[charset]))                            \
1045       {                                                                    \
1046         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1047             && reg == 0                                                    \
1048             && charset == CHARSET_ASCII)                                   \
1049           {                                                                \
1050             /* We should insert this designation sequence as is so         \
1051                that it is surely written back to a file.  */               \
1052             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1053             goto label_invalid_code;                                       \
1054           }                                                                \
1055         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1056         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1057             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1058           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1059         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1060       }                                                                    \
1061     else                                                                   \
1062       {                                                                    \
1063         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1064         goto label_invalid_code;                                           \
1065       }                                                                    \
1066   } while (0)
1067
1068 /* Allocate a memory block for storing information about compositions.
1069    The block is chained to the already allocated blocks.  */
1070
1071 void
1072 coding_allocate_composition_data (coding, char_offset)
1073      struct coding_system *coding;
1074      int char_offset;
1075 {
1076   struct composition_data *cmp_data
1077     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1078
1079   cmp_data->char_offset = char_offset;
1080   cmp_data->used = 0;
1081   cmp_data->prev = coding->cmp_data;
1082   cmp_data->next = NULL;
1083   if (coding->cmp_data)
1084     coding->cmp_data->next = cmp_data;
1085   coding->cmp_data = cmp_data;
1086   coding->cmp_data_start = 0;
1087 }
1088
1089 /* Record the starting position START and METHOD of one composition.  */
1090
1091 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1092   do {                                                          \
1093     struct composition_data *cmp_data = coding->cmp_data;       \
1094     int *data = cmp_data->data + cmp_data->used;                \
1095     coding->cmp_data_start = cmp_data->used;                    \
1096     data[0] = -1;                                               \
1097     data[1] = cmp_data->char_offset + start;                    \
1098     data[3] = (int) method;                                     \
1099     cmp_data->used += 4;                                        \
1100   } while (0)
1101
1102 /* Record the ending position END of the current composition.  */
1103
1104 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1105   do {                                                          \
1106     struct composition_data *cmp_data = coding->cmp_data;       \
1107     int *data = cmp_data->data + coding->cmp_data_start;        \
1108     data[0] = cmp_data->used - coding->cmp_data_start;          \
1109     data[2] = cmp_data->char_offset + end;                      \
1110   } while (0)
1111
1112 /* Record one COMPONENT (alternate character or composition rule).  */
1113
1114 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1115   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1116
1117 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1118
1119 #define DECODE_COMPOSITION_START(c1)                                       \
1120   do {                                                                     \
1121     if (coding->composing == COMPOSITION_DISABLED)                         \
1122       {                                                                    \
1123         *dst++ = ISO_CODE_ESC;                                             \
1124         *dst++ = c1 & 0x7f;                                                \
1125         coding->produced_char += 2;                                        \
1126       }                                                                    \
1127     else if (!COMPOSING_P (coding))                                        \
1128       {                                                                    \
1129         /* This is surely the start of a composition.  We must be sure     \
1130            that coding->cmp_data has enough space to store the             \
1131            information about the composition.  If not, terminate the       \
1132            current decoding loop, allocate one more memory block for       \
1133            coding->cmp_data in the calller, then start the decoding        \
1134            loop again.  We can't allocate memory here directly because     \
1135            it may cause buffer/string relocation.  */                      \
1136         if (!coding->cmp_data                                              \
1137             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1138                 >= COMPOSITION_DATA_SIZE))                                 \
1139           {                                                                \
1140             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1141             goto label_end_of_loop;                                        \
1142           }                                                                \
1143         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1144                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1145                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1146                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1147         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1148                                       coding->composing);                  \
1149         coding->composition_rule_follows = 0;                              \
1150       }                                                                    \
1151     else                                                                   \
1152       {                                                                    \
1153         /* We are already handling a composition.  If the method is        \
1154            the following two, the codes following the current escape       \
1155            sequence are actual characters stored in a buffer.  */          \
1156         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1157             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1158           {                                                                \
1159             coding->composing = COMPOSITION_RELATIVE;                      \
1160             coding->composition_rule_follows = 0;                          \
1161           }                                                                \
1162       }                                                                    \
1163   } while (0)
1164
1165 /* Handle compositoin end sequence ESC 1.  */
1166
1167 #define DECODE_COMPOSITION_END(c1)                                      \
1168   do {                                                                  \
1169     if (coding->composing == COMPOSITION_DISABLED)                      \
1170       {                                                                 \
1171         *dst++ = ISO_CODE_ESC;                                          \
1172         *dst++ = c1;                                                    \
1173         coding->produced_char += 2;                                     \
1174       }                                                                 \
1175     else                                                                \
1176       {                                                                 \
1177         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1178         coding->composing = COMPOSITION_NO;                             \
1179       }                                                                 \
1180   } while (0)
1181
1182 /* Decode a composition rule from the byte C1 (and maybe one more byte
1183    from SRC) and store one encoded composition rule in
1184    coding->cmp_data.  */
1185
1186 #define DECODE_COMPOSITION_RULE(c1)                                     \
1187   do {                                                                  \
1188     int rule = 0;                                                       \
1189     (c1) -= 32;                                                         \
1190     if (c1 < 81)                /* old format (before ver.21) */        \
1191       {                                                                 \
1192         int gref = (c1) / 9;                                            \
1193         int nref = (c1) % 9;                                            \
1194         if (gref == 4) gref = 10;                                       \
1195         if (nref == 4) nref = 10;                                       \
1196         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1197       }                                                                 \
1198     else if (c1 < 93)           /* new format (after ver.21) */         \
1199       {                                                                 \
1200         ONE_MORE_BYTE (c2);                                             \
1201         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1202       }                                                                 \
1203     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1204     coding->composition_rule_follows = 0;                               \
1205   } while (0)
1206
1207
1208 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1209
1210 static void
1211 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1212      struct coding_system *coding;
1213      unsigned char *source, *destination;
1214      int src_bytes, dst_bytes;
1215 {
1216   unsigned char *src = source;
1217   unsigned char *src_end = source + src_bytes;
1218   unsigned char *dst = destination;
1219   unsigned char *dst_end = destination + dst_bytes;
1220   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1221   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1222   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1223   /* SRC_BASE remembers the start position in source in each loop.
1224      The loop will be exited when there's not enough source code
1225      (within macro ONE_MORE_BYTE), or when there's not enough
1226      destination area to produce a character (within macro
1227      EMIT_CHAR).  */
1228   unsigned char *src_base;
1229   int c, charset;
1230   Lisp_Object translation_table;
1231
1232   if (NILP (Venable_character_translation))
1233     translation_table = Qnil;
1234   else
1235     {
1236       translation_table = coding->translation_table_for_decode;
1237       if (NILP (translation_table))
1238         translation_table = Vstandard_translation_table_for_decode;
1239     }
1240
1241   coding->result = CODING_FINISH_NORMAL;
1242
1243   while (1)
1244     {
1245       int c1, c2;
1246
1247       src_base = src;
1248       ONE_MORE_BYTE (c1);
1249
1250       /* We produce no character or one character.  */
1251       switch (iso_code_class [c1])
1252         {
1253         case ISO_0x20_or_0x7F:
1254           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1255             {
1256               DECODE_COMPOSITION_RULE (c1);
1257               continue;
1258             }
1259           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1260             {
1261               /* This is SPACE or DEL.  */
1262               charset = CHARSET_ASCII;
1263               break;
1264             }
1265           /* This is a graphic character, we fall down ...  */
1266
1267         case ISO_graphic_plane_0:
1268           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1269             {
1270               DECODE_COMPOSITION_RULE (c1);
1271               continue;
1272             }
1273           charset = charset0;
1274           break;
1275
1276         case ISO_0xA0_or_0xFF:
1277           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1278               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1279             goto label_invalid_code;
1280           /* This is a graphic character, we fall down ... */
1281
1282         case ISO_graphic_plane_1:
1283           if (charset1 < 0)
1284             goto label_invalid_code;
1285           charset = charset1;
1286           break;
1287
1288         case ISO_control_0:
1289           if (COMPOSING_P (coding))
1290             DECODE_COMPOSITION_END ('1');
1291
1292           /* All ISO2022 control characters in this class have the
1293              same representation in Emacs internal format.  */
1294           if (c1 == '\n'
1295               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1296               && (coding->eol_type == CODING_EOL_CR
1297                   || coding->eol_type == CODING_EOL_CRLF))
1298             {
1299               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1300               goto label_end_of_loop;
1301             }
1302           charset = CHARSET_ASCII;
1303           break;
1304
1305         case ISO_control_1:
1306           if (COMPOSING_P (coding))
1307             DECODE_COMPOSITION_END ('1');
1308           goto label_invalid_code;
1309
1310         case ISO_carriage_return:
1311           if (COMPOSING_P (coding))
1312             DECODE_COMPOSITION_END ('1');
1313
1314           if (coding->eol_type == CODING_EOL_CR)
1315             c1 = '\n';
1316           else if (coding->eol_type == CODING_EOL_CRLF)
1317             {
1318               ONE_MORE_BYTE (c1);
1319               if (c1 != ISO_CODE_LF)
1320                 {
1321                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1322                     {
1323                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1324                       goto label_end_of_loop;
1325                     }
1326                   src--;
1327                   c1 = '\r';
1328                 }
1329             }
1330           charset = CHARSET_ASCII;
1331           break;
1332
1333         case ISO_shift_out:
1334           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1335               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1336             goto label_invalid_code;
1337           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1338           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1339           continue;
1340
1341         case ISO_shift_in:
1342           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1343             goto label_invalid_code;
1344           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1345           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1346           continue;
1347
1348         case ISO_single_shift_2_7:
1349         case ISO_single_shift_2:
1350           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1351             goto label_invalid_code;
1352           /* SS2 is handled as an escape sequence of ESC 'N' */
1353           c1 = 'N';
1354           goto label_escape_sequence;
1355
1356         case ISO_single_shift_3:
1357           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1358             goto label_invalid_code;
1359           /* SS2 is handled as an escape sequence of ESC 'O' */
1360           c1 = 'O';
1361           goto label_escape_sequence;
1362
1363         case ISO_control_sequence_introducer:
1364           /* CSI is handled as an escape sequence of ESC '[' ...  */
1365           c1 = '[';
1366           goto label_escape_sequence;
1367
1368         case ISO_escape:
1369           ONE_MORE_BYTE (c1);
1370         label_escape_sequence:
1371           /* Escape sequences handled by Emacs are invocation,
1372              designation, direction specification, and character
1373              composition specification.  */
1374           switch (c1)
1375             {
1376             case '&':           /* revision of following character set */
1377               ONE_MORE_BYTE (c1);
1378               if (!(c1 >= '@' && c1 <= '~'))
1379                 goto label_invalid_code;
1380               ONE_MORE_BYTE (c1);
1381               if (c1 != ISO_CODE_ESC)
1382                 goto label_invalid_code;
1383               ONE_MORE_BYTE (c1);
1384               goto label_escape_sequence;
1385
1386             case '$':           /* designation of 2-byte character set */
1387               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1388                 goto label_invalid_code;
1389               ONE_MORE_BYTE (c1);
1390               if (c1 >= '@' && c1 <= 'B')
1391                 {       /* designation of JISX0208.1978, GB2312.1980,
1392                            or JISX0208.1980 */
1393                   DECODE_DESIGNATION (0, 2, 94, c1);
1394                 }
1395               else if (c1 >= 0x28 && c1 <= 0x2B)
1396                 {       /* designation of DIMENSION2_CHARS94 character set */
1397                   ONE_MORE_BYTE (c2);
1398                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1399                 }
1400               else if (c1 >= 0x2C && c1 <= 0x2F)
1401                 {       /* designation of DIMENSION2_CHARS96 character set */
1402                   ONE_MORE_BYTE (c2);
1403                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1404                 }
1405               else
1406                 goto label_invalid_code;
1407               /* We must update these variables now.  */
1408               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1409               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1410               continue;
1411
1412             case 'n':           /* invocation of locking-shift-2 */
1413               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1414                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1415                 goto label_invalid_code;
1416               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1417               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1418               continue;
1419
1420             case 'o':           /* invocation of locking-shift-3 */
1421               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1422                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1423                 goto label_invalid_code;
1424               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1425               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1426               continue;
1427
1428             case 'N':           /* invocation of single-shift-2 */
1429               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1430                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1431                 goto label_invalid_code;
1432               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1433               ONE_MORE_BYTE (c1);
1434               break;
1435
1436             case 'O':           /* invocation of single-shift-3 */
1437               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1438                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1439                 goto label_invalid_code;
1440               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1441               ONE_MORE_BYTE (c1);
1442               break;
1443
1444             case '0': case '2': case '3': case '4': /* start composition */
1445               DECODE_COMPOSITION_START (c1);
1446               continue;
1447
1448             case '1':           /* end composition */
1449               DECODE_COMPOSITION_END (c1);
1450               continue;
1451
1452             case '[':           /* specification of direction */
1453               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1454                 goto label_invalid_code;
1455               /* For the moment, nested direction is not supported.
1456                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1457                  left-to-right, and nozero means right-to-left.  */
1458               ONE_MORE_BYTE (c1);
1459               switch (c1)
1460                 {
1461                 case ']':       /* end of the current direction */
1462                   coding->mode &= ~CODING_MODE_DIRECTION;
1463
1464                 case '0':       /* end of the current direction */
1465                 case '1':       /* start of left-to-right direction */
1466                   ONE_MORE_BYTE (c1);
1467                   if (c1 == ']')
1468                     coding->mode &= ~CODING_MODE_DIRECTION;
1469                   else
1470                     goto label_invalid_code;
1471                   break;
1472
1473                 case '2':       /* start of right-to-left direction */
1474                   ONE_MORE_BYTE (c1);
1475                   if (c1 == ']')
1476                     coding->mode |= CODING_MODE_DIRECTION;
1477                   else
1478                     goto label_invalid_code;
1479                   break;
1480
1481                 default:
1482                   goto label_invalid_code;
1483                 }
1484               continue;
1485
1486             default:
1487               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1488                 goto label_invalid_code;
1489               if (c1 >= 0x28 && c1 <= 0x2B)
1490                 {       /* designation of DIMENSION1_CHARS94 character set */
1491                   ONE_MORE_BYTE (c2);
1492                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1493                 }
1494               else if (c1 >= 0x2C && c1 <= 0x2F)
1495                 {       /* designation of DIMENSION1_CHARS96 character set */
1496                   ONE_MORE_BYTE (c2);
1497                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1498                 }
1499               else
1500                 goto label_invalid_code;
1501               /* We must update these variables now.  */
1502               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1503               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1504               continue;
1505             }
1506         }
1507
1508       /* Now we know CHARSET and 1st position code C1 of a character.
1509          Produce a multibyte sequence for that character while getting
1510          2nd position code C2 if necessary.  */
1511       if (CHARSET_DIMENSION (charset) == 2)
1512         {
1513           ONE_MORE_BYTE (c2);
1514           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1515             /* C2 is not in a valid range.  */
1516             goto label_invalid_code;
1517         }
1518       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1519       EMIT_CHAR (c);
1520       continue;
1521
1522     label_invalid_code:
1523       coding->errors++;
1524       if (COMPOSING_P (coding))
1525         DECODE_COMPOSITION_END ('1');
1526       src = src_base;
1527       c = *src++;
1528       EMIT_CHAR (c);
1529     }
1530
1531  label_end_of_loop:
1532   coding->consumed = coding->consumed_char = src_base - source;
1533   coding->produced = dst - destination;
1534   return;
1535 }
1536
1537
1538 /* ISO2022 encoding stuff.  */
1539
1540 /*
1541    It is not enough to say just "ISO2022" on encoding, we have to
1542    specify more details.  In Emacs, each coding system of ISO2022
1543    variant has the following specifications:
1544         1. Initial designation to G0 thru G3.
1545         2. Allows short-form designation?
1546         3. ASCII should be designated to G0 before control characters?
1547         4. ASCII should be designated to G0 at end of line?
1548         5. 7-bit environment or 8-bit environment?
1549         6. Use locking-shift?
1550         7. Use Single-shift?
1551    And the following two are only for Japanese:
1552         8. Use ASCII in place of JIS0201-1976-Roman?
1553         9. Use JISX0208-1983 in place of JISX0208-1978?
1554    These specifications are encoded in `coding->flags' as flag bits
1555    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1556    details.
1557 */
1558
1559 /* Produce codes (escape sequence) for designating CHARSET to graphic
1560    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1561    '@', 'A', or 'B' and the coding system CODING allows, produce
1562    designation sequence of short-form.  */
1563
1564 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1565   do {                                                                  \
1566     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1567     char *intermediate_char_94 = "()*+";                                \
1568     char *intermediate_char_96 = ",-./";                                \
1569     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1570                                                                         \
1571     if (revision < 255)                                                 \
1572       {                                                                 \
1573         *dst++ = ISO_CODE_ESC;                                          \
1574         *dst++ = '&';                                                   \
1575         *dst++ = '@' + revision;                                        \
1576       }                                                                 \
1577     *dst++ = ISO_CODE_ESC;                                              \
1578     if (CHARSET_DIMENSION (charset) == 1)                               \
1579       {                                                                 \
1580         if (CHARSET_CHARS (charset) == 94)                              \
1581           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1582         else                                                            \
1583           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1584       }                                                                 \
1585     else                                                                \
1586       {                                                                 \
1587         *dst++ = '$';                                                   \
1588         if (CHARSET_CHARS (charset) == 94)                              \
1589           {                                                             \
1590             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1591                 || reg != 0                                             \
1592                 || final_char < '@' || final_char > 'B')                \
1593               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1594           }                                                             \
1595         else                                                            \
1596           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1597       }                                                                 \
1598     *dst++ = final_char;                                                \
1599     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1600   } while (0)
1601
1602 /* The following two macros produce codes (control character or escape
1603    sequence) for ISO2022 single-shift functions (single-shift-2 and
1604    single-shift-3).  */
1605
1606 #define ENCODE_SINGLE_SHIFT_2                           \
1607   do {                                                  \
1608     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1609       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1610     else                                                \
1611       *dst++ = ISO_CODE_SS2;                            \
1612     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1613   } while (0)
1614
1615 #define ENCODE_SINGLE_SHIFT_3                           \
1616   do {                                                  \
1617     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1618       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1619     else                                                \
1620       *dst++ = ISO_CODE_SS3;                            \
1621     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1622   } while (0)
1623
1624 /* The following four macros produce codes (control character or
1625    escape sequence) for ISO2022 locking-shift functions (shift-in,
1626    shift-out, locking-shift-2, and locking-shift-3).  */
1627
1628 #define ENCODE_SHIFT_IN                         \
1629   do {                                          \
1630     *dst++ = ISO_CODE_SI;                       \
1631     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1632   } while (0)
1633
1634 #define ENCODE_SHIFT_OUT                        \
1635   do {                                          \
1636     *dst++ = ISO_CODE_SO;                       \
1637     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1638   } while (0)
1639
1640 #define ENCODE_LOCKING_SHIFT_2                  \
1641   do {                                          \
1642     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1643     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1644   } while (0)
1645
1646 #define ENCODE_LOCKING_SHIFT_3                  \
1647   do {                                          \
1648     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1649     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1650   } while (0)
1651
1652 /* Produce codes for a DIMENSION1 character whose character set is
1653    CHARSET and whose position-code is C1.  Designation and invocation
1654    sequences are also produced in advance if necessary.  */
1655
1656 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1657   do {                                                                  \
1658     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1659       {                                                                 \
1660         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1661           *dst++ = c1 & 0x7F;                                           \
1662         else                                                            \
1663           *dst++ = c1 | 0x80;                                           \
1664         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1665         break;                                                          \
1666       }                                                                 \
1667     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1668       {                                                                 \
1669         *dst++ = c1 & 0x7F;                                             \
1670         break;                                                          \
1671       }                                                                 \
1672     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1673       {                                                                 \
1674         *dst++ = c1 | 0x80;                                             \
1675         break;                                                          \
1676       }                                                                 \
1677     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1678              && !coding->safe_charsets[charset])                        \
1679       {                                                                 \
1680         /* We should not encode this character, instead produce one or  \
1681            two `?'s.  */                                                \
1682         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1683         if (CHARSET_WIDTH (charset) == 2)                               \
1684           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1685         break;                                                          \
1686       }                                                                 \
1687     else                                                                \
1688       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1689          must invoke it, or, at first, designate it to some graphic     \
1690          register.  Then repeat the loop to actually produce the        \
1691          character.  */                                                 \
1692       dst = encode_invocation_designation (charset, coding, dst);       \
1693   } while (1)
1694
1695 /* Produce codes for a DIMENSION2 character whose character set is
1696    CHARSET and whose position-codes are C1 and C2.  Designation and
1697    invocation codes are also produced in advance if necessary.  */
1698
1699 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1700   do {                                                                  \
1701     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1702       {                                                                 \
1703         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1704           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1705         else                                                            \
1706           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1707         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1708         break;                                                          \
1709       }                                                                 \
1710     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1711       {                                                                 \
1712         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1713         break;                                                          \
1714       }                                                                 \
1715     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1716       {                                                                 \
1717         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1718         break;                                                          \
1719       }                                                                 \
1720     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1721              && !coding->safe_charsets[charset])                        \
1722       {                                                                 \
1723         /* We should not encode this character, instead produce one or  \
1724            two `?'s.  */                                                \
1725         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1726         if (CHARSET_WIDTH (charset) == 2)                               \
1727           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1728         break;                                                          \
1729       }                                                                 \
1730     else                                                                \
1731       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1732          must invoke it, or, at first, designate it to some graphic     \
1733          register.  Then repeat the loop to actually produce the        \
1734          character.  */                                                 \
1735       dst = encode_invocation_designation (charset, coding, dst);       \
1736   } while (1)
1737
1738 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1739   do {                                                                  \
1740     int alt_charset = charset;                                          \
1741                                                                         \
1742     if (CHARSET_DEFINED_P (charset))                                    \
1743       {                                                                 \
1744         if (CHARSET_DIMENSION (charset) == 1)                           \
1745           {                                                             \
1746             if (charset == CHARSET_ASCII                                \
1747                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1748               alt_charset = charset_latin_jisx0201;                     \
1749             ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1);          \
1750           }                                                             \
1751         else                                                            \
1752           {                                                             \
1753             if (charset == charset_jisx0208                             \
1754                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1755               alt_charset = charset_jisx0208_1978;                      \
1756             ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2);      \
1757           }                                                             \
1758       }                                                                 \
1759     else                                                                \
1760       {                                                                 \
1761         *dst++ = c1;                                                    \
1762         if (c2 >= 0)                                                    \
1763           *dst++ = c2;                                                  \
1764       }                                                                 \
1765   } while (0)
1766
1767 /* Produce designation and invocation codes at a place pointed by DST
1768    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1769    Return new DST.  */
1770
1771 unsigned char *
1772 encode_invocation_designation (charset, coding, dst)
1773      int charset;
1774      struct coding_system *coding;
1775      unsigned char *dst;
1776 {
1777   int reg;                      /* graphic register number */
1778
1779   /* At first, check designations.  */
1780   for (reg = 0; reg < 4; reg++)
1781     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1782       break;
1783
1784   if (reg >= 4)
1785     {
1786       /* CHARSET is not yet designated to any graphic registers.  */
1787       /* At first check the requested designation.  */
1788       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1789       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1790         /* Since CHARSET requests no special designation, designate it
1791            to graphic register 0.  */
1792         reg = 0;
1793
1794       ENCODE_DESIGNATION (charset, reg, coding);
1795     }
1796
1797   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1798       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1799     {
1800       /* Since the graphic register REG is not invoked to any graphic
1801          planes, invoke it to graphic plane 0.  */
1802       switch (reg)
1803         {
1804         case 0:                 /* graphic register 0 */
1805           ENCODE_SHIFT_IN;
1806           break;
1807
1808         case 1:                 /* graphic register 1 */
1809           ENCODE_SHIFT_OUT;
1810           break;
1811
1812         case 2:                 /* graphic register 2 */
1813           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1814             ENCODE_SINGLE_SHIFT_2;
1815           else
1816             ENCODE_LOCKING_SHIFT_2;
1817           break;
1818
1819         case 3:                 /* graphic register 3 */
1820           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1821             ENCODE_SINGLE_SHIFT_3;
1822           else
1823             ENCODE_LOCKING_SHIFT_3;
1824           break;
1825         }
1826     }
1827
1828   return dst;
1829 }
1830
1831 /* Produce 2-byte codes for encoded composition rule RULE.  */
1832
1833 #define ENCODE_COMPOSITION_RULE(rule)           \
1834   do {                                          \
1835     int gref, nref;                             \
1836     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1837     *dst++ = 32 + 81 + gref;                    \
1838     *dst++ = 32 + nref;                         \
1839   } while (0)
1840
1841 /* Produce codes for indicating the start of a composition sequence
1842    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1843    which specify information about the composition.  See the comment
1844    in coding.h for the format of DATA.  */
1845
1846 #define ENCODE_COMPOSITION_START(coding, data)                          \
1847   do {                                                                  \
1848     coding->composing = data[3];                                        \
1849     *dst++ = ISO_CODE_ESC;                                              \
1850     if (coding->composing == COMPOSITION_RELATIVE)                      \
1851       *dst++ = '0';                                                     \
1852     else                                                                \
1853       {                                                                 \
1854         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1855                   ? '3' : '4');                                         \
1856         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1857         coding->composition_rule_follows = 0;                           \
1858       }                                                                 \
1859   } while (0)
1860
1861 /* Produce codes for indicating the end of the current composition.  */
1862
1863 #define ENCODE_COMPOSITION_END(coding, data)                    \
1864   do {                                                          \
1865     *dst++ = ISO_CODE_ESC;                                      \
1866     *dst++ = '1';                                               \
1867     coding->cmp_data_start += data[0];                          \
1868     coding->composing = COMPOSITION_NO;                         \
1869     if (coding->cmp_data_start == coding->cmp_data->used        \
1870         && coding->cmp_data->next)                              \
1871       {                                                         \
1872         coding->cmp_data = coding->cmp_data->next;              \
1873         coding->cmp_data_start = 0;                             \
1874       }                                                         \
1875   } while (0)
1876
1877 /* Produce composition start sequence ESC 0.  Here, this sequence
1878    doesn't mean the start of a new composition but means that we have
1879    just produced components (alternate chars and composition rules) of
1880    the composition and the actual text follows in SRC.  */
1881
1882 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1883   do {                                          \
1884     *dst++ = ISO_CODE_ESC;                      \
1885     *dst++ = '0';                               \
1886     coding->composing = COMPOSITION_RELATIVE;   \
1887   } while (0)
1888
1889 /* The following three macros produce codes for indicating direction
1890    of text.  */
1891 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1892   do {                                                  \
1893     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1894       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1895     else                                                \
1896       *dst++ = ISO_CODE_CSI;                            \
1897   } while (0)
1898
1899 #define ENCODE_DIRECTION_R2L    \
1900   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1901
1902 #define ENCODE_DIRECTION_L2R    \
1903   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1904
1905 /* Produce codes for designation and invocation to reset the graphic
1906    planes and registers to initial state.  */
1907 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1908   do {                                                                      \
1909     int reg;                                                                \
1910     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1911       ENCODE_SHIFT_IN;                                                      \
1912     for (reg = 0; reg < 4; reg++)                                           \
1913       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1914           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1915               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1916         ENCODE_DESIGNATION                                                  \
1917           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1918   } while (0)
1919
1920 /* Produce designation sequences of charsets in the line started from
1921    SRC to a place pointed by DST, and return updated DST.
1922
1923    If the current block ends before any end-of-line, we may fail to
1924    find all the necessary designations.  */
1925
1926 static unsigned char *
1927 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1928      struct coding_system *coding;
1929      Lisp_Object translation_table;
1930      unsigned char *src, *src_end, *dst;
1931 {
1932   int charset, c, found = 0, reg;
1933   /* Table of charsets to be designated to each graphic register.  */
1934   int r[4];
1935
1936   for (reg = 0; reg < 4; reg++)
1937     r[reg] = -1;
1938
1939   while (found < 4)
1940     {
1941       ONE_MORE_CHAR (c);
1942       if (c == '\n')
1943         break;
1944
1945       charset = CHAR_CHARSET (c);
1946       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1947       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1948         {
1949           found++;
1950           r[reg] = charset;
1951         }
1952     }
1953
1954  label_end_of_loop:
1955   if (found)
1956     {
1957       for (reg = 0; reg < 4; reg++)
1958         if (r[reg] >= 0
1959             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1960           ENCODE_DESIGNATION (r[reg], reg, coding);
1961     }
1962
1963   return dst;
1964 }
1965
1966 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1967
1968 static void
1969 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1970      struct coding_system *coding;
1971      unsigned char *source, *destination;
1972      int src_bytes, dst_bytes;
1973 {
1974   unsigned char *src = source;
1975   unsigned char *src_end = source + src_bytes;
1976   unsigned char *dst = destination;
1977   unsigned char *dst_end = destination + dst_bytes;
1978   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1979      from DST_END to assure overflow checking is necessary only at the
1980      head of loop.  */
1981   unsigned char *adjusted_dst_end = dst_end - 19;
1982   /* SRC_BASE remembers the start position in source in each loop.
1983      The loop will be exited when there's not enough source text to
1984      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1985      there's not enough destination area to produce encoded codes
1986      (within macro EMIT_BYTES).  */
1987   unsigned char *src_base;
1988   int c;
1989   Lisp_Object translation_table;
1990
1991   if (NILP (Venable_character_translation))
1992     translation_table = Qnil;
1993   else
1994     {
1995       translation_table = coding->translation_table_for_encode;
1996       if (NILP (translation_table))
1997         translation_table = Vstandard_translation_table_for_encode;
1998     }
1999
2000   coding->consumed_char = 0;
2001   coding->errors = 0;
2002   while (1)
2003     {
2004       int charset, c1, c2;
2005
2006       src_base = src;
2007
2008       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2009         {
2010           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2011           break;
2012         }
2013
2014       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2015           && CODING_SPEC_ISO_BOL (coding))
2016         {
2017           /* We have to produce designation sequences if any now.  */
2018           dst = encode_designation_at_bol (coding, translation_table,
2019                                            src, src_end, dst);
2020           CODING_SPEC_ISO_BOL (coding) = 0;
2021         }
2022
2023       /* Check composition start and end.  */
2024       if (coding->composing != COMPOSITION_DISABLED
2025           && coding->cmp_data_start < coding->cmp_data->used)
2026         {
2027           struct composition_data *cmp_data = coding->cmp_data;
2028           int *data = cmp_data->data + coding->cmp_data_start;
2029           int this_pos = cmp_data->char_offset + coding->consumed_char;
2030
2031           if (coding->composing == COMPOSITION_RELATIVE)
2032             {
2033               if (this_pos == data[2])
2034                 {
2035                   ENCODE_COMPOSITION_END (coding, data);
2036                   cmp_data = coding->cmp_data;
2037                   data = cmp_data->data + coding->cmp_data_start;
2038                 }
2039             }
2040           else if (COMPOSING_P (coding))
2041             {
2042               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2043               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2044                 /* We have consumed components of the composition.
2045                    What follows in SRC is the compositions's base
2046                    text.  */
2047                 ENCODE_COMPOSITION_FAKE_START (coding);
2048               else
2049                 {
2050                   int c = cmp_data->data[coding->cmp_data_index++];
2051                   if (coding->composition_rule_follows)
2052                     {
2053                       ENCODE_COMPOSITION_RULE (c);
2054                       coding->composition_rule_follows = 0;
2055                     }
2056                   else
2057                     {
2058                       SPLIT_CHAR (c, charset, c1, c2);
2059                       ENCODE_ISO_CHARACTER (charset, c1, c2);
2060                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2061                         coding->composition_rule_follows = 1;
2062                     }
2063                   continue;
2064                 }
2065             }
2066           if (!COMPOSING_P (coding))
2067             {
2068               if (this_pos == data[1])
2069                 {
2070                   ENCODE_COMPOSITION_START (coding, data);
2071                   continue;
2072                 }
2073             }
2074         }
2075
2076       ONE_MORE_CHAR (c);
2077
2078       /* Now encode the character C.  */
2079       if (c < 0x20 || c == 0x7F)
2080         {
2081           if (c == '\r')
2082             {
2083               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2084                 {
2085                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2086                     ENCODE_RESET_PLANE_AND_REGISTER;
2087                   *dst++ = c;
2088                   continue;
2089                 }
2090               /* fall down to treat '\r' as '\n' ...  */
2091               c = '\n';
2092             }
2093           if (c == '\n')
2094             {
2095               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2096                 ENCODE_RESET_PLANE_AND_REGISTER;
2097               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2098                 bcopy (coding->spec.iso2022.initial_designation,
2099                        coding->spec.iso2022.current_designation,
2100                        sizeof coding->spec.iso2022.initial_designation);
2101               if (coding->eol_type == CODING_EOL_LF
2102                   || coding->eol_type == CODING_EOL_UNDECIDED)
2103                 *dst++ = ISO_CODE_LF;
2104               else if (coding->eol_type == CODING_EOL_CRLF)
2105                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2106               else
2107                 *dst++ = ISO_CODE_CR;
2108               CODING_SPEC_ISO_BOL (coding) = 1;
2109             }
2110           else
2111             {
2112               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2113                 ENCODE_RESET_PLANE_AND_REGISTER;
2114               *dst++ = c;
2115             }
2116         }
2117       else if (ASCII_BYTE_P (c))
2118         ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2119       else if (SINGLE_BYTE_CHAR_P (c))
2120         {
2121           *dst++ = c;
2122           coding->errors++;
2123         }
2124       else
2125         {
2126           SPLIT_CHAR (c, charset, c1, c2);
2127           ENCODE_ISO_CHARACTER (charset, c1, c2);
2128         }
2129
2130       coding->consumed_char++;
2131     }
2132
2133  label_end_of_loop:
2134   coding->consumed = src_base - source;
2135   coding->produced = coding->produced_char = dst - destination;
2136 }
2137
2138 \f
2139 /*** 4. SJIS and BIG5 handlers ***/
2140
2141 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2142    quite widely.  So, for the moment, Emacs supports them in the bare
2143    C code.  But, in the future, they may be supported only by CCL.  */
2144
2145 /* SJIS is a coding system encoding three character sets: ASCII, right
2146    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2147    as is.  A character of charset katakana-jisx0201 is encoded by
2148    "position-code + 0x80".  A character of charset japanese-jisx0208
2149    is encoded in 2-byte but two position-codes are divided and shifted
2150    so that it fit in the range below.
2151
2152    --- CODE RANGE of SJIS ---
2153    (character set)      (range)
2154    ASCII                0x00 .. 0x7F
2155    KATAKANA-JISX0201    0xA0 .. 0xDF
2156    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2157             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2158    -------------------------------
2159
2160 */
2161
2162 /* BIG5 is a coding system encoding two character sets: ASCII and
2163    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2164    character set and is encoded in two-byte.
2165
2166    --- CODE RANGE of BIG5 ---
2167    (character set)      (range)
2168    ASCII                0x00 .. 0x7F
2169    Big5 (1st byte)      0xA1 .. 0xFE
2170         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2171    --------------------------
2172
2173    Since the number of characters in Big5 is larger than maximum
2174    characters in Emacs' charset (96x96), it can't be handled as one
2175    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2176    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2177    contains frequently used characters and the latter contains less
2178    frequently used characters.  */
2179
2180 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2181    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2182    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2183    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2184
2185 /* Number of Big5 characters which have the same code in 1st byte.  */
2186 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2187
2188 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2189   do {                                                                  \
2190     unsigned int temp                                                   \
2191       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2192     if (b1 < 0xC9)                                                      \
2193       charset = charset_big5_1;                                         \
2194     else                                                                \
2195       {                                                                 \
2196         charset = charset_big5_2;                                       \
2197         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2198       }                                                                 \
2199     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2200     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2201   } while (0)
2202
2203 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2204   do {                                                                  \
2205     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2206     if (charset == charset_big5_2)                                      \
2207       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2208     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2209     b2 = temp % BIG5_SAME_ROW;                                          \
2210     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2211   } while (0)
2212
2213 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2214    Check if a text is encoded in SJIS.  If it is, return
2215    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2216
2217 int
2218 detect_coding_sjis (src, src_end)
2219      unsigned char *src, *src_end;
2220 {
2221   int c;
2222   /* Dummy for ONE_MORE_BYTE.  */
2223   struct coding_system dummy_coding;
2224   struct coding_system *coding = &dummy_coding;
2225
2226   while (1)
2227     {
2228       ONE_MORE_BYTE (c);
2229       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2230         {
2231           ONE_MORE_BYTE (c);
2232           if (c < 0x40)
2233             return 0;
2234         }
2235     }
2236  label_end_of_loop:
2237   return CODING_CATEGORY_MASK_SJIS;
2238 }
2239
2240 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2241    Check if a text is encoded in BIG5.  If it is, return
2242    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2243
2244 int
2245 detect_coding_big5 (src, src_end)
2246      unsigned char *src, *src_end;
2247 {
2248   int c;
2249   /* Dummy for ONE_MORE_BYTE.  */
2250   struct coding_system dummy_coding;
2251   struct coding_system *coding = &dummy_coding;
2252
2253   while (1)
2254     {
2255       ONE_MORE_BYTE (c);
2256       if (c >= 0xA1)
2257         {
2258           ONE_MORE_BYTE (c);
2259           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2260             return 0;
2261         }
2262     }
2263  label_end_of_loop:
2264   return CODING_CATEGORY_MASK_BIG5;
2265 }
2266
2267 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2268    Check if a text is encoded in UTF-8.  If it is, return
2269    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2270
2271 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2272 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2273 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2274 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2275 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2276 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2277 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2278
2279 int
2280 detect_coding_utf_8 (src, src_end)
2281      unsigned char *src, *src_end;
2282 {
2283   unsigned char c;
2284   int seq_maybe_bytes;
2285   /* Dummy for ONE_MORE_BYTE.  */
2286   struct coding_system dummy_coding;
2287   struct coding_system *coding = &dummy_coding;
2288
2289   while (1)
2290     {
2291       ONE_MORE_BYTE (c);
2292       if (UTF_8_1_OCTET_P (c))
2293         continue;
2294       else if (UTF_8_2_OCTET_LEADING_P (c))
2295         seq_maybe_bytes = 1;
2296       else if (UTF_8_3_OCTET_LEADING_P (c))
2297         seq_maybe_bytes = 2;
2298       else if (UTF_8_4_OCTET_LEADING_P (c))
2299         seq_maybe_bytes = 3;
2300       else if (UTF_8_5_OCTET_LEADING_P (c))
2301         seq_maybe_bytes = 4;
2302       else if (UTF_8_6_OCTET_LEADING_P (c))
2303         seq_maybe_bytes = 5;
2304       else
2305         return 0;
2306
2307       do
2308         {
2309           ONE_MORE_BYTE (c);
2310           if (!UTF_8_EXTRA_OCTET_P (c))
2311             return 0;
2312           seq_maybe_bytes--;
2313         }
2314       while (seq_maybe_bytes > 0);
2315     }
2316
2317  label_end_of_loop:
2318   return CODING_CATEGORY_MASK_UTF_8;
2319 }
2320
2321 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2322    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2323    Little Endian (otherwise).  If it is, return
2324    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2325    else return 0.  */
2326
2327 #define UTF_16_INVALID_P(val)   \
2328   (((val) == 0xFFFE)            \
2329    || ((val) == 0xFFFF))
2330
2331 #define UTF_16_HIGH_SURROGATE_P(val) \
2332   (((val) & 0xD800) == 0xD800)
2333
2334 #define UTF_16_LOW_SURROGATE_P(val) \
2335   (((val) & 0xDC00) == 0xDC00)
2336
2337 int
2338 detect_coding_utf_16 (src, src_end)
2339      unsigned char *src, *src_end;
2340 {
2341   unsigned char c1, c2;
2342   /* Dummy for TWO_MORE_BYTES.  */
2343   struct coding_system dummy_coding;
2344   struct coding_system *coding = &dummy_coding;
2345
2346   TWO_MORE_BYTES (c1, c2);
2347
2348   if ((c1 == 0xFF) && (c2 == 0xFE))
2349     return CODING_CATEGORY_MASK_UTF_16_LE;
2350   else if ((c1 == 0xFE) && (c2 == 0xFF))
2351     return CODING_CATEGORY_MASK_UTF_16_BE;
2352
2353  label_end_of_loop:
2354   return 0;
2355 }
2356
2357 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2358    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2359
2360 static void
2361 decode_coding_sjis_big5 (coding, source, destination,
2362                          src_bytes, dst_bytes, sjis_p)
2363      struct coding_system *coding;
2364      unsigned char *source, *destination;
2365      int src_bytes, dst_bytes;
2366      int sjis_p;
2367 {
2368   unsigned char *src = source;
2369   unsigned char *src_end = source + src_bytes;
2370   unsigned char *dst = destination;
2371   unsigned char *dst_end = destination + dst_bytes;
2372   /* SRC_BASE remembers the start position in source in each loop.
2373      The loop will be exited when there's not enough source code
2374      (within macro ONE_MORE_BYTE), or when there's not enough
2375      destination area to produce a character (within macro
2376      EMIT_CHAR).  */
2377   unsigned char *src_base;
2378   Lisp_Object translation_table;
2379
2380   if (NILP (Venable_character_translation))
2381     translation_table = Qnil;
2382   else
2383     {
2384       translation_table = coding->translation_table_for_decode;
2385       if (NILP (translation_table))
2386         translation_table = Vstandard_translation_table_for_decode;
2387     }
2388
2389   coding->produced_char = 0;
2390   while (1)
2391     {
2392       int c, charset, c1, c2;
2393
2394       src_base = src;
2395       ONE_MORE_BYTE (c1);
2396
2397       if (c1 < 0x80)
2398         {
2399           charset = CHARSET_ASCII;
2400           if (c1 < 0x20)
2401             {
2402               if (c1 == '\r')
2403                 {
2404                   if (coding->eol_type == CODING_EOL_CRLF)
2405                     {
2406                       ONE_MORE_BYTE (c2);
2407                       if (c2 == '\n')
2408                         c1 = c2;
2409                       else if (coding->mode
2410                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2411                         {
2412                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2413                           goto label_end_of_loop;
2414                         }
2415                       else
2416                         /* To process C2 again, SRC is subtracted by 1.  */
2417                         src--;
2418                     }
2419                   else if (coding->eol_type == CODING_EOL_CR)
2420                     c1 = '\n';
2421                 }
2422               else if (c1 == '\n'
2423                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2424                        && (coding->eol_type == CODING_EOL_CR
2425                            || coding->eol_type == CODING_EOL_CRLF))
2426                 {
2427                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2428                   goto label_end_of_loop;
2429                 }
2430             }
2431         }
2432       else
2433         {
2434           if (sjis_p)
2435             {
2436               if (c1 >= 0xF0)
2437                 goto label_invalid_code;
2438               if (c1 < 0xA0 || c1 >= 0xE0)
2439                 {
2440                   /* SJIS -> JISX0208 */
2441                   ONE_MORE_BYTE (c2);
2442                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2443                     goto label_invalid_code;
2444                   DECODE_SJIS (c1, c2, c1, c2);
2445                   charset = charset_jisx0208;
2446                 }
2447               else
2448                 /* SJIS -> JISX0201-Kana */
2449                 charset = charset_katakana_jisx0201;
2450             }
2451           else
2452             {
2453               /* BIG5 -> Big5 */
2454               if (c1 < 0xA1 || c1 > 0xFE)
2455                 goto label_invalid_code;
2456               ONE_MORE_BYTE (c2);
2457               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2458                 goto label_invalid_code;
2459               DECODE_BIG5 (c1, c2, charset, c1, c2);
2460             }
2461         }
2462
2463       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2464       EMIT_CHAR (c);
2465       continue;
2466
2467     label_invalid_code:
2468       coding->errors++;
2469       src = src_base;
2470       c = *src++;
2471       EMIT_CHAR (c);
2472     }
2473
2474  label_end_of_loop:
2475   coding->consumed = coding->consumed_char = src_base - source;
2476   coding->produced = dst - destination;
2477   return;
2478 }
2479
2480 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2481    This function can encode charsets `ascii', `katakana-jisx0201',
2482    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2483    are sure that all these charsets are registered as official charset
2484    (i.e. do not have extended leading-codes).  Characters of other
2485    charsets are produced without any encoding.  If SJIS_P is 1, encode
2486    SJIS text, else encode BIG5 text.  */
2487
2488 static void
2489 encode_coding_sjis_big5 (coding, source, destination,
2490                          src_bytes, dst_bytes, sjis_p)
2491      struct coding_system *coding;
2492      unsigned char *source, *destination;
2493      int src_bytes, dst_bytes;
2494      int sjis_p;
2495 {
2496   unsigned char *src = source;
2497   unsigned char *src_end = source + src_bytes;
2498   unsigned char *dst = destination;
2499   unsigned char *dst_end = destination + dst_bytes;
2500   /* SRC_BASE remembers the start position in source in each loop.
2501      The loop will be exited when there's not enough source text to
2502      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2503      there's not enough destination area to produce encoded codes
2504      (within macro EMIT_BYTES).  */
2505   unsigned char *src_base;
2506   Lisp_Object translation_table;
2507
2508   if (NILP (Venable_character_translation))
2509     translation_table = Qnil;
2510   else
2511     {
2512       translation_table = coding->translation_table_for_decode;
2513       if (NILP (translation_table))
2514         translation_table = Vstandard_translation_table_for_decode;
2515     }
2516
2517   while (1)
2518     {
2519       int c, charset, c1, c2;
2520
2521       src_base = src;
2522       ONE_MORE_CHAR (c);
2523
2524       /* Now encode the character C.  */
2525       if (SINGLE_BYTE_CHAR_P (c))
2526         {
2527           switch (c)
2528             {
2529             case '\r':
2530               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2531                 {
2532                   EMIT_ONE_BYTE (c);
2533                   break;
2534                 }
2535               c = '\n';
2536             case '\n':
2537               if (coding->eol_type == CODING_EOL_CRLF)
2538                 {
2539                   EMIT_TWO_BYTES ('\r', c);
2540                   break;
2541                 }
2542               else if (coding->eol_type == CODING_EOL_CR)
2543                 c = '\r';
2544             default:
2545               EMIT_ONE_BYTE (c);
2546             }
2547         }
2548       else
2549         {
2550           SPLIT_CHAR (c, charset, c1, c2);
2551           if (sjis_p)
2552             {
2553               if (charset == charset_jisx0208
2554                   || charset == charset_jisx0208_1978)
2555                 {
2556                   ENCODE_SJIS (c1, c2, c1, c2);
2557                   EMIT_TWO_BYTES (c1, c2);
2558                 }
2559               else if (charset == charset_latin_jisx0201)
2560                 EMIT_ONE_BYTE (c1);
2561               else
2562                 /* There's no way other than producing the internal
2563                    codes as is.  */
2564                 EMIT_BYTES (src_base, src);
2565             }
2566           else
2567             {
2568               if (charset == charset_big5_1 || charset == charset_big5_2)
2569                 {
2570                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2571                   EMIT_TWO_BYTES (c1, c2);
2572                 }
2573               else
2574                 /* There's no way other than producing the internal
2575                    codes as is.  */
2576                 EMIT_BYTES (src_base, src);
2577             }
2578         }
2579       coding->consumed_char++;
2580     }
2581
2582  label_end_of_loop:
2583   coding->consumed = src_base - source;
2584   coding->produced = coding->produced_char = dst - destination;
2585 }
2586
2587 \f
2588 /*** 5. CCL handlers ***/
2589
2590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2591    Check if a text is encoded in a coding system of which
2592    encoder/decoder are written in CCL program.  If it is, return
2593    CODING_CATEGORY_MASK_CCL, else return 0.  */
2594
2595 int
2596 detect_coding_ccl (src, src_end)
2597      unsigned char *src, *src_end;
2598 {
2599   unsigned char *valid;
2600   int c;
2601   /* Dummy for ONE_MORE_BYTE.  */
2602   struct coding_system dummy_coding;
2603   struct coding_system *coding = &dummy_coding;
2604
2605   /* No coding system is assigned to coding-category-ccl.  */
2606   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2607     return 0;
2608
2609   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2610   while (1)
2611     {
2612       ONE_MORE_BYTE (c);
2613       if (! valid[c])
2614         return 0;
2615     }
2616  label_end_of_loop:
2617   return CODING_CATEGORY_MASK_CCL;
2618 }
2619
2620 \f
2621 /*** 6. End-of-line handlers ***/
2622
2623 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2624
2625 static void
2626 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2627      struct coding_system *coding;
2628      unsigned char *source, *destination;
2629      int src_bytes, dst_bytes;
2630 {
2631   unsigned char *src = source;
2632   unsigned char *dst = destination;
2633   unsigned char *src_end = src + src_bytes;
2634   unsigned char *dst_end = dst + dst_bytes;
2635   Lisp_Object translation_table;
2636   /* SRC_BASE remembers the start position in source in each loop.
2637      The loop will be exited when there's not enough source code
2638      (within macro ONE_MORE_BYTE), or when there's not enough
2639      destination area to produce a character (within macro
2640      EMIT_CHAR).  */
2641   unsigned char *src_base;
2642   int c;
2643
2644   translation_table = Qnil;
2645   switch (coding->eol_type)
2646     {
2647     case CODING_EOL_CRLF:
2648       while (1)
2649         {
2650           src_base = src;
2651           ONE_MORE_BYTE (c);
2652           if (c == '\r')
2653             {
2654               ONE_MORE_BYTE (c);
2655               if (c != '\n')
2656                 {
2657                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2658                     {
2659                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2660                       goto label_end_of_loop;
2661                     }
2662                   src--;
2663                   c = '\r';
2664                 }
2665             }
2666           else if (c == '\n'
2667                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2668             {
2669               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2670               goto label_end_of_loop;
2671             }
2672           EMIT_CHAR (c);
2673         }
2674       break;
2675
2676     case CODING_EOL_CR:
2677       while (1)
2678         {
2679           src_base = src;
2680           ONE_MORE_BYTE (c);
2681           if (c == '\n')
2682             {
2683               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2684                 {
2685                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2686                   goto label_end_of_loop;
2687                 }
2688             }
2689           else if (c == '\r')
2690             c = '\n';
2691           EMIT_CHAR (c);
2692         }
2693       break;
2694
2695     default:                    /* no need for EOL handling */
2696       while (1)
2697         {
2698           src_base = src;
2699           ONE_MORE_BYTE (c);
2700           EMIT_CHAR (c);
2701         }
2702     }
2703
2704  label_end_of_loop:
2705   coding->consumed = coding->consumed_char = src_base - source;
2706   coding->produced = dst - destination;
2707   return;
2708 }
2709
2710 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2711    format of end-of-line according to `coding->eol_type'.  It also
2712    convert multibyte form 8-bit characers to unibyte if
2713    CODING->src_multibyte is nonzero.  If `coding->mode &
2714    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2715    also means end-of-line.  */
2716
2717 static void
2718 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2719      struct coding_system *coding;
2720      unsigned char *source, *destination;
2721      int src_bytes, dst_bytes;
2722 {
2723   unsigned char *src = source;
2724   unsigned char *dst = destination;
2725   unsigned char *src_end = src + src_bytes;
2726   unsigned char *dst_end = dst + dst_bytes;
2727   Lisp_Object translation_table;
2728   /* SRC_BASE remembers the start position in source in each loop.
2729      The loop will be exited when there's not enough source text to
2730      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2731      there's not enough destination area to produce encoded codes
2732      (within macro EMIT_BYTES).  */
2733   unsigned char *src_base;
2734   int c;
2735   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2736
2737   translation_table = Qnil;
2738   if (coding->src_multibyte
2739       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2740     {
2741       src_end--;
2742       src_bytes--;
2743       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2744     }
2745
2746   if (coding->eol_type == CODING_EOL_CRLF)
2747     {
2748       while (src < src_end)
2749         {
2750           src_base = src;
2751           c = *src++;
2752           if (c >= 0x20)
2753             EMIT_ONE_BYTE (c);
2754           else if (c == '\n' || (c == '\r' && selective_display))
2755             EMIT_TWO_BYTES ('\r', '\n');
2756           else
2757             EMIT_ONE_BYTE (c);
2758         }
2759       src_base = src;
2760     label_end_of_loop:
2761       ;
2762     }
2763   else
2764     {
2765       if (src_bytes <= dst_bytes)
2766         {
2767           safe_bcopy (src, dst, src_bytes);
2768           src_base = src_end;
2769           dst += src_bytes;
2770         }
2771       else
2772         {
2773           if (coding->src_multibyte
2774               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2775             dst_bytes--;
2776           safe_bcopy (src, dst, dst_bytes);
2777           src_base = src + dst_bytes;
2778           dst = destination + dst_bytes;
2779           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2780         }
2781       if (coding->eol_type == CODING_EOL_CR)
2782         {
2783           for (src = destination; src < dst; src++)
2784             if (*src == '\n') *src = '\r';
2785         }
2786       else if (selective_display)
2787         {
2788           for (src = destination; src < dst; src++)
2789             if (*src == '\r') *src = '\n';
2790         }
2791     }
2792   if (coding->src_multibyte)
2793     dst = destination + str_as_unibyte (destination, dst - destination);
2794
2795   coding->consumed = src_base - source;
2796   coding->produced = dst - destination;
2797 }
2798
2799 \f
2800 /*** 7. C library functions ***/
2801
2802 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2803    has a property `coding-system'.  The value of this property is a
2804    vector of length 5 (called as coding-vector).  Among elements of
2805    this vector, the first (element[0]) and the fifth (element[4])
2806    carry important information for decoding/encoding.  Before
2807    decoding/encoding, this information should be set in fields of a
2808    structure of type `coding_system'.
2809
2810    A value of property `coding-system' can be a symbol of another
2811    subsidiary coding-system.  In that case, Emacs gets coding-vector
2812    from that symbol.
2813
2814    `element[0]' contains information to be set in `coding->type'.  The
2815    value and its meaning is as follows:
2816
2817    0 -- coding_type_emacs_mule
2818    1 -- coding_type_sjis
2819    2 -- coding_type_iso2022
2820    3 -- coding_type_big5
2821    4 -- coding_type_ccl encoder/decoder written in CCL
2822    nil -- coding_type_no_conversion
2823    t -- coding_type_undecided (automatic conversion on decoding,
2824                                no-conversion on encoding)
2825
2826    `element[4]' contains information to be set in `coding->flags' and
2827    `coding->spec'.  The meaning varies by `coding->type'.
2828
2829    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2830    of length 32 (of which the first 13 sub-elements are used now).
2831    Meanings of these sub-elements are:
2832
2833    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2834         If the value is an integer of valid charset, the charset is
2835         assumed to be designated to graphic register N initially.
2836
2837         If the value is minus, it is a minus value of charset which
2838         reserves graphic register N, which means that the charset is
2839         not designated initially but should be designated to graphic
2840         register N just before encoding a character in that charset.
2841
2842         If the value is nil, graphic register N is never used on
2843         encoding.
2844
2845    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2846         Each value takes t or nil.  See the section ISO2022 of
2847         `coding.h' for more information.
2848
2849    If `coding->type' is `coding_type_big5', element[4] is t to denote
2850    BIG5-ETen or nil to denote BIG5-HKU.
2851
2852    If `coding->type' takes the other value, element[4] is ignored.
2853
2854    Emacs Lisp's coding system also carries information about format of
2855    end-of-line in a value of property `eol-type'.  If the value is
2856    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2857    means CODING_EOL_CR.  If it is not integer, it should be a vector
2858    of subsidiary coding systems of which property `eol-type' has one
2859    of above values.
2860
2861 */
2862
2863 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2864    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2865    is setup so that no conversion is necessary and return -1, else
2866    return 0.  */
2867
2868 int
2869 setup_coding_system (coding_system, coding)
2870      Lisp_Object coding_system;
2871      struct coding_system *coding;
2872 {
2873   Lisp_Object coding_spec, coding_type, eol_type, plist;
2874   Lisp_Object val;
2875   int i;
2876
2877   /* Initialize some fields required for all kinds of coding systems.  */
2878   coding->symbol = coding_system;
2879   coding->common_flags = 0;
2880   coding->mode = 0;
2881   coding->heading_ascii = -1;
2882   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2883   coding->composing = COMPOSITION_DISABLED;
2884   coding->cmp_data = NULL;
2885
2886   if (NILP (coding_system))
2887     goto label_invalid_coding_system;
2888
2889   coding_spec = Fget (coding_system, Qcoding_system);
2890
2891   if (!VECTORP (coding_spec)
2892       || XVECTOR (coding_spec)->size != 5
2893       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2894     goto label_invalid_coding_system;
2895
2896   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2897   if (VECTORP (eol_type))
2898     {
2899       coding->eol_type = CODING_EOL_UNDECIDED;
2900       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2901     }
2902   else if (XFASTINT (eol_type) == 1)
2903     {
2904       coding->eol_type = CODING_EOL_CRLF;
2905       coding->common_flags
2906         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2907     }
2908   else if (XFASTINT (eol_type) == 2)
2909     {
2910       coding->eol_type = CODING_EOL_CR;
2911       coding->common_flags
2912         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2913     }
2914   else
2915     coding->eol_type = CODING_EOL_LF;
2916
2917   coding_type = XVECTOR (coding_spec)->contents[0];
2918   /* Try short cut.  */
2919   if (SYMBOLP (coding_type))
2920     {
2921       if (EQ (coding_type, Qt))
2922         {
2923           coding->type = coding_type_undecided;
2924           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2925         }
2926       else
2927         coding->type = coding_type_no_conversion;
2928       return 0;
2929     }
2930
2931   /* Get values of coding system properties:
2932      `post-read-conversion', `pre-write-conversion',
2933      `translation-table-for-decode', `translation-table-for-encode'.  */
2934   plist = XVECTOR (coding_spec)->contents[3];
2935   /* Pre & post conversion functions should be disabled if
2936      inhibit_eol_conversion is nozero.  This is the case that a code
2937      conversion function is called while those functions are running.  */
2938   if (! inhibit_pre_post_conversion)
2939     {
2940       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2941       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2942     }
2943   val = Fplist_get (plist, Qtranslation_table_for_decode);
2944   if (SYMBOLP (val))
2945     val = Fget (val, Qtranslation_table_for_decode);
2946   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2947   val = Fplist_get (plist, Qtranslation_table_for_encode);
2948   if (SYMBOLP (val))
2949     val = Fget (val, Qtranslation_table_for_encode);
2950   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2951   val = Fplist_get (plist, Qcoding_category);
2952   if (!NILP (val))
2953     {
2954       val = Fget (val, Qcoding_category_index);
2955       if (INTEGERP (val))
2956         coding->category_idx = XINT (val);
2957       else
2958         goto label_invalid_coding_system;
2959     }
2960   else
2961     goto label_invalid_coding_system;
2962
2963   val = Fplist_get (plist, Qsafe_charsets);
2964   if (EQ (val, Qt))
2965     {
2966       for (i = 0; i <= MAX_CHARSET; i++)
2967         coding->safe_charsets[i] = 1;
2968     }
2969   else
2970     {
2971       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2972       while (CONSP (val))
2973         {
2974           if ((i = get_charset_id (XCAR (val))) >= 0)
2975             coding->safe_charsets[i] = 1;
2976           val = XCDR (val);
2977         }
2978     }
2979
2980   /* If the coding system has non-nil `composition' property, enable
2981      composition handling.  */
2982   val = Fplist_get (plist, Qcomposition);
2983   if (!NILP (val))
2984     coding->composing = COMPOSITION_NO;
2985
2986   switch (XFASTINT (coding_type))
2987     {
2988     case 0:
2989       coding->type = coding_type_emacs_mule;
2990       if (!NILP (coding->post_read_conversion))
2991         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2992       if (!NILP (coding->pre_write_conversion))
2993         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2994       break;
2995
2996     case 1:
2997       coding->type = coding_type_sjis;
2998       coding->common_flags
2999         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3000       break;
3001
3002     case 2:
3003       coding->type = coding_type_iso2022;
3004       coding->common_flags
3005         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3006       {
3007         Lisp_Object val, temp;
3008         Lisp_Object *flags;
3009         int i, charset, reg_bits = 0;
3010
3011         val = XVECTOR (coding_spec)->contents[4];
3012
3013         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3014           goto label_invalid_coding_system;
3015
3016         flags = XVECTOR (val)->contents;
3017         coding->flags
3018           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3019              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3020              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3021              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3022              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3023              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3024              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3025              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3026              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3027              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3028              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3029              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3030              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3031              );
3032
3033         /* Invoke graphic register 0 to plane 0.  */
3034         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3035         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3036         CODING_SPEC_ISO_INVOCATION (coding, 1)
3037           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3038         /* Not single shifting at first.  */
3039         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3040         /* Beginning of buffer should also be regarded as bol. */
3041         CODING_SPEC_ISO_BOL (coding) = 1;
3042
3043         for (charset = 0; charset <= MAX_CHARSET; charset++)
3044           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3045         val = Vcharset_revision_alist;
3046         while (CONSP (val))
3047           {
3048             charset = get_charset_id (Fcar_safe (XCAR (val)));
3049             if (charset >= 0
3050                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3051                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3052               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3053             val = XCDR (val);
3054           }
3055
3056         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3057            FLAGS[REG] can be one of below:
3058                 integer CHARSET: CHARSET occupies register I,
3059                 t: designate nothing to REG initially, but can be used
3060                   by any charsets,
3061                 list of integer, nil, or t: designate the first
3062                   element (if integer) to REG initially, the remaining
3063                   elements (if integer) is designated to REG on request,
3064                   if an element is t, REG can be used by any charsets,
3065                 nil: REG is never used.  */
3066         for (charset = 0; charset <= MAX_CHARSET; charset++)
3067           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3068             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3069         for (i = 0; i < 4; i++)
3070           {
3071             if (INTEGERP (flags[i])
3072                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3073                 || (charset = get_charset_id (flags[i])) >= 0)
3074               {
3075                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3076                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3077               }
3078             else if (EQ (flags[i], Qt))
3079               {
3080                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3081                 reg_bits |= 1 << i;
3082                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3083               }
3084             else if (CONSP (flags[i]))
3085               {
3086                 Lisp_Object tail;
3087                 tail = flags[i];
3088
3089                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3090                 if (INTEGERP (XCAR (tail))
3091                     && (charset = XINT (XCAR (tail)),
3092                         CHARSET_VALID_P (charset))
3093                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3094                   {
3095                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3096                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3097                   }
3098                 else
3099                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3100                 tail = XCDR (tail);
3101                 while (CONSP (tail))
3102                   {
3103                     if (INTEGERP (XCAR (tail))
3104                         && (charset = XINT (XCAR (tail)),
3105                             CHARSET_VALID_P (charset))
3106                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3107                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3108                         = i;
3109                     else if (EQ (XCAR (tail), Qt))
3110                       reg_bits |= 1 << i;
3111                     tail = XCDR (tail);
3112                   }
3113               }
3114             else
3115               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3116
3117             CODING_SPEC_ISO_DESIGNATION (coding, i)
3118               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3119           }
3120
3121         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3122           {
3123             /* REG 1 can be used only by locking shift in 7-bit env.  */
3124             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3125               reg_bits &= ~2;
3126             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3127               /* Without any shifting, only REG 0 and 1 can be used.  */
3128               reg_bits &= 3;
3129           }
3130
3131         if (reg_bits)
3132           for (charset = 0; charset <= MAX_CHARSET; charset++)
3133             {
3134               if (CHARSET_VALID_P (charset))
3135                 {
3136                   /* There exist some default graphic registers to be
3137                      used CHARSET.  */
3138
3139                   /* We had better avoid designating a charset of
3140                      CHARS96 to REG 0 as far as possible.  */
3141                   if (CHARSET_CHARS (charset) == 96)
3142                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3143                       = (reg_bits & 2
3144                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3145                   else
3146                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3147                       = (reg_bits & 1
3148                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3149                 }
3150             }
3151       }
3152       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3153       coding->spec.iso2022.last_invalid_designation_register = -1;
3154       break;
3155
3156     case 3:
3157       coding->type = coding_type_big5;
3158       coding->common_flags
3159         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3160       coding->flags
3161         = (NILP (XVECTOR (coding_spec)->contents[4])
3162            ? CODING_FLAG_BIG5_HKU
3163            : CODING_FLAG_BIG5_ETEN);
3164       break;
3165
3166     case 4:
3167       coding->type = coding_type_ccl;
3168       coding->common_flags
3169         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3170       {
3171         val = XVECTOR (coding_spec)->contents[4];
3172         if (! CONSP (val)
3173             || setup_ccl_program (&(coding->spec.ccl.decoder),
3174                                   XCAR (val)) < 0
3175             || setup_ccl_program (&(coding->spec.ccl.encoder),
3176                                   XCDR (val)) < 0)
3177           goto label_invalid_coding_system;
3178
3179         bzero (coding->spec.ccl.valid_codes, 256);
3180         val = Fplist_get (plist, Qvalid_codes);
3181         if (CONSP (val))
3182           {
3183             Lisp_Object this;
3184
3185             for (; CONSP (val); val = XCDR (val))
3186               {
3187                 this = XCAR (val);
3188                 if (INTEGERP (this)
3189                     && XINT (this) >= 0 && XINT (this) < 256)
3190                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3191                 else if (CONSP (this)
3192                          && INTEGERP (XCAR (this))
3193                          && INTEGERP (XCDR (this)))
3194                   {
3195                     int start = XINT (XCAR (this));
3196                     int end = XINT (XCDR (this));
3197
3198                     if (start >= 0 && start <= end && end < 256)
3199                       while (start <= end)
3200                         coding->spec.ccl.valid_codes[start++] = 1;
3201                   }
3202               }
3203           }
3204       }
3205       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3206       break;
3207
3208     case 5:
3209       coding->type = coding_type_raw_text;
3210       break;
3211
3212     default:
3213       goto label_invalid_coding_system;
3214     }
3215   return 0;
3216
3217  label_invalid_coding_system:
3218   coding->type = coding_type_no_conversion;
3219   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3220   coding->common_flags = 0;
3221   coding->eol_type = CODING_EOL_LF;
3222   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3223   return -1;
3224 }
3225
3226 /* Free memory blocks allocated for storing composition information.  */
3227
3228 void
3229 coding_free_composition_data (coding)
3230      struct coding_system *coding;
3231 {
3232   struct composition_data *cmp_data = coding->cmp_data, *next;
3233
3234   if (!cmp_data)
3235     return;
3236   /* Memory blocks are chained.  At first, rewind to the first, then,
3237      free blocks one by one.  */
3238   while (cmp_data->prev)
3239     cmp_data = cmp_data->prev;
3240   while (cmp_data)
3241     {
3242       next = cmp_data->next;
3243       xfree (cmp_data);
3244       cmp_data = next;
3245     }
3246   coding->cmp_data = NULL;
3247 }
3248
3249 /* Set `char_offset' member of all memory blocks pointed by
3250    coding->cmp_data to POS.  */
3251
3252 void
3253 coding_adjust_composition_offset (coding, pos)
3254      struct coding_system *coding;
3255      int pos;
3256 {
3257   struct composition_data *cmp_data;
3258
3259   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3260     cmp_data->char_offset = pos;
3261 }
3262
3263 /* Setup raw-text or one of its subsidiaries in the structure
3264    coding_system CODING according to the already setup value eol_type
3265    in CODING.  CODING should be setup for some coding system in
3266    advance.  */
3267
3268 void
3269 setup_raw_text_coding_system (coding)
3270      struct coding_system *coding;
3271 {
3272   if (coding->type != coding_type_raw_text)
3273     {
3274       coding->symbol = Qraw_text;
3275       coding->type = coding_type_raw_text;
3276       if (coding->eol_type != CODING_EOL_UNDECIDED)
3277         {
3278           Lisp_Object subsidiaries;
3279           subsidiaries = Fget (Qraw_text, Qeol_type);
3280
3281           if (VECTORP (subsidiaries)
3282               && XVECTOR (subsidiaries)->size == 3)
3283             coding->symbol
3284               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3285         }
3286       setup_coding_system (coding->symbol, coding);
3287     }
3288   return;
3289 }
3290
3291 /* Emacs has a mechanism to automatically detect a coding system if it
3292    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3293    it's impossible to distinguish some coding systems accurately
3294    because they use the same range of codes.  So, at first, coding
3295    systems are categorized into 7, those are:
3296
3297    o coding-category-emacs-mule
3298
3299         The category for a coding system which has the same code range
3300         as Emacs' internal format.  Assigned the coding-system (Lisp
3301         symbol) `emacs-mule' by default.
3302
3303    o coding-category-sjis
3304
3305         The category for a coding system which has the same code range
3306         as SJIS.  Assigned the coding-system (Lisp
3307         symbol) `japanese-shift-jis' by default.
3308
3309    o coding-category-iso-7
3310
3311         The category for a coding system which has the same code range
3312         as ISO2022 of 7-bit environment.  This doesn't use any locking
3313         shift and single shift functions.  This can encode/decode all
3314         charsets.  Assigned the coding-system (Lisp symbol)
3315         `iso-2022-7bit' by default.
3316
3317    o coding-category-iso-7-tight
3318
3319         Same as coding-category-iso-7 except that this can
3320         encode/decode only the specified charsets.
3321
3322    o coding-category-iso-8-1
3323
3324         The category for a coding system which has the same code range
3325         as ISO2022 of 8-bit environment and graphic plane 1 used only
3326         for DIMENSION1 charset.  This doesn't use any locking shift
3327         and single shift functions.  Assigned the coding-system (Lisp
3328         symbol) `iso-latin-1' by default.
3329
3330    o coding-category-iso-8-2
3331
3332         The category for a coding system which has the same code range
3333         as ISO2022 of 8-bit environment and graphic plane 1 used only
3334         for DIMENSION2 charset.  This doesn't use any locking shift
3335         and single shift functions.  Assigned the coding-system (Lisp
3336         symbol) `japanese-iso-8bit' by default.
3337
3338    o coding-category-iso-7-else
3339
3340         The category for a coding system which has the same code range
3341         as ISO2022 of 7-bit environemnt but uses locking shift or
3342         single shift functions.  Assigned the coding-system (Lisp
3343         symbol) `iso-2022-7bit-lock' by default.
3344
3345    o coding-category-iso-8-else
3346
3347         The category for a coding system which has the same code range
3348         as ISO2022 of 8-bit environemnt but uses locking shift or
3349         single shift functions.  Assigned the coding-system (Lisp
3350         symbol) `iso-2022-8bit-ss2' by default.
3351
3352    o coding-category-big5
3353
3354         The category for a coding system which has the same code range
3355         as BIG5.  Assigned the coding-system (Lisp symbol)
3356         `cn-big5' by default.
3357
3358    o coding-category-utf-8
3359
3360         The category for a coding system which has the same code range
3361         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3362         symbol) `utf-8' by default.
3363
3364    o coding-category-utf-16-be
3365
3366         The category for a coding system in which a text has an
3367         Unicode signature (cf. Unicode Standard) in the order of BIG
3368         endian at the head.  Assigned the coding-system (Lisp symbol)
3369         `utf-16-be' by default.
3370
3371    o coding-category-utf-16-le
3372
3373         The category for a coding system in which a text has an
3374         Unicode signature (cf. Unicode Standard) in the order of
3375         LITTLE endian at the head.  Assigned the coding-system (Lisp
3376         symbol) `utf-16-le' by default.
3377
3378    o coding-category-ccl
3379
3380         The category for a coding system of which encoder/decoder is
3381         written in CCL programs.  The default value is nil, i.e., no
3382         coding system is assigned.
3383
3384    o coding-category-binary
3385
3386         The category for a coding system not categorized in any of the
3387         above.  Assigned the coding-system (Lisp symbol)
3388         `no-conversion' by default.
3389
3390    Each of them is a Lisp symbol and the value is an actual
3391    `coding-system's (this is also a Lisp symbol) assigned by a user.
3392    What Emacs does actually is to detect a category of coding system.
3393    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3394    decide only one possible category, it selects a category of the
3395    highest priority.  Priorities of categories are also specified by a
3396    user in a Lisp variable `coding-category-list'.
3397
3398 */
3399
3400 static
3401 int ascii_skip_code[256];
3402
3403 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3404    If it detects possible coding systems, return an integer in which
3405    appropriate flag bits are set.  Flag bits are defined by macros
3406    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3407    it should point the table `coding_priorities'.  In that case, only
3408    the flag bit for a coding system of the highest priority is set in
3409    the returned value.
3410
3411    How many ASCII characters are at the head is returned as *SKIP.  */
3412
3413 static int
3414 detect_coding_mask (source, src_bytes, priorities, skip)
3415      unsigned char *source;
3416      int src_bytes, *priorities, *skip;
3417 {
3418   register unsigned char c;
3419   unsigned char *src = source, *src_end = source + src_bytes;
3420   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3421   int i, idx;
3422
3423   /* At first, skip all ASCII characters and control characters except
3424      for three ISO2022 specific control characters.  */
3425   ascii_skip_code[ISO_CODE_SO] = 0;
3426   ascii_skip_code[ISO_CODE_SI] = 0;
3427   ascii_skip_code[ISO_CODE_ESC] = 0;
3428
3429  label_loop_detect_coding:
3430   while (src < src_end && ascii_skip_code[*src]) src++;
3431   *skip = src - source;
3432
3433   if (src >= src_end)
3434     /* We found nothing other than ASCII.  There's nothing to do.  */
3435     return 0;
3436
3437   c = *src;
3438   /* The text seems to be encoded in some multilingual coding system.
3439      Now, try to find in which coding system the text is encoded.  */
3440   if (c < 0x80)
3441     {
3442       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3443       /* C is an ISO2022 specific control code of C0.  */
3444       mask = detect_coding_iso2022 (src, src_end);
3445       if (mask == 0)
3446         {
3447           /* No valid ISO2022 code follows C.  Try again.  */
3448           src++;
3449           if (c == ISO_CODE_ESC)
3450             ascii_skip_code[ISO_CODE_ESC] = 1;
3451           else
3452             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3453           goto label_loop_detect_coding;
3454         }
3455       if (priorities)
3456         {
3457           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3458             {
3459               if (mask & priorities[i])
3460                 return priorities[i];
3461             }
3462           return CODING_CATEGORY_MASK_RAW_TEXT;
3463         }
3464     }
3465   else
3466     {
3467       int try;
3468
3469       if (c < 0xA0)
3470         {
3471           /* C is the first byte of SJIS character code,
3472              or a leading-code of Emacs' internal format (emacs-mule),
3473              or the first byte of UTF-16.  */
3474           try = (CODING_CATEGORY_MASK_SJIS
3475                   | CODING_CATEGORY_MASK_EMACS_MULE
3476                   | CODING_CATEGORY_MASK_UTF_16_BE
3477                   | CODING_CATEGORY_MASK_UTF_16_LE);
3478
3479           /* Or, if C is a special latin extra code,
3480              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3481              or is an ISO2022 control-sequence-introducer (CSI),
3482              we should also consider the possibility of ISO2022 codings.  */
3483           if ((VECTORP (Vlatin_extra_code_table)
3484                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3485               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3486               || (c == ISO_CODE_CSI
3487                   && (src < src_end
3488                       && (*src == ']'
3489                           || ((*src == '0' || *src == '1' || *src == '2')
3490                               && src + 1 < src_end
3491                               && src[1] == ']')))))
3492             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3493                      | CODING_CATEGORY_MASK_ISO_8BIT);
3494         }
3495       else
3496         /* C is a character of ISO2022 in graphic plane right,
3497            or a SJIS's 1-byte character code (i.e. JISX0201),
3498            or the first byte of BIG5's 2-byte code,
3499            or the first byte of UTF-8/16.  */
3500         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3501                 | CODING_CATEGORY_MASK_ISO_8BIT
3502                 | CODING_CATEGORY_MASK_SJIS
3503                 | CODING_CATEGORY_MASK_BIG5
3504                 | CODING_CATEGORY_MASK_UTF_8
3505                 | CODING_CATEGORY_MASK_UTF_16_BE
3506                 | CODING_CATEGORY_MASK_UTF_16_LE);
3507
3508       /* Or, we may have to consider the possibility of CCL.  */
3509       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3510           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511               ->spec.ccl.valid_codes)[c])
3512         try |= CODING_CATEGORY_MASK_CCL;
3513
3514       mask = 0;
3515       utf16_examined_p = iso2022_examined_p = 0;
3516       if (priorities)
3517         {
3518           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3519             {
3520               if (!iso2022_examined_p
3521                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3522                 {
3523                   mask |= detect_coding_iso2022 (src, src_end);
3524                   iso2022_examined_p = 1;
3525                 }
3526               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3527                 mask |= detect_coding_sjis (src, src_end);
3528               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3529                 mask |= detect_coding_utf_8 (src, src_end);
3530               else if (!utf16_examined_p
3531                        && (priorities[i] & try &
3532                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3533                 {
3534                   mask |= detect_coding_utf_16 (src, src_end);
3535                   utf16_examined_p = 1;
3536                 }
3537               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3538                 mask |= detect_coding_big5 (src, src_end);
3539               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3540                 mask |= detect_coding_emacs_mule (src, src_end);
3541               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3542                 mask |= detect_coding_ccl (src, src_end);
3543               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3544                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3545               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3546                 mask |= CODING_CATEGORY_MASK_BINARY;
3547               if (mask & priorities[i])
3548                 return priorities[i];
3549             }
3550           return CODING_CATEGORY_MASK_RAW_TEXT;
3551         }
3552       if (try & CODING_CATEGORY_MASK_ISO)
3553         mask |= detect_coding_iso2022 (src, src_end);
3554       if (try & CODING_CATEGORY_MASK_SJIS)
3555         mask |= detect_coding_sjis (src, src_end);
3556       if (try & CODING_CATEGORY_MASK_BIG5)
3557         mask |= detect_coding_big5 (src, src_end);
3558       if (try & CODING_CATEGORY_MASK_UTF_8)
3559         mask |= detect_coding_utf_8 (src, src_end);
3560       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3561         mask |= detect_coding_utf_16 (src, src_end);
3562       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3563         mask |= detect_coding_emacs_mule (src, src_end);
3564       if (try & CODING_CATEGORY_MASK_CCL)
3565         mask |= detect_coding_ccl (src, src_end);
3566     }
3567   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3568 }
3569
3570 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3571    The information of the detected coding system is set in CODING.  */
3572
3573 void
3574 detect_coding (coding, src, src_bytes)
3575      struct coding_system *coding;
3576      unsigned char *src;
3577      int src_bytes;
3578 {
3579   unsigned int idx;
3580   int skip, mask, i;
3581   Lisp_Object val;
3582
3583   val = Vcoding_category_list;
3584   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3585   coding->heading_ascii = skip;
3586
3587   if (!mask) return;
3588
3589   /* We found a single coding system of the highest priority in MASK.  */
3590   idx = 0;
3591   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3592   if (! mask)
3593     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3594
3595   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3596
3597   if (coding->eol_type != CODING_EOL_UNDECIDED)
3598     {
3599       Lisp_Object tmp;
3600
3601       tmp = Fget (val, Qeol_type);
3602       if (VECTORP (tmp))
3603         val = XVECTOR (tmp)->contents[coding->eol_type];
3604     }
3605
3606   /* Setup this new coding system while preserving some slots.  */
3607   {
3608     int src_multibyte = coding->src_multibyte;
3609     int dst_multibyte = coding->dst_multibyte;
3610
3611     setup_coding_system (val, coding);
3612     coding->src_multibyte = src_multibyte;
3613     coding->dst_multibyte = dst_multibyte;
3614     coding->heading_ascii = skip;
3615   }
3616 }
3617
3618 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3619    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3620    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3621
3622    How many non-eol characters are at the head is returned as *SKIP.  */
3623
3624 #define MAX_EOL_CHECK_COUNT 3
3625
3626 static int
3627 detect_eol_type (source, src_bytes, skip)
3628      unsigned char *source;
3629      int src_bytes, *skip;
3630 {
3631   unsigned char *src = source, *src_end = src + src_bytes;
3632   unsigned char c;
3633   int total = 0;                /* How many end-of-lines are found so far.  */
3634   int eol_type = CODING_EOL_UNDECIDED;
3635   int this_eol_type;
3636
3637   *skip = 0;
3638
3639   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3640     {
3641       c = *src++;
3642       if (c == '\n' || c == '\r')
3643         {
3644           if (*skip == 0)
3645             *skip = src - 1 - source;
3646           total++;
3647           if (c == '\n')
3648             this_eol_type = CODING_EOL_LF;
3649           else if (src >= src_end || *src != '\n')
3650             this_eol_type = CODING_EOL_CR;
3651           else
3652             this_eol_type = CODING_EOL_CRLF, src++;
3653
3654           if (eol_type == CODING_EOL_UNDECIDED)
3655             /* This is the first end-of-line.  */
3656             eol_type = this_eol_type;
3657           else if (eol_type != this_eol_type)
3658             {
3659               /* The found type is different from what found before.  */
3660               eol_type = CODING_EOL_INCONSISTENT;
3661               break;
3662             }
3663         }
3664     }
3665
3666   if (*skip == 0)
3667     *skip = src_end - source;
3668   return eol_type;
3669 }
3670
3671 /* Like detect_eol_type, but detect EOL type in 2-octet
3672    big-endian/little-endian format for coding systems utf-16-be and
3673    utf-16-le.  */
3674
3675 static int
3676 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3677      unsigned char *source;
3678      int src_bytes, *skip;
3679 {
3680   unsigned char *src = source, *src_end = src + src_bytes;
3681   unsigned int c1, c2;
3682   int total = 0;                /* How many end-of-lines are found so far.  */
3683   int eol_type = CODING_EOL_UNDECIDED;
3684   int this_eol_type;
3685   int msb, lsb;
3686
3687   if (big_endian_p)
3688     msb = 0, lsb = 1;
3689   else
3690     msb = 1, lsb = 0;
3691
3692   *skip = 0;
3693
3694   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3695     {
3696       c1 = (src[msb] << 8) | (src[lsb]);
3697       src += 2;
3698
3699       if (c1 == '\n' || c1 == '\r')
3700         {
3701           if (*skip == 0)
3702             *skip = src - 2 - source;
3703           total++;
3704           if (c1 == '\n')
3705             {
3706               this_eol_type = CODING_EOL_LF;
3707             }
3708           else
3709             {
3710               if ((src + 1) >= src_end)
3711                 {
3712                   this_eol_type = CODING_EOL_CR;
3713                 }
3714               else
3715                 {
3716                   c2 = (src[msb] << 8) | (src[lsb]);
3717                   if (c2 == '\n')
3718                     this_eol_type = CODING_EOL_CRLF, src += 2;
3719                   else
3720                     this_eol_type = CODING_EOL_CR;
3721                 }
3722             }
3723
3724           if (eol_type == CODING_EOL_UNDECIDED)
3725             /* This is the first end-of-line.  */
3726             eol_type = this_eol_type;
3727           else if (eol_type != this_eol_type)
3728             {
3729               /* The found type is different from what found before.  */
3730               eol_type = CODING_EOL_INCONSISTENT;
3731               break;
3732             }
3733         }
3734     }
3735
3736   if (*skip == 0)
3737     *skip = src_end - source;
3738   return eol_type;
3739 }
3740
3741 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3742    is encoded.  If it detects an appropriate format of end-of-line, it
3743    sets the information in *CODING.  */
3744
3745 void
3746 detect_eol (coding, src, src_bytes)
3747      struct coding_system *coding;
3748      unsigned char *src;
3749      int src_bytes;
3750 {
3751   Lisp_Object val;
3752   int skip;
3753   int eol_type;
3754
3755   switch (coding->category_idx)
3756     {
3757     case CODING_CATEGORY_IDX_UTF_16_BE:
3758       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3759       break;
3760     case CODING_CATEGORY_IDX_UTF_16_LE:
3761       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3762       break;
3763     default:
3764       eol_type = detect_eol_type (src, src_bytes, &skip);
3765       break;
3766     }
3767
3768   if (coding->heading_ascii > skip)
3769     coding->heading_ascii = skip;
3770   else
3771     skip = coding->heading_ascii;
3772
3773   if (eol_type == CODING_EOL_UNDECIDED)
3774     return;
3775   if (eol_type == CODING_EOL_INCONSISTENT)
3776     {
3777 #if 0
3778       /* This code is suppressed until we find a better way to
3779          distinguish raw text file and binary file.  */
3780
3781       /* If we have already detected that the coding is raw-text, the
3782          coding should actually be no-conversion.  */
3783       if (coding->type == coding_type_raw_text)
3784         {
3785           setup_coding_system (Qno_conversion, coding);
3786           return;
3787         }
3788       /* Else, let's decode only text code anyway.  */
3789 #endif /* 0 */
3790       eol_type = CODING_EOL_LF;
3791     }
3792
3793   val = Fget (coding->symbol, Qeol_type);
3794   if (VECTORP (val) && XVECTOR (val)->size == 3)
3795     {
3796       int src_multibyte = coding->src_multibyte;
3797       int dst_multibyte = coding->dst_multibyte;
3798
3799       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3800       coding->src_multibyte = src_multibyte;
3801       coding->dst_multibyte = dst_multibyte;
3802       coding->heading_ascii = skip;
3803     }
3804 }
3805
3806 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3807
3808 #define DECODING_BUFFER_MAG(coding)                     \
3809   (coding->type == coding_type_iso2022                  \
3810    ? 3                                                  \
3811    : (coding->type == coding_type_ccl                   \
3812       ? coding->spec.ccl.decoder.buf_magnification      \
3813       : 2))
3814
3815 /* Return maximum size (bytes) of a buffer enough for decoding
3816    SRC_BYTES of text encoded in CODING.  */
3817
3818 int
3819 decoding_buffer_size (coding, src_bytes)
3820      struct coding_system *coding;
3821      int src_bytes;
3822 {
3823   return (src_bytes * DECODING_BUFFER_MAG (coding)
3824           + CONVERSION_BUFFER_EXTRA_ROOM);
3825 }
3826
3827 /* Return maximum size (bytes) of a buffer enough for encoding
3828    SRC_BYTES of text to CODING.  */
3829
3830 int
3831 encoding_buffer_size (coding, src_bytes)
3832      struct coding_system *coding;
3833      int src_bytes;
3834 {
3835   int magnification;
3836
3837   if (coding->type == coding_type_ccl)
3838     magnification = coding->spec.ccl.encoder.buf_magnification;
3839   else if (CODING_REQUIRE_ENCODING (coding))
3840     magnification = 3;
3841   else
3842     magnification = 1;
3843
3844   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3845 }
3846
3847 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3848 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3849 #endif
3850
3851 char *conversion_buffer;
3852 int conversion_buffer_size;
3853
3854 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3855    or decoding.  Sufficient memory is allocated automatically.  If we
3856    run out of memory, return NULL.  */
3857
3858 char *
3859 get_conversion_buffer (size)
3860      int size;
3861 {
3862   if (size > conversion_buffer_size)
3863     {
3864       char *buf;
3865       int real_size = conversion_buffer_size * 2;
3866
3867       while (real_size < size) real_size *= 2;
3868       buf = (char *) xmalloc (real_size);
3869       xfree (conversion_buffer);
3870       conversion_buffer = buf;
3871       conversion_buffer_size = real_size;
3872     }
3873   return conversion_buffer;
3874 }
3875
3876 int
3877 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3878      struct coding_system *coding;
3879      unsigned char *source, *destination;
3880      int src_bytes, dst_bytes, encodep;
3881 {
3882   struct ccl_program *ccl
3883     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3884   int result;
3885
3886   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3887
3888   coding->produced = ccl_driver (ccl, source, destination,
3889                                  src_bytes, dst_bytes, &(coding->consumed));
3890   if (encodep)
3891     coding->produced_char = coding->produced;
3892   else
3893     {
3894       int bytes
3895         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3896       coding->produced = str_as_multibyte (destination, bytes,
3897                                            coding->produced,
3898                                            &(coding->produced_char));
3899     }
3900
3901   switch (ccl->status)
3902     {
3903     case CCL_STAT_SUSPEND_BY_SRC:
3904       result = CODING_FINISH_INSUFFICIENT_SRC;
3905       break;
3906     case CCL_STAT_SUSPEND_BY_DST:
3907       result = CODING_FINISH_INSUFFICIENT_DST;
3908       break;
3909     case CCL_STAT_QUIT:
3910     case CCL_STAT_INVALID_CMD:
3911       result = CODING_FINISH_INTERRUPT;
3912       break;
3913     default:
3914       result = CODING_FINISH_NORMAL;
3915       break;
3916     }
3917   return result;
3918 }
3919
3920 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3921    decoding, it may detect coding system and format of end-of-line if
3922    those are not yet decided.  The source should be unibyte, the
3923    result is multibyte if CODING->dst_multibyte is nonzero, else
3924    unibyte.  */
3925
3926 int
3927 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3928      struct coding_system *coding;
3929      unsigned char *source, *destination;
3930      int src_bytes, dst_bytes;
3931 {
3932   if (coding->type == coding_type_undecided)
3933     detect_coding (coding, source, src_bytes);
3934
3935   if (coding->eol_type == CODING_EOL_UNDECIDED)
3936     detect_eol (coding, source, src_bytes);
3937
3938   coding->produced = coding->produced_char = 0;
3939   coding->consumed = coding->consumed_char = 0;
3940   coding->errors = 0;
3941   coding->result = CODING_FINISH_NORMAL;
3942
3943   switch (coding->type)
3944     {
3945     case coding_type_sjis:
3946       decode_coding_sjis_big5 (coding, source, destination,
3947                                src_bytes, dst_bytes, 1);
3948       break;
3949
3950     case coding_type_iso2022:
3951       decode_coding_iso2022 (coding, source, destination,
3952                              src_bytes, dst_bytes);
3953       break;
3954
3955     case coding_type_big5:
3956       decode_coding_sjis_big5 (coding, source, destination,
3957                                src_bytes, dst_bytes, 0);
3958       break;
3959
3960     case coding_type_emacs_mule:
3961       decode_coding_emacs_mule (coding, source, destination,
3962                                 src_bytes, dst_bytes);
3963       break;
3964
3965     case coding_type_ccl:
3966       ccl_coding_driver (coding, source, destination,
3967                          src_bytes, dst_bytes, 0);
3968       break;
3969
3970     default:
3971       decode_eol (coding, source, destination, src_bytes, dst_bytes);
3972     }
3973
3974   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3975       && coding->consumed == src_bytes)
3976     coding->result = CODING_FINISH_NORMAL;
3977
3978   if (coding->mode & CODING_MODE_LAST_BLOCK
3979       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3980     {
3981       unsigned char *src = source + coding->consumed;
3982       unsigned char *dst = destination + coding->produced;
3983
3984       src_bytes -= coding->consumed;
3985      coding->errors++;
3986       if (COMPOSING_P (coding))
3987         DECODE_COMPOSITION_END ('1');
3988       while (src_bytes--)
3989         {
3990           int c = *src++;
3991           dst += CHAR_STRING (c, dst);
3992           coding->produced_char++;
3993         }
3994       coding->consumed = coding->consumed_char = src - source;
3995       coding->produced = dst - destination;
3996     }
3997
3998   if (!coding->dst_multibyte)
3999     {
4000       coding->produced = str_as_unibyte (destination, coding->produced);
4001       coding->produced_char = coding->produced;
4002     }
4003
4004   return coding->result;
4005 }
4006
4007 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4008    multibyteness of the source is CODING->src_multibyte, the
4009    multibyteness of the result is always unibyte.  */
4010
4011 int
4012 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4013      struct coding_system *coding;
4014      unsigned char *source, *destination;
4015      int src_bytes, dst_bytes;
4016 {
4017   coding->produced = coding->produced_char = 0;
4018   coding->consumed = coding->consumed_char = 0;
4019   coding->errors = 0;
4020   coding->result = CODING_FINISH_NORMAL;
4021
4022   switch (coding->type)
4023     {
4024     case coding_type_sjis:
4025       encode_coding_sjis_big5 (coding, source, destination,
4026                                src_bytes, dst_bytes, 1);
4027       break;
4028
4029     case coding_type_iso2022:
4030       encode_coding_iso2022 (coding, source, destination,
4031                              src_bytes, dst_bytes);
4032       break;
4033
4034     case coding_type_big5:
4035       encode_coding_sjis_big5 (coding, source, destination,
4036                                src_bytes, dst_bytes, 0);
4037       break;
4038
4039     case coding_type_emacs_mule:
4040       encode_coding_emacs_mule (coding, source, destination,
4041                                 src_bytes, dst_bytes);
4042       break;
4043
4044     case coding_type_ccl:
4045       ccl_coding_driver (coding, source, destination,
4046                          src_bytes, dst_bytes, 1);
4047       break;
4048
4049     default:
4050       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4051     }
4052
4053   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4054       && coding->consumed == src_bytes)
4055     coding->result = CODING_FINISH_NORMAL;
4056
4057   if (coding->mode & CODING_MODE_LAST_BLOCK)
4058     {
4059       unsigned char *src = source + coding->consumed;
4060       unsigned char *src_end = src + src_bytes;
4061       unsigned char *dst = destination + coding->produced;
4062
4063       if (coding->type == coding_type_iso2022)
4064         ENCODE_RESET_PLANE_AND_REGISTER;
4065       if (COMPOSING_P (coding))
4066         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4067       if (coding->consumed < src_bytes)
4068         {
4069           int len = src_bytes - coding->consumed;
4070
4071           BCOPY_SHORT (source + coding->consumed, dst, len);
4072           if (coding->src_multibyte)
4073             len = str_as_unibyte (dst, len);
4074           dst += len;
4075           coding->consumed = src_bytes;
4076         }
4077       coding->produced = coding->produced_char = dst - destination;
4078     }
4079
4080   return coding->result;
4081 }
4082
4083 /* Scan text in the region between *BEG and *END (byte positions),
4084    skip characters which we don't have to decode by coding system
4085    CODING at the head and tail, then set *BEG and *END to the region
4086    of the text we actually have to convert.  The caller should move
4087    the gap out of the region in advance if the region is from a
4088    buffer.
4089
4090    If STR is not NULL, *BEG and *END are indices into STR.  */
4091
4092 static void
4093 shrink_decoding_region (beg, end, coding, str)
4094      int *beg, *end;
4095      struct coding_system *coding;
4096      unsigned char *str;
4097 {
4098   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4099   int eol_conversion;
4100   Lisp_Object translation_table;
4101
4102   if (coding->type == coding_type_ccl
4103       || coding->type == coding_type_undecided
4104       || coding->eol_type != CODING_EOL_LF
4105       || !NILP (coding->post_read_conversion)
4106       || coding->composing != COMPOSITION_DISABLED)
4107     {
4108       /* We can't skip any data.  */
4109       return;
4110     }
4111   if (coding->type == coding_type_no_conversion
4112       || coding->type == coding_type_raw_text
4113       || coding->type == coding_type_emacs_mule)
4114     {
4115       /* We need no conversion, but don't have to skip any data here.
4116          Decoding routine handles them effectively anyway.  */
4117       return;
4118     }
4119
4120   translation_table = coding->translation_table_for_decode;
4121   if (NILP (translation_table) && !NILP (Venable_character_translation))
4122     translation_table = Vstandard_translation_table_for_decode;
4123   if (CHAR_TABLE_P (translation_table))
4124     {
4125       int i;
4126       for (i = 0; i < 128; i++)
4127         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4128           break;
4129       if (i < 128)
4130         /* Some ASCII character should be translated.  We give up
4131            shrinking.  */
4132         return;
4133     }
4134
4135   if (coding->heading_ascii >= 0)
4136     /* Detection routine has already found how much we can skip at the
4137        head.  */
4138     *beg += coding->heading_ascii;
4139
4140   if (str)
4141     {
4142       begp_orig = begp = str + *beg;
4143       endp_orig = endp = str + *end;
4144     }
4145   else
4146     {
4147       begp_orig = begp = BYTE_POS_ADDR (*beg);
4148       endp_orig = endp = begp + *end - *beg;
4149     }
4150
4151   eol_conversion = (coding->eol_type == CODING_EOL_CR
4152                     || coding->eol_type == CODING_EOL_CRLF);
4153
4154   switch (coding->type)
4155     {
4156     case coding_type_sjis:
4157     case coding_type_big5:
4158       /* We can skip all ASCII characters at the head.  */
4159       if (coding->heading_ascii < 0)
4160         {
4161           if (eol_conversion)
4162             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4163           else
4164             while (begp < endp && *begp < 0x80) begp++;
4165         }
4166       /* We can skip all ASCII characters at the tail except for the
4167          second byte of SJIS or BIG5 code.  */
4168       if (eol_conversion)
4169         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4170       else
4171         while (begp < endp && endp[-1] < 0x80) endp--;
4172       /* Do not consider LF as ascii if preceded by CR, since that
4173          confuses eol decoding. */
4174       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4175         endp++;
4176       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4177         endp++;
4178       break;
4179
4180     case coding_type_iso2022:
4181       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4182         /* We can't skip any data.  */
4183         break;
4184       if (coding->heading_ascii < 0)
4185         {
4186           /* We can skip all ASCII characters at the head except for a
4187              few control codes.  */
4188           while (begp < endp && (c = *begp) < 0x80
4189                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4190                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4191                  && (!eol_conversion || c != ISO_CODE_LF))
4192             begp++;
4193         }
4194       switch (coding->category_idx)
4195         {
4196         case CODING_CATEGORY_IDX_ISO_8_1:
4197         case CODING_CATEGORY_IDX_ISO_8_2:
4198           /* We can skip all ASCII characters at the tail.  */
4199           if (eol_conversion)
4200             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4201           else
4202             while (begp < endp && endp[-1] < 0x80) endp--;
4203           /* Do not consider LF as ascii if preceded by CR, since that
4204              confuses eol decoding. */
4205           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4206             endp++;
4207           break;
4208
4209         case CODING_CATEGORY_IDX_ISO_7:
4210         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4211           {
4212             /* We can skip all charactes at the tail except for 8-bit
4213                codes and ESC and the following 2-byte at the tail.  */
4214             unsigned char *eight_bit = NULL;
4215
4216             if (eol_conversion)
4217               while (begp < endp
4218                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4219                 {
4220                   if (!eight_bit && c & 0x80) eight_bit = endp;
4221                   endp--;
4222                 }
4223             else
4224               while (begp < endp
4225                      && (c = endp[-1]) != ISO_CODE_ESC)
4226                 {
4227                   if (!eight_bit && c & 0x80) eight_bit = endp;
4228                   endp--;
4229                 }
4230             /* Do not consider LF as ascii if preceded by CR, since that
4231                confuses eol decoding. */
4232             if (begp < endp && endp < endp_orig
4233                 && endp[-1] == '\r' && endp[0] == '\n')
4234               endp++;
4235             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4236               {
4237                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4238                   /* This is an ASCII designation sequence.  We can
4239                      surely skip the tail.  But, if we have
4240                      encountered an 8-bit code, skip only the codes
4241                      after that.  */
4242                   endp = eight_bit ? eight_bit : endp + 2;
4243                 else
4244                   /* Hmmm, we can't skip the tail.  */
4245                   endp = endp_orig;
4246               }
4247             else if (eight_bit)
4248               endp = eight_bit;
4249           }
4250         }
4251       break;
4252
4253     default:
4254       abort ();
4255     }
4256   *beg += begp - begp_orig;
4257   *end += endp - endp_orig;
4258   return;
4259 }
4260
4261 /* Like shrink_decoding_region but for encoding.  */
4262
4263 static void
4264 shrink_encoding_region (beg, end, coding, str)
4265      int *beg, *end;
4266      struct coding_system *coding;
4267      unsigned char *str;
4268 {
4269   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4270   int eol_conversion;
4271   Lisp_Object translation_table;
4272
4273   if (coding->type == coding_type_ccl
4274       || coding->eol_type == CODING_EOL_CRLF
4275       || coding->eol_type == CODING_EOL_CR
4276       || coding->cmp_data && coding->cmp_data->used > 0)
4277     {
4278       /* We can't skip any data.  */
4279       return;
4280     }
4281   if (coding->type == coding_type_no_conversion
4282       || coding->type == coding_type_raw_text
4283       || coding->type == coding_type_emacs_mule
4284       || coding->type == coding_type_undecided)
4285     {
4286       /* We need no conversion, but don't have to skip any data here.
4287          Encoding routine handles them effectively anyway.  */
4288       return;
4289     }
4290
4291   translation_table = coding->translation_table_for_encode;
4292   if (NILP (translation_table) && !NILP (Venable_character_translation))
4293     translation_table = Vstandard_translation_table_for_encode;
4294   if (CHAR_TABLE_P (translation_table))
4295     {
4296       int i;
4297       for (i = 0; i < 128; i++)
4298         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4299           break;
4300       if (i < 128)
4301         /* Some ASCII character should be tranlsated.  We give up
4302            shrinking.  */
4303         return;
4304     }
4305
4306   if (str)
4307     {
4308       begp_orig = begp = str + *beg;
4309       endp_orig = endp = str + *end;
4310     }
4311   else
4312     {
4313       begp_orig = begp = BYTE_POS_ADDR (*beg);
4314       endp_orig = endp = begp + *end - *beg;
4315     }
4316
4317   eol_conversion = (coding->eol_type == CODING_EOL_CR
4318                     || coding->eol_type == CODING_EOL_CRLF);
4319
4320   /* Here, we don't have to check coding->pre_write_conversion because
4321      the caller is expected to have handled it already.  */
4322   switch (coding->type)
4323     {
4324     case coding_type_iso2022:
4325       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4326         /* We can't skip any data.  */
4327         break;
4328       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4329         {
4330           unsigned char *bol = begp;
4331           while (begp < endp && *begp < 0x80)
4332             {
4333               begp++;
4334               if (begp[-1] == '\n')
4335                 bol = begp;
4336             }
4337           begp = bol;
4338           goto label_skip_tail;
4339         }
4340       /* fall down ... */
4341
4342     case coding_type_sjis:
4343     case coding_type_big5:
4344       /* We can skip all ASCII characters at the head and tail.  */
4345       if (eol_conversion)
4346         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4347       else
4348         while (begp < endp && *begp < 0x80) begp++;
4349     label_skip_tail:
4350       if (eol_conversion)
4351         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4352       else
4353         while (begp < endp && *(endp - 1) < 0x80) endp--;
4354       break;
4355
4356     default:
4357       abort ();
4358     }
4359
4360   *beg += begp - begp_orig;
4361   *end += endp - endp_orig;
4362   return;
4363 }
4364
4365 /* As shrinking conversion region requires some overhead, we don't try
4366    shrinking if the length of conversion region is less than this
4367    value.  */
4368 static int shrink_conversion_region_threshhold = 1024;
4369
4370 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4371   do {                                                                  \
4372     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4373       {                                                                 \
4374         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4375         else shrink_decoding_region (beg, end, coding, str);            \
4376       }                                                                 \
4377   } while (0)
4378
4379 static Lisp_Object
4380 code_convert_region_unwind (dummy)
4381      Lisp_Object dummy;
4382 {
4383   inhibit_pre_post_conversion = 0;
4384   return Qnil;
4385 }
4386
4387 /* Store information about all compositions in the range FROM and TO
4388    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4389    buffer or a string, defaults to the current buffer.  */
4390
4391 void
4392 coding_save_composition (coding, from, to, obj)
4393      struct coding_system *coding;
4394      int from, to;
4395      Lisp_Object obj;
4396 {
4397   Lisp_Object prop;
4398   int start, end;
4399
4400   if (coding->composing == COMPOSITION_DISABLED)
4401     return;
4402   if (!coding->cmp_data)
4403     coding_allocate_composition_data (coding, from);
4404   if (!find_composition (from, to, &start, &end, &prop, obj)
4405       || end > to)
4406     return;
4407   if (start < from
4408       && (!find_composition (end, to, &start, &end, &prop, obj)
4409           || end > to))
4410     return;
4411   coding->composing = COMPOSITION_NO;
4412   do
4413     {
4414       if (COMPOSITION_VALID_P (start, end, prop))
4415         {
4416           enum composition_method method = COMPOSITION_METHOD (prop);
4417           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4418               >= COMPOSITION_DATA_SIZE)
4419             coding_allocate_composition_data (coding, from);
4420           /* For relative composition, we remember start and end
4421              positions, for the other compositions, we also remember
4422              components.  */
4423           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4424           if (method != COMPOSITION_RELATIVE)
4425             {
4426               /* We must store a*/
4427               Lisp_Object val, ch;
4428
4429               val = COMPOSITION_COMPONENTS (prop);
4430               if (CONSP (val))
4431                 while (CONSP (val))
4432                   {
4433                     ch = XCAR (val), val = XCDR (val);
4434                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4435                   }
4436               else if (VECTORP (val) || STRINGP (val))
4437                 {
4438                   int len = (VECTORP (val)
4439                              ? XVECTOR (val)->size : XSTRING (val)->size);
4440                   int i;
4441                   for (i = 0; i < len; i++)
4442                     {
4443                       ch = (STRINGP (val)
4444                             ? Faref (val, make_number (i))
4445                             : XVECTOR (val)->contents[i]);
4446                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4447                     }
4448                 }
4449               else              /* INTEGERP (val) */
4450                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4451             }
4452           CODING_ADD_COMPOSITION_END (coding, end - from);
4453         }
4454       start = end;
4455     }
4456   while (start < to
4457          && find_composition (start, to, &start, &end, &prop, obj)
4458          && end <= to);
4459
4460   /* Make coding->cmp_data point to the first memory block.  */
4461   while (coding->cmp_data->prev)
4462     coding->cmp_data = coding->cmp_data->prev;
4463   coding->cmp_data_start = 0;
4464 }
4465
4466 /* Reflect the saved information about compositions to OBJ.
4467    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4468    is a buffer or a string, defaults to the current buffer.  */
4469
4470 void
4471 coding_restore_composition (coding, obj)
4472      struct coding_system *coding;
4473      Lisp_Object obj;
4474 {
4475   struct composition_data *cmp_data = coding->cmp_data;
4476
4477   if (!cmp_data)
4478     return;
4479
4480   while (cmp_data->prev)
4481     cmp_data = cmp_data->prev;
4482
4483   while (cmp_data)
4484     {
4485       int i;
4486
4487       for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4488         {
4489           int *data = cmp_data->data + i;
4490           enum composition_method method = (enum composition_method) data[3];
4491           Lisp_Object components;
4492
4493           if (method == COMPOSITION_RELATIVE)
4494             components = Qnil;
4495           else
4496             {
4497               int len = data[0] - 4, j;
4498               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4499
4500               for (j = 0; j < len; j++)
4501                 args[j] = make_number (data[4 + j]);
4502               components = (method == COMPOSITION_WITH_ALTCHARS
4503                             ? Fstring (len, args) : Fvector (len, args));
4504             }
4505           compose_text (data[1], data[2], components, Qnil, obj);
4506         }
4507       cmp_data = cmp_data->next;
4508     }
4509 }
4510
4511 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4512    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4513    coding system CODING, and return the status code of code conversion
4514    (currently, this value has no meaning).
4515
4516    How many characters (and bytes) are converted to how many
4517    characters (and bytes) are recorded in members of the structure
4518    CODING.
4519
4520    If REPLACE is nonzero, we do various things as if the original text
4521    is deleted and a new text is inserted.  See the comments in
4522    replace_range (insdel.c) to know what we are doing.
4523
4524    If REPLACE is zero, it is assumed that the source text is unibyte.
4525    Otherwize, it is assumed that the source text is multibyte.  */
4526
4527 int
4528 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4529      int from, from_byte, to, to_byte, encodep, replace;
4530      struct coding_system *coding;
4531 {
4532   int len = to - from, len_byte = to_byte - from_byte;
4533   int require, inserted, inserted_byte;
4534   int head_skip, tail_skip, total_skip = 0;
4535   Lisp_Object saved_coding_symbol;
4536   int first = 1;
4537   unsigned char *src, *dst;
4538   Lisp_Object deletion;
4539   int orig_point = PT, orig_len = len;
4540   int prev_Z;
4541   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4542
4543   coding->src_multibyte = replace && multibyte_p;
4544   coding->dst_multibyte = multibyte_p;
4545
4546   deletion = Qnil;
4547   saved_coding_symbol = Qnil;
4548
4549   if (from < PT && PT < to)
4550     {
4551       TEMP_SET_PT_BOTH (from, from_byte);
4552       orig_point = from;
4553     }
4554
4555   if (replace)
4556     {
4557       int saved_from = from;
4558
4559       prepare_to_modify_buffer (from, to, &from);
4560       if (saved_from != from)
4561         {
4562           to = from + len;
4563           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4564           len_byte = to_byte - from_byte;
4565         }
4566     }
4567
4568   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4569     {
4570       /* We must detect encoding of text and eol format.  */
4571
4572       if (from < GPT && to > GPT)
4573         move_gap_both (from, from_byte);
4574       if (coding->type == coding_type_undecided)
4575         {
4576           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4577           if (coding->type == coding_type_undecided)
4578             /* It seems that the text contains only ASCII, but we
4579                should not left it undecided because the deeper
4580                decoding routine (decode_coding) tries to detect the
4581                encodings again in vain.  */
4582             coding->type = coding_type_emacs_mule;
4583         }
4584       if (coding->eol_type == CODING_EOL_UNDECIDED)
4585         {
4586           saved_coding_symbol = coding->symbol;
4587           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4588           if (coding->eol_type == CODING_EOL_UNDECIDED)
4589             coding->eol_type = CODING_EOL_LF;
4590           /* We had better recover the original eol format if we
4591              encounter an inconsitent eol format while decoding.  */
4592           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4593         }
4594     }
4595
4596   /* Now we convert the text.  */
4597
4598   /* For encoding, we must process pre-write-conversion in advance.  */
4599   if (! inhibit_pre_post_conversion
4600       && encodep
4601       && SYMBOLP (coding->pre_write_conversion)
4602       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4603     {
4604       /* The function in pre-write-conversion may put a new text in a
4605          new buffer.  */
4606       struct buffer *prev = current_buffer;
4607       Lisp_Object new;
4608       int count = specpdl_ptr - specpdl;
4609
4610       record_unwind_protect (code_convert_region_unwind, Qnil);
4611       /* We should not call any more pre-write/post-read-conversion
4612          functions while this pre-write-conversion is running.  */
4613       inhibit_pre_post_conversion = 1;
4614       call2 (coding->pre_write_conversion,
4615              make_number (from), make_number (to));
4616       inhibit_pre_post_conversion = 0;
4617       /* Discard the unwind protect.  */
4618       specpdl_ptr--;
4619
4620       if (current_buffer != prev)
4621         {
4622           len = ZV - BEGV;
4623           new = Fcurrent_buffer ();
4624           set_buffer_internal_1 (prev);
4625           del_range_2 (from, from_byte, to, to_byte, 0);
4626           TEMP_SET_PT_BOTH (from, from_byte);
4627           insert_from_buffer (XBUFFER (new), 1, len, 0);
4628           Fkill_buffer (new);
4629           if (orig_point >= to)
4630             orig_point += len - orig_len;
4631           else if (orig_point > from)
4632             orig_point = from;
4633           orig_len = len;
4634           to = from + len;
4635           from_byte = CHAR_TO_BYTE (from);
4636           to_byte = CHAR_TO_BYTE (to);
4637           len_byte = to_byte - from_byte;
4638           TEMP_SET_PT_BOTH (from, from_byte);
4639         }
4640     }
4641
4642   if (replace)
4643     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4644
4645   if (coding->composing != COMPOSITION_DISABLED)
4646     {
4647       if (encodep)
4648         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4649       else
4650         coding_allocate_composition_data (coding, from);
4651     }
4652
4653   /* Try to skip the heading and tailing ASCIIs.  */
4654   {
4655     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4656
4657     if (from < GPT && GPT < to)
4658       move_gap_both (from, from_byte);
4659     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4660     if (from_byte == to_byte
4661         && (encodep || NILP (coding->post_read_conversion))
4662         && ! CODING_REQUIRE_FLUSHING (coding))
4663       {
4664         coding->produced = len_byte;
4665         coding->produced_char = len;
4666         if (!replace)
4667           /* We must record and adjust for this new text now.  */
4668           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4669         return 0;
4670       }
4671
4672     head_skip = from_byte - from_byte_orig;
4673     tail_skip = to_byte_orig - to_byte;
4674     total_skip = head_skip + tail_skip;
4675     from += head_skip;
4676     to -= tail_skip;
4677     len -= total_skip; len_byte -= total_skip;
4678   }
4679
4680   /* The code conversion routine can not preserve text properties for
4681      now.  So, we must remove all text properties in the region.
4682      Here, we must suppress all modification hooks.  */
4683   if (replace)
4684     {
4685       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4686       inhibit_modification_hooks = 1;
4687       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4688       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4689     }
4690
4691   /* For converion, we must put the gap before the text in addition to
4692      making the gap larger for efficient decoding.  The required gap
4693      size starts from 2000 which is the magic number used in make_gap.
4694      But, after one batch of conversion, it will be incremented if we
4695      find that it is not enough .  */
4696   require = 2000;
4697
4698   if (GAP_SIZE  < require)
4699     make_gap (require - GAP_SIZE);
4700   move_gap_both (from, from_byte);
4701
4702   inserted = inserted_byte = 0;
4703
4704   GAP_SIZE += len_byte;
4705   ZV -= len;
4706   Z -= len;
4707   ZV_BYTE -= len_byte;
4708   Z_BYTE -= len_byte;
4709
4710   if (GPT - BEG < BEG_UNCHANGED)
4711     BEG_UNCHANGED = GPT - BEG;
4712   if (Z - GPT < END_UNCHANGED)
4713     END_UNCHANGED = Z - GPT;
4714
4715   if (!encodep && coding->src_multibyte)
4716     {
4717       /* Decoding routines expects that the source text is unibyte.
4718          We must convert 8-bit characters of multibyte form to
4719          unibyte.  */
4720       int len_byte_orig = len_byte;
4721       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4722       if (len_byte < len_byte_orig)
4723         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4724                     len_byte);
4725       coding->src_multibyte = 0;
4726     }
4727
4728   for (;;)
4729     {
4730       int result;
4731
4732       /* The buffer memory is now:
4733          +--------+converted-text+---------+-------original-text-------+---+
4734          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4735                   |<---------------------- GAP ----------------------->|  */
4736       src = GAP_END_ADDR - len_byte;
4737       dst = GPT_ADDR + inserted_byte;
4738
4739       if (encodep)
4740         result = encode_coding (coding, src, dst, len_byte, 0);
4741       else
4742         result = decode_coding (coding, src, dst, len_byte, 0);
4743
4744       /* The buffer memory is now:
4745          +--------+-------converted-text----+--+------original-text----+---+
4746          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4747                   |<---------------------- GAP ----------------------->|  */
4748
4749       inserted += coding->produced_char;
4750       inserted_byte += coding->produced;
4751       len_byte -= coding->consumed;
4752
4753       if (result == CODING_FINISH_INSUFFICIENT_CMP)
4754         {
4755           coding_allocate_composition_data (coding, from + inserted);
4756           continue;
4757         }
4758
4759       src += coding->consumed;
4760       dst += coding->produced;
4761
4762       if (result == CODING_FINISH_NORMAL)
4763         {
4764           src += len_byte;
4765           break;
4766         }
4767       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4768         {
4769           unsigned char *pend = dst, *p = pend - inserted_byte;
4770           Lisp_Object eol_type;
4771
4772           /* Encode LFs back to the original eol format (CR or CRLF).  */
4773           if (coding->eol_type == CODING_EOL_CR)
4774             {
4775               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4776             }
4777           else
4778             {
4779               int count = 0;
4780
4781               while (p < pend) if (*p++ == '\n') count++;
4782               if (src - dst < count)
4783                 {
4784                   /* We don't have sufficient room for encoding LFs
4785                      back to CRLF.  We must record converted and
4786                      not-yet-converted text back to the buffer
4787                      content, enlarge the gap, then record them out of
4788                      the buffer contents again.  */
4789                   int add = len_byte + inserted_byte;
4790
4791                   GAP_SIZE -= add;
4792                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4793                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4794                   make_gap (count - GAP_SIZE);
4795                   GAP_SIZE += add;
4796                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4797                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4798                   /* Don't forget to update SRC, DST, and PEND.  */
4799                   src = GAP_END_ADDR - len_byte;
4800                   dst = GPT_ADDR + inserted_byte;
4801                   pend = dst;
4802                 }
4803               inserted += count;
4804               inserted_byte += count;
4805               coding->produced += count;
4806               p = dst = pend + count;
4807               while (count)
4808                 {
4809                   *--p = *--pend;
4810                   if (*p == '\n') count--, *--p = '\r';
4811                 }
4812             }
4813
4814           /* Suppress eol-format conversion in the further conversion.  */
4815           coding->eol_type = CODING_EOL_LF;
4816
4817           /* Set the coding system symbol to that for Unix-like EOL.  */
4818           eol_type = Fget (saved_coding_symbol, Qeol_type);
4819           if (VECTORP (eol_type)
4820               && XVECTOR (eol_type)->size == 3
4821               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4822             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4823           else
4824             coding->symbol = saved_coding_symbol;
4825
4826           continue;
4827         }
4828       if (len_byte <= 0)
4829         {
4830           if (coding->type != coding_type_ccl
4831               || coding->mode & CODING_MODE_LAST_BLOCK)
4832             break;
4833           coding->mode |= CODING_MODE_LAST_BLOCK;
4834           continue;
4835         }
4836       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4837         {
4838           /* The source text ends in invalid codes.  Let's just
4839              make them valid buffer contents, and finish conversion.  */
4840           inserted += len_byte;
4841           inserted_byte += len_byte;
4842           while (len_byte--)
4843             *dst++ = *src++;
4844           break;
4845         }
4846       if (result == CODING_FINISH_INTERRUPT)
4847         {
4848           /* The conversion procedure was interrupted by a user.  */
4849           break;
4850         }
4851       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4852       if (coding->consumed < 1)
4853         {
4854           /* It's quite strange to require more memory without
4855              consuming any bytes.  Perhaps CCL program bug.  */
4856           break;
4857         }
4858       if (first)
4859         {
4860           /* We have just done the first batch of conversion which was
4861              stoped because of insufficient gap.  Let's reconsider the
4862              required gap size (i.e. SRT - DST) now.
4863
4864              We have converted ORIG bytes (== coding->consumed) into
4865              NEW bytes (coding->produced).  To convert the remaining
4866              LEN bytes, we may need REQUIRE bytes of gap, where:
4867                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4868                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4869              Here, we are sure that NEW >= ORIG.  */
4870           float ratio = coding->produced - coding->consumed;
4871           ratio /= coding->consumed;
4872           require = len_byte * ratio;
4873           first = 0;
4874         }
4875       if ((src - dst) < (require + 2000))
4876         {
4877           /* See the comment above the previous call of make_gap.  */
4878           int add = len_byte + inserted_byte;
4879
4880           GAP_SIZE -= add;
4881           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4882           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4883           make_gap (require + 2000);
4884           GAP_SIZE += add;
4885           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4886           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4887         }
4888     }
4889   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4890
4891   if (encodep && coding->dst_multibyte)
4892     {
4893       /* The output is unibyte.  We must convert 8-bit characters to
4894          multibyte form.  */
4895       if (inserted_byte * 2 > GAP_SIZE)
4896         {
4897           GAP_SIZE -= inserted_byte;
4898           ZV += inserted_byte; Z += inserted_byte;
4899           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4900           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4901           make_gap (inserted_byte - GAP_SIZE);
4902           GAP_SIZE += inserted_byte;
4903           ZV -= inserted_byte; Z -= inserted_byte;
4904           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4905           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4906         }
4907       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4908     }
4909
4910   /* If we have shrinked the conversion area, adjust it now.  */
4911   if (total_skip > 0)
4912     {
4913       if (tail_skip > 0)
4914         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4915       inserted += total_skip; inserted_byte += total_skip;
4916       GAP_SIZE += total_skip;
4917       GPT -= head_skip; GPT_BYTE -= head_skip;
4918       ZV -= total_skip; ZV_BYTE -= total_skip;
4919       Z -= total_skip; Z_BYTE -= total_skip;
4920       from -= head_skip; from_byte -= head_skip;
4921       to += tail_skip; to_byte += tail_skip;
4922     }
4923
4924   prev_Z = Z;
4925   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4926   inserted = Z - prev_Z;
4927
4928   if (!encodep && coding->cmp_data && coding->cmp_data->used)
4929     coding_restore_composition (coding, Fcurrent_buffer ());
4930   coding_free_composition_data (coding);
4931
4932   if (! inhibit_pre_post_conversion
4933       && ! encodep && ! NILP (coding->post_read_conversion))
4934     {
4935       Lisp_Object val;
4936       int count = specpdl_ptr - specpdl;
4937
4938       if (from != PT)
4939         TEMP_SET_PT_BOTH (from, from_byte);
4940       prev_Z = Z;
4941       record_unwind_protect (code_convert_region_unwind, Qnil);
4942       /* We should not call any more pre-write/post-read-conversion
4943          functions while this post-read-conversion is running.  */
4944       inhibit_pre_post_conversion = 1;
4945       val = call1 (coding->post_read_conversion, make_number (inserted));
4946       inhibit_pre_post_conversion = 0;
4947       /* Discard the unwind protect.  */
4948       specpdl_ptr--;
4949       CHECK_NUMBER (val, 0);
4950       inserted += Z - prev_Z;
4951     }
4952
4953   if (orig_point >= from)
4954     {
4955       if (orig_point >= from + orig_len)
4956         orig_point += inserted - orig_len;
4957       else
4958         orig_point = from;
4959       TEMP_SET_PT (orig_point);
4960     }
4961
4962   if (replace)
4963     {
4964       signal_after_change (from, to - from, inserted);
4965       update_compositions (from, from + inserted, CHECK_BORDER);
4966     }
4967
4968   {
4969     coding->consumed = to_byte - from_byte;
4970     coding->consumed_char = to - from;
4971     coding->produced = inserted_byte;
4972     coding->produced_char = inserted;
4973   }
4974
4975   return 0;
4976 }
4977
4978 Lisp_Object
4979 run_pre_post_conversion_on_str (str, coding, encodep)
4980      Lisp_Object str;
4981      struct coding_system *coding;
4982      int encodep;
4983 {
4984   int count = specpdl_ptr - specpdl;
4985   struct gcpro gcpro1;
4986   struct buffer *prev = current_buffer;
4987   int multibyte = STRING_MULTIBYTE (str);
4988
4989   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4990   record_unwind_protect (code_convert_region_unwind, Qnil);
4991   GCPRO1 (str);
4992   temp_output_buffer_setup (" *code-converting-work*");
4993   set_buffer_internal (XBUFFER (Vstandard_output));
4994   /* We must insert the contents of STR as is without
4995      unibyte<->multibyte conversion.  For that, we adjust the
4996      multibyteness of the working buffer to that of STR.  */
4997   Ferase_buffer ();
4998   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
4999   insert_from_string (str, 0, 0,
5000                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5001   UNGCPRO;
5002   inhibit_pre_post_conversion = 1;
5003   if (encodep)
5004     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5005   else
5006     {
5007       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5008       call1 (coding->post_read_conversion, make_number (Z - BEG));
5009     }
5010   inhibit_pre_post_conversion = 0;
5011   str = make_buffer_string (BEG, Z, 0);
5012   return unbind_to (count, str);
5013 }
5014
5015 Lisp_Object
5016 decode_coding_string (str, coding, nocopy)
5017      Lisp_Object str;
5018      struct coding_system *coding;
5019      int nocopy;
5020 {
5021   int len;
5022   char *buf;
5023   int from, to, to_byte;
5024   struct gcpro gcpro1;
5025   Lisp_Object saved_coding_symbol;
5026   int result;
5027
5028   from = 0;
5029   to = XSTRING (str)->size;
5030   to_byte = STRING_BYTES (XSTRING (str));
5031
5032   saved_coding_symbol = Qnil;
5033   if (CODING_REQUIRE_DETECTION (coding))
5034     {
5035       /* See the comments in code_convert_region.  */
5036       if (coding->type == coding_type_undecided)
5037         {
5038           detect_coding (coding, XSTRING (str)->data, to_byte);
5039           if (coding->type == coding_type_undecided)
5040             coding->type = coding_type_emacs_mule;
5041         }
5042       if (coding->eol_type == CODING_EOL_UNDECIDED)
5043         {
5044           saved_coding_symbol = coding->symbol;
5045           detect_eol (coding, XSTRING (str)->data, to_byte);
5046           if (coding->eol_type == CODING_EOL_UNDECIDED)
5047             coding->eol_type = CODING_EOL_LF;
5048           /* We had better recover the original eol format if we
5049              encounter an inconsitent eol format while decoding.  */
5050           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5051         }
5052     }
5053
5054   if (! CODING_REQUIRE_DECODING (coding))
5055     {
5056       if (!STRING_MULTIBYTE (str))
5057         {
5058           str = Fstring_as_multibyte (str);
5059           nocopy = 1;
5060         }
5061       return (nocopy ? str : Fcopy_sequence (str));
5062     }
5063
5064   if (STRING_MULTIBYTE (str))
5065     {
5066       /* Decoding routines expect the source text to be unibyte.  */
5067       str = Fstring_as_unibyte (str);
5068       nocopy = 1;
5069       coding->src_multibyte = 0;
5070     }
5071   coding->dst_multibyte = 1;
5072
5073   if (coding->composing != COMPOSITION_DISABLED)
5074     coding_allocate_composition_data (coding, from);
5075
5076   /* Try to skip the heading and tailing ASCIIs.  */
5077   {
5078     int from_orig = from;
5079
5080     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5081                               0);
5082     if (from == to_byte)
5083       return (nocopy ? str : Fcopy_sequence (str));
5084   }
5085
5086   len = decoding_buffer_size (coding, to_byte - from);
5087   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5088   GCPRO1 (str);
5089   buf = get_conversion_buffer (len);
5090   UNGCPRO;
5091
5092   if (from > 0)
5093     bcopy (XSTRING (str)->data, buf, from);
5094   result = decode_coding (coding, XSTRING (str)->data + from,
5095                          buf + from, to_byte - from, len);
5096   if (result == CODING_FINISH_INCONSISTENT_EOL)
5097     {
5098       /* We simply try to decode the whole string again but without
5099          eol-conversion this time.  */
5100       coding->eol_type = CODING_EOL_LF;
5101       coding->symbol = saved_coding_symbol;
5102       coding_free_composition_data (coding);
5103       return decode_coding_string (str, coding, nocopy);
5104     }
5105
5106   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5107          STRING_BYTES (XSTRING (str)) - to_byte);
5108
5109   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5110   str = make_multibyte_string (buf, len + coding->produced_char,
5111                                len + coding->produced);
5112
5113   if (coding->cmp_data && coding->cmp_data->used)
5114     coding_restore_composition (coding, str);
5115   coding_free_composition_data (coding);
5116
5117   if (SYMBOLP (coding->post_read_conversion)
5118       && !NILP (Ffboundp (coding->post_read_conversion)))
5119     str = run_pre_post_conversion_on_str (str, coding, 0);
5120
5121   return str;
5122 }
5123
5124 Lisp_Object
5125 encode_coding_string (str, coding, nocopy)
5126      Lisp_Object str;
5127      struct coding_system *coding;
5128      int nocopy;
5129 {
5130   int len;
5131   char *buf;
5132   int from, to, to_byte;
5133   struct gcpro gcpro1;
5134   Lisp_Object saved_coding_symbol;
5135   int result;
5136
5137   if (SYMBOLP (coding->pre_write_conversion)
5138       && !NILP (Ffboundp (coding->pre_write_conversion)))
5139     str = run_pre_post_conversion_on_str (str, coding, 1);
5140
5141   from = 0;
5142   to = XSTRING (str)->size;
5143   to_byte = STRING_BYTES (XSTRING (str));
5144
5145   saved_coding_symbol = Qnil;
5146   if (! CODING_REQUIRE_ENCODING (coding))
5147     {
5148       if (STRING_MULTIBYTE (str))
5149         {
5150           str = Fstring_as_unibyte (str);
5151           nocopy = 1;
5152         }
5153       return (nocopy ? str : Fcopy_sequence (str));
5154     }
5155
5156   /* Encoding routines determine the multibyteness of the source text
5157      by coding->src_multibyte.  */
5158   coding->src_multibyte = STRING_MULTIBYTE (str);
5159   coding->dst_multibyte = 0;
5160
5161   if (coding->composing != COMPOSITION_DISABLED)
5162     coding_save_composition (coding, from, to, str);
5163
5164   /* Try to skip the heading and tailing ASCIIs.  */
5165   {
5166     int from_orig = from;
5167
5168     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5169                               1);
5170     if (from == to_byte)
5171       return (nocopy ? str : Fcopy_sequence (str));
5172   }
5173
5174   len = encoding_buffer_size (coding, to_byte - from);
5175   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5176   GCPRO1 (str);
5177   buf = get_conversion_buffer (len);
5178   UNGCPRO;
5179
5180   if (from > 0)
5181     bcopy (XSTRING (str)->data, buf, from);
5182   result = encode_coding (coding, XSTRING (str)->data + from,
5183                           buf + from, to_byte - from, len);
5184   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5185          STRING_BYTES (XSTRING (str)) - to_byte);
5186
5187   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5188   str = make_unibyte_string (buf, len + coding->produced);
5189   coding_free_composition_data (coding);
5190
5191   return str;
5192 }
5193
5194 \f
5195 #ifdef emacs
5196 /*** 8. Emacs Lisp library functions ***/
5197
5198 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5199   "Return t if OBJECT is nil or a coding-system.\n\
5200 See the documentation of `make-coding-system' for information\n\
5201 about coding-system objects.")
5202   (obj)
5203      Lisp_Object obj;
5204 {
5205   if (NILP (obj))
5206     return Qt;
5207   if (!SYMBOLP (obj))
5208     return Qnil;
5209   /* Get coding-spec vector for OBJ.  */
5210   obj = Fget (obj, Qcoding_system);
5211   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5212           ? Qt : Qnil);
5213 }
5214
5215 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5216        Sread_non_nil_coding_system, 1, 1, 0,
5217   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5218   (prompt)
5219      Lisp_Object prompt;
5220 {
5221   Lisp_Object val;
5222   do
5223     {
5224       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5225                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5226     }
5227   while (XSTRING (val)->size == 0);
5228   return (Fintern (val, Qnil));
5229 }
5230
5231 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5232   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5233 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5234   (prompt, default_coding_system)
5235      Lisp_Object prompt, default_coding_system;
5236 {
5237   Lisp_Object val;
5238   if (SYMBOLP (default_coding_system))
5239     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5240   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5241                           Qt, Qnil, Qcoding_system_history,
5242                           default_coding_system, Qnil);
5243   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5244 }
5245
5246 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5247        1, 1, 0,
5248   "Check validity of CODING-SYSTEM.\n\
5249 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5250 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5251 The value of property should be a vector of length 5.")
5252   (coding_system)
5253      Lisp_Object coding_system;
5254 {
5255   CHECK_SYMBOL (coding_system, 0);
5256   if (!NILP (Fcoding_system_p (coding_system)))
5257     return coding_system;
5258   while (1)
5259     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5260 }
5261 \f
5262 Lisp_Object
5263 detect_coding_system (src, src_bytes, highest)
5264      unsigned char *src;
5265      int src_bytes, highest;
5266 {
5267   int coding_mask, eol_type;
5268   Lisp_Object val, tmp;
5269   int dummy;
5270
5271   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5272   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5273   if (eol_type == CODING_EOL_INCONSISTENT)
5274     eol_type = CODING_EOL_UNDECIDED;
5275
5276   if (!coding_mask)
5277     {
5278       val = Qundecided;
5279       if (eol_type != CODING_EOL_UNDECIDED)
5280         {
5281           Lisp_Object val2;
5282           val2 = Fget (Qundecided, Qeol_type);
5283           if (VECTORP (val2))
5284             val = XVECTOR (val2)->contents[eol_type];
5285         }
5286       return (highest ? val : Fcons (val, Qnil));
5287     }
5288
5289   /* At first, gather possible coding systems in VAL.  */
5290   val = Qnil;
5291   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5292     {
5293       Lisp_Object category_val, category_index;
5294
5295       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5296       category_val = Fsymbol_value (XCAR (tmp));
5297       if (!NILP (category_val)
5298           && NATNUMP (category_index)
5299           && (coding_mask & (1 << XFASTINT (category_index))))
5300         {
5301           val = Fcons (category_val, val);
5302           if (highest)
5303             break;
5304         }
5305     }
5306   if (!highest)
5307     val = Fnreverse (val);
5308
5309   /* Then, replace the elements with subsidiary coding systems.  */
5310   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5311     {
5312       if (eol_type != CODING_EOL_UNDECIDED
5313           && eol_type != CODING_EOL_INCONSISTENT)
5314         {
5315           Lisp_Object eol;
5316           eol = Fget (XCAR (tmp), Qeol_type);
5317           if (VECTORP (eol))
5318             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5319         }
5320     }
5321   return (highest ? XCAR (val) : val);
5322 }
5323
5324 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5325        2, 3, 0,
5326   "Detect coding system of the text in the region between START and END.\n\
5327 Return a list of possible coding systems ordered by priority.\n\
5328 \n\
5329 If only ASCII characters are found, it returns a list of single element\n\
5330 `undecided' or its subsidiary coding system according to a detected\n\
5331 end-of-line format.\n\
5332 \n\
5333 If optional argument HIGHEST is non-nil, return the coding system of\n\
5334 highest priority.")
5335   (start, end, highest)
5336      Lisp_Object start, end, highest;
5337 {
5338   int from, to;
5339   int from_byte, to_byte;
5340
5341   CHECK_NUMBER_COERCE_MARKER (start, 0);
5342   CHECK_NUMBER_COERCE_MARKER (end, 1);
5343
5344   validate_region (&start, &end);
5345   from = XINT (start), to = XINT (end);
5346   from_byte = CHAR_TO_BYTE (from);
5347   to_byte = CHAR_TO_BYTE (to);
5348
5349   if (from < GPT && to >= GPT)
5350     move_gap_both (to, to_byte);
5351
5352   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5353                                to_byte - from_byte,
5354                                !NILP (highest));
5355 }
5356
5357 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5358        1, 2, 0,
5359   "Detect coding system of the text in STRING.\n\
5360 Return a list of possible coding systems ordered by priority.\n\
5361 \n\
5362 If only ASCII characters are found, it returns a list of single element\n\
5363 `undecided' or its subsidiary coding system according to a detected\n\
5364 end-of-line format.\n\
5365 \n\
5366 If optional argument HIGHEST is non-nil, return the coding system of\n\
5367 highest priority.")
5368   (string, highest)
5369      Lisp_Object string, highest;
5370 {
5371   CHECK_STRING (string, 0);
5372
5373   return detect_coding_system (XSTRING (string)->data,
5374                                STRING_BYTES (XSTRING (string)),
5375                                !NILP (highest));
5376 }
5377
5378 Lisp_Object
5379 code_convert_region1 (start, end, coding_system, encodep)
5380      Lisp_Object start, end, coding_system;
5381      int encodep;
5382 {
5383   struct coding_system coding;
5384   int from, to, len;
5385
5386   CHECK_NUMBER_COERCE_MARKER (start, 0);
5387   CHECK_NUMBER_COERCE_MARKER (end, 1);
5388   CHECK_SYMBOL (coding_system, 2);
5389
5390   validate_region (&start, &end);
5391   from = XFASTINT (start);
5392   to = XFASTINT (end);
5393
5394   if (NILP (coding_system))
5395     return make_number (to - from);
5396
5397   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5398     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5399
5400   coding.mode |= CODING_MODE_LAST_BLOCK;
5401   coding.src_multibyte = coding.dst_multibyte
5402     = !NILP (current_buffer->enable_multibyte_characters);
5403   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5404                        &coding, encodep, 1);
5405   Vlast_coding_system_used = coding.symbol;
5406   return make_number (coding.produced_char);
5407 }
5408
5409 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5410        3, 3, "r\nzCoding system: ",
5411   "Decode the current region by specified coding system.\n\
5412 When called from a program, takes three arguments:\n\
5413 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5414 This function sets `last-coding-system-used' to the precise coding system\n\
5415 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5416 not fully specified.)\n\
5417 It returns the length of the decoded text.")
5418   (start, end, coding_system)
5419      Lisp_Object start, end, coding_system;
5420 {
5421   return code_convert_region1 (start, end, coding_system, 0);
5422 }
5423
5424 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5425        3, 3, "r\nzCoding system: ",
5426   "Encode the current region by specified coding system.\n\
5427 When called from a program, takes three arguments:\n\
5428 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5429 This function sets `last-coding-system-used' to the precise coding system\n\
5430 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5431 not fully specified.)\n\
5432 It returns the length of the encoded text.")
5433   (start, end, coding_system)
5434      Lisp_Object start, end, coding_system;
5435 {
5436   return code_convert_region1 (start, end, coding_system, 1);
5437 }
5438
5439 Lisp_Object
5440 code_convert_string1 (string, coding_system, nocopy, encodep)
5441      Lisp_Object string, coding_system, nocopy;
5442      int encodep;
5443 {
5444   struct coding_system coding;
5445
5446   CHECK_STRING (string, 0);
5447   CHECK_SYMBOL (coding_system, 1);
5448
5449   if (NILP (coding_system))
5450     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5451
5452   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5453     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5454
5455   coding.mode |= CODING_MODE_LAST_BLOCK;
5456   string = (encodep
5457             ? encode_coding_string (string, &coding, !NILP (nocopy))
5458             : decode_coding_string (string, &coding, !NILP (nocopy)));
5459   Vlast_coding_system_used = coding.symbol;
5460
5461   return string;
5462 }
5463
5464 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5465        2, 3, 0,
5466   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5467 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5468 if the decoding operation is trivial.\n\
5469 This function sets `last-coding-system-used' to the precise coding system\n\
5470 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5471 not fully specified.)")
5472   (string, coding_system, nocopy)
5473      Lisp_Object string, coding_system, nocopy;
5474 {
5475   return code_convert_string1 (string, coding_system, nocopy, 0);
5476 }
5477
5478 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5479        2, 3, 0,
5480   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5481 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5482 if the encoding operation is trivial.\n\
5483 This function sets `last-coding-system-used' to the precise coding system\n\
5484 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5485 not fully specified.)")
5486   (string, coding_system, nocopy)
5487      Lisp_Object string, coding_system, nocopy;
5488 {
5489   return code_convert_string1 (string, coding_system, nocopy, 1);
5490 }
5491
5492 /* Encode or decode STRING according to CODING_SYSTEM.
5493    Do not set Vlast_coding_system_used.
5494
5495    This function is called only from macros DECODE_FILE and
5496    ENCODE_FILE, thus we ignore character composition.  */
5497
5498 Lisp_Object
5499 code_convert_string_norecord (string, coding_system, encodep)
5500      Lisp_Object string, coding_system;
5501      int encodep;
5502 {
5503   struct coding_system coding;
5504
5505   CHECK_STRING (string, 0);
5506   CHECK_SYMBOL (coding_system, 1);
5507
5508   if (NILP (coding_system))
5509     return string;
5510
5511   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5512     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5513
5514   coding.composing = COMPOSITION_DISABLED;
5515   coding.mode |= CODING_MODE_LAST_BLOCK;
5516   return (encodep
5517           ? encode_coding_string (string, &coding, 1)
5518           : decode_coding_string (string, &coding, 1));
5519 }
5520 \f
5521 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5522   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5523 Return the corresponding character.")
5524   (code)
5525      Lisp_Object code;
5526 {
5527   unsigned char c1, c2, s1, s2;
5528   Lisp_Object val;
5529
5530   CHECK_NUMBER (code, 0);
5531   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5532   if (s1 == 0)
5533     {
5534       if (s2 < 0x80)
5535         XSETFASTINT (val, s2);
5536       else if (s2 >= 0xA0 || s2 <= 0xDF)
5537         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5538       else
5539         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5540     }
5541   else
5542     {
5543       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5544           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5545         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5546       DECODE_SJIS (s1, s2, c1, c2);
5547       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5548     }
5549   return val;
5550 }
5551
5552 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5553   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5554 Return the corresponding code in SJIS.")
5555   (ch)
5556      Lisp_Object ch;
5557 {
5558   int charset, c1, c2, s1, s2;
5559   Lisp_Object val;
5560
5561   CHECK_NUMBER (ch, 0);
5562   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5563   if (charset == CHARSET_ASCII)
5564     {
5565       val = ch;
5566     }
5567   else if (charset == charset_jisx0208
5568            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5569     {
5570       ENCODE_SJIS (c1, c2, s1, s2);
5571       XSETFASTINT (val, (s1 << 8) | s2);
5572     }
5573   else if (charset == charset_katakana_jisx0201
5574            && c1 > 0x20 && c2 < 0xE0)
5575     {
5576       XSETFASTINT (val, c1 | 0x80);
5577     }
5578   else
5579     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5580   return val;
5581 }
5582
5583 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5584   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5585 Return the corresponding character.")
5586   (code)
5587      Lisp_Object code;
5588 {
5589   int charset;
5590   unsigned char b1, b2, c1, c2;
5591   Lisp_Object val;
5592
5593   CHECK_NUMBER (code, 0);
5594   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5595   if (b1 == 0)
5596     {
5597       if (b2 >= 0x80)
5598         error ("Invalid BIG5 code: %x", XFASTINT (code));
5599       val = code;
5600     }
5601   else
5602     {
5603       if ((b1 < 0xA1 || b1 > 0xFE)
5604           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5605         error ("Invalid BIG5 code: %x", XFASTINT (code));
5606       DECODE_BIG5 (b1, b2, charset, c1, c2);
5607       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5608     }
5609   return val;
5610 }
5611
5612 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5613   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5614 Return the corresponding character code in Big5.")
5615   (ch)
5616      Lisp_Object ch;
5617 {
5618   int charset, c1, c2, b1, b2;
5619   Lisp_Object val;
5620
5621   CHECK_NUMBER (ch, 0);
5622   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5623   if (charset == CHARSET_ASCII)
5624     {
5625       val = ch;
5626     }
5627   else if ((charset == charset_big5_1
5628             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5629            || (charset == charset_big5_2
5630                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5631     {
5632       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5633       XSETFASTINT (val, (b1 << 8) | b2);
5634     }
5635   else
5636     error ("Can't encode to Big5: %d", XFASTINT (ch));
5637   return val;
5638 }
5639 \f
5640 DEFUN ("set-terminal-coding-system-internal",
5641        Fset_terminal_coding_system_internal,
5642        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5643   (coding_system)
5644      Lisp_Object coding_system;
5645 {
5646   CHECK_SYMBOL (coding_system, 0);
5647   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5648   /* We had better not send unsafe characters to terminal.  */
5649   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5650   /* Characer composition should be disabled.  */
5651   terminal_coding.composing = COMPOSITION_DISABLED;
5652   terminal_coding.src_multibyte = 1;
5653   terminal_coding.dst_multibyte = 0;
5654   return Qnil;
5655 }
5656
5657 DEFUN ("set-safe-terminal-coding-system-internal",
5658        Fset_safe_terminal_coding_system_internal,
5659        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5660   (coding_system)
5661      Lisp_Object coding_system;
5662 {
5663   CHECK_SYMBOL (coding_system, 0);
5664   setup_coding_system (Fcheck_coding_system (coding_system),
5665                        &safe_terminal_coding);
5666   /* Characer composition should be disabled.  */
5667   safe_terminal_coding.composing = COMPOSITION_DISABLED;
5668   safe_terminal_coding.src_multibyte = 1;
5669   safe_terminal_coding.dst_multibyte = 0;
5670   return Qnil;
5671 }
5672
5673 DEFUN ("terminal-coding-system",
5674        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5675   "Return coding system specified for terminal output.")
5676   ()
5677 {
5678   return terminal_coding.symbol;
5679 }
5680
5681 DEFUN ("set-keyboard-coding-system-internal",
5682        Fset_keyboard_coding_system_internal,
5683        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5684   (coding_system)
5685      Lisp_Object coding_system;
5686 {
5687   CHECK_SYMBOL (coding_system, 0);
5688   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5689   /* Characer composition should be disabled.  */
5690   keyboard_coding.composing = COMPOSITION_DISABLED;
5691   return Qnil;
5692 }
5693
5694 DEFUN ("keyboard-coding-system",
5695        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5696   "Return coding system specified for decoding keyboard input.")
5697   ()
5698 {
5699   return keyboard_coding.symbol;
5700 }
5701
5702 \f
5703 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5704        Sfind_operation_coding_system,  1, MANY, 0,
5705   "Choose a coding system for an operation based on the target name.\n\
5706 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5707 DECODING-SYSTEM is the coding system to use for decoding\n\
5708 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5709 for encoding (in case OPERATION does encoding).\n\
5710 \n\
5711 The first argument OPERATION specifies an I/O primitive:\n\
5712   For file I/O, `insert-file-contents' or `write-region'.\n\
5713   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5714   For network I/O, `open-network-stream'.\n\
5715 \n\
5716 The remaining arguments should be the same arguments that were passed\n\
5717 to the primitive.  Depending on which primitive, one of those arguments\n\
5718 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5719 whichever argument specifies the file name is TARGET.\n\
5720 \n\
5721 TARGET has a meaning which depends on OPERATION:\n\
5722   For file I/O, TARGET is a file name.\n\
5723   For process I/O, TARGET is a process name.\n\
5724   For network I/O, TARGET is a service name or a port number\n\
5725 \n\
5726 This function looks up what specified for TARGET in,\n\
5727 `file-coding-system-alist', `process-coding-system-alist',\n\
5728 or `network-coding-system-alist' depending on OPERATION.\n\
5729 They may specify a coding system, a cons of coding systems,\n\
5730 or a function symbol to call.\n\
5731 In the last case, we call the function with one argument,\n\
5732 which is a list of all the arguments given to this function.")
5733   (nargs, args)
5734      int nargs;
5735      Lisp_Object *args;
5736 {
5737   Lisp_Object operation, target_idx, target, val;
5738   register Lisp_Object chain;
5739
5740   if (nargs < 2)
5741     error ("Too few arguments");
5742   operation = args[0];
5743   if (!SYMBOLP (operation)
5744       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5745     error ("Invalid first arguement");
5746   if (nargs < 1 + XINT (target_idx))
5747     error ("Too few arguments for operation: %s",
5748            XSYMBOL (operation)->name->data);
5749   target = args[XINT (target_idx) + 1];
5750   if (!(STRINGP (target)
5751         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5752     error ("Invalid %dth argument", XINT (target_idx) + 1);
5753
5754   chain = ((EQ (operation, Qinsert_file_contents)
5755             || EQ (operation, Qwrite_region))
5756            ? Vfile_coding_system_alist
5757            : (EQ (operation, Qopen_network_stream)
5758               ? Vnetwork_coding_system_alist
5759               : Vprocess_coding_system_alist));
5760   if (NILP (chain))
5761     return Qnil;
5762
5763   for (; CONSP (chain); chain = XCDR (chain))
5764     {
5765       Lisp_Object elt;
5766       elt = XCAR (chain);
5767
5768       if (CONSP (elt)
5769           && ((STRINGP (target)
5770                && STRINGP (XCAR (elt))
5771                && fast_string_match (XCAR (elt), target) >= 0)
5772               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5773         {
5774           val = XCDR (elt);
5775           /* Here, if VAL is both a valid coding system and a valid
5776              function symbol, we return VAL as a coding system.  */
5777           if (CONSP (val))
5778             return val;
5779           if (! SYMBOLP (val))
5780             return Qnil;
5781           if (! NILP (Fcoding_system_p (val)))
5782             return Fcons (val, val);
5783           if (! NILP (Ffboundp (val)))
5784             {
5785               val = call1 (val, Flist (nargs, args));
5786               if (CONSP (val))
5787                 return val;
5788               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5789                 return Fcons (val, val);
5790             }
5791           return Qnil;
5792         }
5793     }
5794   return Qnil;
5795 }
5796
5797 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5798        Supdate_coding_systems_internal, 0, 0, 0,
5799   "Update internal database for ISO2022 and CCL based coding systems.\n\
5800 When values of any coding categories are changed, you must\n\
5801 call this function")
5802   ()
5803 {
5804   int i;
5805
5806   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5807     {
5808       Lisp_Object val;
5809
5810       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5811       if (!NILP (val))
5812         {
5813           if (! coding_system_table[i])
5814             coding_system_table[i] = ((struct coding_system *)
5815                                       xmalloc (sizeof (struct coding_system)));
5816           setup_coding_system (val, coding_system_table[i]);
5817         }
5818       else if (coding_system_table[i])
5819         {
5820           xfree (coding_system_table[i]);
5821           coding_system_table[i] = NULL;
5822         }
5823     }
5824
5825   return Qnil;
5826 }
5827
5828 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5829        Sset_coding_priority_internal, 0, 0, 0,
5830   "Update internal database for the current value of `coding-category-list'.\n\
5831 This function is internal use only.")
5832   ()
5833 {
5834   int i = 0, idx;
5835   Lisp_Object val;
5836
5837   val = Vcoding_category_list;
5838
5839   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5840     {
5841       if (! SYMBOLP (XCAR (val)))
5842         break;
5843       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5844       if (idx >= CODING_CATEGORY_IDX_MAX)
5845         break;
5846       coding_priorities[i++] = (1 << idx);
5847       val = XCDR (val);
5848     }
5849   /* If coding-category-list is valid and contains all coding
5850      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5851      the following code saves Emacs from crashing.  */
5852   while (i < CODING_CATEGORY_IDX_MAX)
5853     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5854
5855   return Qnil;
5856 }
5857
5858 #endif /* emacs */
5859
5860 \f
5861 /*** 9. Post-amble ***/
5862
5863 void
5864 init_coding ()
5865 {
5866   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5867 }
5868
5869 void
5870 init_coding_once ()
5871 {
5872   int i;
5873
5874   /* Emacs' internal format specific initialize routine.  */
5875   for (i = 0; i <= 0x20; i++)
5876     emacs_code_class[i] = EMACS_control_code;
5877   emacs_code_class[0x0A] = EMACS_linefeed_code;
5878   emacs_code_class[0x0D] = EMACS_carriage_return_code;
5879   for (i = 0x21 ; i < 0x7F; i++)
5880     emacs_code_class[i] = EMACS_ascii_code;
5881   emacs_code_class[0x7F] = EMACS_control_code;
5882   for (i = 0x80; i < 0xFF; i++)
5883     emacs_code_class[i] = EMACS_invalid_code;
5884   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5885   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5886   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5887   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5888
5889   /* ISO2022 specific initialize routine.  */
5890   for (i = 0; i < 0x20; i++)
5891     iso_code_class[i] = ISO_control_0;
5892   for (i = 0x21; i < 0x7F; i++)
5893     iso_code_class[i] = ISO_graphic_plane_0;
5894   for (i = 0x80; i < 0xA0; i++)
5895     iso_code_class[i] = ISO_control_1;
5896   for (i = 0xA1; i < 0xFF; i++)
5897     iso_code_class[i] = ISO_graphic_plane_1;
5898   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5899   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5900   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5901   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5902   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5903   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5904   iso_code_class[ISO_CODE_ESC] = ISO_escape;
5905   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5906   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5907   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5908
5909   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5910
5911   setup_coding_system (Qnil, &keyboard_coding);
5912   setup_coding_system (Qnil, &terminal_coding);
5913   setup_coding_system (Qnil, &safe_terminal_coding);
5914   setup_coding_system (Qnil, &default_buffer_file_coding);
5915
5916   bzero (coding_system_table, sizeof coding_system_table);
5917
5918   bzero (ascii_skip_code, sizeof ascii_skip_code);
5919   for (i = 0; i < 128; i++)
5920     ascii_skip_code[i] = 1;
5921
5922 #if defined (MSDOS) || defined (WINDOWSNT)
5923   system_eol_type = CODING_EOL_CRLF;
5924 #else
5925   system_eol_type = CODING_EOL_LF;
5926 #endif
5927
5928   inhibit_pre_post_conversion = 0;
5929 }
5930
5931 #ifdef emacs
5932
5933 void
5934 syms_of_coding ()
5935 {
5936   Qtarget_idx = intern ("target-idx");
5937   staticpro (&Qtarget_idx);
5938
5939   Qcoding_system_history = intern ("coding-system-history");
5940   staticpro (&Qcoding_system_history);
5941   Fset (Qcoding_system_history, Qnil);
5942
5943   /* Target FILENAME is the first argument.  */
5944   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5945   /* Target FILENAME is the third argument.  */
5946   Fput (Qwrite_region, Qtarget_idx, make_number (2));
5947
5948   Qcall_process = intern ("call-process");
5949   staticpro (&Qcall_process);
5950   /* Target PROGRAM is the first argument.  */
5951   Fput (Qcall_process, Qtarget_idx, make_number (0));
5952
5953   Qcall_process_region = intern ("call-process-region");
5954   staticpro (&Qcall_process_region);
5955   /* Target PROGRAM is the third argument.  */
5956   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5957
5958   Qstart_process = intern ("start-process");
5959   staticpro (&Qstart_process);
5960   /* Target PROGRAM is the third argument.  */
5961   Fput (Qstart_process, Qtarget_idx, make_number (2));
5962
5963   Qopen_network_stream = intern ("open-network-stream");
5964   staticpro (&Qopen_network_stream);
5965   /* Target SERVICE is the fourth argument.  */
5966   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5967
5968   Qcoding_system = intern ("coding-system");
5969   staticpro (&Qcoding_system);
5970
5971   Qeol_type = intern ("eol-type");
5972   staticpro (&Qeol_type);
5973
5974   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5975   staticpro (&Qbuffer_file_coding_system);
5976
5977   Qpost_read_conversion = intern ("post-read-conversion");
5978   staticpro (&Qpost_read_conversion);
5979
5980   Qpre_write_conversion = intern ("pre-write-conversion");
5981   staticpro (&Qpre_write_conversion);
5982
5983   Qno_conversion = intern ("no-conversion");
5984   staticpro (&Qno_conversion);
5985
5986   Qundecided = intern ("undecided");
5987   staticpro (&Qundecided);
5988
5989   Qcoding_system_p = intern ("coding-system-p");
5990   staticpro (&Qcoding_system_p);
5991
5992   Qcoding_system_error = intern ("coding-system-error");
5993   staticpro (&Qcoding_system_error);
5994
5995   Fput (Qcoding_system_error, Qerror_conditions,
5996         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5997   Fput (Qcoding_system_error, Qerror_message,
5998         build_string ("Invalid coding system"));
5999
6000   Qcoding_category = intern ("coding-category");
6001   staticpro (&Qcoding_category);
6002   Qcoding_category_index = intern ("coding-category-index");
6003   staticpro (&Qcoding_category_index);
6004
6005   Vcoding_category_table
6006     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6007   staticpro (&Vcoding_category_table);
6008   {
6009     int i;
6010     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6011       {
6012         XVECTOR (Vcoding_category_table)->contents[i]
6013           = intern (coding_category_name[i]);
6014         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6015               Qcoding_category_index, make_number (i));
6016       }
6017   }
6018
6019   Qtranslation_table = intern ("translation-table");
6020   staticpro (&Qtranslation_table);
6021   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6022
6023   Qtranslation_table_id = intern ("translation-table-id");
6024   staticpro (&Qtranslation_table_id);
6025
6026   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6027   staticpro (&Qtranslation_table_for_decode);
6028
6029   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6030   staticpro (&Qtranslation_table_for_encode);
6031
6032   Qsafe_charsets = intern ("safe-charsets");
6033   staticpro (&Qsafe_charsets);
6034
6035   Qvalid_codes = intern ("valid-codes");
6036   staticpro (&Qvalid_codes);
6037
6038   Qemacs_mule = intern ("emacs-mule");
6039   staticpro (&Qemacs_mule);
6040
6041   Qraw_text = intern ("raw-text");
6042   staticpro (&Qraw_text);
6043
6044   defsubr (&Scoding_system_p);
6045   defsubr (&Sread_coding_system);
6046   defsubr (&Sread_non_nil_coding_system);
6047   defsubr (&Scheck_coding_system);
6048   defsubr (&Sdetect_coding_region);
6049   defsubr (&Sdetect_coding_string);
6050   defsubr (&Sdecode_coding_region);
6051   defsubr (&Sencode_coding_region);
6052   defsubr (&Sdecode_coding_string);
6053   defsubr (&Sencode_coding_string);
6054   defsubr (&Sdecode_sjis_char);
6055   defsubr (&Sencode_sjis_char);
6056   defsubr (&Sdecode_big5_char);
6057   defsubr (&Sencode_big5_char);
6058   defsubr (&Sset_terminal_coding_system_internal);
6059   defsubr (&Sset_safe_terminal_coding_system_internal);
6060   defsubr (&Sterminal_coding_system);
6061   defsubr (&Sset_keyboard_coding_system_internal);
6062   defsubr (&Skeyboard_coding_system);
6063   defsubr (&Sfind_operation_coding_system);
6064   defsubr (&Supdate_coding_systems_internal);
6065   defsubr (&Sset_coding_priority_internal);
6066
6067   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6068     "List of coding systems.\n\
6069 \n\
6070 Do not alter the value of this variable manually.  This variable should be\n\
6071 updated by the functions `make-coding-system' and\n\
6072 `define-coding-system-alias'.");
6073   Vcoding_system_list = Qnil;
6074
6075   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6076     "Alist of coding system names.\n\
6077 Each element is one element list of coding system name.\n\
6078 This variable is given to `completing-read' as TABLE argument.\n\
6079 \n\
6080 Do not alter the value of this variable manually.  This variable should be\n\
6081 updated by the functions `make-coding-system' and\n\
6082 `define-coding-system-alias'.");
6083   Vcoding_system_alist = Qnil;
6084
6085   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6086     "List of coding-categories (symbols) ordered by priority.");
6087   {
6088     int i;
6089
6090     Vcoding_category_list = Qnil;
6091     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6092       Vcoding_category_list
6093         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6094                  Vcoding_category_list);
6095   }
6096
6097   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6098     "Specify the coding system for read operations.\n\
6099 It is useful to bind this variable with `let', but do not set it globally.\n\
6100 If the value is a coding system, it is used for decoding on read operation.\n\
6101 If not, an appropriate element is used from one of the coding system alists:\n\
6102 There are three such tables, `file-coding-system-alist',\n\
6103 `process-coding-system-alist', and `network-coding-system-alist'.");
6104   Vcoding_system_for_read = Qnil;
6105
6106   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6107     "Specify the coding system for write operations.\n\
6108 Programs bind this variable with `let', but you should not set it globally.\n\
6109 If the value is a coding system, it is used for encoding of output,\n\
6110 when writing it to a file and when sending it to a file or subprocess.\n\
6111 \n\
6112 If this does not specify a coding system, an appropriate element\n\
6113 is used from one of the coding system alists:\n\
6114 There are three such tables, `file-coding-system-alist',\n\
6115 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6116 For output to files, if the above procedure does not specify a coding system,\n\
6117 the value of `buffer-file-coding-system' is used.");
6118   Vcoding_system_for_write = Qnil;
6119
6120   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6121     "Coding system used in the latest file or process I/O.");
6122   Vlast_coding_system_used = Qnil;
6123
6124   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6125     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6126 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6127 such conversion.");
6128   inhibit_eol_conversion = 0;
6129
6130   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6131     "Non-nil means process buffer inherits coding system of process output.\n\
6132 Bind it to t if the process output is to be treated as if it were a file\n\
6133 read from some filesystem.");
6134   inherit_process_coding_system = 0;
6135
6136   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6137     "Alist to decide a coding system to use for a file I/O operation.\n\
6138 The format is ((PATTERN . VAL) ...),\n\
6139 where PATTERN is a regular expression matching a file name,\n\
6140 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6141 If VAL is a coding system, it is used for both decoding and encoding\n\
6142 the file contents.\n\
6143 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6144 and the cdr part is used for encoding.\n\
6145 If VAL is a function symbol, the function must return a coding system\n\
6146 or a cons of coding systems which are used as above.\n\
6147 \n\
6148 See also the function `find-operation-coding-system'\n\
6149 and the variable `auto-coding-alist'.");
6150   Vfile_coding_system_alist = Qnil;
6151
6152   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6153     "Alist to decide a coding system to use for a process I/O operation.\n\
6154 The format is ((PATTERN . VAL) ...),\n\
6155 where PATTERN is a regular expression matching a program name,\n\
6156 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6157 If VAL is a coding system, it is used for both decoding what received\n\
6158 from the program and encoding what sent to the program.\n\
6159 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6160 and the cdr part is used for encoding.\n\
6161 If VAL is a function symbol, the function must return a coding system\n\
6162 or a cons of coding systems which are used as above.\n\
6163 \n\
6164 See also the function `find-operation-coding-system'.");
6165   Vprocess_coding_system_alist = Qnil;
6166
6167   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6168     "Alist to decide a coding system to use for a network I/O operation.\n\
6169 The format is ((PATTERN . VAL) ...),\n\
6170 where PATTERN is a regular expression matching a network service name\n\
6171 or is a port number to connect to,\n\
6172 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6173 If VAL is a coding system, it is used for both decoding what received\n\
6174 from the network stream and encoding what sent to the network stream.\n\
6175 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6176 and the cdr part is used for encoding.\n\
6177 If VAL is a function symbol, the function must return a coding system\n\
6178 or a cons of coding systems which are used as above.\n\
6179 \n\
6180 See also the function `find-operation-coding-system'.");
6181   Vnetwork_coding_system_alist = Qnil;
6182
6183   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6184     "Coding system to use with system messages.");
6185   Vlocale_coding_system = Qnil;
6186
6187   /* The eol mnemonics are reset in startup.el system-dependently.  */
6188   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6189     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6190   eol_mnemonic_unix = build_string (":");
6191
6192   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6193     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6194   eol_mnemonic_dos = build_string ("\\");
6195
6196   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6197     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6198   eol_mnemonic_mac = build_string ("/");
6199
6200   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6201     "*String displayed in mode line when end-of-line format is not yet determined.");
6202   eol_mnemonic_undecided = build_string (":");
6203
6204   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6205     "*Non-nil enables character translation while encoding and decoding.");
6206   Venable_character_translation = Qt;
6207
6208   DEFVAR_LISP ("standard-translation-table-for-decode",
6209     &Vstandard_translation_table_for_decode,
6210     "Table for translating characters while decoding.");
6211   Vstandard_translation_table_for_decode = Qnil;
6212
6213   DEFVAR_LISP ("standard-translation-table-for-encode",
6214     &Vstandard_translation_table_for_encode,
6215     "Table for translationg characters while encoding.");
6216   Vstandard_translation_table_for_encode = Qnil;
6217
6218   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6219     "Alist of charsets vs revision numbers.\n\
6220 While encoding, if a charset (car part of an element) is found,\n\
6221 designate it with the escape sequence identifing revision (cdr part of the element).");
6222   Vcharset_revision_alist = Qnil;
6223
6224   DEFVAR_LISP ("default-process-coding-system",
6225                &Vdefault_process_coding_system,
6226     "Cons of coding systems used for process I/O by default.\n\
6227 The car part is used for decoding a process output,\n\
6228 the cdr part is used for encoding a text to be sent to a process.");
6229   Vdefault_process_coding_system = Qnil;
6230
6231   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6232     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6233 This is a vector of length 256.\n\
6234 If Nth element is non-nil, the existence of code N in a file\n\
6235 \(or output of subprocess) doesn't prevent it to be detected as\n\
6236 a coding system of ISO 2022 variant which has a flag\n\
6237 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6238 or reading output of a subprocess.\n\
6239 Only 128th through 159th elements has a meaning.");
6240   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6241
6242   DEFVAR_LISP ("select-safe-coding-system-function",
6243                &Vselect_safe_coding_system_function,
6244     "Function to call to select safe coding system for encoding a text.\n\
6245 \n\
6246 If set, this function is called to force a user to select a proper\n\
6247 coding system which can encode the text in the case that a default\n\
6248 coding system used in each operation can't encode the text.\n\
6249 \n\
6250 The default value is `select-safe-coding-system' (which see).");
6251   Vselect_safe_coding_system_function = Qnil;
6252
6253 }
6254
6255 char *
6256 emacs_strerror (error_number)
6257      int error_number;
6258 {
6259   char *str;
6260
6261   synchronize_system_messages_locale ();
6262   str = strerror (error_number);
6263
6264   if (! NILP (Vlocale_coding_system))
6265     {
6266       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6267                                                       Vlocale_coding_system,
6268                                                       0);
6269       str = (char *) XSTRING (dec)->data;
6270     }
6271
6272   return str;
6273 }
6274
6275 #endif /* emacs */
6276