code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  */
 116 #if 0
 117 int
 118 detect_coding_emacs_mule (src, src_end)
 119      unsigned char *src, *src_end;
 120 {
 121   ...
 122 }
 123 #endif
 124
 125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 126
 127   These functions decode SRC_BYTES length of unibyte text at SOURCE
 128   encoded in CODING to Emacs' internal format.  The resulting
 129   multibyte text goes to a place pointed to by DESTINATION, the length
 130   of which should not exceed DST_BYTES.
 131
 132   These functions set the information of original and decoded texts in
 133   the members produced, produced_char, consumed, and consumed_char of
 134   the structure *CODING.  They also set the member result to one of
 135   CODING_FINISH_XXX indicating how the decoding finished.
 136
 137   DST_BYTES zero means that source area and destination area are
 138   overlapped, which means that we can produce a decoded text until it
 139   reaches at the head of not-yet-decoded source text.
 140
 141   Below is a template of these functions.  */
 142 #if 0
 143 static void
 144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148 {
 149   ...
 150 }
 151 #endif
 152
 153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 154
 155   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 156   internal multibyte format to CODING.  The resulting unibyte text
 157   goes to a place pointed to by DESTINATION, the length of which
 158   should not exceed DST_BYTES.
 159
 160   These functions set the information of original and encoded texts in
 161   the members produced, produced_char, consumed, and consumed_char of
 162   the structure *CODING.  They also set the member result to one of
 163   CODING_FINISH_XXX indicating how the encoding finished.
 164
 165   DST_BYTES zero means that source area and destination area are
 166   overlapped, which means that we can produce a encoded text until it
 167   reaches at the head of not-yet-encoded source text.
 168
 169   Below is a template of these functions.  */
 170 #if 0
 171 static void
 172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 173      struct coding_system *coding;
 174      unsigned char *source, *destination;
 175      int src_bytes, dst_bytes;
 176 {
 177   ...
 178 }
 179 #endif
 180
 181 /*** COMMONLY USED MACROS ***/
 182
 183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 184    get one, two, and three bytes from the source text respectively.
 185    If there are not enough bytes in the source, they jump to
 186    `label_end_of_loop'.  The caller should set variables `coding',
 187    `src' and `src_end' to appropriate pointer in advance.  These
 188    macros are called from decoding routines `decode_coding_XXX', thus
 189    it is assumed that the source text is unibyte.  */
 190
 191 #define ONE_MORE_BYTE(c1)                                       \
 192   do {                                                          \
 193     if (src >= src_end)                                         \
 194       {                                                         \
 195         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 196         goto label_end_of_loop;                                 \
 197       }                                                         \
 198     c1 = *src++;                                                \
 199   } while (0)
 200
 201 #define TWO_MORE_BYTES(c1, c2)                                  \
 202   do {                                                          \
 203     if (src + 1 >= src_end)                                     \
 204       {                                                         \
 205         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 206         goto label_end_of_loop;                                 \
 207       }                                                         \
 208     c1 = *src++;                                                \
 209     c2 = *src++;                                                \
 210   } while (0)
 211
 212
 213 /* Set C to the next character at the source text pointed by `src'.
 214    If there are not enough characters in the source, jump to
 215    `label_end_of_loop'.  The caller should set variables `coding'
 216    `src', `src_end', and `translation_table' to appropriate pointers
 217    in advance.  This macro is used in encoding routines
 218    `encode_coding_XXX', thus it assumes that the source text is in
 219    multibyte form except for 8-bit characters.  8-bit characters are
 220    in multibyte form if coding->src_multibyte is nonzero, else they
 221    are represented by a single byte.  */
 222
 223 #define ONE_MORE_CHAR(c)                                        \
 224   do {                                                          \
 225     int len = src_end - src;                                    \
 226     int bytes;                                                  \
 227     if (len <= 0)                                               \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         goto label_end_of_loop;                                 \
 231       }                                                         \
 232     if (coding->src_multibyte                                   \
 233         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 234       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 235     else                                                        \
 236       c = *src, bytes = 1;                                      \
 237     if (!NILP (translation_table))                              \
 238       c = translate_char (translation_table, c, 0, 0, 0);       \
 239     src += bytes;                                               \
 240   } while (0)
 241
 242
 243 /* Produce a multibyte form of characater C to `dst'.  Jump to
 244    `label_end_of_loop' if there's not enough space at `dst'.
 245
 246    If we are now in the middle of composition sequence, the decoded
 247    character may be ALTCHAR (for the current composition).  In that
 248    case, the character goes to coding->cmp_data->data instead of
 249    `dst'.
 250
 251    This macro is used in decoding routines.  */
 252
 253 #define EMIT_CHAR(c)                                                    \
 254   do {                                                                  \
 255     if (! COMPOSING_P (coding)                                          \
 256         || coding->composing == COMPOSITION_RELATIVE                    \
 257         || coding->composing == COMPOSITION_WITH_RULE)                  \
 258       {                                                                 \
 259         int bytes = CHAR_BYTES (c);                                     \
 260         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 261           {                                                             \
 262             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 263             goto label_end_of_loop;                                     \
 264           }                                                             \
 265         dst += CHAR_STRING (c, dst);                                    \
 266         coding->produced_char++;                                        \
 267       }                                                                 \
 268                                                                         \
 269     if (COMPOSING_P (coding)                                            \
 270         && coding->composing != COMPOSITION_RELATIVE)                   \
 271       {                                                                 \
 272         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 273         coding->composition_rule_follows                                \
 274           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 275       }                                                                 \
 276   } while (0)
 277
 278
 279 #define EMIT_ONE_BYTE(c)                                        \
 280   do {                                                          \
 281     if (dst >= (dst_bytes ? dst_end : src))                     \
 282       {                                                         \
 283         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 284         goto label_end_of_loop;                                 \
 285       }                                                         \
 286     *dst++ = c;                                                 \
 287   } while (0)
 288
 289 #define EMIT_TWO_BYTES(c1, c2)                                  \
 290   do {                                                          \
 291     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 292       {                                                         \
 293         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 294         goto label_end_of_loop;                                 \
 295       }                                                         \
 296     *dst++ = c1, *dst++ = c2;                                   \
 297   } while (0)
 298
 299 #define EMIT_BYTES(from, to)                                    \
 300   do {                                                          \
 301     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     while (from < to)                                           \
 307       *dst++ = *from++;                                         \
 308   } while (0)
 309
 310 \f
 311 /*** 1. Preamble ***/
 312
 313 #ifdef emacs
 314 #include <config.h>
 315 #endif
 316
 317 #include <stdio.h>
 318
 319 #ifdef emacs
 320
 321 #include "lisp.h"
 322 #include "buffer.h"
 323 #include "charset.h"
 324 #include "composite.h"
 325 #include "ccl.h"
 326 #include "coding.h"
 327 #include "window.h"
 328
 329 #else  /* not emacs */
 330
 331 #include "mulelib.h"
 332
 333 #endif /* not emacs */
 334
 335 Lisp_Object Qcoding_system, Qeol_type;
 336 Lisp_Object Qbuffer_file_coding_system;
 337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 338 Lisp_Object Qno_conversion, Qundecided;
 339 Lisp_Object Qcoding_system_history;
 340 Lisp_Object Qsafe_charsets;
 341 Lisp_Object Qvalid_codes;
 342
 343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 345 Lisp_Object Qstart_process, Qopen_network_stream;
 346 Lisp_Object Qtarget_idx;
 347
 348 Lisp_Object Vselect_safe_coding_system_function;
 349
 350 /* Mnemonic string for each format of end-of-line.  */
 351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 352 /* Mnemonic string to indicate format of end-of-line is not yet
 353    decided.  */
 354 Lisp_Object eol_mnemonic_undecided;
 355
 356 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 357    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 358 int system_eol_type;
 359
 360 #ifdef emacs
 361
 362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 363
 364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 365
 366 /* Coding system emacs-mule and raw-text are for converting only
 367    end-of-line format.  */
 368 Lisp_Object Qemacs_mule, Qraw_text;
 369
 370 /* Coding-systems are handed between Emacs Lisp programs and C internal
 371    routines by the following three variables.  */
 372 /* Coding-system for reading files and receiving data from process.  */
 373 Lisp_Object Vcoding_system_for_read;
 374 /* Coding-system for writing files and sending data to process.  */
 375 Lisp_Object Vcoding_system_for_write;
 376 /* Coding-system actually used in the latest I/O.  */
 377 Lisp_Object Vlast_coding_system_used;
 378
 379 /* A vector of length 256 which contains information about special
 380    Latin codes (especially for dealing with Microsoft codes).  */
 381 Lisp_Object Vlatin_extra_code_table;
 382
 383 /* Flag to inhibit code conversion of end-of-line format.  */
 384 int inhibit_eol_conversion;
 385
 386 /* Flag to inhibit ISO2022 escape sequence detection.  */
 387 int inhibit_iso_escape_detection;
 388
 389 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 390 int inherit_process_coding_system;
 391
 392 /* Coding system to be used to encode text for terminal display.  */
 393 struct coding_system terminal_coding;
 394
 395 /* Coding system to be used to encode text for terminal display when
 396    terminal coding system is nil.  */
 397 struct coding_system safe_terminal_coding;
 398
 399 /* Coding system of what is sent from terminal keyboard.  */
 400 struct coding_system keyboard_coding;
 401
 402 /* Default coding system to be used to write a file.  */
 403 struct coding_system default_buffer_file_coding;
 404
 405 Lisp_Object Vfile_coding_system_alist;
 406 Lisp_Object Vprocess_coding_system_alist;
 407 Lisp_Object Vnetwork_coding_system_alist;
 408
 409 Lisp_Object Vlocale_coding_system;
 410
 411 #endif /* emacs */
 412
 413 Lisp_Object Qcoding_category, Qcoding_category_index;
 414
 415 /* List of symbols `coding-category-xxx' ordered by priority.  */
 416 Lisp_Object Vcoding_category_list;
 417
 418 /* Table of coding categories (Lisp symbols).  */
 419 Lisp_Object Vcoding_category_table;
 420
 421 /* Table of names of symbol for each coding-category.  */
 422 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 423   "coding-category-emacs-mule",
 424   "coding-category-sjis",
 425   "coding-category-iso-7",
 426   "coding-category-iso-7-tight",
 427   "coding-category-iso-8-1",
 428   "coding-category-iso-8-2",
 429   "coding-category-iso-7-else",
 430   "coding-category-iso-8-else",
 431   "coding-category-ccl",
 432   "coding-category-big5",
 433   "coding-category-utf-8",
 434   "coding-category-utf-16-be",
 435   "coding-category-utf-16-le",
 436   "coding-category-raw-text",
 437   "coding-category-binary"
 438 };
 439
 440 /* Table of pointers to coding systems corresponding to each coding
 441    categories.  */
 442 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 443
 444 /* Table of coding category masks.  Nth element is a mask for a coding
 445    cateogry of which priority is Nth.  */
 446 static
 447 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 448
 449 /* Flag to tell if we look up translation table on character code
 450    conversion.  */
 451 Lisp_Object Venable_character_translation;
 452 /* Standard translation table to look up on decoding (reading).  */
 453 Lisp_Object Vstandard_translation_table_for_decode;
 454 /* Standard translation table to look up on encoding (writing).  */
 455 Lisp_Object Vstandard_translation_table_for_encode;
 456
 457 Lisp_Object Qtranslation_table;
 458 Lisp_Object Qtranslation_table_id;
 459 Lisp_Object Qtranslation_table_for_decode;
 460 Lisp_Object Qtranslation_table_for_encode;
 461
 462 /* Alist of charsets vs revision number.  */
 463 Lisp_Object Vcharset_revision_alist;
 464
 465 /* Default coding systems used for process I/O.  */
 466 Lisp_Object Vdefault_process_coding_system;
 467
 468 /* Global flag to tell that we can't call post-read-conversion and
 469    pre-write-conversion functions.  Usually the value is zero, but it
 470    is set to 1 temporarily while such functions are running.  This is
 471    to avoid infinite recursive call.  */
 472 static int inhibit_pre_post_conversion;
 473
 474 \f
 475 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 476
 477 /* Emacs' internal format for encoding multiple character sets is a
 478    kind of multi-byte encoding, i.e. characters are encoded by
 479    variable-length sequences of one-byte codes.
 480
 481    ASCII characters and control characters (e.g. `tab', `newline') are
 482    represented by one-byte sequences which are their ASCII codes, in
 483    the range 0x00 through 0x7F.
 484
 485    8-bit characters of the range 0x80..0x9F are represented by
 486    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 487    code + 0x20).
 488
 489    8-bit characters of the range 0xA0..0xFF are represented by
 490    one-byte sequences which are their 8-bit code.
 491
 492    The other characters are represented by a sequence of `base
 493    leading-code', optional `extended leading-code', and one or two
 494    `position-code's.  The length of the sequence is determined by the
 495    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 496    whereas extended leading-code and position-code take the range 0xA0
 497    through 0xFF.  See `charset.h' for more details about leading-code
 498    and position-code.
 499
 500    --- CODE RANGE of Emacs' internal format ---
 501    character set        range
 502    -------------        -----
 503    ascii                0x00..0x7F
 504    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 505    eight-bit-graphic    0xA0..0xBF
 506    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 507    ---------------------------------------------
 508
 509   */
 510
 511 enum emacs_code_class_type emacs_code_class[256];
 512
 513 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 514    Check if a text is encoded in Emacs' internal format.  If it is,
 515    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 516
 517 int
 518 detect_coding_emacs_mule (src, src_end)
 519       unsigned char *src, *src_end;
 520 {
 521   unsigned char c;
 522   int composing = 0;
 523   /* Dummy for ONE_MORE_BYTE.  */
 524   struct coding_system dummy_coding;
 525   struct coding_system *coding = &dummy_coding;
 526
 527   while (1)
 528     {
 529       ONE_MORE_BYTE (c);
 530
 531       if (composing)
 532         {
 533           if (c < 0xA0)
 534             composing = 0;
 535           else if (c == 0xA0)
 536             {
 537               ONE_MORE_BYTE (c);
 538               c &= 0x7F;
 539             }
 540           else
 541             c -= 0x20;
 542         }
 543
 544       if (c < 0x20)
 545         {
 546           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 547             return 0;
 548         }
 549       else if (c >= 0x80 && c < 0xA0)
 550         {
 551           if (c == 0x80)
 552             /* Old leading code for a composite character.  */
 553             composing = 1;
 554           else
 555             {
 556               unsigned char *src_base = src - 1;
 557               int bytes;
 558
 559               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 560                                                bytes))
 561                 return 0;
 562               src = src_base + bytes;
 563             }
 564         }
 565     }
 566  label_end_of_loop:
 567   return CODING_CATEGORY_MASK_EMACS_MULE;
 568 }
 569
 570
 571 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 572
 573 static void
 574 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 575      struct coding_system *coding;
 576      unsigned char *source, *destination;
 577      int src_bytes, dst_bytes;
 578 {
 579   unsigned char *src = source;
 580   unsigned char *src_end = source + src_bytes;
 581   unsigned char *dst = destination;
 582   unsigned char *dst_end = destination + dst_bytes;
 583   /* SRC_BASE remembers the start position in source in each loop.
 584      The loop will be exited when there's not enough source code, or
 585      when there's not enough destination area to produce a
 586      character.  */
 587   unsigned char *src_base;
 588
 589   coding->produced_char = 0;
 590   while ((src_base = src) < src_end)
 591     {
 592       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 593       int bytes;
 594
 595       if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 596         {
 597           p = src;
 598           src += bytes;
 599         }
 600       else
 601         {
 602           bytes = CHAR_STRING (*src, tmp);
 603           p = tmp;
 604           src++;
 605         }
 606       if (dst + bytes >= (dst_bytes ? dst_end : src))
 607         {
 608           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 609           break;
 610         }
 611       while (bytes--) *dst++ = *p++;
 612       coding->produced_char++;
 613     }
 614   coding->consumed = coding->consumed_char = src_base - source;
 615   coding->produced = dst - destination;
 616 }
 617
 618 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 619   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 620
 621
 622 \f
 623 /*** 3. ISO2022 handlers ***/
 624
 625 /* The following note describes the coding system ISO2022 briefly.
 626    Since the intention of this note is to help understand the
 627    functions in this file, some parts are NOT ACCURATE or OVERLY
 628    SIMPLIFIED.  For thorough understanding, please refer to the
 629    original document of ISO2022.
 630
 631    ISO2022 provides many mechanisms to encode several character sets
 632    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 633    is encoded using bytes less than 128.  This may make the encoded
 634    text a little bit longer, but the text passes more easily through
 635    several gateways, some of which strip off MSB (Most Signigant Bit).
 636
 637    There are two kinds of character sets: control character set and
 638    graphic character set.  The former contains control characters such
 639    as `newline' and `escape' to provide control functions (control
 640    functions are also provided by escape sequences).  The latter
 641    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 642    two control character sets and many graphic character sets.
 643
 644    Graphic character sets are classified into one of the following
 645    four classes, according to the number of bytes (DIMENSION) and
 646    number of characters in one dimension (CHARS) of the set:
 647    - DIMENSION1_CHARS94
 648    - DIMENSION1_CHARS96
 649    - DIMENSION2_CHARS94
 650    - DIMENSION2_CHARS96
 651
 652    In addition, each character set is assigned an identification tag,
 653    unique for each set, called "final character" (denoted as <F>
 654    hereafter).  The <F> of each character set is decided by ECMA(*)
 655    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 656    (0x30..0x3F are for private use only).
 657
 658    Note (*): ECMA = European Computer Manufacturers Association
 659
 660    Here are examples of graphic character set [NAME(<F>)]:
 661         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 662         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 663         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 664         o DIMENSION2_CHARS96 -- none for the moment
 665
 666    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 667         C0 [0x00..0x1F] -- control character plane 0
 668         GL [0x20..0x7F] -- graphic character plane 0
 669         C1 [0x80..0x9F] -- control character plane 1
 670         GR [0xA0..0xFF] -- graphic character plane 1
 671
 672    A control character set is directly designated and invoked to C0 or
 673    C1 by an escape sequence.  The most common case is that:
 674    - ISO646's  control character set is designated/invoked to C0, and
 675    - ISO6429's control character set is designated/invoked to C1,
 676    and usually these designations/invocations are omitted in encoded
 677    text.  In a 7-bit environment, only C0 can be used, and a control
 678    character for C1 is encoded by an appropriate escape sequence to
 679    fit into the environment.  All control characters for C1 are
 680    defined to have corresponding escape sequences.
 681
 682    A graphic character set is at first designated to one of four
 683    graphic registers (G0 through G3), then these graphic registers are
 684    invoked to GL or GR.  These designations and invocations can be
 685    done independently.  The most common case is that G0 is invoked to
 686    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 687    these invocations and designations are omitted in encoded text.
 688    In a 7-bit environment, only GL can be used.
 689
 690    When a graphic character set of CHARS94 is invoked to GL, codes
 691    0x20 and 0x7F of the GL area work as control characters SPACE and
 692    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 693    be used.
 694
 695    There are two ways of invocation: locking-shift and single-shift.
 696    With locking-shift, the invocation lasts until the next different
 697    invocation, whereas with single-shift, the invocation affects the
 698    following character only and doesn't affect the locking-shift
 699    state.  Invocations are done by the following control characters or
 700    escape sequences:
 701
 702    ----------------------------------------------------------------------
 703    abbrev  function                  cntrl escape seq   description
 704    ----------------------------------------------------------------------
 705    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 706    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 707    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 708    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 709    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 710    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 711    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 712    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 713    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 714    ----------------------------------------------------------------------
 715    (*) These are not used by any known coding system.
 716
 717    Control characters for these functions are defined by macros
 718    ISO_CODE_XXX in `coding.h'.
 719
 720    Designations are done by the following escape sequences:
 721    ----------------------------------------------------------------------
 722    escape sequence      description
 723    ----------------------------------------------------------------------
 724    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 725    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 726    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 727    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 728    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 729    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 730    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 731    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 732    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 733    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 734    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 735    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 736    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 737    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 738    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 739    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 740    ----------------------------------------------------------------------
 741
 742    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 743    of dimension 1, chars 94, and final character <F>, etc...
 744
 745    Note (*): Although these designations are not allowed in ISO2022,
 746    Emacs accepts them on decoding, and produces them on encoding
 747    CHARS96 character sets in a coding system which is characterized as
 748    7-bit environment, non-locking-shift, and non-single-shift.
 749
 750    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 751    '(' can be omitted.  We refer to this as "short-form" hereafter.
 752
 753    Now you may notice that there are a lot of ways for encoding the
 754    same multilingual text in ISO2022.  Actually, there exist many
 755    coding systems such as Compound Text (used in X11's inter client
 756    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 757    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 758    localized platforms), and all of these are variants of ISO2022.
 759
 760    In addition to the above, Emacs handles two more kinds of escape
 761    sequences: ISO6429's direction specification and Emacs' private
 762    sequence for specifying character composition.
 763
 764    ISO6429's direction specification takes the following form:
 765         o CSI ']'      -- end of the current direction
 766         o CSI '0' ']'  -- end of the current direction
 767         o CSI '1' ']'  -- start of left-to-right text
 768         o CSI '2' ']'  -- start of right-to-left text
 769    The control character CSI (0x9B: control sequence introducer) is
 770    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 771
 772    Character composition specification takes the following form:
 773         o ESC '0' -- start relative composition
 774         o ESC '1' -- end composition
 775         o ESC '2' -- start rule-base composition (*)
 776         o ESC '3' -- start relative composition with alternate chars  (**)
 777         o ESC '4' -- start rule-base composition with alternate chars  (**)
 778   Since these are not standard escape sequences of any ISO standard,
 779   the use of them for these meaning is restricted to Emacs only.
 780
 781   (*) This form is used only in Emacs 20.5 and the older versions,
 782   but the newer versions can safely decode it.
 783   (**) This form is used only in Emacs 21.1 and the newer versions,
 784   and the older versions can't decode it.
 785
 786   Here's a list of examples usages of these composition escape
 787   sequences (categorized by `enum composition_method').
 788
 789   COMPOSITION_RELATIVE:
 790         ESC 0 CHAR [ CHAR ] ESC 1
 791   COMPOSITOIN_WITH_RULE:
 792         ESC 2 CHAR [ RULE CHAR ] ESC 1
 793   COMPOSITION_WITH_ALTCHARS:
 794         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 795   COMPOSITION_WITH_RULE_ALTCHARS:
 796         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 797
 798 enum iso_code_class_type iso_code_class[256];
 799
 800 #define CHARSET_OK(idx, charset)                                \
 801   (coding_system_table[idx]                                     \
 802    && (coding_system_table[idx]->safe_charsets[charset]         \
 803        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 804             (coding_system_table[idx], charset)                 \
 805            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 806
 807 #define SHIFT_OUT_OK(idx) \
 808   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 809
 810 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 811    Check if a text is encoded in ISO2022.  If it is, returns an
 812    integer in which appropriate flag bits any of:
 813         CODING_CATEGORY_MASK_ISO_7
 814         CODING_CATEGORY_MASK_ISO_7_TIGHT
 815         CODING_CATEGORY_MASK_ISO_8_1
 816         CODING_CATEGORY_MASK_ISO_8_2
 817         CODING_CATEGORY_MASK_ISO_7_ELSE
 818         CODING_CATEGORY_MASK_ISO_8_ELSE
 819    are set.  If a code which should never appear in ISO2022 is found,
 820    returns 0.  */
 821
 822 int
 823 detect_coding_iso2022 (src, src_end)
 824      unsigned char *src, *src_end;
 825 {
 826   int mask = CODING_CATEGORY_MASK_ISO;
 827   int mask_found = 0;
 828   int reg[4], shift_out = 0, single_shifting = 0;
 829   int c, c1, i, charset;
 830   /* Dummy for ONE_MORE_BYTE.  */
 831   struct coding_system dummy_coding;
 832   struct coding_system *coding = &dummy_coding;
 833
 834   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 835   while (mask && src < src_end)
 836     {
 837       ONE_MORE_BYTE (c);
 838       switch (c)
 839         {
 840         case ISO_CODE_ESC:
 841           if (inhibit_iso_escape_detection)
 842             break;
 843           single_shifting = 0;
 844           ONE_MORE_BYTE (c);
 845           if (c >= '(' && c <= '/')
 846             {
 847               /* Designation sequence for a charset of dimension 1.  */
 848               ONE_MORE_BYTE (c1);
 849               if (c1 < ' ' || c1 >= 0x80
 850                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 851                 /* Invalid designation sequence.  Just ignore.  */
 852                 break;
 853               reg[(c - '(') % 4] = charset;
 854             }
 855           else if (c == '$')
 856             {
 857               /* Designation sequence for a charset of dimension 2.  */
 858               ONE_MORE_BYTE (c);
 859               if (c >= '@' && c <= 'B')
 860                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 861                 reg[0] = charset = iso_charset_table[1][0][c];
 862               else if (c >= '(' && c <= '/')
 863                 {
 864                   ONE_MORE_BYTE (c1);
 865                   if (c1 < ' ' || c1 >= 0x80
 866                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 867                     /* Invalid designation sequence.  Just ignore.  */
 868                     break;
 869                   reg[(c - '(') % 4] = charset;
 870                 }
 871               else
 872                 /* Invalid designation sequence.  Just ignore.  */
 873                 break;
 874             }
 875           else if (c == 'N' || c == 'O')
 876             {
 877               /* ESC <Fe> for SS2 or SS3.  */
 878               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 879               break;
 880             }
 881           else if (c >= '0' && c <= '4')
 882             {
 883               /* ESC <Fp> for start/end composition.  */
 884               mask_found |= CODING_CATEGORY_MASK_ISO;
 885               break;
 886             }
 887           else
 888             /* Invalid escape sequence.  Just ignore.  */
 889             break;
 890
 891           /* We found a valid designation sequence for CHARSET.  */
 892           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 893           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 894             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 895           else
 896             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 897           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 898             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 899           else
 900             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 901           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 902             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 903           else
 904             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 905           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 906             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 907           else
 908             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 909           break;
 910
 911         case ISO_CODE_SO:
 912           if (inhibit_iso_escape_detection)
 913             break;
 914           single_shifting = 0;
 915           if (shift_out == 0
 916               && (reg[1] >= 0
 917                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 918                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 919             {
 920               /* Locking shift out.  */
 921               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 922               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 923             }
 924           break;
 925
 926         case ISO_CODE_SI:
 927           if (inhibit_iso_escape_detection)
 928             break;
 929           single_shifting = 0;
 930           if (shift_out == 1)
 931             {
 932               /* Locking shift in.  */
 933               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 934               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 935             }
 936           break;
 937
 938         case ISO_CODE_CSI:
 939           single_shifting = 0;
 940         case ISO_CODE_SS2:
 941         case ISO_CODE_SS3:
 942           {
 943             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 944
 945             if (inhibit_iso_escape_detection)
 946               break;
 947             if (c != ISO_CODE_CSI)
 948               {
 949                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 950                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 951                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 952                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 953                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 954                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 955                 single_shifting = 1;
 956               }
 957             if (VECTORP (Vlatin_extra_code_table)
 958                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 959               {
 960                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 961                     & CODING_FLAG_ISO_LATIN_EXTRA)
 962                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 963                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 964                     & CODING_FLAG_ISO_LATIN_EXTRA)
 965                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 966               }
 967             mask &= newmask;
 968             mask_found |= newmask;
 969           }
 970           break;
 971
 972         default:
 973           if (c < 0x80)
 974             {
 975               single_shifting = 0;
 976               break;
 977             }
 978           else if (c < 0xA0)
 979             {
 980               single_shifting = 0;
 981               if (VECTORP (Vlatin_extra_code_table)
 982                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 983                 {
 984                   int newmask = 0;
 985
 986                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 987                       & CODING_FLAG_ISO_LATIN_EXTRA)
 988                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 989                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 990                       & CODING_FLAG_ISO_LATIN_EXTRA)
 991                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 992                   mask &= newmask;
 993                   mask_found |= newmask;
 994                 }
 995               else
 996                 return 0;
 997             }
 998           else
 999             {
1000               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1001                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1002               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1003               /* Check the length of succeeding codes of the range
1004                  0xA0..0FF.  If the byte length is odd, we exclude
1005                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1006                  when we are not single shifting.  */
1007               if (!single_shifting
1008                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1009                 {
1010                   int i = 1;
1011                   while (src < src_end)
1012                     {
1013                       ONE_MORE_BYTE (c);
1014                       if (c < 0xA0)
1015                         break;
1016                       i++;
1017                     }
1018
1019                   if (i & 1 && src < src_end)
1020                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1021                   else
1022                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1023                 }
1024             }
1025           break;
1026         }
1027     }
1028  label_end_of_loop:
1029   return (mask & mask_found);
1030 }
1031
1032 /* Decode a character of which charset is CHARSET, the 1st position
1033    code is C1, the 2nd position code is C2, and return the decoded
1034    character code.  If the variable `translation_table' is non-nil,
1035    returned the translated code.  */
1036
1037 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1038   (NILP (translation_table)                     \
1039    ? MAKE_CHAR (charset, c1, c2)                \
1040    : translate_char (translation_table, -1, charset, c1, c2))
1041
1042 /* Set designation state into CODING.  */
1043 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1044   do {                                                                     \
1045     int charset;                                                           \
1046                                                                            \
1047     if (final_char < '0' || final_char >= 128)                             \
1048       goto label_invalid_code;                                             \
1049     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1050                                  make_number (chars),                      \
1051                                  make_number (final_char));                \
1052     if (charset >= 0                                                       \
1053         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1054             || coding->safe_charsets[charset]))                            \
1055       {                                                                    \
1056         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1057             && reg == 0                                                    \
1058             && charset == CHARSET_ASCII)                                   \
1059           {                                                                \
1060             /* We should insert this designation sequence as is so         \
1061                that it is surely written back to a file.  */               \
1062             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1063             goto label_invalid_code;                                       \
1064           }                                                                \
1065         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1066         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1067             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1068           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1069         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1070       }                                                                    \
1071     else                                                                   \
1072       {                                                                    \
1073         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1074         goto label_invalid_code;                                           \
1075       }                                                                    \
1076   } while (0)
1077
1078 /* Allocate a memory block for storing information about compositions.
1079    The block is chained to the already allocated blocks.  */
1080
1081 void
1082 coding_allocate_composition_data (coding, char_offset)
1083      struct coding_system *coding;
1084      int char_offset;
1085 {
1086   struct composition_data *cmp_data
1087     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1088
1089   cmp_data->char_offset = char_offset;
1090   cmp_data->used = 0;
1091   cmp_data->prev = coding->cmp_data;
1092   cmp_data->next = NULL;
1093   if (coding->cmp_data)
1094     coding->cmp_data->next = cmp_data;
1095   coding->cmp_data = cmp_data;
1096   coding->cmp_data_start = 0;
1097 }
1098
1099 /* Record the starting position START and METHOD of one composition.  */
1100
1101 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1102   do {                                                          \
1103     struct composition_data *cmp_data = coding->cmp_data;       \
1104     int *data = cmp_data->data + cmp_data->used;                \
1105     coding->cmp_data_start = cmp_data->used;                    \
1106     data[0] = -1;                                               \
1107     data[1] = cmp_data->char_offset + start;                    \
1108     data[3] = (int) method;                                     \
1109     cmp_data->used += 4;                                        \
1110   } while (0)
1111
1112 /* Record the ending position END of the current composition.  */
1113
1114 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1115   do {                                                          \
1116     struct composition_data *cmp_data = coding->cmp_data;       \
1117     int *data = cmp_data->data + coding->cmp_data_start;        \
1118     data[0] = cmp_data->used - coding->cmp_data_start;          \
1119     data[2] = cmp_data->char_offset + end;                      \
1120   } while (0)
1121
1122 /* Record one COMPONENT (alternate character or composition rule).  */
1123
1124 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1125   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1126
1127 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1128
1129 #define DECODE_COMPOSITION_START(c1)                                       \
1130   do {                                                                     \
1131     if (coding->composing == COMPOSITION_DISABLED)                         \
1132       {                                                                    \
1133         *dst++ = ISO_CODE_ESC;                                             \
1134         *dst++ = c1 & 0x7f;                                                \
1135         coding->produced_char += 2;                                        \
1136       }                                                                    \
1137     else if (!COMPOSING_P (coding))                                        \
1138       {                                                                    \
1139         /* This is surely the start of a composition.  We must be sure     \
1140            that coding->cmp_data has enough space to store the             \
1141            information about the composition.  If not, terminate the       \
1142            current decoding loop, allocate one more memory block for       \
1143            coding->cmp_data in the calller, then start the decoding        \
1144            loop again.  We can't allocate memory here directly because     \
1145            it may cause buffer/string relocation.  */                      \
1146         if (!coding->cmp_data                                              \
1147             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1148                 >= COMPOSITION_DATA_SIZE))                                 \
1149           {                                                                \
1150             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1151             goto label_end_of_loop;                                        \
1152           }                                                                \
1153         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1154                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1155                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1156                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1157         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1158                                       coding->composing);                  \
1159         coding->composition_rule_follows = 0;                              \
1160       }                                                                    \
1161     else                                                                   \
1162       {                                                                    \
1163         /* We are already handling a composition.  If the method is        \
1164            the following two, the codes following the current escape       \
1165            sequence are actual characters stored in a buffer.  */          \
1166         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1167             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1168           {                                                                \
1169             coding->composing = COMPOSITION_RELATIVE;                      \
1170             coding->composition_rule_follows = 0;                          \
1171           }                                                                \
1172       }                                                                    \
1173   } while (0)
1174
1175 /* Handle compositoin end sequence ESC 1.  */
1176
1177 #define DECODE_COMPOSITION_END(c1)                                      \
1178   do {                                                                  \
1179     if (coding->composing == COMPOSITION_DISABLED)                      \
1180       {                                                                 \
1181         *dst++ = ISO_CODE_ESC;                                          \
1182         *dst++ = c1;                                                    \
1183         coding->produced_char += 2;                                     \
1184       }                                                                 \
1185     else                                                                \
1186       {                                                                 \
1187         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1188         coding->composing = COMPOSITION_NO;                             \
1189       }                                                                 \
1190   } while (0)
1191
1192 /* Decode a composition rule from the byte C1 (and maybe one more byte
1193    from SRC) and store one encoded composition rule in
1194    coding->cmp_data.  */
1195
1196 #define DECODE_COMPOSITION_RULE(c1)                                     \
1197   do {                                                                  \
1198     int rule = 0;                                                       \
1199     (c1) -= 32;                                                         \
1200     if (c1 < 81)                /* old format (before ver.21) */        \
1201       {                                                                 \
1202         int gref = (c1) / 9;                                            \
1203         int nref = (c1) % 9;                                            \
1204         if (gref == 4) gref = 10;                                       \
1205         if (nref == 4) nref = 10;                                       \
1206         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1207       }                                                                 \
1208     else if (c1 < 93)           /* new format (after ver.21) */         \
1209       {                                                                 \
1210         ONE_MORE_BYTE (c2);                                             \
1211         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1212       }                                                                 \
1213     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1214     coding->composition_rule_follows = 0;                               \
1215   } while (0)
1216
1217
1218 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1219
1220 static void
1221 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1222      struct coding_system *coding;
1223      unsigned char *source, *destination;
1224      int src_bytes, dst_bytes;
1225 {
1226   unsigned char *src = source;
1227   unsigned char *src_end = source + src_bytes;
1228   unsigned char *dst = destination;
1229   unsigned char *dst_end = destination + dst_bytes;
1230   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1231   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1232   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1233   /* SRC_BASE remembers the start position in source in each loop.
1234      The loop will be exited when there's not enough source code
1235      (within macro ONE_MORE_BYTE), or when there's not enough
1236      destination area to produce a character (within macro
1237      EMIT_CHAR).  */
1238   unsigned char *src_base;
1239   int c, charset;
1240   Lisp_Object translation_table;
1241
1242   if (NILP (Venable_character_translation))
1243     translation_table = Qnil;
1244   else
1245     {
1246       translation_table = coding->translation_table_for_decode;
1247       if (NILP (translation_table))
1248         translation_table = Vstandard_translation_table_for_decode;
1249     }
1250
1251   coding->result = CODING_FINISH_NORMAL;
1252
1253   while (1)
1254     {
1255       int c1, c2;
1256
1257       src_base = src;
1258       ONE_MORE_BYTE (c1);
1259
1260       /* We produce no character or one character.  */
1261       switch (iso_code_class [c1])
1262         {
1263         case ISO_0x20_or_0x7F:
1264           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1265             {
1266               DECODE_COMPOSITION_RULE (c1);
1267               continue;
1268             }
1269           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1270             {
1271               /* This is SPACE or DEL.  */
1272               charset = CHARSET_ASCII;
1273               break;
1274             }
1275           /* This is a graphic character, we fall down ...  */
1276
1277         case ISO_graphic_plane_0:
1278           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1279             {
1280               DECODE_COMPOSITION_RULE (c1);
1281               continue;
1282             }
1283           charset = charset0;
1284           break;
1285
1286         case ISO_0xA0_or_0xFF:
1287           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1288               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1289             goto label_invalid_code;
1290           /* This is a graphic character, we fall down ... */
1291
1292         case ISO_graphic_plane_1:
1293           if (charset1 < 0)
1294             goto label_invalid_code;
1295           charset = charset1;
1296           break;
1297
1298         case ISO_control_0:
1299           if (COMPOSING_P (coding))
1300             DECODE_COMPOSITION_END ('1');
1301
1302           /* All ISO2022 control characters in this class have the
1303              same representation in Emacs internal format.  */
1304           if (c1 == '\n'
1305               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1306               && (coding->eol_type == CODING_EOL_CR
1307                   || coding->eol_type == CODING_EOL_CRLF))
1308             {
1309               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1310               goto label_end_of_loop;
1311             }
1312           charset = CHARSET_ASCII;
1313           break;
1314
1315         case ISO_control_1:
1316           if (COMPOSING_P (coding))
1317             DECODE_COMPOSITION_END ('1');
1318           goto label_invalid_code;
1319
1320         case ISO_carriage_return:
1321           if (COMPOSING_P (coding))
1322             DECODE_COMPOSITION_END ('1');
1323
1324           if (coding->eol_type == CODING_EOL_CR)
1325             c1 = '\n';
1326           else if (coding->eol_type == CODING_EOL_CRLF)
1327             {
1328               ONE_MORE_BYTE (c1);
1329               if (c1 != ISO_CODE_LF)
1330                 {
1331                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1332                     {
1333                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1334                       goto label_end_of_loop;
1335                     }
1336                   src--;
1337                   c1 = '\r';
1338                 }
1339             }
1340           charset = CHARSET_ASCII;
1341           break;
1342
1343         case ISO_shift_out:
1344           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1345               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1346             goto label_invalid_code;
1347           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1348           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1349           continue;
1350
1351         case ISO_shift_in:
1352           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1353             goto label_invalid_code;
1354           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1355           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1356           continue;
1357
1358         case ISO_single_shift_2_7:
1359         case ISO_single_shift_2:
1360           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1361             goto label_invalid_code;
1362           /* SS2 is handled as an escape sequence of ESC 'N' */
1363           c1 = 'N';
1364           goto label_escape_sequence;
1365
1366         case ISO_single_shift_3:
1367           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1368             goto label_invalid_code;
1369           /* SS2 is handled as an escape sequence of ESC 'O' */
1370           c1 = 'O';
1371           goto label_escape_sequence;
1372
1373         case ISO_control_sequence_introducer:
1374           /* CSI is handled as an escape sequence of ESC '[' ...  */
1375           c1 = '[';
1376           goto label_escape_sequence;
1377
1378         case ISO_escape:
1379           ONE_MORE_BYTE (c1);
1380         label_escape_sequence:
1381           /* Escape sequences handled by Emacs are invocation,
1382              designation, direction specification, and character
1383              composition specification.  */
1384           switch (c1)
1385             {
1386             case '&':           /* revision of following character set */
1387               ONE_MORE_BYTE (c1);
1388               if (!(c1 >= '@' && c1 <= '~'))
1389                 goto label_invalid_code;
1390               ONE_MORE_BYTE (c1);
1391               if (c1 != ISO_CODE_ESC)
1392                 goto label_invalid_code;
1393               ONE_MORE_BYTE (c1);
1394               goto label_escape_sequence;
1395
1396             case '$':           /* designation of 2-byte character set */
1397               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1398                 goto label_invalid_code;
1399               ONE_MORE_BYTE (c1);
1400               if (c1 >= '@' && c1 <= 'B')
1401                 {       /* designation of JISX0208.1978, GB2312.1980,
1402                            or JISX0208.1980 */
1403                   DECODE_DESIGNATION (0, 2, 94, c1);
1404                 }
1405               else if (c1 >= 0x28 && c1 <= 0x2B)
1406                 {       /* designation of DIMENSION2_CHARS94 character set */
1407                   ONE_MORE_BYTE (c2);
1408                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1409                 }
1410               else if (c1 >= 0x2C && c1 <= 0x2F)
1411                 {       /* designation of DIMENSION2_CHARS96 character set */
1412                   ONE_MORE_BYTE (c2);
1413                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1414                 }
1415               else
1416                 goto label_invalid_code;
1417               /* We must update these variables now.  */
1418               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1419               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1420               continue;
1421
1422             case 'n':           /* invocation of locking-shift-2 */
1423               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1424                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1425                 goto label_invalid_code;
1426               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1427               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1428               continue;
1429
1430             case 'o':           /* invocation of locking-shift-3 */
1431               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1432                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1433                 goto label_invalid_code;
1434               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1435               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1436               continue;
1437
1438             case 'N':           /* invocation of single-shift-2 */
1439               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1440                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1441                 goto label_invalid_code;
1442               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1443               ONE_MORE_BYTE (c1);
1444               break;
1445
1446             case 'O':           /* invocation of single-shift-3 */
1447               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1448                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1449                 goto label_invalid_code;
1450               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1451               ONE_MORE_BYTE (c1);
1452               break;
1453
1454             case '0': case '2': case '3': case '4': /* start composition */
1455               DECODE_COMPOSITION_START (c1);
1456               continue;
1457
1458             case '1':           /* end composition */
1459               DECODE_COMPOSITION_END (c1);
1460               continue;
1461
1462             case '[':           /* specification of direction */
1463               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1464                 goto label_invalid_code;
1465               /* For the moment, nested direction is not supported.
1466                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1467                  left-to-right, and nozero means right-to-left.  */
1468               ONE_MORE_BYTE (c1);
1469               switch (c1)
1470                 {
1471                 case ']':       /* end of the current direction */
1472                   coding->mode &= ~CODING_MODE_DIRECTION;
1473
1474                 case '0':       /* end of the current direction */
1475                 case '1':       /* start of left-to-right direction */
1476                   ONE_MORE_BYTE (c1);
1477                   if (c1 == ']')
1478                     coding->mode &= ~CODING_MODE_DIRECTION;
1479                   else
1480                     goto label_invalid_code;
1481                   break;
1482
1483                 case '2':       /* start of right-to-left direction */
1484                   ONE_MORE_BYTE (c1);
1485                   if (c1 == ']')
1486                     coding->mode |= CODING_MODE_DIRECTION;
1487                   else
1488                     goto label_invalid_code;
1489                   break;
1490
1491                 default:
1492                   goto label_invalid_code;
1493                 }
1494               continue;
1495
1496             default:
1497               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1498                 goto label_invalid_code;
1499               if (c1 >= 0x28 && c1 <= 0x2B)
1500                 {       /* designation of DIMENSION1_CHARS94 character set */
1501                   ONE_MORE_BYTE (c2);
1502                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1503                 }
1504               else if (c1 >= 0x2C && c1 <= 0x2F)
1505                 {       /* designation of DIMENSION1_CHARS96 character set */
1506                   ONE_MORE_BYTE (c2);
1507                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1508                 }
1509               else
1510                 goto label_invalid_code;
1511               /* We must update these variables now.  */
1512               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1513               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1514               continue;
1515             }
1516         }
1517
1518       /* Now we know CHARSET and 1st position code C1 of a character.
1519          Produce a multibyte sequence for that character while getting
1520          2nd position code C2 if necessary.  */
1521       if (CHARSET_DIMENSION (charset) == 2)
1522         {
1523           ONE_MORE_BYTE (c2);
1524           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1525             /* C2 is not in a valid range.  */
1526             goto label_invalid_code;
1527         }
1528       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1529       EMIT_CHAR (c);
1530       continue;
1531
1532     label_invalid_code:
1533       coding->errors++;
1534       if (COMPOSING_P (coding))
1535         DECODE_COMPOSITION_END ('1');
1536       src = src_base;
1537       c = *src++;
1538       EMIT_CHAR (c);
1539     }
1540
1541  label_end_of_loop:
1542   coding->consumed = coding->consumed_char = src_base - source;
1543   coding->produced = dst - destination;
1544   return;
1545 }
1546
1547
1548 /* ISO2022 encoding stuff.  */
1549
1550 /*
1551    It is not enough to say just "ISO2022" on encoding, we have to
1552    specify more details.  In Emacs, each coding system of ISO2022
1553    variant has the following specifications:
1554         1. Initial designation to G0 thru G3.
1555         2. Allows short-form designation?
1556         3. ASCII should be designated to G0 before control characters?
1557         4. ASCII should be designated to G0 at end of line?
1558         5. 7-bit environment or 8-bit environment?
1559         6. Use locking-shift?
1560         7. Use Single-shift?
1561    And the following two are only for Japanese:
1562         8. Use ASCII in place of JIS0201-1976-Roman?
1563         9. Use JISX0208-1983 in place of JISX0208-1978?
1564    These specifications are encoded in `coding->flags' as flag bits
1565    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1566    details.
1567 */
1568
1569 /* Produce codes (escape sequence) for designating CHARSET to graphic
1570    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1571    '@', 'A', or 'B' and the coding system CODING allows, produce
1572    designation sequence of short-form.  */
1573
1574 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1575   do {                                                                  \
1576     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1577     char *intermediate_char_94 = "()*+";                                \
1578     char *intermediate_char_96 = ",-./";                                \
1579     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1580                                                                         \
1581     if (revision < 255)                                                 \
1582       {                                                                 \
1583         *dst++ = ISO_CODE_ESC;                                          \
1584         *dst++ = '&';                                                   \
1585         *dst++ = '@' + revision;                                        \
1586       }                                                                 \
1587     *dst++ = ISO_CODE_ESC;                                              \
1588     if (CHARSET_DIMENSION (charset) == 1)                               \
1589       {                                                                 \
1590         if (CHARSET_CHARS (charset) == 94)                              \
1591           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1592         else                                                            \
1593           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1594       }                                                                 \
1595     else                                                                \
1596       {                                                                 \
1597         *dst++ = '$';                                                   \
1598         if (CHARSET_CHARS (charset) == 94)                              \
1599           {                                                             \
1600             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1601                 || reg != 0                                             \
1602                 || final_char < '@' || final_char > 'B')                \
1603               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1604           }                                                             \
1605         else                                                            \
1606           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1607       }                                                                 \
1608     *dst++ = final_char;                                                \
1609     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1610   } while (0)
1611
1612 /* The following two macros produce codes (control character or escape
1613    sequence) for ISO2022 single-shift functions (single-shift-2 and
1614    single-shift-3).  */
1615
1616 #define ENCODE_SINGLE_SHIFT_2                           \
1617   do {                                                  \
1618     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1619       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1620     else                                                \
1621       *dst++ = ISO_CODE_SS2;                            \
1622     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1623   } while (0)
1624
1625 #define ENCODE_SINGLE_SHIFT_3                           \
1626   do {                                                  \
1627     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1628       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1629     else                                                \
1630       *dst++ = ISO_CODE_SS3;                            \
1631     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1632   } while (0)
1633
1634 /* The following four macros produce codes (control character or
1635    escape sequence) for ISO2022 locking-shift functions (shift-in,
1636    shift-out, locking-shift-2, and locking-shift-3).  */
1637
1638 #define ENCODE_SHIFT_IN                         \
1639   do {                                          \
1640     *dst++ = ISO_CODE_SI;                       \
1641     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1642   } while (0)
1643
1644 #define ENCODE_SHIFT_OUT                        \
1645   do {                                          \
1646     *dst++ = ISO_CODE_SO;                       \
1647     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1648   } while (0)
1649
1650 #define ENCODE_LOCKING_SHIFT_2                  \
1651   do {                                          \
1652     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1653     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1654   } while (0)
1655
1656 #define ENCODE_LOCKING_SHIFT_3                  \
1657   do {                                          \
1658     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1659     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1660   } while (0)
1661
1662 /* Produce codes for a DIMENSION1 character whose character set is
1663    CHARSET and whose position-code is C1.  Designation and invocation
1664    sequences are also produced in advance if necessary.  */
1665
1666 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1667   do {                                                                  \
1668     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1669       {                                                                 \
1670         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1671           *dst++ = c1 & 0x7F;                                           \
1672         else                                                            \
1673           *dst++ = c1 | 0x80;                                           \
1674         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1675         break;                                                          \
1676       }                                                                 \
1677     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1678       {                                                                 \
1679         *dst++ = c1 & 0x7F;                                             \
1680         break;                                                          \
1681       }                                                                 \
1682     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1683       {                                                                 \
1684         *dst++ = c1 | 0x80;                                             \
1685         break;                                                          \
1686       }                                                                 \
1687     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1688              && !coding->safe_charsets[charset])                        \
1689       {                                                                 \
1690         /* We should not encode this character, instead produce one or  \
1691            two `?'s.  */                                                \
1692         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1693         if (CHARSET_WIDTH (charset) == 2)                               \
1694           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1695         break;                                                          \
1696       }                                                                 \
1697     else                                                                \
1698       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1699          must invoke it, or, at first, designate it to some graphic     \
1700          register.  Then repeat the loop to actually produce the        \
1701          character.  */                                                 \
1702       dst = encode_invocation_designation (charset, coding, dst);       \
1703   } while (1)
1704
1705 /* Produce codes for a DIMENSION2 character whose character set is
1706    CHARSET and whose position-codes are C1 and C2.  Designation and
1707    invocation codes are also produced in advance if necessary.  */
1708
1709 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1710   do {                                                                  \
1711     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1712       {                                                                 \
1713         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1714           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1715         else                                                            \
1716           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1717         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1718         break;                                                          \
1719       }                                                                 \
1720     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1721       {                                                                 \
1722         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1723         break;                                                          \
1724       }                                                                 \
1725     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1726       {                                                                 \
1727         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1728         break;                                                          \
1729       }                                                                 \
1730     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1731              && !coding->safe_charsets[charset])                        \
1732       {                                                                 \
1733         /* We should not encode this character, instead produce one or  \
1734            two `?'s.  */                                                \
1735         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1736         if (CHARSET_WIDTH (charset) == 2)                               \
1737           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1738         break;                                                          \
1739       }                                                                 \
1740     else                                                                \
1741       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1742          must invoke it, or, at first, designate it to some graphic     \
1743          register.  Then repeat the loop to actually produce the        \
1744          character.  */                                                 \
1745       dst = encode_invocation_designation (charset, coding, dst);       \
1746   } while (1)
1747
1748 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1749   do {                                                                  \
1750     int alt_charset = charset;                                          \
1751                                                                         \
1752     if (CHARSET_DEFINED_P (charset))                                    \
1753       {                                                                 \
1754         if (CHARSET_DIMENSION (charset) == 1)                           \
1755           {                                                             \
1756             if (charset == CHARSET_ASCII                                \
1757                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1758               alt_charset = charset_latin_jisx0201;                     \
1759             ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1);          \
1760           }                                                             \
1761         else                                                            \
1762           {                                                             \
1763             if (charset == charset_jisx0208                             \
1764                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1765               alt_charset = charset_jisx0208_1978;                      \
1766             ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2);      \
1767           }                                                             \
1768       }                                                                 \
1769     else                                                                \
1770       {                                                                 \
1771         *dst++ = c1;                                                    \
1772         if (c2 >= 0)                                                    \
1773           *dst++ = c2;                                                  \
1774       }                                                                 \
1775   } while (0)
1776
1777 /* Produce designation and invocation codes at a place pointed by DST
1778    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1779    Return new DST.  */
1780
1781 unsigned char *
1782 encode_invocation_designation (charset, coding, dst)
1783      int charset;
1784      struct coding_system *coding;
1785      unsigned char *dst;
1786 {
1787   int reg;                      /* graphic register number */
1788
1789   /* At first, check designations.  */
1790   for (reg = 0; reg < 4; reg++)
1791     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1792       break;
1793
1794   if (reg >= 4)
1795     {
1796       /* CHARSET is not yet designated to any graphic registers.  */
1797       /* At first check the requested designation.  */
1798       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1799       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1800         /* Since CHARSET requests no special designation, designate it
1801            to graphic register 0.  */
1802         reg = 0;
1803
1804       ENCODE_DESIGNATION (charset, reg, coding);
1805     }
1806
1807   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1808       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1809     {
1810       /* Since the graphic register REG is not invoked to any graphic
1811          planes, invoke it to graphic plane 0.  */
1812       switch (reg)
1813         {
1814         case 0:                 /* graphic register 0 */
1815           ENCODE_SHIFT_IN;
1816           break;
1817
1818         case 1:                 /* graphic register 1 */
1819           ENCODE_SHIFT_OUT;
1820           break;
1821
1822         case 2:                 /* graphic register 2 */
1823           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1824             ENCODE_SINGLE_SHIFT_2;
1825           else
1826             ENCODE_LOCKING_SHIFT_2;
1827           break;
1828
1829         case 3:                 /* graphic register 3 */
1830           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1831             ENCODE_SINGLE_SHIFT_3;
1832           else
1833             ENCODE_LOCKING_SHIFT_3;
1834           break;
1835         }
1836     }
1837
1838   return dst;
1839 }
1840
1841 /* Produce 2-byte codes for encoded composition rule RULE.  */
1842
1843 #define ENCODE_COMPOSITION_RULE(rule)           \
1844   do {                                          \
1845     int gref, nref;                             \
1846     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1847     *dst++ = 32 + 81 + gref;                    \
1848     *dst++ = 32 + nref;                         \
1849   } while (0)
1850
1851 /* Produce codes for indicating the start of a composition sequence
1852    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1853    which specify information about the composition.  See the comment
1854    in coding.h for the format of DATA.  */
1855
1856 #define ENCODE_COMPOSITION_START(coding, data)                          \
1857   do {                                                                  \
1858     coding->composing = data[3];                                        \
1859     *dst++ = ISO_CODE_ESC;                                              \
1860     if (coding->composing == COMPOSITION_RELATIVE)                      \
1861       *dst++ = '0';                                                     \
1862     else                                                                \
1863       {                                                                 \
1864         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1865                   ? '3' : '4');                                         \
1866         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1867         coding->composition_rule_follows = 0;                           \
1868       }                                                                 \
1869   } while (0)
1870
1871 /* Produce codes for indicating the end of the current composition.  */
1872
1873 #define ENCODE_COMPOSITION_END(coding, data)                    \
1874   do {                                                          \
1875     *dst++ = ISO_CODE_ESC;                                      \
1876     *dst++ = '1';                                               \
1877     coding->cmp_data_start += data[0];                          \
1878     coding->composing = COMPOSITION_NO;                         \
1879     if (coding->cmp_data_start == coding->cmp_data->used        \
1880         && coding->cmp_data->next)                              \
1881       {                                                         \
1882         coding->cmp_data = coding->cmp_data->next;              \
1883         coding->cmp_data_start = 0;                             \
1884       }                                                         \
1885   } while (0)
1886
1887 /* Produce composition start sequence ESC 0.  Here, this sequence
1888    doesn't mean the start of a new composition but means that we have
1889    just produced components (alternate chars and composition rules) of
1890    the composition and the actual text follows in SRC.  */
1891
1892 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1893   do {                                          \
1894     *dst++ = ISO_CODE_ESC;                      \
1895     *dst++ = '0';                               \
1896     coding->composing = COMPOSITION_RELATIVE;   \
1897   } while (0)
1898
1899 /* The following three macros produce codes for indicating direction
1900    of text.  */
1901 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1902   do {                                                  \
1903     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1904       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1905     else                                                \
1906       *dst++ = ISO_CODE_CSI;                            \
1907   } while (0)
1908
1909 #define ENCODE_DIRECTION_R2L    \
1910   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1911
1912 #define ENCODE_DIRECTION_L2R    \
1913   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1914
1915 /* Produce codes for designation and invocation to reset the graphic
1916    planes and registers to initial state.  */
1917 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1918   do {                                                                      \
1919     int reg;                                                                \
1920     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1921       ENCODE_SHIFT_IN;                                                      \
1922     for (reg = 0; reg < 4; reg++)                                           \
1923       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1924           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1925               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1926         ENCODE_DESIGNATION                                                  \
1927           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1928   } while (0)
1929
1930 /* Produce designation sequences of charsets in the line started from
1931    SRC to a place pointed by DST, and return updated DST.
1932
1933    If the current block ends before any end-of-line, we may fail to
1934    find all the necessary designations.  */
1935
1936 static unsigned char *
1937 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1938      struct coding_system *coding;
1939      Lisp_Object translation_table;
1940      unsigned char *src, *src_end, *dst;
1941 {
1942   int charset, c, found = 0, reg;
1943   /* Table of charsets to be designated to each graphic register.  */
1944   int r[4];
1945
1946   for (reg = 0; reg < 4; reg++)
1947     r[reg] = -1;
1948
1949   while (found < 4)
1950     {
1951       ONE_MORE_CHAR (c);
1952       if (c == '\n')
1953         break;
1954
1955       charset = CHAR_CHARSET (c);
1956       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1957       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1958         {
1959           found++;
1960           r[reg] = charset;
1961         }
1962     }
1963
1964  label_end_of_loop:
1965   if (found)
1966     {
1967       for (reg = 0; reg < 4; reg++)
1968         if (r[reg] >= 0
1969             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1970           ENCODE_DESIGNATION (r[reg], reg, coding);
1971     }
1972
1973   return dst;
1974 }
1975
1976 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1977
1978 static void
1979 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1980      struct coding_system *coding;
1981      unsigned char *source, *destination;
1982      int src_bytes, dst_bytes;
1983 {
1984   unsigned char *src = source;
1985   unsigned char *src_end = source + src_bytes;
1986   unsigned char *dst = destination;
1987   unsigned char *dst_end = destination + dst_bytes;
1988   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1989      from DST_END to assure overflow checking is necessary only at the
1990      head of loop.  */
1991   unsigned char *adjusted_dst_end = dst_end - 19;
1992   /* SRC_BASE remembers the start position in source in each loop.
1993      The loop will be exited when there's not enough source text to
1994      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1995      there's not enough destination area to produce encoded codes
1996      (within macro EMIT_BYTES).  */
1997   unsigned char *src_base;
1998   int c;
1999   Lisp_Object translation_table;
2000
2001   if (NILP (Venable_character_translation))
2002     translation_table = Qnil;
2003   else
2004     {
2005       translation_table = coding->translation_table_for_encode;
2006       if (NILP (translation_table))
2007         translation_table = Vstandard_translation_table_for_encode;
2008     }
2009
2010   coding->consumed_char = 0;
2011   coding->errors = 0;
2012   while (1)
2013     {
2014       int charset, c1, c2;
2015
2016       src_base = src;
2017
2018       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2019         {
2020           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2021           break;
2022         }
2023
2024       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2025           && CODING_SPEC_ISO_BOL (coding))
2026         {
2027           /* We have to produce designation sequences if any now.  */
2028           dst = encode_designation_at_bol (coding, translation_table,
2029                                            src, src_end, dst);
2030           CODING_SPEC_ISO_BOL (coding) = 0;
2031         }
2032
2033       /* Check composition start and end.  */
2034       if (coding->composing != COMPOSITION_DISABLED
2035           && coding->cmp_data_start < coding->cmp_data->used)
2036         {
2037           struct composition_data *cmp_data = coding->cmp_data;
2038           int *data = cmp_data->data + coding->cmp_data_start;
2039           int this_pos = cmp_data->char_offset + coding->consumed_char;
2040
2041           if (coding->composing == COMPOSITION_RELATIVE)
2042             {
2043               if (this_pos == data[2])
2044                 {
2045                   ENCODE_COMPOSITION_END (coding, data);
2046                   cmp_data = coding->cmp_data;
2047                   data = cmp_data->data + coding->cmp_data_start;
2048                 }
2049             }
2050           else if (COMPOSING_P (coding))
2051             {
2052               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2053               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2054                 /* We have consumed components of the composition.
2055                    What follows in SRC is the compositions's base
2056                    text.  */
2057                 ENCODE_COMPOSITION_FAKE_START (coding);
2058               else
2059                 {
2060                   int c = cmp_data->data[coding->cmp_data_index++];
2061                   if (coding->composition_rule_follows)
2062                     {
2063                       ENCODE_COMPOSITION_RULE (c);
2064                       coding->composition_rule_follows = 0;
2065                     }
2066                   else
2067                     {
2068                       SPLIT_CHAR (c, charset, c1, c2);
2069                       ENCODE_ISO_CHARACTER (charset, c1, c2);
2070                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2071                         coding->composition_rule_follows = 1;
2072                     }
2073                   continue;
2074                 }
2075             }
2076           if (!COMPOSING_P (coding))
2077             {
2078               if (this_pos == data[1])
2079                 {
2080                   ENCODE_COMPOSITION_START (coding, data);
2081                   continue;
2082                 }
2083             }
2084         }
2085
2086       ONE_MORE_CHAR (c);
2087
2088       /* Now encode the character C.  */
2089       if (c < 0x20 || c == 0x7F)
2090         {
2091           if (c == '\r')
2092             {
2093               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2094                 {
2095                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2096                     ENCODE_RESET_PLANE_AND_REGISTER;
2097                   *dst++ = c;
2098                   continue;
2099                 }
2100               /* fall down to treat '\r' as '\n' ...  */
2101               c = '\n';
2102             }
2103           if (c == '\n')
2104             {
2105               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2106                 ENCODE_RESET_PLANE_AND_REGISTER;
2107               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2108                 bcopy (coding->spec.iso2022.initial_designation,
2109                        coding->spec.iso2022.current_designation,
2110                        sizeof coding->spec.iso2022.initial_designation);
2111               if (coding->eol_type == CODING_EOL_LF
2112                   || coding->eol_type == CODING_EOL_UNDECIDED)
2113                 *dst++ = ISO_CODE_LF;
2114               else if (coding->eol_type == CODING_EOL_CRLF)
2115                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2116               else
2117                 *dst++ = ISO_CODE_CR;
2118               CODING_SPEC_ISO_BOL (coding) = 1;
2119             }
2120           else
2121             {
2122               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2123                 ENCODE_RESET_PLANE_AND_REGISTER;
2124               *dst++ = c;
2125             }
2126         }
2127       else if (ASCII_BYTE_P (c))
2128         ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2129       else if (SINGLE_BYTE_CHAR_P (c))
2130         {
2131           *dst++ = c;
2132           coding->errors++;
2133         }
2134       else
2135         {
2136           SPLIT_CHAR (c, charset, c1, c2);
2137           ENCODE_ISO_CHARACTER (charset, c1, c2);
2138         }
2139
2140       coding->consumed_char++;
2141     }
2142
2143  label_end_of_loop:
2144   coding->consumed = src_base - source;
2145   coding->produced = coding->produced_char = dst - destination;
2146 }
2147
2148 \f
2149 /*** 4. SJIS and BIG5 handlers ***/
2150
2151 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2152    quite widely.  So, for the moment, Emacs supports them in the bare
2153    C code.  But, in the future, they may be supported only by CCL.  */
2154
2155 /* SJIS is a coding system encoding three character sets: ASCII, right
2156    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2157    as is.  A character of charset katakana-jisx0201 is encoded by
2158    "position-code + 0x80".  A character of charset japanese-jisx0208
2159    is encoded in 2-byte but two position-codes are divided and shifted
2160    so that it fit in the range below.
2161
2162    --- CODE RANGE of SJIS ---
2163    (character set)      (range)
2164    ASCII                0x00 .. 0x7F
2165    KATAKANA-JISX0201    0xA0 .. 0xDF
2166    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2167             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2168    -------------------------------
2169
2170 */
2171
2172 /* BIG5 is a coding system encoding two character sets: ASCII and
2173    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2174    character set and is encoded in two-byte.
2175
2176    --- CODE RANGE of BIG5 ---
2177    (character set)      (range)
2178    ASCII                0x00 .. 0x7F
2179    Big5 (1st byte)      0xA1 .. 0xFE
2180         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2181    --------------------------
2182
2183    Since the number of characters in Big5 is larger than maximum
2184    characters in Emacs' charset (96x96), it can't be handled as one
2185    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2186    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2187    contains frequently used characters and the latter contains less
2188    frequently used characters.  */
2189
2190 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2191    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2192    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2193    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2194
2195 /* Number of Big5 characters which have the same code in 1st byte.  */
2196 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2197
2198 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2199   do {                                                                  \
2200     unsigned int temp                                                   \
2201       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2202     if (b1 < 0xC9)                                                      \
2203       charset = charset_big5_1;                                         \
2204     else                                                                \
2205       {                                                                 \
2206         charset = charset_big5_2;                                       \
2207         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2208       }                                                                 \
2209     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2210     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2211   } while (0)
2212
2213 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2214   do {                                                                  \
2215     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2216     if (charset == charset_big5_2)                                      \
2217       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2218     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2219     b2 = temp % BIG5_SAME_ROW;                                          \
2220     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2221   } while (0)
2222
2223 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2224    Check if a text is encoded in SJIS.  If it is, return
2225    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2226
2227 int
2228 detect_coding_sjis (src, src_end)
2229      unsigned char *src, *src_end;
2230 {
2231   int c;
2232   /* Dummy for ONE_MORE_BYTE.  */
2233   struct coding_system dummy_coding;
2234   struct coding_system *coding = &dummy_coding;
2235
2236   while (1)
2237     {
2238       ONE_MORE_BYTE (c);
2239       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2240         {
2241           ONE_MORE_BYTE (c);
2242           if (c < 0x40)
2243             return 0;
2244         }
2245     }
2246  label_end_of_loop:
2247   return CODING_CATEGORY_MASK_SJIS;
2248 }
2249
2250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2251    Check if a text is encoded in BIG5.  If it is, return
2252    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2253
2254 int
2255 detect_coding_big5 (src, src_end)
2256      unsigned char *src, *src_end;
2257 {
2258   int c;
2259   /* Dummy for ONE_MORE_BYTE.  */
2260   struct coding_system dummy_coding;
2261   struct coding_system *coding = &dummy_coding;
2262
2263   while (1)
2264     {
2265       ONE_MORE_BYTE (c);
2266       if (c >= 0xA1)
2267         {
2268           ONE_MORE_BYTE (c);
2269           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2270             return 0;
2271         }
2272     }
2273  label_end_of_loop:
2274   return CODING_CATEGORY_MASK_BIG5;
2275 }
2276
2277 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2278    Check if a text is encoded in UTF-8.  If it is, return
2279    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2280
2281 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2282 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2283 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2284 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2285 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2286 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2287 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2288
2289 int
2290 detect_coding_utf_8 (src, src_end)
2291      unsigned char *src, *src_end;
2292 {
2293   unsigned char c;
2294   int seq_maybe_bytes;
2295   /* Dummy for ONE_MORE_BYTE.  */
2296   struct coding_system dummy_coding;
2297   struct coding_system *coding = &dummy_coding;
2298
2299   while (1)
2300     {
2301       ONE_MORE_BYTE (c);
2302       if (UTF_8_1_OCTET_P (c))
2303         continue;
2304       else if (UTF_8_2_OCTET_LEADING_P (c))
2305         seq_maybe_bytes = 1;
2306       else if (UTF_8_3_OCTET_LEADING_P (c))
2307         seq_maybe_bytes = 2;
2308       else if (UTF_8_4_OCTET_LEADING_P (c))
2309         seq_maybe_bytes = 3;
2310       else if (UTF_8_5_OCTET_LEADING_P (c))
2311         seq_maybe_bytes = 4;
2312       else if (UTF_8_6_OCTET_LEADING_P (c))
2313         seq_maybe_bytes = 5;
2314       else
2315         return 0;
2316
2317       do
2318         {
2319           ONE_MORE_BYTE (c);
2320           if (!UTF_8_EXTRA_OCTET_P (c))
2321             return 0;
2322           seq_maybe_bytes--;
2323         }
2324       while (seq_maybe_bytes > 0);
2325     }
2326
2327  label_end_of_loop:
2328   return CODING_CATEGORY_MASK_UTF_8;
2329 }
2330
2331 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2332    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2333    Little Endian (otherwise).  If it is, return
2334    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2335    else return 0.  */
2336
2337 #define UTF_16_INVALID_P(val)   \
2338   (((val) == 0xFFFE)            \
2339    || ((val) == 0xFFFF))
2340
2341 #define UTF_16_HIGH_SURROGATE_P(val) \
2342   (((val) & 0xD800) == 0xD800)
2343
2344 #define UTF_16_LOW_SURROGATE_P(val) \
2345   (((val) & 0xDC00) == 0xDC00)
2346
2347 int
2348 detect_coding_utf_16 (src, src_end)
2349      unsigned char *src, *src_end;
2350 {
2351   unsigned char c1, c2;
2352   /* Dummy for TWO_MORE_BYTES.  */
2353   struct coding_system dummy_coding;
2354   struct coding_system *coding = &dummy_coding;
2355
2356   TWO_MORE_BYTES (c1, c2);
2357
2358   if ((c1 == 0xFF) && (c2 == 0xFE))
2359     return CODING_CATEGORY_MASK_UTF_16_LE;
2360   else if ((c1 == 0xFE) && (c2 == 0xFF))
2361     return CODING_CATEGORY_MASK_UTF_16_BE;
2362
2363  label_end_of_loop:
2364   return 0;
2365 }
2366
2367 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2368    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2369
2370 static void
2371 decode_coding_sjis_big5 (coding, source, destination,
2372                          src_bytes, dst_bytes, sjis_p)
2373      struct coding_system *coding;
2374      unsigned char *source, *destination;
2375      int src_bytes, dst_bytes;
2376      int sjis_p;
2377 {
2378   unsigned char *src = source;
2379   unsigned char *src_end = source + src_bytes;
2380   unsigned char *dst = destination;
2381   unsigned char *dst_end = destination + dst_bytes;
2382   /* SRC_BASE remembers the start position in source in each loop.
2383      The loop will be exited when there's not enough source code
2384      (within macro ONE_MORE_BYTE), or when there's not enough
2385      destination area to produce a character (within macro
2386      EMIT_CHAR).  */
2387   unsigned char *src_base;
2388   Lisp_Object translation_table;
2389
2390   if (NILP (Venable_character_translation))
2391     translation_table = Qnil;
2392   else
2393     {
2394       translation_table = coding->translation_table_for_decode;
2395       if (NILP (translation_table))
2396         translation_table = Vstandard_translation_table_for_decode;
2397     }
2398
2399   coding->produced_char = 0;
2400   while (1)
2401     {
2402       int c, charset, c1, c2;
2403
2404       src_base = src;
2405       ONE_MORE_BYTE (c1);
2406
2407       if (c1 < 0x80)
2408         {
2409           charset = CHARSET_ASCII;
2410           if (c1 < 0x20)
2411             {
2412               if (c1 == '\r')
2413                 {
2414                   if (coding->eol_type == CODING_EOL_CRLF)
2415                     {
2416                       ONE_MORE_BYTE (c2);
2417                       if (c2 == '\n')
2418                         c1 = c2;
2419                       else if (coding->mode
2420                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2421                         {
2422                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2423                           goto label_end_of_loop;
2424                         }
2425                       else
2426                         /* To process C2 again, SRC is subtracted by 1.  */
2427                         src--;
2428                     }
2429                   else if (coding->eol_type == CODING_EOL_CR)
2430                     c1 = '\n';
2431                 }
2432               else if (c1 == '\n'
2433                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2434                        && (coding->eol_type == CODING_EOL_CR
2435                            || coding->eol_type == CODING_EOL_CRLF))
2436                 {
2437                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2438                   goto label_end_of_loop;
2439                 }
2440             }
2441         }
2442       else
2443         {
2444           if (sjis_p)
2445             {
2446               if (c1 >= 0xF0)
2447                 goto label_invalid_code;
2448               if (c1 < 0xA0 || c1 >= 0xE0)
2449                 {
2450                   /* SJIS -> JISX0208 */
2451                   ONE_MORE_BYTE (c2);
2452                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2453                     goto label_invalid_code;
2454                   DECODE_SJIS (c1, c2, c1, c2);
2455                   charset = charset_jisx0208;
2456                 }
2457               else
2458                 /* SJIS -> JISX0201-Kana */
2459                 charset = charset_katakana_jisx0201;
2460             }
2461           else
2462             {
2463               /* BIG5 -> Big5 */
2464               if (c1 < 0xA1 || c1 > 0xFE)
2465                 goto label_invalid_code;
2466               ONE_MORE_BYTE (c2);
2467               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2468                 goto label_invalid_code;
2469               DECODE_BIG5 (c1, c2, charset, c1, c2);
2470             }
2471         }
2472
2473       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2474       EMIT_CHAR (c);
2475       continue;
2476
2477     label_invalid_code:
2478       coding->errors++;
2479       src = src_base;
2480       c = *src++;
2481       EMIT_CHAR (c);
2482     }
2483
2484  label_end_of_loop:
2485   coding->consumed = coding->consumed_char = src_base - source;
2486   coding->produced = dst - destination;
2487   return;
2488 }
2489
2490 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2491    This function can encode charsets `ascii', `katakana-jisx0201',
2492    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2493    are sure that all these charsets are registered as official charset
2494    (i.e. do not have extended leading-codes).  Characters of other
2495    charsets are produced without any encoding.  If SJIS_P is 1, encode
2496    SJIS text, else encode BIG5 text.  */
2497
2498 static void
2499 encode_coding_sjis_big5 (coding, source, destination,
2500                          src_bytes, dst_bytes, sjis_p)
2501      struct coding_system *coding;
2502      unsigned char *source, *destination;
2503      int src_bytes, dst_bytes;
2504      int sjis_p;
2505 {
2506   unsigned char *src = source;
2507   unsigned char *src_end = source + src_bytes;
2508   unsigned char *dst = destination;
2509   unsigned char *dst_end = destination + dst_bytes;
2510   /* SRC_BASE remembers the start position in source in each loop.
2511      The loop will be exited when there's not enough source text to
2512      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2513      there's not enough destination area to produce encoded codes
2514      (within macro EMIT_BYTES).  */
2515   unsigned char *src_base;
2516   Lisp_Object translation_table;
2517
2518   if (NILP (Venable_character_translation))
2519     translation_table = Qnil;
2520   else
2521     {
2522       translation_table = coding->translation_table_for_decode;
2523       if (NILP (translation_table))
2524         translation_table = Vstandard_translation_table_for_decode;
2525     }
2526
2527   while (1)
2528     {
2529       int c, charset, c1, c2;
2530
2531       src_base = src;
2532       ONE_MORE_CHAR (c);
2533
2534       /* Now encode the character C.  */
2535       if (SINGLE_BYTE_CHAR_P (c))
2536         {
2537           switch (c)
2538             {
2539             case '\r':
2540               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2541                 {
2542                   EMIT_ONE_BYTE (c);
2543                   break;
2544                 }
2545               c = '\n';
2546             case '\n':
2547               if (coding->eol_type == CODING_EOL_CRLF)
2548                 {
2549                   EMIT_TWO_BYTES ('\r', c);
2550                   break;
2551                 }
2552               else if (coding->eol_type == CODING_EOL_CR)
2553                 c = '\r';
2554             default:
2555               EMIT_ONE_BYTE (c);
2556             }
2557         }
2558       else
2559         {
2560           SPLIT_CHAR (c, charset, c1, c2);
2561           if (sjis_p)
2562             {
2563               if (charset == charset_jisx0208
2564                   || charset == charset_jisx0208_1978)
2565                 {
2566                   ENCODE_SJIS (c1, c2, c1, c2);
2567                   EMIT_TWO_BYTES (c1, c2);
2568                 }
2569               else if (charset == charset_latin_jisx0201)
2570                 EMIT_ONE_BYTE (c1);
2571               else
2572                 /* There's no way other than producing the internal
2573                    codes as is.  */
2574                 EMIT_BYTES (src_base, src);
2575             }
2576           else
2577             {
2578               if (charset == charset_big5_1 || charset == charset_big5_2)
2579                 {
2580                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2581                   EMIT_TWO_BYTES (c1, c2);
2582                 }
2583               else
2584                 /* There's no way other than producing the internal
2585                    codes as is.  */
2586                 EMIT_BYTES (src_base, src);
2587             }
2588         }
2589       coding->consumed_char++;
2590     }
2591
2592  label_end_of_loop:
2593   coding->consumed = src_base - source;
2594   coding->produced = coding->produced_char = dst - destination;
2595 }
2596
2597 \f
2598 /*** 5. CCL handlers ***/
2599
2600 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2601    Check if a text is encoded in a coding system of which
2602    encoder/decoder are written in CCL program.  If it is, return
2603    CODING_CATEGORY_MASK_CCL, else return 0.  */
2604
2605 int
2606 detect_coding_ccl (src, src_end)
2607      unsigned char *src, *src_end;
2608 {
2609   unsigned char *valid;
2610   int c;
2611   /* Dummy for ONE_MORE_BYTE.  */
2612   struct coding_system dummy_coding;
2613   struct coding_system *coding = &dummy_coding;
2614
2615   /* No coding system is assigned to coding-category-ccl.  */
2616   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2617     return 0;
2618
2619   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2620   while (1)
2621     {
2622       ONE_MORE_BYTE (c);
2623       if (! valid[c])
2624         return 0;
2625     }
2626  label_end_of_loop:
2627   return CODING_CATEGORY_MASK_CCL;
2628 }
2629
2630 \f
2631 /*** 6. End-of-line handlers ***/
2632
2633 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2634
2635 static void
2636 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2637      struct coding_system *coding;
2638      unsigned char *source, *destination;
2639      int src_bytes, dst_bytes;
2640 {
2641   unsigned char *src = source;
2642   unsigned char *dst = destination;
2643   unsigned char *src_end = src + src_bytes;
2644   unsigned char *dst_end = dst + dst_bytes;
2645   Lisp_Object translation_table;
2646   /* SRC_BASE remembers the start position in source in each loop.
2647      The loop will be exited when there's not enough source code
2648      (within macro ONE_MORE_BYTE), or when there's not enough
2649      destination area to produce a character (within macro
2650      EMIT_CHAR).  */
2651   unsigned char *src_base;
2652   int c;
2653
2654   translation_table = Qnil;
2655   switch (coding->eol_type)
2656     {
2657     case CODING_EOL_CRLF:
2658       while (1)
2659         {
2660           src_base = src;
2661           ONE_MORE_BYTE (c);
2662           if (c == '\r')
2663             {
2664               ONE_MORE_BYTE (c);
2665               if (c != '\n')
2666                 {
2667                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2668                     {
2669                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2670                       goto label_end_of_loop;
2671                     }
2672                   src--;
2673                   c = '\r';
2674                 }
2675             }
2676           else if (c == '\n'
2677                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2678             {
2679               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2680               goto label_end_of_loop;
2681             }
2682           EMIT_CHAR (c);
2683         }
2684       break;
2685
2686     case CODING_EOL_CR:
2687       while (1)
2688         {
2689           src_base = src;
2690           ONE_MORE_BYTE (c);
2691           if (c == '\n')
2692             {
2693               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2694                 {
2695                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2696                   goto label_end_of_loop;
2697                 }
2698             }
2699           else if (c == '\r')
2700             c = '\n';
2701           EMIT_CHAR (c);
2702         }
2703       break;
2704
2705     default:                    /* no need for EOL handling */
2706       while (1)
2707         {
2708           src_base = src;
2709           ONE_MORE_BYTE (c);
2710           EMIT_CHAR (c);
2711         }
2712     }
2713
2714  label_end_of_loop:
2715   coding->consumed = coding->consumed_char = src_base - source;
2716   coding->produced = dst - destination;
2717   return;
2718 }
2719
2720 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2721    format of end-of-line according to `coding->eol_type'.  It also
2722    convert multibyte form 8-bit characers to unibyte if
2723    CODING->src_multibyte is nonzero.  If `coding->mode &
2724    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2725    also means end-of-line.  */
2726
2727 static void
2728 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2729      struct coding_system *coding;
2730      unsigned char *source, *destination;
2731      int src_bytes, dst_bytes;
2732 {
2733   unsigned char *src = source;
2734   unsigned char *dst = destination;
2735   unsigned char *src_end = src + src_bytes;
2736   unsigned char *dst_end = dst + dst_bytes;
2737   Lisp_Object translation_table;
2738   /* SRC_BASE remembers the start position in source in each loop.
2739      The loop will be exited when there's not enough source text to
2740      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2741      there's not enough destination area to produce encoded codes
2742      (within macro EMIT_BYTES).  */
2743   unsigned char *src_base;
2744   int c;
2745   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2746
2747   translation_table = Qnil;
2748   if (coding->src_multibyte
2749       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2750     {
2751       src_end--;
2752       src_bytes--;
2753       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2754     }
2755
2756   if (coding->eol_type == CODING_EOL_CRLF)
2757     {
2758       while (src < src_end)
2759         {
2760           src_base = src;
2761           c = *src++;
2762           if (c >= 0x20)
2763             EMIT_ONE_BYTE (c);
2764           else if (c == '\n' || (c == '\r' && selective_display))
2765             EMIT_TWO_BYTES ('\r', '\n');
2766           else
2767             EMIT_ONE_BYTE (c);
2768         }
2769       src_base = src;
2770     label_end_of_loop:
2771       ;
2772     }
2773   else
2774     {
2775       if (src_bytes <= dst_bytes)
2776         {
2777           safe_bcopy (src, dst, src_bytes);
2778           src_base = src_end;
2779           dst += src_bytes;
2780         }
2781       else
2782         {
2783           if (coding->src_multibyte
2784               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2785             dst_bytes--;
2786           safe_bcopy (src, dst, dst_bytes);
2787           src_base = src + dst_bytes;
2788           dst = destination + dst_bytes;
2789           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2790         }
2791       if (coding->eol_type == CODING_EOL_CR)
2792         {
2793           for (src = destination; src < dst; src++)
2794             if (*src == '\n') *src = '\r';
2795         }
2796       else if (selective_display)
2797         {
2798           for (src = destination; src < dst; src++)
2799             if (*src == '\r') *src = '\n';
2800         }
2801     }
2802   if (coding->src_multibyte)
2803     dst = destination + str_as_unibyte (destination, dst - destination);
2804
2805   coding->consumed = src_base - source;
2806   coding->produced = dst - destination;
2807 }
2808
2809 \f
2810 /*** 7. C library functions ***/
2811
2812 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2813    has a property `coding-system'.  The value of this property is a
2814    vector of length 5 (called as coding-vector).  Among elements of
2815    this vector, the first (element[0]) and the fifth (element[4])
2816    carry important information for decoding/encoding.  Before
2817    decoding/encoding, this information should be set in fields of a
2818    structure of type `coding_system'.
2819
2820    A value of property `coding-system' can be a symbol of another
2821    subsidiary coding-system.  In that case, Emacs gets coding-vector
2822    from that symbol.
2823
2824    `element[0]' contains information to be set in `coding->type'.  The
2825    value and its meaning is as follows:
2826
2827    0 -- coding_type_emacs_mule
2828    1 -- coding_type_sjis
2829    2 -- coding_type_iso2022
2830    3 -- coding_type_big5
2831    4 -- coding_type_ccl encoder/decoder written in CCL
2832    nil -- coding_type_no_conversion
2833    t -- coding_type_undecided (automatic conversion on decoding,
2834                                no-conversion on encoding)
2835
2836    `element[4]' contains information to be set in `coding->flags' and
2837    `coding->spec'.  The meaning varies by `coding->type'.
2838
2839    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2840    of length 32 (of which the first 13 sub-elements are used now).
2841    Meanings of these sub-elements are:
2842
2843    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2844         If the value is an integer of valid charset, the charset is
2845         assumed to be designated to graphic register N initially.
2846
2847         If the value is minus, it is a minus value of charset which
2848         reserves graphic register N, which means that the charset is
2849         not designated initially but should be designated to graphic
2850         register N just before encoding a character in that charset.
2851
2852         If the value is nil, graphic register N is never used on
2853         encoding.
2854
2855    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2856         Each value takes t or nil.  See the section ISO2022 of
2857         `coding.h' for more information.
2858
2859    If `coding->type' is `coding_type_big5', element[4] is t to denote
2860    BIG5-ETen or nil to denote BIG5-HKU.
2861
2862    If `coding->type' takes the other value, element[4] is ignored.
2863
2864    Emacs Lisp's coding system also carries information about format of
2865    end-of-line in a value of property `eol-type'.  If the value is
2866    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2867    means CODING_EOL_CR.  If it is not integer, it should be a vector
2868    of subsidiary coding systems of which property `eol-type' has one
2869    of above values.
2870
2871 */
2872
2873 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2874    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2875    is setup so that no conversion is necessary and return -1, else
2876    return 0.  */
2877
2878 int
2879 setup_coding_system (coding_system, coding)
2880      Lisp_Object coding_system;
2881      struct coding_system *coding;
2882 {
2883   Lisp_Object coding_spec, coding_type, eol_type, plist;
2884   Lisp_Object val;
2885   int i;
2886
2887   /* Initialize some fields required for all kinds of coding systems.  */
2888   coding->symbol = coding_system;
2889   coding->common_flags = 0;
2890   coding->mode = 0;
2891   coding->heading_ascii = -1;
2892   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2893   coding->composing = COMPOSITION_DISABLED;
2894   coding->cmp_data = NULL;
2895
2896   if (NILP (coding_system))
2897     goto label_invalid_coding_system;
2898
2899   coding_spec = Fget (coding_system, Qcoding_system);
2900
2901   if (!VECTORP (coding_spec)
2902       || XVECTOR (coding_spec)->size != 5
2903       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2904     goto label_invalid_coding_system;
2905
2906   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2907   if (VECTORP (eol_type))
2908     {
2909       coding->eol_type = CODING_EOL_UNDECIDED;
2910       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2911     }
2912   else if (XFASTINT (eol_type) == 1)
2913     {
2914       coding->eol_type = CODING_EOL_CRLF;
2915       coding->common_flags
2916         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2917     }
2918   else if (XFASTINT (eol_type) == 2)
2919     {
2920       coding->eol_type = CODING_EOL_CR;
2921       coding->common_flags
2922         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2923     }
2924   else
2925     coding->eol_type = CODING_EOL_LF;
2926
2927   coding_type = XVECTOR (coding_spec)->contents[0];
2928   /* Try short cut.  */
2929   if (SYMBOLP (coding_type))
2930     {
2931       if (EQ (coding_type, Qt))
2932         {
2933           coding->type = coding_type_undecided;
2934           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2935         }
2936       else
2937         coding->type = coding_type_no_conversion;
2938       return 0;
2939     }
2940
2941   /* Get values of coding system properties:
2942      `post-read-conversion', `pre-write-conversion',
2943      `translation-table-for-decode', `translation-table-for-encode'.  */
2944   plist = XVECTOR (coding_spec)->contents[3];
2945   /* Pre & post conversion functions should be disabled if
2946      inhibit_eol_conversion is nozero.  This is the case that a code
2947      conversion function is called while those functions are running.  */
2948   if (! inhibit_pre_post_conversion)
2949     {
2950       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2951       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2952     }
2953   val = Fplist_get (plist, Qtranslation_table_for_decode);
2954   if (SYMBOLP (val))
2955     val = Fget (val, Qtranslation_table_for_decode);
2956   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2957   val = Fplist_get (plist, Qtranslation_table_for_encode);
2958   if (SYMBOLP (val))
2959     val = Fget (val, Qtranslation_table_for_encode);
2960   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2961   val = Fplist_get (plist, Qcoding_category);
2962   if (!NILP (val))
2963     {
2964       val = Fget (val, Qcoding_category_index);
2965       if (INTEGERP (val))
2966         coding->category_idx = XINT (val);
2967       else
2968         goto label_invalid_coding_system;
2969     }
2970   else
2971     goto label_invalid_coding_system;
2972
2973   val = Fplist_get (plist, Qsafe_charsets);
2974   if (EQ (val, Qt))
2975     {
2976       for (i = 0; i <= MAX_CHARSET; i++)
2977         coding->safe_charsets[i] = 1;
2978     }
2979   else
2980     {
2981       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2982       while (CONSP (val))
2983         {
2984           if ((i = get_charset_id (XCAR (val))) >= 0)
2985             coding->safe_charsets[i] = 1;
2986           val = XCDR (val);
2987         }
2988     }
2989
2990   /* If the coding system has non-nil `composition' property, enable
2991      composition handling.  */
2992   val = Fplist_get (plist, Qcomposition);
2993   if (!NILP (val))
2994     coding->composing = COMPOSITION_NO;
2995
2996   switch (XFASTINT (coding_type))
2997     {
2998     case 0:
2999       coding->type = coding_type_emacs_mule;
3000       if (!NILP (coding->post_read_conversion))
3001         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3002       if (!NILP (coding->pre_write_conversion))
3003         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3004       break;
3005
3006     case 1:
3007       coding->type = coding_type_sjis;
3008       coding->common_flags
3009         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3010       break;
3011
3012     case 2:
3013       coding->type = coding_type_iso2022;
3014       coding->common_flags
3015         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3016       {
3017         Lisp_Object val, temp;
3018         Lisp_Object *flags;
3019         int i, charset, reg_bits = 0;
3020
3021         val = XVECTOR (coding_spec)->contents[4];
3022
3023         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3024           goto label_invalid_coding_system;
3025
3026         flags = XVECTOR (val)->contents;
3027         coding->flags
3028           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3029              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3030              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3031              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3032              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3033              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3034              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3035              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3036              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3037              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3038              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3039              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3040              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3041              );
3042
3043         /* Invoke graphic register 0 to plane 0.  */
3044         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3045         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3046         CODING_SPEC_ISO_INVOCATION (coding, 1)
3047           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3048         /* Not single shifting at first.  */
3049         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3050         /* Beginning of buffer should also be regarded as bol. */
3051         CODING_SPEC_ISO_BOL (coding) = 1;
3052
3053         for (charset = 0; charset <= MAX_CHARSET; charset++)
3054           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3055         val = Vcharset_revision_alist;
3056         while (CONSP (val))
3057           {
3058             charset = get_charset_id (Fcar_safe (XCAR (val)));
3059             if (charset >= 0
3060                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3061                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3062               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3063             val = XCDR (val);
3064           }
3065
3066         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3067            FLAGS[REG] can be one of below:
3068                 integer CHARSET: CHARSET occupies register I,
3069                 t: designate nothing to REG initially, but can be used
3070                   by any charsets,
3071                 list of integer, nil, or t: designate the first
3072                   element (if integer) to REG initially, the remaining
3073                   elements (if integer) is designated to REG on request,
3074                   if an element is t, REG can be used by any charsets,
3075                 nil: REG is never used.  */
3076         for (charset = 0; charset <= MAX_CHARSET; charset++)
3077           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3078             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3079         for (i = 0; i < 4; i++)
3080           {
3081             if (INTEGERP (flags[i])
3082                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3083                 || (charset = get_charset_id (flags[i])) >= 0)
3084               {
3085                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3086                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3087               }
3088             else if (EQ (flags[i], Qt))
3089               {
3090                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3091                 reg_bits |= 1 << i;
3092                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3093               }
3094             else if (CONSP (flags[i]))
3095               {
3096                 Lisp_Object tail;
3097                 tail = flags[i];
3098
3099                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3100                 if (INTEGERP (XCAR (tail))
3101                     && (charset = XINT (XCAR (tail)),
3102                         CHARSET_VALID_P (charset))
3103                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3104                   {
3105                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3106                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3107                   }
3108                 else
3109                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3110                 tail = XCDR (tail);
3111                 while (CONSP (tail))
3112                   {
3113                     if (INTEGERP (XCAR (tail))
3114                         && (charset = XINT (XCAR (tail)),
3115                             CHARSET_VALID_P (charset))
3116                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3117                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3118                         = i;
3119                     else if (EQ (XCAR (tail), Qt))
3120                       reg_bits |= 1 << i;
3121                     tail = XCDR (tail);
3122                   }
3123               }
3124             else
3125               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3126
3127             CODING_SPEC_ISO_DESIGNATION (coding, i)
3128               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3129           }
3130
3131         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3132           {
3133             /* REG 1 can be used only by locking shift in 7-bit env.  */
3134             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3135               reg_bits &= ~2;
3136             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3137               /* Without any shifting, only REG 0 and 1 can be used.  */
3138               reg_bits &= 3;
3139           }
3140
3141         if (reg_bits)
3142           for (charset = 0; charset <= MAX_CHARSET; charset++)
3143             {
3144               if (CHARSET_VALID_P (charset))
3145                 {
3146                   /* There exist some default graphic registers to be
3147                      used CHARSET.  */
3148
3149                   /* We had better avoid designating a charset of
3150                      CHARS96 to REG 0 as far as possible.  */
3151                   if (CHARSET_CHARS (charset) == 96)
3152                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3153                       = (reg_bits & 2
3154                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3155                   else
3156                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3157                       = (reg_bits & 1
3158                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3159                 }
3160             }
3161       }
3162       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3163       coding->spec.iso2022.last_invalid_designation_register = -1;
3164       break;
3165
3166     case 3:
3167       coding->type = coding_type_big5;
3168       coding->common_flags
3169         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3170       coding->flags
3171         = (NILP (XVECTOR (coding_spec)->contents[4])
3172            ? CODING_FLAG_BIG5_HKU
3173            : CODING_FLAG_BIG5_ETEN);
3174       break;
3175
3176     case 4:
3177       coding->type = coding_type_ccl;
3178       coding->common_flags
3179         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3180       {
3181         val = XVECTOR (coding_spec)->contents[4];
3182         if (! CONSP (val)
3183             || setup_ccl_program (&(coding->spec.ccl.decoder),
3184                                   XCAR (val)) < 0
3185             || setup_ccl_program (&(coding->spec.ccl.encoder),
3186                                   XCDR (val)) < 0)
3187           goto label_invalid_coding_system;
3188
3189         bzero (coding->spec.ccl.valid_codes, 256);
3190         val = Fplist_get (plist, Qvalid_codes);
3191         if (CONSP (val))
3192           {
3193             Lisp_Object this;
3194
3195             for (; CONSP (val); val = XCDR (val))
3196               {
3197                 this = XCAR (val);
3198                 if (INTEGERP (this)
3199                     && XINT (this) >= 0 && XINT (this) < 256)
3200                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3201                 else if (CONSP (this)
3202                          && INTEGERP (XCAR (this))
3203                          && INTEGERP (XCDR (this)))
3204                   {
3205                     int start = XINT (XCAR (this));
3206                     int end = XINT (XCDR (this));
3207
3208                     if (start >= 0 && start <= end && end < 256)
3209                       while (start <= end)
3210                         coding->spec.ccl.valid_codes[start++] = 1;
3211                   }
3212               }
3213           }
3214       }
3215       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3216       coding->spec.ccl.cr_carryover = 0;
3217       break;
3218
3219     case 5:
3220       coding->type = coding_type_raw_text;
3221       break;
3222
3223     default:
3224       goto label_invalid_coding_system;
3225     }
3226   return 0;
3227
3228  label_invalid_coding_system:
3229   coding->type = coding_type_no_conversion;
3230   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3231   coding->common_flags = 0;
3232   coding->eol_type = CODING_EOL_LF;
3233   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3234   return -1;
3235 }
3236
3237 /* Free memory blocks allocated for storing composition information.  */
3238
3239 void
3240 coding_free_composition_data (coding)
3241      struct coding_system *coding;
3242 {
3243   struct composition_data *cmp_data = coding->cmp_data, *next;
3244
3245   if (!cmp_data)
3246     return;
3247   /* Memory blocks are chained.  At first, rewind to the first, then,
3248      free blocks one by one.  */
3249   while (cmp_data->prev)
3250     cmp_data = cmp_data->prev;
3251   while (cmp_data)
3252     {
3253       next = cmp_data->next;
3254       xfree (cmp_data);
3255       cmp_data = next;
3256     }
3257   coding->cmp_data = NULL;
3258 }
3259
3260 /* Set `char_offset' member of all memory blocks pointed by
3261    coding->cmp_data to POS.  */
3262
3263 void
3264 coding_adjust_composition_offset (coding, pos)
3265      struct coding_system *coding;
3266      int pos;
3267 {
3268   struct composition_data *cmp_data;
3269
3270   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3271     cmp_data->char_offset = pos;
3272 }
3273
3274 /* Setup raw-text or one of its subsidiaries in the structure
3275    coding_system CODING according to the already setup value eol_type
3276    in CODING.  CODING should be setup for some coding system in
3277    advance.  */
3278
3279 void
3280 setup_raw_text_coding_system (coding)
3281      struct coding_system *coding;
3282 {
3283   if (coding->type != coding_type_raw_text)
3284     {
3285       coding->symbol = Qraw_text;
3286       coding->type = coding_type_raw_text;
3287       if (coding->eol_type != CODING_EOL_UNDECIDED)
3288         {
3289           Lisp_Object subsidiaries;
3290           subsidiaries = Fget (Qraw_text, Qeol_type);
3291
3292           if (VECTORP (subsidiaries)
3293               && XVECTOR (subsidiaries)->size == 3)
3294             coding->symbol
3295               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3296         }
3297       setup_coding_system (coding->symbol, coding);
3298     }
3299   return;
3300 }
3301
3302 /* Emacs has a mechanism to automatically detect a coding system if it
3303    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3304    it's impossible to distinguish some coding systems accurately
3305    because they use the same range of codes.  So, at first, coding
3306    systems are categorized into 7, those are:
3307
3308    o coding-category-emacs-mule
3309
3310         The category for a coding system which has the same code range
3311         as Emacs' internal format.  Assigned the coding-system (Lisp
3312         symbol) `emacs-mule' by default.
3313
3314    o coding-category-sjis
3315
3316         The category for a coding system which has the same code range
3317         as SJIS.  Assigned the coding-system (Lisp
3318         symbol) `japanese-shift-jis' by default.
3319
3320    o coding-category-iso-7
3321
3322         The category for a coding system which has the same code range
3323         as ISO2022 of 7-bit environment.  This doesn't use any locking
3324         shift and single shift functions.  This can encode/decode all
3325         charsets.  Assigned the coding-system (Lisp symbol)
3326         `iso-2022-7bit' by default.
3327
3328    o coding-category-iso-7-tight
3329
3330         Same as coding-category-iso-7 except that this can
3331         encode/decode only the specified charsets.
3332
3333    o coding-category-iso-8-1
3334
3335         The category for a coding system which has the same code range
3336         as ISO2022 of 8-bit environment and graphic plane 1 used only
3337         for DIMENSION1 charset.  This doesn't use any locking shift
3338         and single shift functions.  Assigned the coding-system (Lisp
3339         symbol) `iso-latin-1' by default.
3340
3341    o coding-category-iso-8-2
3342
3343         The category for a coding system which has the same code range
3344         as ISO2022 of 8-bit environment and graphic plane 1 used only
3345         for DIMENSION2 charset.  This doesn't use any locking shift
3346         and single shift functions.  Assigned the coding-system (Lisp
3347         symbol) `japanese-iso-8bit' by default.
3348
3349    o coding-category-iso-7-else
3350
3351         The category for a coding system which has the same code range
3352         as ISO2022 of 7-bit environemnt but uses locking shift or
3353         single shift functions.  Assigned the coding-system (Lisp
3354         symbol) `iso-2022-7bit-lock' by default.
3355
3356    o coding-category-iso-8-else
3357
3358         The category for a coding system which has the same code range
3359         as ISO2022 of 8-bit environemnt but uses locking shift or
3360         single shift functions.  Assigned the coding-system (Lisp
3361         symbol) `iso-2022-8bit-ss2' by default.
3362
3363    o coding-category-big5
3364
3365         The category for a coding system which has the same code range
3366         as BIG5.  Assigned the coding-system (Lisp symbol)
3367         `cn-big5' by default.
3368
3369    o coding-category-utf-8
3370
3371         The category for a coding system which has the same code range
3372         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3373         symbol) `utf-8' by default.
3374
3375    o coding-category-utf-16-be
3376
3377         The category for a coding system in which a text has an
3378         Unicode signature (cf. Unicode Standard) in the order of BIG
3379         endian at the head.  Assigned the coding-system (Lisp symbol)
3380         `utf-16-be' by default.
3381
3382    o coding-category-utf-16-le
3383
3384         The category for a coding system in which a text has an
3385         Unicode signature (cf. Unicode Standard) in the order of
3386         LITTLE endian at the head.  Assigned the coding-system (Lisp
3387         symbol) `utf-16-le' by default.
3388
3389    o coding-category-ccl
3390
3391         The category for a coding system of which encoder/decoder is
3392         written in CCL programs.  The default value is nil, i.e., no
3393         coding system is assigned.
3394
3395    o coding-category-binary
3396
3397         The category for a coding system not categorized in any of the
3398         above.  Assigned the coding-system (Lisp symbol)
3399         `no-conversion' by default.
3400
3401    Each of them is a Lisp symbol and the value is an actual
3402    `coding-system's (this is also a Lisp symbol) assigned by a user.
3403    What Emacs does actually is to detect a category of coding system.
3404    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3405    decide only one possible category, it selects a category of the
3406    highest priority.  Priorities of categories are also specified by a
3407    user in a Lisp variable `coding-category-list'.
3408
3409 */
3410
3411 static
3412 int ascii_skip_code[256];
3413
3414 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3415    If it detects possible coding systems, return an integer in which
3416    appropriate flag bits are set.  Flag bits are defined by macros
3417    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3418    it should point the table `coding_priorities'.  In that case, only
3419    the flag bit for a coding system of the highest priority is set in
3420    the returned value.
3421
3422    How many ASCII characters are at the head is returned as *SKIP.  */
3423
3424 static int
3425 detect_coding_mask (source, src_bytes, priorities, skip)
3426      unsigned char *source;
3427      int src_bytes, *priorities, *skip;
3428 {
3429   register unsigned char c;
3430   unsigned char *src = source, *src_end = source + src_bytes;
3431   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3432   int i, idx;
3433
3434   /* At first, skip all ASCII characters and control characters except
3435      for three ISO2022 specific control characters.  */
3436   ascii_skip_code[ISO_CODE_SO] = 0;
3437   ascii_skip_code[ISO_CODE_SI] = 0;
3438   ascii_skip_code[ISO_CODE_ESC] = 0;
3439
3440  label_loop_detect_coding:
3441   while (src < src_end && ascii_skip_code[*src]) src++;
3442   *skip = src - source;
3443
3444   if (src >= src_end)
3445     /* We found nothing other than ASCII.  There's nothing to do.  */
3446     return 0;
3447
3448   c = *src;
3449   /* The text seems to be encoded in some multilingual coding system.
3450      Now, try to find in which coding system the text is encoded.  */
3451   if (c < 0x80)
3452     {
3453       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3454       /* C is an ISO2022 specific control code of C0.  */
3455       mask = detect_coding_iso2022 (src, src_end);
3456       if (mask == 0)
3457         {
3458           /* No valid ISO2022 code follows C.  Try again.  */
3459           src++;
3460           if (c == ISO_CODE_ESC)
3461             ascii_skip_code[ISO_CODE_ESC] = 1;
3462           else
3463             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3464           goto label_loop_detect_coding;
3465         }
3466       if (priorities)
3467         {
3468           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3469             {
3470               if (mask & priorities[i])
3471                 return priorities[i];
3472             }
3473           return CODING_CATEGORY_MASK_RAW_TEXT;
3474         }
3475     }
3476   else
3477     {
3478       int try;
3479
3480       if (c < 0xA0)
3481         {
3482           /* C is the first byte of SJIS character code,
3483              or a leading-code of Emacs' internal format (emacs-mule),
3484              or the first byte of UTF-16.  */
3485           try = (CODING_CATEGORY_MASK_SJIS
3486                   | CODING_CATEGORY_MASK_EMACS_MULE
3487                   | CODING_CATEGORY_MASK_UTF_16_BE
3488                   | CODING_CATEGORY_MASK_UTF_16_LE);
3489
3490           /* Or, if C is a special latin extra code,
3491              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3492              or is an ISO2022 control-sequence-introducer (CSI),
3493              we should also consider the possibility of ISO2022 codings.  */
3494           if ((VECTORP (Vlatin_extra_code_table)
3495                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3496               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3497               || (c == ISO_CODE_CSI
3498                   && (src < src_end
3499                       && (*src == ']'
3500                           || ((*src == '0' || *src == '1' || *src == '2')
3501                               && src + 1 < src_end
3502                               && src[1] == ']')))))
3503             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3504                      | CODING_CATEGORY_MASK_ISO_8BIT);
3505         }
3506       else
3507         /* C is a character of ISO2022 in graphic plane right,
3508            or a SJIS's 1-byte character code (i.e. JISX0201),
3509            or the first byte of BIG5's 2-byte code,
3510            or the first byte of UTF-8/16.  */
3511         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3512                 | CODING_CATEGORY_MASK_ISO_8BIT
3513                 | CODING_CATEGORY_MASK_SJIS
3514                 | CODING_CATEGORY_MASK_BIG5
3515                 | CODING_CATEGORY_MASK_UTF_8
3516                 | CODING_CATEGORY_MASK_UTF_16_BE
3517                 | CODING_CATEGORY_MASK_UTF_16_LE);
3518
3519       /* Or, we may have to consider the possibility of CCL.  */
3520       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3521           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3522               ->spec.ccl.valid_codes)[c])
3523         try |= CODING_CATEGORY_MASK_CCL;
3524
3525       mask = 0;
3526       utf16_examined_p = iso2022_examined_p = 0;
3527       if (priorities)
3528         {
3529           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3530             {
3531               if (!iso2022_examined_p
3532                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3533                 {
3534                   mask |= detect_coding_iso2022 (src, src_end);
3535                   iso2022_examined_p = 1;
3536                 }
3537               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3538                 mask |= detect_coding_sjis (src, src_end);
3539               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3540                 mask |= detect_coding_utf_8 (src, src_end);
3541               else if (!utf16_examined_p
3542                        && (priorities[i] & try &
3543                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3544                 {
3545                   mask |= detect_coding_utf_16 (src, src_end);
3546                   utf16_examined_p = 1;
3547                 }
3548               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3549                 mask |= detect_coding_big5 (src, src_end);
3550               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3551                 mask |= detect_coding_emacs_mule (src, src_end);
3552               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3553                 mask |= detect_coding_ccl (src, src_end);
3554               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3555                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3556               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3557                 mask |= CODING_CATEGORY_MASK_BINARY;
3558               if (mask & priorities[i])
3559                 return priorities[i];
3560             }
3561           return CODING_CATEGORY_MASK_RAW_TEXT;
3562         }
3563       if (try & CODING_CATEGORY_MASK_ISO)
3564         mask |= detect_coding_iso2022 (src, src_end);
3565       if (try & CODING_CATEGORY_MASK_SJIS)
3566         mask |= detect_coding_sjis (src, src_end);
3567       if (try & CODING_CATEGORY_MASK_BIG5)
3568         mask |= detect_coding_big5 (src, src_end);
3569       if (try & CODING_CATEGORY_MASK_UTF_8)
3570         mask |= detect_coding_utf_8 (src, src_end);
3571       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3572         mask |= detect_coding_utf_16 (src, src_end);
3573       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3574         mask |= detect_coding_emacs_mule (src, src_end);
3575       if (try & CODING_CATEGORY_MASK_CCL)
3576         mask |= detect_coding_ccl (src, src_end);
3577     }
3578   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3579 }
3580
3581 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3582    The information of the detected coding system is set in CODING.  */
3583
3584 void
3585 detect_coding (coding, src, src_bytes)
3586      struct coding_system *coding;
3587      unsigned char *src;
3588      int src_bytes;
3589 {
3590   unsigned int idx;
3591   int skip, mask, i;
3592   Lisp_Object val;
3593
3594   val = Vcoding_category_list;
3595   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3596   coding->heading_ascii = skip;
3597
3598   if (!mask) return;
3599
3600   /* We found a single coding system of the highest priority in MASK.  */
3601   idx = 0;
3602   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3603   if (! mask)
3604     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3605
3606   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3607
3608   if (coding->eol_type != CODING_EOL_UNDECIDED)
3609     {
3610       Lisp_Object tmp;
3611
3612       tmp = Fget (val, Qeol_type);
3613       if (VECTORP (tmp))
3614         val = XVECTOR (tmp)->contents[coding->eol_type];
3615     }
3616
3617   /* Setup this new coding system while preserving some slots.  */
3618   {
3619     int src_multibyte = coding->src_multibyte;
3620     int dst_multibyte = coding->dst_multibyte;
3621
3622     setup_coding_system (val, coding);
3623     coding->src_multibyte = src_multibyte;
3624     coding->dst_multibyte = dst_multibyte;
3625     coding->heading_ascii = skip;
3626   }
3627 }
3628
3629 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3630    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3631    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3632
3633    How many non-eol characters are at the head is returned as *SKIP.  */
3634
3635 #define MAX_EOL_CHECK_COUNT 3
3636
3637 static int
3638 detect_eol_type (source, src_bytes, skip)
3639      unsigned char *source;
3640      int src_bytes, *skip;
3641 {
3642   unsigned char *src = source, *src_end = src + src_bytes;
3643   unsigned char c;
3644   int total = 0;                /* How many end-of-lines are found so far.  */
3645   int eol_type = CODING_EOL_UNDECIDED;
3646   int this_eol_type;
3647
3648   *skip = 0;
3649
3650   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3651     {
3652       c = *src++;
3653       if (c == '\n' || c == '\r')
3654         {
3655           if (*skip == 0)
3656             *skip = src - 1 - source;
3657           total++;
3658           if (c == '\n')
3659             this_eol_type = CODING_EOL_LF;
3660           else if (src >= src_end || *src != '\n')
3661             this_eol_type = CODING_EOL_CR;
3662           else
3663             this_eol_type = CODING_EOL_CRLF, src++;
3664
3665           if (eol_type == CODING_EOL_UNDECIDED)
3666             /* This is the first end-of-line.  */
3667             eol_type = this_eol_type;
3668           else if (eol_type != this_eol_type)
3669             {
3670               /* The found type is different from what found before.  */
3671               eol_type = CODING_EOL_INCONSISTENT;
3672               break;
3673             }
3674         }
3675     }
3676
3677   if (*skip == 0)
3678     *skip = src_end - source;
3679   return eol_type;
3680 }
3681
3682 /* Like detect_eol_type, but detect EOL type in 2-octet
3683    big-endian/little-endian format for coding systems utf-16-be and
3684    utf-16-le.  */
3685
3686 static int
3687 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3688      unsigned char *source;
3689      int src_bytes, *skip;
3690 {
3691   unsigned char *src = source, *src_end = src + src_bytes;
3692   unsigned int c1, c2;
3693   int total = 0;                /* How many end-of-lines are found so far.  */
3694   int eol_type = CODING_EOL_UNDECIDED;
3695   int this_eol_type;
3696   int msb, lsb;
3697
3698   if (big_endian_p)
3699     msb = 0, lsb = 1;
3700   else
3701     msb = 1, lsb = 0;
3702
3703   *skip = 0;
3704
3705   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3706     {
3707       c1 = (src[msb] << 8) | (src[lsb]);
3708       src += 2;
3709
3710       if (c1 == '\n' || c1 == '\r')
3711         {
3712           if (*skip == 0)
3713             *skip = src - 2 - source;
3714           total++;
3715           if (c1 == '\n')
3716             {
3717               this_eol_type = CODING_EOL_LF;
3718             }
3719           else
3720             {
3721               if ((src + 1) >= src_end)
3722                 {
3723                   this_eol_type = CODING_EOL_CR;
3724                 }
3725               else
3726                 {
3727                   c2 = (src[msb] << 8) | (src[lsb]);
3728                   if (c2 == '\n')
3729                     this_eol_type = CODING_EOL_CRLF, src += 2;
3730                   else
3731                     this_eol_type = CODING_EOL_CR;
3732                 }
3733             }
3734
3735           if (eol_type == CODING_EOL_UNDECIDED)
3736             /* This is the first end-of-line.  */
3737             eol_type = this_eol_type;
3738           else if (eol_type != this_eol_type)
3739             {
3740               /* The found type is different from what found before.  */
3741               eol_type = CODING_EOL_INCONSISTENT;
3742               break;
3743             }
3744         }
3745     }
3746
3747   if (*skip == 0)
3748     *skip = src_end - source;
3749   return eol_type;
3750 }
3751
3752 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3753    is encoded.  If it detects an appropriate format of end-of-line, it
3754    sets the information in *CODING.  */
3755
3756 void
3757 detect_eol (coding, src, src_bytes)
3758      struct coding_system *coding;
3759      unsigned char *src;
3760      int src_bytes;
3761 {
3762   Lisp_Object val;
3763   int skip;
3764   int eol_type;
3765
3766   switch (coding->category_idx)
3767     {
3768     case CODING_CATEGORY_IDX_UTF_16_BE:
3769       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3770       break;
3771     case CODING_CATEGORY_IDX_UTF_16_LE:
3772       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3773       break;
3774     default:
3775       eol_type = detect_eol_type (src, src_bytes, &skip);
3776       break;
3777     }
3778
3779   if (coding->heading_ascii > skip)
3780     coding->heading_ascii = skip;
3781   else
3782     skip = coding->heading_ascii;
3783
3784   if (eol_type == CODING_EOL_UNDECIDED)
3785     return;
3786   if (eol_type == CODING_EOL_INCONSISTENT)
3787     {
3788 #if 0
3789       /* This code is suppressed until we find a better way to
3790          distinguish raw text file and binary file.  */
3791
3792       /* If we have already detected that the coding is raw-text, the
3793          coding should actually be no-conversion.  */
3794       if (coding->type == coding_type_raw_text)
3795         {
3796           setup_coding_system (Qno_conversion, coding);
3797           return;
3798         }
3799       /* Else, let's decode only text code anyway.  */
3800 #endif /* 0 */
3801       eol_type = CODING_EOL_LF;
3802     }
3803
3804   val = Fget (coding->symbol, Qeol_type);
3805   if (VECTORP (val) && XVECTOR (val)->size == 3)
3806     {
3807       int src_multibyte = coding->src_multibyte;
3808       int dst_multibyte = coding->dst_multibyte;
3809
3810       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3811       coding->src_multibyte = src_multibyte;
3812       coding->dst_multibyte = dst_multibyte;
3813       coding->heading_ascii = skip;
3814     }
3815 }
3816
3817 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3818
3819 #define DECODING_BUFFER_MAG(coding)                     \
3820   (coding->type == coding_type_iso2022                  \
3821    ? 3                                                  \
3822    : (coding->type == coding_type_ccl                   \
3823       ? coding->spec.ccl.decoder.buf_magnification      \
3824       : 2))
3825
3826 /* Return maximum size (bytes) of a buffer enough for decoding
3827    SRC_BYTES of text encoded in CODING.  */
3828
3829 int
3830 decoding_buffer_size (coding, src_bytes)
3831      struct coding_system *coding;
3832      int src_bytes;
3833 {
3834   return (src_bytes * DECODING_BUFFER_MAG (coding)
3835           + CONVERSION_BUFFER_EXTRA_ROOM);
3836 }
3837
3838 /* Return maximum size (bytes) of a buffer enough for encoding
3839    SRC_BYTES of text to CODING.  */
3840
3841 int
3842 encoding_buffer_size (coding, src_bytes)
3843      struct coding_system *coding;
3844      int src_bytes;
3845 {
3846   int magnification;
3847
3848   if (coding->type == coding_type_ccl)
3849     magnification = coding->spec.ccl.encoder.buf_magnification;
3850   else if (CODING_REQUIRE_ENCODING (coding))
3851     magnification = 3;
3852   else
3853     magnification = 1;
3854
3855   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3856 }
3857
3858 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3859 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3860 #endif
3861
3862 char *conversion_buffer;
3863 int conversion_buffer_size;
3864
3865 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3866    or decoding.  Sufficient memory is allocated automatically.  If we
3867    run out of memory, return NULL.  */
3868
3869 char *
3870 get_conversion_buffer (size)
3871      int size;
3872 {
3873   if (size > conversion_buffer_size)
3874     {
3875       char *buf;
3876       int real_size = conversion_buffer_size * 2;
3877
3878       while (real_size < size) real_size *= 2;
3879       buf = (char *) xmalloc (real_size);
3880       xfree (conversion_buffer);
3881       conversion_buffer = buf;
3882       conversion_buffer_size = real_size;
3883     }
3884   return conversion_buffer;
3885 }
3886
3887 int
3888 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3889      struct coding_system *coding;
3890      unsigned char *source, *destination;
3891      int src_bytes, dst_bytes, encodep;
3892 {
3893   struct ccl_program *ccl
3894     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3895   int result;
3896
3897   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3898   if (encodep)
3899     ccl->eol_type = coding->eol_type;
3900   coding->produced = ccl_driver (ccl, source, destination,
3901                                  src_bytes, dst_bytes, &(coding->consumed));
3902   if (encodep)
3903     coding->produced_char = coding->produced;
3904   else
3905     {
3906       int bytes
3907         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3908       coding->produced = str_as_multibyte (destination, bytes,
3909                                            coding->produced,
3910                                            &(coding->produced_char));
3911     }
3912
3913   switch (ccl->status)
3914     {
3915     case CCL_STAT_SUSPEND_BY_SRC:
3916       result = CODING_FINISH_INSUFFICIENT_SRC;
3917       break;
3918     case CCL_STAT_SUSPEND_BY_DST:
3919       result = CODING_FINISH_INSUFFICIENT_DST;
3920       break;
3921     case CCL_STAT_QUIT:
3922     case CCL_STAT_INVALID_CMD:
3923       result = CODING_FINISH_INTERRUPT;
3924       break;
3925     default:
3926       result = CODING_FINISH_NORMAL;
3927       break;
3928     }
3929   return result;
3930 }
3931
3932 /* Decode EOL format of the text at PTR of BYTES length destructively
3933    according to CODING->eol_type.  This is called after the CCL
3934    program produced a decoded text at PTR.  If we do CRLF->LF
3935    conversion, update CODING->produced and CODING->produced_char.  */
3936
3937 static void
3938 decode_eol_post_ccl (coding, ptr, bytes)
3939      struct coding_system *coding;
3940      unsigned char *ptr;
3941      int bytes;
3942 {
3943   Lisp_Object val, saved_coding_symbol;
3944   unsigned char *pend = ptr + bytes;
3945   int dummy;
3946
3947   /* Remember the current coding system symbol.  We set it back when
3948      an inconsistent EOL is found so that `last-coding-system-used' is
3949      set to the coding system that doesn't specify EOL conversion.  */
3950   saved_coding_symbol = coding->symbol;
3951
3952   coding->spec.ccl.cr_carryover = 0;
3953   if (coding->eol_type == CODING_EOL_UNDECIDED)
3954     {
3955       /* Here, to avoid the call of setup_coding_system, we directly
3956          call detect_eol_type.  */
3957       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
3958       if (coding->eol_type == CODING_EOL_INCONSISTENT)
3959         coding->eol_type = CODING_EOL_LF;
3960       if (coding->eol_type != CODING_EOL_UNDECIDED)
3961         {
3962           val = Fget (coding->symbol, Qeol_type);
3963           if (VECTORP (val) && XVECTOR (val)->size == 3)
3964             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
3965         }
3966       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
3967     }
3968
3969   if (coding->eol_type == CODING_EOL_LF
3970       || coding->eol_type == CODING_EOL_UNDECIDED)
3971     {
3972       /* We have nothing to do.  */
3973       ptr = pend;
3974     }
3975   else if (coding->eol_type == CODING_EOL_CRLF)
3976     {
3977       unsigned char *pstart = ptr, *p = ptr;
3978
3979       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
3980           && *(pend - 1) == '\r')
3981         {
3982           /* If the last character is CR, we can't handle it here
3983              because LF will be in the not-yet-decoded source text.
3984              Recorded that the CR is not yet processed.  */
3985           coding->spec.ccl.cr_carryover = 1;
3986           coding->produced--;
3987           coding->produced_char--;
3988           pend--;
3989         }
3990       while (ptr < pend)
3991         {
3992           if (*ptr == '\r')
3993             {
3994               if (ptr + 1 < pend && *(ptr + 1) == '\n')
3995                 {
3996                   *p++ = '\n';
3997                   ptr += 2;
3998                 }
3999               else
4000                 {
4001                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4002                     goto undo_eol_conversion;
4003                   *p++ = *ptr++;
4004                 }
4005             }
4006           else if (*ptr == '\n'
4007                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4008             goto undo_eol_conversion;
4009           else
4010             *p++ = *ptr++;
4011           continue;
4012
4013         undo_eol_conversion:
4014           /* We have faced with inconsistent EOL format at PTR.
4015              Convert all LFs before PTR back to CRLFs.  */
4016           for (p--, ptr--; p >= pstart; p--)
4017             {
4018               if (*p == '\n')
4019                 *ptr-- = '\n', *ptr-- = '\r';
4020               else
4021                 *ptr-- = *p;
4022             }
4023           /*  If carryover is recorded, cancel it because we don't
4024               convert CRLF anymore.  */
4025           if (coding->spec.ccl.cr_carryover)
4026             {
4027               coding->spec.ccl.cr_carryover = 0;
4028               coding->produced++;
4029               coding->produced_char++;
4030               pend++;
4031             }
4032           p = ptr = pend;
4033           coding->eol_type = CODING_EOL_LF;
4034           coding->symbol = saved_coding_symbol;
4035         }
4036       if (p < pend)
4037         {
4038           /* As each two-byte sequence CRLF was converted to LF, (PEND
4039              - P) is the number of deleted characters.  */
4040           coding->produced -= pend - p;
4041           coding->produced_char -= pend - p;
4042         }
4043     }
4044   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4045     {
4046       unsigned char *p = ptr;
4047
4048       for (; ptr < pend; ptr++)
4049         {
4050           if (*ptr == '\r')
4051             *ptr = '\n';
4052           else if (*ptr == '\n'
4053                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4054             {
4055               for (; p < ptr; p++)
4056                 {
4057                   if (*p == '\n')
4058                     *p = '\r';
4059                 }
4060               ptr = pend;
4061               coding->eol_type = CODING_EOL_LF;
4062               coding->symbol = saved_coding_symbol;
4063             }
4064         }
4065     }
4066 }
4067
4068 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4069    decoding, it may detect coding system and format of end-of-line if
4070    those are not yet decided.  The source should be unibyte, the
4071    result is multibyte if CODING->dst_multibyte is nonzero, else
4072    unibyte.  */
4073
4074 int
4075 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4076      struct coding_system *coding;
4077      unsigned char *source, *destination;
4078      int src_bytes, dst_bytes;
4079 {
4080   if (coding->type == coding_type_undecided)
4081     detect_coding (coding, source, src_bytes);
4082
4083   if (coding->eol_type == CODING_EOL_UNDECIDED
4084       && coding->type != coding_type_ccl)
4085     detect_eol (coding, source, src_bytes);
4086
4087   coding->produced = coding->produced_char = 0;
4088   coding->consumed = coding->consumed_char = 0;
4089   coding->errors = 0;
4090   coding->result = CODING_FINISH_NORMAL;
4091
4092   switch (coding->type)
4093     {
4094     case coding_type_sjis:
4095       decode_coding_sjis_big5 (coding, source, destination,
4096                                src_bytes, dst_bytes, 1);
4097       break;
4098
4099     case coding_type_iso2022:
4100       decode_coding_iso2022 (coding, source, destination,
4101                              src_bytes, dst_bytes);
4102       break;
4103
4104     case coding_type_big5:
4105       decode_coding_sjis_big5 (coding, source, destination,
4106                                src_bytes, dst_bytes, 0);
4107       break;
4108
4109     case coding_type_emacs_mule:
4110       decode_coding_emacs_mule (coding, source, destination,
4111                                 src_bytes, dst_bytes);
4112       break;
4113
4114     case coding_type_ccl:
4115       if (coding->spec.ccl.cr_carryover)
4116         {
4117           /* Set the CR which is not processed by the previous call of
4118              decode_eol_post_ccl in DESTINATION.  */
4119           *destination = '\r';
4120           coding->produced++;
4121           coding->produced_char++;
4122           dst_bytes--;
4123         }
4124       ccl_coding_driver (coding, source,
4125                          destination + coding->spec.ccl.cr_carryover,
4126                          src_bytes, dst_bytes, 0);
4127       if (coding->eol_type != CODING_EOL_LF)
4128         decode_eol_post_ccl (coding, destination, coding->produced);
4129       break;
4130
4131     default:
4132       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4133     }
4134
4135   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4136       && coding->consumed == src_bytes)
4137     coding->result = CODING_FINISH_NORMAL;
4138
4139   if (coding->mode & CODING_MODE_LAST_BLOCK
4140       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4141     {
4142       unsigned char *src = source + coding->consumed;
4143       unsigned char *dst = destination + coding->produced;
4144
4145       src_bytes -= coding->consumed;
4146      coding->errors++;
4147       if (COMPOSING_P (coding))
4148         DECODE_COMPOSITION_END ('1');
4149       while (src_bytes--)
4150         {
4151           int c = *src++;
4152           dst += CHAR_STRING (c, dst);
4153           coding->produced_char++;
4154         }
4155       coding->consumed = coding->consumed_char = src - source;
4156       coding->produced = dst - destination;
4157     }
4158
4159   if (!coding->dst_multibyte)
4160     {
4161       coding->produced = str_as_unibyte (destination, coding->produced);
4162       coding->produced_char = coding->produced;
4163     }
4164
4165   return coding->result;
4166 }
4167
4168 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4169    multibyteness of the source is CODING->src_multibyte, the
4170    multibyteness of the result is always unibyte.  */
4171
4172 int
4173 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4174      struct coding_system *coding;
4175      unsigned char *source, *destination;
4176      int src_bytes, dst_bytes;
4177 {
4178   coding->produced = coding->produced_char = 0;
4179   coding->consumed = coding->consumed_char = 0;
4180   coding->errors = 0;
4181   coding->result = CODING_FINISH_NORMAL;
4182
4183   switch (coding->type)
4184     {
4185     case coding_type_sjis:
4186       encode_coding_sjis_big5 (coding, source, destination,
4187                                src_bytes, dst_bytes, 1);
4188       break;
4189
4190     case coding_type_iso2022:
4191       encode_coding_iso2022 (coding, source, destination,
4192                              src_bytes, dst_bytes);
4193       break;
4194
4195     case coding_type_big5:
4196       encode_coding_sjis_big5 (coding, source, destination,
4197                                src_bytes, dst_bytes, 0);
4198       break;
4199
4200     case coding_type_emacs_mule:
4201       encode_coding_emacs_mule (coding, source, destination,
4202                                 src_bytes, dst_bytes);
4203       break;
4204
4205     case coding_type_ccl:
4206       ccl_coding_driver (coding, source, destination,
4207                          src_bytes, dst_bytes, 1);
4208       break;
4209
4210     default:
4211       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4212     }
4213
4214   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4215       && coding->consumed == src_bytes)
4216     coding->result = CODING_FINISH_NORMAL;
4217
4218   if (coding->mode & CODING_MODE_LAST_BLOCK)
4219     {
4220       unsigned char *src = source + coding->consumed;
4221       unsigned char *src_end = src + src_bytes;
4222       unsigned char *dst = destination + coding->produced;
4223
4224       if (coding->type == coding_type_iso2022)
4225         ENCODE_RESET_PLANE_AND_REGISTER;
4226       if (COMPOSING_P (coding))
4227         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4228       if (coding->consumed < src_bytes)
4229         {
4230           int len = src_bytes - coding->consumed;
4231
4232           BCOPY_SHORT (source + coding->consumed, dst, len);
4233           if (coding->src_multibyte)
4234             len = str_as_unibyte (dst, len);
4235           dst += len;
4236           coding->consumed = src_bytes;
4237         }
4238       coding->produced = coding->produced_char = dst - destination;
4239     }
4240
4241   return coding->result;
4242 }
4243
4244 /* Scan text in the region between *BEG and *END (byte positions),
4245    skip characters which we don't have to decode by coding system
4246    CODING at the head and tail, then set *BEG and *END to the region
4247    of the text we actually have to convert.  The caller should move
4248    the gap out of the region in advance if the region is from a
4249    buffer.
4250
4251    If STR is not NULL, *BEG and *END are indices into STR.  */
4252
4253 static void
4254 shrink_decoding_region (beg, end, coding, str)
4255      int *beg, *end;
4256      struct coding_system *coding;
4257      unsigned char *str;
4258 {
4259   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4260   int eol_conversion;
4261   Lisp_Object translation_table;
4262
4263   if (coding->type == coding_type_ccl
4264       || coding->type == coding_type_undecided
4265       || coding->eol_type != CODING_EOL_LF
4266       || !NILP (coding->post_read_conversion)
4267       || coding->composing != COMPOSITION_DISABLED)
4268     {
4269       /* We can't skip any data.  */
4270       return;
4271     }
4272   if (coding->type == coding_type_no_conversion
4273       || coding->type == coding_type_raw_text
4274       || coding->type == coding_type_emacs_mule)
4275     {
4276       /* We need no conversion, but don't have to skip any data here.
4277          Decoding routine handles them effectively anyway.  */
4278       return;
4279     }
4280
4281   translation_table = coding->translation_table_for_decode;
4282   if (NILP (translation_table) && !NILP (Venable_character_translation))
4283     translation_table = Vstandard_translation_table_for_decode;
4284   if (CHAR_TABLE_P (translation_table))
4285     {
4286       int i;
4287       for (i = 0; i < 128; i++)
4288         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4289           break;
4290       if (i < 128)
4291         /* Some ASCII character should be translated.  We give up
4292            shrinking.  */
4293         return;
4294     }
4295
4296   if (coding->heading_ascii >= 0)
4297     /* Detection routine has already found how much we can skip at the
4298        head.  */
4299     *beg += coding->heading_ascii;
4300
4301   if (str)
4302     {
4303       begp_orig = begp = str + *beg;
4304       endp_orig = endp = str + *end;
4305     }
4306   else
4307     {
4308       begp_orig = begp = BYTE_POS_ADDR (*beg);
4309       endp_orig = endp = begp + *end - *beg;
4310     }
4311
4312   eol_conversion = (coding->eol_type == CODING_EOL_CR
4313                     || coding->eol_type == CODING_EOL_CRLF);
4314
4315   switch (coding->type)
4316     {
4317     case coding_type_sjis:
4318     case coding_type_big5:
4319       /* We can skip all ASCII characters at the head.  */
4320       if (coding->heading_ascii < 0)
4321         {
4322           if (eol_conversion)
4323             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4324           else
4325             while (begp < endp && *begp < 0x80) begp++;
4326         }
4327       /* We can skip all ASCII characters at the tail except for the
4328          second byte of SJIS or BIG5 code.  */
4329       if (eol_conversion)
4330         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4331       else
4332         while (begp < endp && endp[-1] < 0x80) endp--;
4333       /* Do not consider LF as ascii if preceded by CR, since that
4334          confuses eol decoding. */
4335       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4336         endp++;
4337       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4338         endp++;
4339       break;
4340
4341     case coding_type_iso2022:
4342       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4343         /* We can't skip any data.  */
4344         break;
4345       if (coding->heading_ascii < 0)
4346         {
4347           /* We can skip all ASCII characters at the head except for a
4348              few control codes.  */
4349           while (begp < endp && (c = *begp) < 0x80
4350                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4351                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4352                  && (!eol_conversion || c != ISO_CODE_LF))
4353             begp++;
4354         }
4355       switch (coding->category_idx)
4356         {
4357         case CODING_CATEGORY_IDX_ISO_8_1:
4358         case CODING_CATEGORY_IDX_ISO_8_2:
4359           /* We can skip all ASCII characters at the tail.  */
4360           if (eol_conversion)
4361             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4362           else
4363             while (begp < endp && endp[-1] < 0x80) endp--;
4364           /* Do not consider LF as ascii if preceded by CR, since that
4365              confuses eol decoding. */
4366           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4367             endp++;
4368           break;
4369
4370         case CODING_CATEGORY_IDX_ISO_7:
4371         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4372           {
4373             /* We can skip all charactes at the tail except for 8-bit
4374                codes and ESC and the following 2-byte at the tail.  */
4375             unsigned char *eight_bit = NULL;
4376
4377             if (eol_conversion)
4378               while (begp < endp
4379                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4380                 {
4381                   if (!eight_bit && c & 0x80) eight_bit = endp;
4382                   endp--;
4383                 }
4384             else
4385               while (begp < endp
4386                      && (c = endp[-1]) != ISO_CODE_ESC)
4387                 {
4388                   if (!eight_bit && c & 0x80) eight_bit = endp;
4389                   endp--;
4390                 }
4391             /* Do not consider LF as ascii if preceded by CR, since that
4392                confuses eol decoding. */
4393             if (begp < endp && endp < endp_orig
4394                 && endp[-1] == '\r' && endp[0] == '\n')
4395               endp++;
4396             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4397               {
4398                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4399                   /* This is an ASCII designation sequence.  We can
4400                      surely skip the tail.  But, if we have
4401                      encountered an 8-bit code, skip only the codes
4402                      after that.  */
4403                   endp = eight_bit ? eight_bit : endp + 2;
4404                 else
4405                   /* Hmmm, we can't skip the tail.  */
4406                   endp = endp_orig;
4407               }
4408             else if (eight_bit)
4409               endp = eight_bit;
4410           }
4411         }
4412       break;
4413
4414     default:
4415       abort ();
4416     }
4417   *beg += begp - begp_orig;
4418   *end += endp - endp_orig;
4419   return;
4420 }
4421
4422 /* Like shrink_decoding_region but for encoding.  */
4423
4424 static void
4425 shrink_encoding_region (beg, end, coding, str)
4426      int *beg, *end;
4427      struct coding_system *coding;
4428      unsigned char *str;
4429 {
4430   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4431   int eol_conversion;
4432   Lisp_Object translation_table;
4433
4434   if (coding->type == coding_type_ccl
4435       || coding->eol_type == CODING_EOL_CRLF
4436       || coding->eol_type == CODING_EOL_CR
4437       || coding->cmp_data && coding->cmp_data->used > 0)
4438     {
4439       /* We can't skip any data.  */
4440       return;
4441     }
4442   if (coding->type == coding_type_no_conversion
4443       || coding->type == coding_type_raw_text
4444       || coding->type == coding_type_emacs_mule
4445       || coding->type == coding_type_undecided)
4446     {
4447       /* We need no conversion, but don't have to skip any data here.
4448          Encoding routine handles them effectively anyway.  */
4449       return;
4450     }
4451
4452   translation_table = coding->translation_table_for_encode;
4453   if (NILP (translation_table) && !NILP (Venable_character_translation))
4454     translation_table = Vstandard_translation_table_for_encode;
4455   if (CHAR_TABLE_P (translation_table))
4456     {
4457       int i;
4458       for (i = 0; i < 128; i++)
4459         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4460           break;
4461       if (i < 128)
4462         /* Some ASCII character should be tranlsated.  We give up
4463            shrinking.  */
4464         return;
4465     }
4466
4467   if (str)
4468     {
4469       begp_orig = begp = str + *beg;
4470       endp_orig = endp = str + *end;
4471     }
4472   else
4473     {
4474       begp_orig = begp = BYTE_POS_ADDR (*beg);
4475       endp_orig = endp = begp + *end - *beg;
4476     }
4477
4478   eol_conversion = (coding->eol_type == CODING_EOL_CR
4479                     || coding->eol_type == CODING_EOL_CRLF);
4480
4481   /* Here, we don't have to check coding->pre_write_conversion because
4482      the caller is expected to have handled it already.  */
4483   switch (coding->type)
4484     {
4485     case coding_type_iso2022:
4486       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4487         /* We can't skip any data.  */
4488         break;
4489       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4490         {
4491           unsigned char *bol = begp;
4492           while (begp < endp && *begp < 0x80)
4493             {
4494               begp++;
4495               if (begp[-1] == '\n')
4496                 bol = begp;
4497             }
4498           begp = bol;
4499           goto label_skip_tail;
4500         }
4501       /* fall down ... */
4502
4503     case coding_type_sjis:
4504     case coding_type_big5:
4505       /* We can skip all ASCII characters at the head and tail.  */
4506       if (eol_conversion)
4507         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4508       else
4509         while (begp < endp && *begp < 0x80) begp++;
4510     label_skip_tail:
4511       if (eol_conversion)
4512         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4513       else
4514         while (begp < endp && *(endp - 1) < 0x80) endp--;
4515       break;
4516
4517     default:
4518       abort ();
4519     }
4520
4521   *beg += begp - begp_orig;
4522   *end += endp - endp_orig;
4523   return;
4524 }
4525
4526 /* As shrinking conversion region requires some overhead, we don't try
4527    shrinking if the length of conversion region is less than this
4528    value.  */
4529 static int shrink_conversion_region_threshhold = 1024;
4530
4531 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4532   do {                                                                  \
4533     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4534       {                                                                 \
4535         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4536         else shrink_decoding_region (beg, end, coding, str);            \
4537       }                                                                 \
4538   } while (0)
4539
4540 static Lisp_Object
4541 code_convert_region_unwind (dummy)
4542      Lisp_Object dummy;
4543 {
4544   inhibit_pre_post_conversion = 0;
4545   return Qnil;
4546 }
4547
4548 /* Store information about all compositions in the range FROM and TO
4549    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4550    buffer or a string, defaults to the current buffer.  */
4551
4552 void
4553 coding_save_composition (coding, from, to, obj)
4554      struct coding_system *coding;
4555      int from, to;
4556      Lisp_Object obj;
4557 {
4558   Lisp_Object prop;
4559   int start, end;
4560
4561   if (coding->composing == COMPOSITION_DISABLED)
4562     return;
4563   if (!coding->cmp_data)
4564     coding_allocate_composition_data (coding, from);
4565   if (!find_composition (from, to, &start, &end, &prop, obj)
4566       || end > to)
4567     return;
4568   if (start < from
4569       && (!find_composition (end, to, &start, &end, &prop, obj)
4570           || end > to))
4571     return;
4572   coding->composing = COMPOSITION_NO;
4573   do
4574     {
4575       if (COMPOSITION_VALID_P (start, end, prop))
4576         {
4577           enum composition_method method = COMPOSITION_METHOD (prop);
4578           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4579               >= COMPOSITION_DATA_SIZE)
4580             coding_allocate_composition_data (coding, from);
4581           /* For relative composition, we remember start and end
4582              positions, for the other compositions, we also remember
4583              components.  */
4584           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4585           if (method != COMPOSITION_RELATIVE)
4586             {
4587               /* We must store a*/
4588               Lisp_Object val, ch;
4589
4590               val = COMPOSITION_COMPONENTS (prop);
4591               if (CONSP (val))
4592                 while (CONSP (val))
4593                   {
4594                     ch = XCAR (val), val = XCDR (val);
4595                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4596                   }
4597               else if (VECTORP (val) || STRINGP (val))
4598                 {
4599                   int len = (VECTORP (val)
4600                              ? XVECTOR (val)->size : XSTRING (val)->size);
4601                   int i;
4602                   for (i = 0; i < len; i++)
4603                     {
4604                       ch = (STRINGP (val)
4605                             ? Faref (val, make_number (i))
4606                             : XVECTOR (val)->contents[i]);
4607                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4608                     }
4609                 }
4610               else              /* INTEGERP (val) */
4611                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4612             }
4613           CODING_ADD_COMPOSITION_END (coding, end - from);
4614         }
4615       start = end;
4616     }
4617   while (start < to
4618          && find_composition (start, to, &start, &end, &prop, obj)
4619          && end <= to);
4620
4621   /* Make coding->cmp_data point to the first memory block.  */
4622   while (coding->cmp_data->prev)
4623     coding->cmp_data = coding->cmp_data->prev;
4624   coding->cmp_data_start = 0;
4625 }
4626
4627 /* Reflect the saved information about compositions to OBJ.
4628    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4629    is a buffer or a string, defaults to the current buffer.  */
4630
4631 void
4632 coding_restore_composition (coding, obj)
4633      struct coding_system *coding;
4634      Lisp_Object obj;
4635 {
4636   struct composition_data *cmp_data = coding->cmp_data;
4637
4638   if (!cmp_data)
4639     return;
4640
4641   while (cmp_data->prev)
4642     cmp_data = cmp_data->prev;
4643
4644   while (cmp_data)
4645     {
4646       int i;
4647
4648       for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4649         {
4650           int *data = cmp_data->data + i;
4651           enum composition_method method = (enum composition_method) data[3];
4652           Lisp_Object components;
4653
4654           if (method == COMPOSITION_RELATIVE)
4655             components = Qnil;
4656           else
4657             {
4658               int len = data[0] - 4, j;
4659               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4660
4661               for (j = 0; j < len; j++)
4662                 args[j] = make_number (data[4 + j]);
4663               components = (method == COMPOSITION_WITH_ALTCHARS
4664                             ? Fstring (len, args) : Fvector (len, args));
4665             }
4666           compose_text (data[1], data[2], components, Qnil, obj);
4667         }
4668       cmp_data = cmp_data->next;
4669     }
4670 }
4671
4672 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4673    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4674    coding system CODING, and return the status code of code conversion
4675    (currently, this value has no meaning).
4676
4677    How many characters (and bytes) are converted to how many
4678    characters (and bytes) are recorded in members of the structure
4679    CODING.
4680
4681    If REPLACE is nonzero, we do various things as if the original text
4682    is deleted and a new text is inserted.  See the comments in
4683    replace_range (insdel.c) to know what we are doing.
4684
4685    If REPLACE is zero, it is assumed that the source text is unibyte.
4686    Otherwize, it is assumed that the source text is multibyte.  */
4687
4688 int
4689 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4690      int from, from_byte, to, to_byte, encodep, replace;
4691      struct coding_system *coding;
4692 {
4693   int len = to - from, len_byte = to_byte - from_byte;
4694   int require, inserted, inserted_byte;
4695   int head_skip, tail_skip, total_skip = 0;
4696   Lisp_Object saved_coding_symbol;
4697   int first = 1;
4698   unsigned char *src, *dst;
4699   Lisp_Object deletion;
4700   int orig_point = PT, orig_len = len;
4701   int prev_Z;
4702   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4703
4704   coding->src_multibyte = replace && multibyte_p;
4705   coding->dst_multibyte = multibyte_p;
4706
4707   deletion = Qnil;
4708   saved_coding_symbol = Qnil;
4709
4710   if (from < PT && PT < to)
4711     {
4712       TEMP_SET_PT_BOTH (from, from_byte);
4713       orig_point = from;
4714     }
4715
4716   if (replace)
4717     {
4718       int saved_from = from;
4719
4720       prepare_to_modify_buffer (from, to, &from);
4721       if (saved_from != from)
4722         {
4723           to = from + len;
4724           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4725           len_byte = to_byte - from_byte;
4726         }
4727     }
4728
4729   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4730     {
4731       /* We must detect encoding of text and eol format.  */
4732
4733       if (from < GPT && to > GPT)
4734         move_gap_both (from, from_byte);
4735       if (coding->type == coding_type_undecided)
4736         {
4737           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4738           if (coding->type == coding_type_undecided)
4739             /* It seems that the text contains only ASCII, but we
4740                should not left it undecided because the deeper
4741                decoding routine (decode_coding) tries to detect the
4742                encodings again in vain.  */
4743             coding->type = coding_type_emacs_mule;
4744         }
4745       if (coding->eol_type == CODING_EOL_UNDECIDED
4746           && coding->type != coding_type_ccl)
4747         {
4748           saved_coding_symbol = coding->symbol;
4749           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4750           if (coding->eol_type == CODING_EOL_UNDECIDED)
4751             coding->eol_type = CODING_EOL_LF;
4752           /* We had better recover the original eol format if we
4753              encounter an inconsitent eol format while decoding.  */
4754           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4755         }
4756     }
4757
4758   /* Now we convert the text.  */
4759
4760   /* For encoding, we must process pre-write-conversion in advance.  */
4761   if (! inhibit_pre_post_conversion
4762       && encodep
4763       && SYMBOLP (coding->pre_write_conversion)
4764       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4765     {
4766       /* The function in pre-write-conversion may put a new text in a
4767          new buffer.  */
4768       struct buffer *prev = current_buffer;
4769       Lisp_Object new;
4770       int count = specpdl_ptr - specpdl;
4771
4772       record_unwind_protect (code_convert_region_unwind, Qnil);
4773       /* We should not call any more pre-write/post-read-conversion
4774          functions while this pre-write-conversion is running.  */
4775       inhibit_pre_post_conversion = 1;
4776       call2 (coding->pre_write_conversion,
4777              make_number (from), make_number (to));
4778       inhibit_pre_post_conversion = 0;
4779       /* Discard the unwind protect.  */
4780       specpdl_ptr--;
4781
4782       if (current_buffer != prev)
4783         {
4784           len = ZV - BEGV;
4785           new = Fcurrent_buffer ();
4786           set_buffer_internal_1 (prev);
4787           del_range_2 (from, from_byte, to, to_byte, 0);
4788           TEMP_SET_PT_BOTH (from, from_byte);
4789           insert_from_buffer (XBUFFER (new), 1, len, 0);
4790           Fkill_buffer (new);
4791           if (orig_point >= to)
4792             orig_point += len - orig_len;
4793           else if (orig_point > from)
4794             orig_point = from;
4795           orig_len = len;
4796           to = from + len;
4797           from_byte = CHAR_TO_BYTE (from);
4798           to_byte = CHAR_TO_BYTE (to);
4799           len_byte = to_byte - from_byte;
4800           TEMP_SET_PT_BOTH (from, from_byte);
4801         }
4802     }
4803
4804   if (replace)
4805     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4806
4807   if (coding->composing != COMPOSITION_DISABLED)
4808     {
4809       if (encodep)
4810         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4811       else
4812         coding_allocate_composition_data (coding, from);
4813     }
4814
4815   /* Try to skip the heading and tailing ASCIIs.  */
4816   if (coding->type != coding_type_ccl)
4817     {
4818       int from_byte_orig = from_byte, to_byte_orig = to_byte;
4819
4820       if (from < GPT && GPT < to)
4821         move_gap_both (from, from_byte);
4822       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4823       if (from_byte == to_byte
4824           && (encodep || NILP (coding->post_read_conversion))
4825           && ! CODING_REQUIRE_FLUSHING (coding))
4826         {
4827           coding->produced = len_byte;
4828           coding->produced_char = len;
4829           if (!replace)
4830             /* We must record and adjust for this new text now.  */
4831             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4832           return 0;
4833         }
4834
4835       head_skip = from_byte - from_byte_orig;
4836       tail_skip = to_byte_orig - to_byte;
4837       total_skip = head_skip + tail_skip;
4838       from += head_skip;
4839       to -= tail_skip;
4840       len -= total_skip; len_byte -= total_skip;
4841     }
4842
4843   /* The code conversion routine can not preserve text properties for
4844      now.  So, we must remove all text properties in the region.
4845      Here, we must suppress all modification hooks.  */
4846   if (replace)
4847     {
4848       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4849       inhibit_modification_hooks = 1;
4850       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4851       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4852     }
4853
4854   /* For converion, we must put the gap before the text in addition to
4855      making the gap larger for efficient decoding.  The required gap
4856      size starts from 2000 which is the magic number used in make_gap.
4857      But, after one batch of conversion, it will be incremented if we
4858      find that it is not enough .  */
4859   require = 2000;
4860
4861   if (GAP_SIZE  < require)
4862     make_gap (require - GAP_SIZE);
4863   move_gap_both (from, from_byte);
4864
4865   inserted = inserted_byte = 0;
4866
4867   GAP_SIZE += len_byte;
4868   ZV -= len;
4869   Z -= len;
4870   ZV_BYTE -= len_byte;
4871   Z_BYTE -= len_byte;
4872
4873   if (GPT - BEG < BEG_UNCHANGED)
4874     BEG_UNCHANGED = GPT - BEG;
4875   if (Z - GPT < END_UNCHANGED)
4876     END_UNCHANGED = Z - GPT;
4877
4878   if (!encodep && coding->src_multibyte)
4879     {
4880       /* Decoding routines expects that the source text is unibyte.
4881          We must convert 8-bit characters of multibyte form to
4882          unibyte.  */
4883       int len_byte_orig = len_byte;
4884       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4885       if (len_byte < len_byte_orig)
4886         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4887                     len_byte);
4888       coding->src_multibyte = 0;
4889     }
4890
4891   for (;;)
4892     {
4893       int result;
4894
4895       /* The buffer memory is now:
4896          +--------+converted-text+---------+-------original-text-------+---+
4897          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4898                   |<---------------------- GAP ----------------------->|  */
4899       src = GAP_END_ADDR - len_byte;
4900       dst = GPT_ADDR + inserted_byte;
4901
4902       if (encodep)
4903         result = encode_coding (coding, src, dst, len_byte, 0);
4904       else
4905         result = decode_coding (coding, src, dst, len_byte, 0);
4906
4907       /* The buffer memory is now:
4908          +--------+-------converted-text----+--+------original-text----+---+
4909          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4910                   |<---------------------- GAP ----------------------->|  */
4911
4912       inserted += coding->produced_char;
4913       inserted_byte += coding->produced;
4914       len_byte -= coding->consumed;
4915
4916       if (result == CODING_FINISH_INSUFFICIENT_CMP)
4917         {
4918           coding_allocate_composition_data (coding, from + inserted);
4919           continue;
4920         }
4921
4922       src += coding->consumed;
4923       dst += coding->produced;
4924
4925       if (result == CODING_FINISH_NORMAL)
4926         {
4927           src += len_byte;
4928           break;
4929         }
4930       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4931         {
4932           unsigned char *pend = dst, *p = pend - inserted_byte;
4933           Lisp_Object eol_type;
4934
4935           /* Encode LFs back to the original eol format (CR or CRLF).  */
4936           if (coding->eol_type == CODING_EOL_CR)
4937             {
4938               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4939             }
4940           else
4941             {
4942               int count = 0;
4943
4944               while (p < pend) if (*p++ == '\n') count++;
4945               if (src - dst < count)
4946                 {
4947                   /* We don't have sufficient room for encoding LFs
4948                      back to CRLF.  We must record converted and
4949                      not-yet-converted text back to the buffer
4950                      content, enlarge the gap, then record them out of
4951                      the buffer contents again.  */
4952                   int add = len_byte + inserted_byte;
4953
4954                   GAP_SIZE -= add;
4955                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4956                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4957                   make_gap (count - GAP_SIZE);
4958                   GAP_SIZE += add;
4959                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4960                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4961                   /* Don't forget to update SRC, DST, and PEND.  */
4962                   src = GAP_END_ADDR - len_byte;
4963                   dst = GPT_ADDR + inserted_byte;
4964                   pend = dst;
4965                 }
4966               inserted += count;
4967               inserted_byte += count;
4968               coding->produced += count;
4969               p = dst = pend + count;
4970               while (count)
4971                 {
4972                   *--p = *--pend;
4973                   if (*p == '\n') count--, *--p = '\r';
4974                 }
4975             }
4976
4977           /* Suppress eol-format conversion in the further conversion.  */
4978           coding->eol_type = CODING_EOL_LF;
4979
4980           /* Set the coding system symbol to that for Unix-like EOL.  */
4981           eol_type = Fget (saved_coding_symbol, Qeol_type);
4982           if (VECTORP (eol_type)
4983               && XVECTOR (eol_type)->size == 3
4984               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4985             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4986           else
4987             coding->symbol = saved_coding_symbol;
4988
4989           continue;
4990         }
4991       if (len_byte <= 0)
4992         {
4993           if (coding->type != coding_type_ccl
4994               || coding->mode & CODING_MODE_LAST_BLOCK)
4995             break;
4996           coding->mode |= CODING_MODE_LAST_BLOCK;
4997           continue;
4998         }
4999       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5000         {
5001           /* The source text ends in invalid codes.  Let's just
5002              make them valid buffer contents, and finish conversion.  */
5003           inserted += len_byte;
5004           inserted_byte += len_byte;
5005           while (len_byte--)
5006             *dst++ = *src++;
5007           break;
5008         }
5009       if (result == CODING_FINISH_INTERRUPT)
5010         {
5011           /* The conversion procedure was interrupted by a user.  */
5012           break;
5013         }
5014       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5015       if (coding->consumed < 1)
5016         {
5017           /* It's quite strange to require more memory without
5018              consuming any bytes.  Perhaps CCL program bug.  */
5019           break;
5020         }
5021       if (first)
5022         {
5023           /* We have just done the first batch of conversion which was
5024              stoped because of insufficient gap.  Let's reconsider the
5025              required gap size (i.e. SRT - DST) now.
5026
5027              We have converted ORIG bytes (== coding->consumed) into
5028              NEW bytes (coding->produced).  To convert the remaining
5029              LEN bytes, we may need REQUIRE bytes of gap, where:
5030                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5031                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5032              Here, we are sure that NEW >= ORIG.  */
5033           float ratio = coding->produced - coding->consumed;
5034           ratio /= coding->consumed;
5035           require = len_byte * ratio;
5036           first = 0;
5037         }
5038       if ((src - dst) < (require + 2000))
5039         {
5040           /* See the comment above the previous call of make_gap.  */
5041           int add = len_byte + inserted_byte;
5042
5043           GAP_SIZE -= add;
5044           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5045           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5046           make_gap (require + 2000);
5047           GAP_SIZE += add;
5048           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5049           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5050         }
5051     }
5052   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5053
5054   if (encodep && coding->dst_multibyte)
5055     {
5056       /* The output is unibyte.  We must convert 8-bit characters to
5057          multibyte form.  */
5058       if (inserted_byte * 2 > GAP_SIZE)
5059         {
5060           GAP_SIZE -= inserted_byte;
5061           ZV += inserted_byte; Z += inserted_byte;
5062           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5063           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5064           make_gap (inserted_byte - GAP_SIZE);
5065           GAP_SIZE += inserted_byte;
5066           ZV -= inserted_byte; Z -= inserted_byte;
5067           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5068           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5069         }
5070       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5071     }
5072
5073   /* If we have shrinked the conversion area, adjust it now.  */
5074   if (total_skip > 0)
5075     {
5076       if (tail_skip > 0)
5077         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5078       inserted += total_skip; inserted_byte += total_skip;
5079       GAP_SIZE += total_skip;
5080       GPT -= head_skip; GPT_BYTE -= head_skip;
5081       ZV -= total_skip; ZV_BYTE -= total_skip;
5082       Z -= total_skip; Z_BYTE -= total_skip;
5083       from -= head_skip; from_byte -= head_skip;
5084       to += tail_skip; to_byte += tail_skip;
5085     }
5086
5087   prev_Z = Z;
5088   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5089   inserted = Z - prev_Z;
5090
5091   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5092     coding_restore_composition (coding, Fcurrent_buffer ());
5093   coding_free_composition_data (coding);
5094
5095   if (! inhibit_pre_post_conversion
5096       && ! encodep && ! NILP (coding->post_read_conversion))
5097     {
5098       Lisp_Object val;
5099       int count = specpdl_ptr - specpdl;
5100
5101       if (from != PT)
5102         TEMP_SET_PT_BOTH (from, from_byte);
5103       prev_Z = Z;
5104       record_unwind_protect (code_convert_region_unwind, Qnil);
5105       /* We should not call any more pre-write/post-read-conversion
5106          functions while this post-read-conversion is running.  */
5107       inhibit_pre_post_conversion = 1;
5108       val = call1 (coding->post_read_conversion, make_number (inserted));
5109       inhibit_pre_post_conversion = 0;
5110       /* Discard the unwind protect.  */
5111       specpdl_ptr--;
5112       CHECK_NUMBER (val, 0);
5113       inserted += Z - prev_Z;
5114     }
5115
5116   if (orig_point >= from)
5117     {
5118       if (orig_point >= from + orig_len)
5119         orig_point += inserted - orig_len;
5120       else
5121         orig_point = from;
5122       TEMP_SET_PT (orig_point);
5123     }
5124
5125   if (replace)
5126     {
5127       signal_after_change (from, to - from, inserted);
5128       update_compositions (from, from + inserted, CHECK_BORDER);
5129     }
5130
5131   {
5132     coding->consumed = to_byte - from_byte;
5133     coding->consumed_char = to - from;
5134     coding->produced = inserted_byte;
5135     coding->produced_char = inserted;
5136   }
5137
5138   return 0;
5139 }
5140
5141 Lisp_Object
5142 run_pre_post_conversion_on_str (str, coding, encodep)
5143      Lisp_Object str;
5144      struct coding_system *coding;
5145      int encodep;
5146 {
5147   int count = specpdl_ptr - specpdl;
5148   struct gcpro gcpro1;
5149   struct buffer *prev = current_buffer;
5150   int multibyte = STRING_MULTIBYTE (str);
5151
5152   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5153   record_unwind_protect (code_convert_region_unwind, Qnil);
5154   GCPRO1 (str);
5155   temp_output_buffer_setup (" *code-converting-work*");
5156   set_buffer_internal (XBUFFER (Vstandard_output));
5157   /* We must insert the contents of STR as is without
5158      unibyte<->multibyte conversion.  For that, we adjust the
5159      multibyteness of the working buffer to that of STR.  */
5160   Ferase_buffer ();
5161   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5162   insert_from_string (str, 0, 0,
5163                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5164   UNGCPRO;
5165   inhibit_pre_post_conversion = 1;
5166   if (encodep)
5167     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5168   else
5169     {
5170       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5171       call1 (coding->post_read_conversion, make_number (Z - BEG));
5172     }
5173   inhibit_pre_post_conversion = 0;
5174   str = make_buffer_string (BEG, Z, 0);
5175   return unbind_to (count, str);
5176 }
5177
5178 Lisp_Object
5179 decode_coding_string (str, coding, nocopy)
5180      Lisp_Object str;
5181      struct coding_system *coding;
5182      int nocopy;
5183 {
5184   int len;
5185   char *buf;
5186   int from, to, to_byte;
5187   struct gcpro gcpro1;
5188   Lisp_Object saved_coding_symbol;
5189   int result;
5190
5191   from = 0;
5192   to = XSTRING (str)->size;
5193   to_byte = STRING_BYTES (XSTRING (str));
5194
5195   saved_coding_symbol = Qnil;
5196   if (CODING_REQUIRE_DETECTION (coding))
5197     {
5198       /* See the comments in code_convert_region.  */
5199       if (coding->type == coding_type_undecided)
5200         {
5201           detect_coding (coding, XSTRING (str)->data, to_byte);
5202           if (coding->type == coding_type_undecided)
5203             coding->type = coding_type_emacs_mule;
5204         }
5205       if (coding->eol_type == CODING_EOL_UNDECIDED
5206           && coding->type != coding_type_ccl)
5207         {
5208           saved_coding_symbol = coding->symbol;
5209           detect_eol (coding, XSTRING (str)->data, to_byte);
5210           if (coding->eol_type == CODING_EOL_UNDECIDED)
5211             coding->eol_type = CODING_EOL_LF;
5212           /* We had better recover the original eol format if we
5213              encounter an inconsitent eol format while decoding.  */
5214           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5215         }
5216     }
5217
5218   if (! CODING_REQUIRE_DECODING (coding))
5219     {
5220       if (!STRING_MULTIBYTE (str))
5221         {
5222           str = Fstring_as_multibyte (str);
5223           nocopy = 1;
5224         }
5225       return (nocopy ? str : Fcopy_sequence (str));
5226     }
5227
5228   if (STRING_MULTIBYTE (str))
5229     {
5230       /* Decoding routines expect the source text to be unibyte.  */
5231       str = Fstring_as_unibyte (str);
5232       to_byte = STRING_BYTES (XSTRING (str));
5233       nocopy = 1;
5234       coding->src_multibyte = 0;
5235     }
5236   coding->dst_multibyte = 1;
5237
5238   if (coding->composing != COMPOSITION_DISABLED)
5239     coding_allocate_composition_data (coding, from);
5240
5241   /* Try to skip the heading and tailing ASCIIs.  */
5242   if (coding->type != coding_type_ccl)
5243     {
5244       int from_orig = from;
5245
5246       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5247                                 0);
5248       if (from == to_byte)
5249         return (nocopy ? str : Fcopy_sequence (str));
5250     }
5251
5252   len = decoding_buffer_size (coding, to_byte - from);
5253   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5254   GCPRO1 (str);
5255   buf = get_conversion_buffer (len);
5256   UNGCPRO;
5257
5258   if (from > 0)
5259     bcopy (XSTRING (str)->data, buf, from);
5260   result = decode_coding (coding, XSTRING (str)->data + from,
5261                          buf + from, to_byte - from, len);
5262   if (result == CODING_FINISH_INCONSISTENT_EOL)
5263     {
5264       /* We simply try to decode the whole string again but without
5265          eol-conversion this time.  */
5266       coding->eol_type = CODING_EOL_LF;
5267       coding->symbol = saved_coding_symbol;
5268       coding_free_composition_data (coding);
5269       return decode_coding_string (str, coding, nocopy);
5270     }
5271
5272   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5273          STRING_BYTES (XSTRING (str)) - to_byte);
5274
5275   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5276   str = make_multibyte_string (buf, len + coding->produced_char,
5277                                len + coding->produced);
5278
5279   if (coding->cmp_data && coding->cmp_data->used)
5280     coding_restore_composition (coding, str);
5281   coding_free_composition_data (coding);
5282
5283   if (SYMBOLP (coding->post_read_conversion)
5284       && !NILP (Ffboundp (coding->post_read_conversion)))
5285     str = run_pre_post_conversion_on_str (str, coding, 0);
5286
5287   return str;
5288 }
5289
5290 Lisp_Object
5291 encode_coding_string (str, coding, nocopy)
5292      Lisp_Object str;
5293      struct coding_system *coding;
5294      int nocopy;
5295 {
5296   int len;
5297   char *buf;
5298   int from, to, to_byte;
5299   struct gcpro gcpro1;
5300   Lisp_Object saved_coding_symbol;
5301   int result;
5302
5303   if (SYMBOLP (coding->pre_write_conversion)
5304       && !NILP (Ffboundp (coding->pre_write_conversion)))
5305     str = run_pre_post_conversion_on_str (str, coding, 1);
5306
5307   from = 0;
5308   to = XSTRING (str)->size;
5309   to_byte = STRING_BYTES (XSTRING (str));
5310
5311   saved_coding_symbol = Qnil;
5312   if (! CODING_REQUIRE_ENCODING (coding))
5313     {
5314       if (STRING_MULTIBYTE (str))
5315         {
5316           str = Fstring_as_unibyte (str);
5317           nocopy = 1;
5318         }
5319       return (nocopy ? str : Fcopy_sequence (str));
5320     }
5321
5322   /* Encoding routines determine the multibyteness of the source text
5323      by coding->src_multibyte.  */
5324   coding->src_multibyte = STRING_MULTIBYTE (str);
5325   coding->dst_multibyte = 0;
5326
5327   if (coding->composing != COMPOSITION_DISABLED)
5328     coding_save_composition (coding, from, to, str);
5329
5330   /* Try to skip the heading and tailing ASCIIs.  */
5331   if (coding->type != coding_type_ccl)
5332     {
5333       int from_orig = from;
5334
5335       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5336                                 1);
5337       if (from == to_byte)
5338         return (nocopy ? str : Fcopy_sequence (str));
5339     }
5340
5341   len = encoding_buffer_size (coding, to_byte - from);
5342   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5343   GCPRO1 (str);
5344   buf = get_conversion_buffer (len);
5345   UNGCPRO;
5346
5347   if (from > 0)
5348     bcopy (XSTRING (str)->data, buf, from);
5349   result = encode_coding (coding, XSTRING (str)->data + from,
5350                           buf + from, to_byte - from, len);
5351   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5352          STRING_BYTES (XSTRING (str)) - to_byte);
5353
5354   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5355   str = make_unibyte_string (buf, len + coding->produced);
5356   coding_free_composition_data (coding);
5357
5358   return str;
5359 }
5360
5361 \f
5362 #ifdef emacs
5363 /*** 8. Emacs Lisp library functions ***/
5364
5365 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5366   "Return t if OBJECT is nil or a coding-system.\n\
5367 See the documentation of `make-coding-system' for information\n\
5368 about coding-system objects.")
5369   (obj)
5370      Lisp_Object obj;
5371 {
5372   if (NILP (obj))
5373     return Qt;
5374   if (!SYMBOLP (obj))
5375     return Qnil;
5376   /* Get coding-spec vector for OBJ.  */
5377   obj = Fget (obj, Qcoding_system);
5378   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5379           ? Qt : Qnil);
5380 }
5381
5382 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5383        Sread_non_nil_coding_system, 1, 1, 0,
5384   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5385   (prompt)
5386      Lisp_Object prompt;
5387 {
5388   Lisp_Object val;
5389   do
5390     {
5391       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5392                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5393     }
5394   while (XSTRING (val)->size == 0);
5395   return (Fintern (val, Qnil));
5396 }
5397
5398 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5399   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5400 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5401   (prompt, default_coding_system)
5402      Lisp_Object prompt, default_coding_system;
5403 {
5404   Lisp_Object val;
5405   if (SYMBOLP (default_coding_system))
5406     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5407   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5408                           Qt, Qnil, Qcoding_system_history,
5409                           default_coding_system, Qnil);
5410   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5411 }
5412
5413 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5414        1, 1, 0,
5415   "Check validity of CODING-SYSTEM.\n\
5416 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5417 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5418 The value of property should be a vector of length 5.")
5419   (coding_system)
5420      Lisp_Object coding_system;
5421 {
5422   CHECK_SYMBOL (coding_system, 0);
5423   if (!NILP (Fcoding_system_p (coding_system)))
5424     return coding_system;
5425   while (1)
5426     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5427 }
5428 \f
5429 Lisp_Object
5430 detect_coding_system (src, src_bytes, highest)
5431      unsigned char *src;
5432      int src_bytes, highest;
5433 {
5434   int coding_mask, eol_type;
5435   Lisp_Object val, tmp;
5436   int dummy;
5437
5438   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5439   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5440   if (eol_type == CODING_EOL_INCONSISTENT)
5441     eol_type = CODING_EOL_UNDECIDED;
5442
5443   if (!coding_mask)
5444     {
5445       val = Qundecided;
5446       if (eol_type != CODING_EOL_UNDECIDED)
5447         {
5448           Lisp_Object val2;
5449           val2 = Fget (Qundecided, Qeol_type);
5450           if (VECTORP (val2))
5451             val = XVECTOR (val2)->contents[eol_type];
5452         }
5453       return (highest ? val : Fcons (val, Qnil));
5454     }
5455
5456   /* At first, gather possible coding systems in VAL.  */
5457   val = Qnil;
5458   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5459     {
5460       Lisp_Object category_val, category_index;
5461
5462       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5463       category_val = Fsymbol_value (XCAR (tmp));
5464       if (!NILP (category_val)
5465           && NATNUMP (category_index)
5466           && (coding_mask & (1 << XFASTINT (category_index))))
5467         {
5468           val = Fcons (category_val, val);
5469           if (highest)
5470             break;
5471         }
5472     }
5473   if (!highest)
5474     val = Fnreverse (val);
5475
5476   /* Then, replace the elements with subsidiary coding systems.  */
5477   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5478     {
5479       if (eol_type != CODING_EOL_UNDECIDED
5480           && eol_type != CODING_EOL_INCONSISTENT)
5481         {
5482           Lisp_Object eol;
5483           eol = Fget (XCAR (tmp), Qeol_type);
5484           if (VECTORP (eol))
5485             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5486         }
5487     }
5488   return (highest ? XCAR (val) : val);
5489 }
5490
5491 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5492        2, 3, 0,
5493   "Detect coding system of the text in the region between START and END.\n\
5494 Return a list of possible coding systems ordered by priority.\n\
5495 \n\
5496 If only ASCII characters are found, it returns a list of single element\n\
5497 `undecided' or its subsidiary coding system according to a detected\n\
5498 end-of-line format.\n\
5499 \n\
5500 If optional argument HIGHEST is non-nil, return the coding system of\n\
5501 highest priority.")
5502   (start, end, highest)
5503      Lisp_Object start, end, highest;
5504 {
5505   int from, to;
5506   int from_byte, to_byte;
5507
5508   CHECK_NUMBER_COERCE_MARKER (start, 0);
5509   CHECK_NUMBER_COERCE_MARKER (end, 1);
5510
5511   validate_region (&start, &end);
5512   from = XINT (start), to = XINT (end);
5513   from_byte = CHAR_TO_BYTE (from);
5514   to_byte = CHAR_TO_BYTE (to);
5515
5516   if (from < GPT && to >= GPT)
5517     move_gap_both (to, to_byte);
5518
5519   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5520                                to_byte - from_byte,
5521                                !NILP (highest));
5522 }
5523
5524 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5525        1, 2, 0,
5526   "Detect coding system of the text in STRING.\n\
5527 Return a list of possible coding systems ordered by priority.\n\
5528 \n\
5529 If only ASCII characters are found, it returns a list of single element\n\
5530 `undecided' or its subsidiary coding system according to a detected\n\
5531 end-of-line format.\n\
5532 \n\
5533 If optional argument HIGHEST is non-nil, return the coding system of\n\
5534 highest priority.")
5535   (string, highest)
5536      Lisp_Object string, highest;
5537 {
5538   CHECK_STRING (string, 0);
5539
5540   return detect_coding_system (XSTRING (string)->data,
5541                                STRING_BYTES (XSTRING (string)),
5542                                !NILP (highest));
5543 }
5544
5545 Lisp_Object
5546 code_convert_region1 (start, end, coding_system, encodep)
5547      Lisp_Object start, end, coding_system;
5548      int encodep;
5549 {
5550   struct coding_system coding;
5551   int from, to, len;
5552
5553   CHECK_NUMBER_COERCE_MARKER (start, 0);
5554   CHECK_NUMBER_COERCE_MARKER (end, 1);
5555   CHECK_SYMBOL (coding_system, 2);
5556
5557   validate_region (&start, &end);
5558   from = XFASTINT (start);
5559   to = XFASTINT (end);
5560
5561   if (NILP (coding_system))
5562     return make_number (to - from);
5563
5564   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5565     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5566
5567   coding.mode |= CODING_MODE_LAST_BLOCK;
5568   coding.src_multibyte = coding.dst_multibyte
5569     = !NILP (current_buffer->enable_multibyte_characters);
5570   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5571                        &coding, encodep, 1);
5572   Vlast_coding_system_used = coding.symbol;
5573   return make_number (coding.produced_char);
5574 }
5575
5576 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5577        3, 3, "r\nzCoding system: ",
5578   "Decode the current region by specified coding system.\n\
5579 When called from a program, takes three arguments:\n\
5580 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5581 This function sets `last-coding-system-used' to the precise coding system\n\
5582 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5583 not fully specified.)\n\
5584 It returns the length of the decoded text.")
5585   (start, end, coding_system)
5586      Lisp_Object start, end, coding_system;
5587 {
5588   return code_convert_region1 (start, end, coding_system, 0);
5589 }
5590
5591 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5592        3, 3, "r\nzCoding system: ",
5593   "Encode the current region by specified coding system.\n\
5594 When called from a program, takes three arguments:\n\
5595 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5596 This function sets `last-coding-system-used' to the precise coding system\n\
5597 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5598 not fully specified.)\n\
5599 It returns the length of the encoded text.")
5600   (start, end, coding_system)
5601      Lisp_Object start, end, coding_system;
5602 {
5603   return code_convert_region1 (start, end, coding_system, 1);
5604 }
5605
5606 Lisp_Object
5607 code_convert_string1 (string, coding_system, nocopy, encodep)
5608      Lisp_Object string, coding_system, nocopy;
5609      int encodep;
5610 {
5611   struct coding_system coding;
5612
5613   CHECK_STRING (string, 0);
5614   CHECK_SYMBOL (coding_system, 1);
5615
5616   if (NILP (coding_system))
5617     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5618
5619   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5620     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5621
5622   coding.mode |= CODING_MODE_LAST_BLOCK;
5623   string = (encodep
5624             ? encode_coding_string (string, &coding, !NILP (nocopy))
5625             : decode_coding_string (string, &coding, !NILP (nocopy)));
5626   Vlast_coding_system_used = coding.symbol;
5627
5628   return string;
5629 }
5630
5631 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5632        2, 3, 0,
5633   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5634 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5635 if the decoding operation is trivial.\n\
5636 This function sets `last-coding-system-used' to the precise coding system\n\
5637 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5638 not fully specified.)")
5639   (string, coding_system, nocopy)
5640      Lisp_Object string, coding_system, nocopy;
5641 {
5642   return code_convert_string1 (string, coding_system, nocopy, 0);
5643 }
5644
5645 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5646        2, 3, 0,
5647   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5648 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5649 if the encoding operation is trivial.\n\
5650 This function sets `last-coding-system-used' to the precise coding system\n\
5651 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5652 not fully specified.)")
5653   (string, coding_system, nocopy)
5654      Lisp_Object string, coding_system, nocopy;
5655 {
5656   return code_convert_string1 (string, coding_system, nocopy, 1);
5657 }
5658
5659 /* Encode or decode STRING according to CODING_SYSTEM.
5660    Do not set Vlast_coding_system_used.
5661
5662    This function is called only from macros DECODE_FILE and
5663    ENCODE_FILE, thus we ignore character composition.  */
5664
5665 Lisp_Object
5666 code_convert_string_norecord (string, coding_system, encodep)
5667      Lisp_Object string, coding_system;
5668      int encodep;
5669 {
5670   struct coding_system coding;
5671
5672   CHECK_STRING (string, 0);
5673   CHECK_SYMBOL (coding_system, 1);
5674
5675   if (NILP (coding_system))
5676     return string;
5677
5678   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5679     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5680
5681   coding.composing = COMPOSITION_DISABLED;
5682   coding.mode |= CODING_MODE_LAST_BLOCK;
5683   return (encodep
5684           ? encode_coding_string (string, &coding, 1)
5685           : decode_coding_string (string, &coding, 1));
5686 }
5687 \f
5688 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5689   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5690 Return the corresponding character.")
5691   (code)
5692      Lisp_Object code;
5693 {
5694   unsigned char c1, c2, s1, s2;
5695   Lisp_Object val;
5696
5697   CHECK_NUMBER (code, 0);
5698   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5699   if (s1 == 0)
5700     {
5701       if (s2 < 0x80)
5702         XSETFASTINT (val, s2);
5703       else if (s2 >= 0xA0 || s2 <= 0xDF)
5704         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5705       else
5706         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5707     }
5708   else
5709     {
5710       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5711           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5712         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5713       DECODE_SJIS (s1, s2, c1, c2);
5714       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5715     }
5716   return val;
5717 }
5718
5719 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5720   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5721 Return the corresponding code in SJIS.")
5722   (ch)
5723      Lisp_Object ch;
5724 {
5725   int charset, c1, c2, s1, s2;
5726   Lisp_Object val;
5727
5728   CHECK_NUMBER (ch, 0);
5729   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5730   if (charset == CHARSET_ASCII)
5731     {
5732       val = ch;
5733     }
5734   else if (charset == charset_jisx0208
5735            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5736     {
5737       ENCODE_SJIS (c1, c2, s1, s2);
5738       XSETFASTINT (val, (s1 << 8) | s2);
5739     }
5740   else if (charset == charset_katakana_jisx0201
5741            && c1 > 0x20 && c2 < 0xE0)
5742     {
5743       XSETFASTINT (val, c1 | 0x80);
5744     }
5745   else
5746     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5747   return val;
5748 }
5749
5750 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5751   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5752 Return the corresponding character.")
5753   (code)
5754      Lisp_Object code;
5755 {
5756   int charset;
5757   unsigned char b1, b2, c1, c2;
5758   Lisp_Object val;
5759
5760   CHECK_NUMBER (code, 0);
5761   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5762   if (b1 == 0)
5763     {
5764       if (b2 >= 0x80)
5765         error ("Invalid BIG5 code: %x", XFASTINT (code));
5766       val = code;
5767     }
5768   else
5769     {
5770       if ((b1 < 0xA1 || b1 > 0xFE)
5771           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5772         error ("Invalid BIG5 code: %x", XFASTINT (code));
5773       DECODE_BIG5 (b1, b2, charset, c1, c2);
5774       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5775     }
5776   return val;
5777 }
5778
5779 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5780   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5781 Return the corresponding character code in Big5.")
5782   (ch)
5783      Lisp_Object ch;
5784 {
5785   int charset, c1, c2, b1, b2;
5786   Lisp_Object val;
5787
5788   CHECK_NUMBER (ch, 0);
5789   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5790   if (charset == CHARSET_ASCII)
5791     {
5792       val = ch;
5793     }
5794   else if ((charset == charset_big5_1
5795             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5796            || (charset == charset_big5_2
5797                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5798     {
5799       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5800       XSETFASTINT (val, (b1 << 8) | b2);
5801     }
5802   else
5803     error ("Can't encode to Big5: %d", XFASTINT (ch));
5804   return val;
5805 }
5806 \f
5807 DEFUN ("set-terminal-coding-system-internal",
5808        Fset_terminal_coding_system_internal,
5809        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5810   (coding_system)
5811      Lisp_Object coding_system;
5812 {
5813   CHECK_SYMBOL (coding_system, 0);
5814   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5815   /* We had better not send unsafe characters to terminal.  */
5816   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5817   /* Characer composition should be disabled.  */
5818   terminal_coding.composing = COMPOSITION_DISABLED;
5819   terminal_coding.src_multibyte = 1;
5820   terminal_coding.dst_multibyte = 0;
5821   return Qnil;
5822 }
5823
5824 DEFUN ("set-safe-terminal-coding-system-internal",
5825        Fset_safe_terminal_coding_system_internal,
5826        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5827   (coding_system)
5828      Lisp_Object coding_system;
5829 {
5830   CHECK_SYMBOL (coding_system, 0);
5831   setup_coding_system (Fcheck_coding_system (coding_system),
5832                        &safe_terminal_coding);
5833   /* Characer composition should be disabled.  */
5834   safe_terminal_coding.composing = COMPOSITION_DISABLED;
5835   safe_terminal_coding.src_multibyte = 1;
5836   safe_terminal_coding.dst_multibyte = 0;
5837   return Qnil;
5838 }
5839
5840 DEFUN ("terminal-coding-system",
5841        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5842   "Return coding system specified for terminal output.")
5843   ()
5844 {
5845   return terminal_coding.symbol;
5846 }
5847
5848 DEFUN ("set-keyboard-coding-system-internal",
5849        Fset_keyboard_coding_system_internal,
5850        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5851   (coding_system)
5852      Lisp_Object coding_system;
5853 {
5854   CHECK_SYMBOL (coding_system, 0);
5855   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5856   /* Characer composition should be disabled.  */
5857   keyboard_coding.composing = COMPOSITION_DISABLED;
5858   return Qnil;
5859 }
5860
5861 DEFUN ("keyboard-coding-system",
5862        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5863   "Return coding system specified for decoding keyboard input.")
5864   ()
5865 {
5866   return keyboard_coding.symbol;
5867 }
5868
5869 \f
5870 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5871        Sfind_operation_coding_system,  1, MANY, 0,
5872   "Choose a coding system for an operation based on the target name.\n\
5873 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5874 DECODING-SYSTEM is the coding system to use for decoding\n\
5875 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5876 for encoding (in case OPERATION does encoding).\n\
5877 \n\
5878 The first argument OPERATION specifies an I/O primitive:\n\
5879   For file I/O, `insert-file-contents' or `write-region'.\n\
5880   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5881   For network I/O, `open-network-stream'.\n\
5882 \n\
5883 The remaining arguments should be the same arguments that were passed\n\
5884 to the primitive.  Depending on which primitive, one of those arguments\n\
5885 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5886 whichever argument specifies the file name is TARGET.\n\
5887 \n\
5888 TARGET has a meaning which depends on OPERATION:\n\
5889   For file I/O, TARGET is a file name.\n\
5890   For process I/O, TARGET is a process name.\n\
5891   For network I/O, TARGET is a service name or a port number\n\
5892 \n\
5893 This function looks up what specified for TARGET in,\n\
5894 `file-coding-system-alist', `process-coding-system-alist',\n\
5895 or `network-coding-system-alist' depending on OPERATION.\n\
5896 They may specify a coding system, a cons of coding systems,\n\
5897 or a function symbol to call.\n\
5898 In the last case, we call the function with one argument,\n\
5899 which is a list of all the arguments given to this function.")
5900   (nargs, args)
5901      int nargs;
5902      Lisp_Object *args;
5903 {
5904   Lisp_Object operation, target_idx, target, val;
5905   register Lisp_Object chain;
5906
5907   if (nargs < 2)
5908     error ("Too few arguments");
5909   operation = args[0];
5910   if (!SYMBOLP (operation)
5911       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5912     error ("Invalid first arguement");
5913   if (nargs < 1 + XINT (target_idx))
5914     error ("Too few arguments for operation: %s",
5915            XSYMBOL (operation)->name->data);
5916   target = args[XINT (target_idx) + 1];
5917   if (!(STRINGP (target)
5918         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5919     error ("Invalid %dth argument", XINT (target_idx) + 1);
5920
5921   chain = ((EQ (operation, Qinsert_file_contents)
5922             || EQ (operation, Qwrite_region))
5923            ? Vfile_coding_system_alist
5924            : (EQ (operation, Qopen_network_stream)
5925               ? Vnetwork_coding_system_alist
5926               : Vprocess_coding_system_alist));
5927   if (NILP (chain))
5928     return Qnil;
5929
5930   for (; CONSP (chain); chain = XCDR (chain))
5931     {
5932       Lisp_Object elt;
5933       elt = XCAR (chain);
5934
5935       if (CONSP (elt)
5936           && ((STRINGP (target)
5937                && STRINGP (XCAR (elt))
5938                && fast_string_match (XCAR (elt), target) >= 0)
5939               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5940         {
5941           val = XCDR (elt);
5942           /* Here, if VAL is both a valid coding system and a valid
5943              function symbol, we return VAL as a coding system.  */
5944           if (CONSP (val))
5945             return val;
5946           if (! SYMBOLP (val))
5947             return Qnil;
5948           if (! NILP (Fcoding_system_p (val)))
5949             return Fcons (val, val);
5950           if (! NILP (Ffboundp (val)))
5951             {
5952               val = call1 (val, Flist (nargs, args));
5953               if (CONSP (val))
5954                 return val;
5955               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5956                 return Fcons (val, val);
5957             }
5958           return Qnil;
5959         }
5960     }
5961   return Qnil;
5962 }
5963
5964 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5965        Supdate_coding_systems_internal, 0, 0, 0,
5966   "Update internal database for ISO2022 and CCL based coding systems.\n\
5967 When values of any coding categories are changed, you must\n\
5968 call this function")
5969   ()
5970 {
5971   int i;
5972
5973   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5974     {
5975       Lisp_Object val;
5976
5977       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5978       if (!NILP (val))
5979         {
5980           if (! coding_system_table[i])
5981             coding_system_table[i] = ((struct coding_system *)
5982                                       xmalloc (sizeof (struct coding_system)));
5983           setup_coding_system (val, coding_system_table[i]);
5984         }
5985       else if (coding_system_table[i])
5986         {
5987           xfree (coding_system_table[i]);
5988           coding_system_table[i] = NULL;
5989         }
5990     }
5991
5992   return Qnil;
5993 }
5994
5995 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5996        Sset_coding_priority_internal, 0, 0, 0,
5997   "Update internal database for the current value of `coding-category-list'.\n\
5998 This function is internal use only.")
5999   ()
6000 {
6001   int i = 0, idx;
6002   Lisp_Object val;
6003
6004   val = Vcoding_category_list;
6005
6006   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
6007     {
6008       if (! SYMBOLP (XCAR (val)))
6009         break;
6010       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6011       if (idx >= CODING_CATEGORY_IDX_MAX)
6012         break;
6013       coding_priorities[i++] = (1 << idx);
6014       val = XCDR (val);
6015     }
6016   /* If coding-category-list is valid and contains all coding
6017      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
6018      the following code saves Emacs from crashing.  */
6019   while (i < CODING_CATEGORY_IDX_MAX)
6020     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6021
6022   return Qnil;
6023 }
6024
6025 #endif /* emacs */
6026
6027 \f
6028 /*** 9. Post-amble ***/
6029
6030 void
6031 init_coding ()
6032 {
6033   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6034 }
6035
6036 void
6037 init_coding_once ()
6038 {
6039   int i;
6040
6041   /* Emacs' internal format specific initialize routine.  */
6042   for (i = 0; i <= 0x20; i++)
6043     emacs_code_class[i] = EMACS_control_code;
6044   emacs_code_class[0x0A] = EMACS_linefeed_code;
6045   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6046   for (i = 0x21 ; i < 0x7F; i++)
6047     emacs_code_class[i] = EMACS_ascii_code;
6048   emacs_code_class[0x7F] = EMACS_control_code;
6049   for (i = 0x80; i < 0xFF; i++)
6050     emacs_code_class[i] = EMACS_invalid_code;
6051   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6052   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6053   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6054   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6055
6056   /* ISO2022 specific initialize routine.  */
6057   for (i = 0; i < 0x20; i++)
6058     iso_code_class[i] = ISO_control_0;
6059   for (i = 0x21; i < 0x7F; i++)
6060     iso_code_class[i] = ISO_graphic_plane_0;
6061   for (i = 0x80; i < 0xA0; i++)
6062     iso_code_class[i] = ISO_control_1;
6063   for (i = 0xA1; i < 0xFF; i++)
6064     iso_code_class[i] = ISO_graphic_plane_1;
6065   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6066   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6067   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6068   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6069   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6070   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6071   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6072   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6073   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6074   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6075
6076   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6077
6078   setup_coding_system (Qnil, &keyboard_coding);
6079   setup_coding_system (Qnil, &terminal_coding);
6080   setup_coding_system (Qnil, &safe_terminal_coding);
6081   setup_coding_system (Qnil, &default_buffer_file_coding);
6082
6083   bzero (coding_system_table, sizeof coding_system_table);
6084
6085   bzero (ascii_skip_code, sizeof ascii_skip_code);
6086   for (i = 0; i < 128; i++)
6087     ascii_skip_code[i] = 1;
6088
6089 #if defined (MSDOS) || defined (WINDOWSNT)
6090   system_eol_type = CODING_EOL_CRLF;
6091 #else
6092   system_eol_type = CODING_EOL_LF;
6093 #endif
6094
6095   inhibit_pre_post_conversion = 0;
6096 }
6097
6098 #ifdef emacs
6099
6100 void
6101 syms_of_coding ()
6102 {
6103   Qtarget_idx = intern ("target-idx");
6104   staticpro (&Qtarget_idx);
6105
6106   Qcoding_system_history = intern ("coding-system-history");
6107   staticpro (&Qcoding_system_history);
6108   Fset (Qcoding_system_history, Qnil);
6109
6110   /* Target FILENAME is the first argument.  */
6111   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6112   /* Target FILENAME is the third argument.  */
6113   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6114
6115   Qcall_process = intern ("call-process");
6116   staticpro (&Qcall_process);
6117   /* Target PROGRAM is the first argument.  */
6118   Fput (Qcall_process, Qtarget_idx, make_number (0));
6119
6120   Qcall_process_region = intern ("call-process-region");
6121   staticpro (&Qcall_process_region);
6122   /* Target PROGRAM is the third argument.  */
6123   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6124
6125   Qstart_process = intern ("start-process");
6126   staticpro (&Qstart_process);
6127   /* Target PROGRAM is the third argument.  */
6128   Fput (Qstart_process, Qtarget_idx, make_number (2));
6129
6130   Qopen_network_stream = intern ("open-network-stream");
6131   staticpro (&Qopen_network_stream);
6132   /* Target SERVICE is the fourth argument.  */
6133   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6134
6135   Qcoding_system = intern ("coding-system");
6136   staticpro (&Qcoding_system);
6137
6138   Qeol_type = intern ("eol-type");
6139   staticpro (&Qeol_type);
6140
6141   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6142   staticpro (&Qbuffer_file_coding_system);
6143
6144   Qpost_read_conversion = intern ("post-read-conversion");
6145   staticpro (&Qpost_read_conversion);
6146
6147   Qpre_write_conversion = intern ("pre-write-conversion");
6148   staticpro (&Qpre_write_conversion);
6149
6150   Qno_conversion = intern ("no-conversion");
6151   staticpro (&Qno_conversion);
6152
6153   Qundecided = intern ("undecided");
6154   staticpro (&Qundecided);
6155
6156   Qcoding_system_p = intern ("coding-system-p");
6157   staticpro (&Qcoding_system_p);
6158
6159   Qcoding_system_error = intern ("coding-system-error");
6160   staticpro (&Qcoding_system_error);
6161
6162   Fput (Qcoding_system_error, Qerror_conditions,
6163         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6164   Fput (Qcoding_system_error, Qerror_message,
6165         build_string ("Invalid coding system"));
6166
6167   Qcoding_category = intern ("coding-category");
6168   staticpro (&Qcoding_category);
6169   Qcoding_category_index = intern ("coding-category-index");
6170   staticpro (&Qcoding_category_index);
6171
6172   Vcoding_category_table
6173     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6174   staticpro (&Vcoding_category_table);
6175   {
6176     int i;
6177     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6178       {
6179         XVECTOR (Vcoding_category_table)->contents[i]
6180           = intern (coding_category_name[i]);
6181         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6182               Qcoding_category_index, make_number (i));
6183       }
6184   }
6185
6186   Qtranslation_table = intern ("translation-table");
6187   staticpro (&Qtranslation_table);
6188   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6189
6190   Qtranslation_table_id = intern ("translation-table-id");
6191   staticpro (&Qtranslation_table_id);
6192
6193   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6194   staticpro (&Qtranslation_table_for_decode);
6195
6196   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6197   staticpro (&Qtranslation_table_for_encode);
6198
6199   Qsafe_charsets = intern ("safe-charsets");
6200   staticpro (&Qsafe_charsets);
6201
6202   Qvalid_codes = intern ("valid-codes");
6203   staticpro (&Qvalid_codes);
6204
6205   Qemacs_mule = intern ("emacs-mule");
6206   staticpro (&Qemacs_mule);
6207
6208   Qraw_text = intern ("raw-text");
6209   staticpro (&Qraw_text);
6210
6211   defsubr (&Scoding_system_p);
6212   defsubr (&Sread_coding_system);
6213   defsubr (&Sread_non_nil_coding_system);
6214   defsubr (&Scheck_coding_system);
6215   defsubr (&Sdetect_coding_region);
6216   defsubr (&Sdetect_coding_string);
6217   defsubr (&Sdecode_coding_region);
6218   defsubr (&Sencode_coding_region);
6219   defsubr (&Sdecode_coding_string);
6220   defsubr (&Sencode_coding_string);
6221   defsubr (&Sdecode_sjis_char);
6222   defsubr (&Sencode_sjis_char);
6223   defsubr (&Sdecode_big5_char);
6224   defsubr (&Sencode_big5_char);
6225   defsubr (&Sset_terminal_coding_system_internal);
6226   defsubr (&Sset_safe_terminal_coding_system_internal);
6227   defsubr (&Sterminal_coding_system);
6228   defsubr (&Sset_keyboard_coding_system_internal);
6229   defsubr (&Skeyboard_coding_system);
6230   defsubr (&Sfind_operation_coding_system);
6231   defsubr (&Supdate_coding_systems_internal);
6232   defsubr (&Sset_coding_priority_internal);
6233
6234   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6235     "List of coding systems.\n\
6236 \n\
6237 Do not alter the value of this variable manually.  This variable should be\n\
6238 updated by the functions `make-coding-system' and\n\
6239 `define-coding-system-alias'.");
6240   Vcoding_system_list = Qnil;
6241
6242   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6243     "Alist of coding system names.\n\
6244 Each element is one element list of coding system name.\n\
6245 This variable is given to `completing-read' as TABLE argument.\n\
6246 \n\
6247 Do not alter the value of this variable manually.  This variable should be\n\
6248 updated by the functions `make-coding-system' and\n\
6249 `define-coding-system-alias'.");
6250   Vcoding_system_alist = Qnil;
6251
6252   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6253     "List of coding-categories (symbols) ordered by priority.");
6254   {
6255     int i;
6256
6257     Vcoding_category_list = Qnil;
6258     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6259       Vcoding_category_list
6260         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6261                  Vcoding_category_list);
6262   }
6263
6264   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6265     "Specify the coding system for read operations.\n\
6266 It is useful to bind this variable with `let', but do not set it globally.\n\
6267 If the value is a coding system, it is used for decoding on read operation.\n\
6268 If not, an appropriate element is used from one of the coding system alists:\n\
6269 There are three such tables, `file-coding-system-alist',\n\
6270 `process-coding-system-alist', and `network-coding-system-alist'.");
6271   Vcoding_system_for_read = Qnil;
6272
6273   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6274     "Specify the coding system for write operations.\n\
6275 Programs bind this variable with `let', but you should not set it globally.\n\
6276 If the value is a coding system, it is used for encoding of output,\n\
6277 when writing it to a file and when sending it to a file or subprocess.\n\
6278 \n\
6279 If this does not specify a coding system, an appropriate element\n\
6280 is used from one of the coding system alists:\n\
6281 There are three such tables, `file-coding-system-alist',\n\
6282 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6283 For output to files, if the above procedure does not specify a coding system,\n\
6284 the value of `buffer-file-coding-system' is used.");
6285   Vcoding_system_for_write = Qnil;
6286
6287   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6288     "Coding system used in the latest file or process I/O.");
6289   Vlast_coding_system_used = Qnil;
6290
6291   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6292     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6293 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6294 such conversion.");
6295   inhibit_eol_conversion = 0;
6296
6297   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6298     "Non-nil means process buffer inherits coding system of process output.\n\
6299 Bind it to t if the process output is to be treated as if it were a file\n\
6300 read from some filesystem.");
6301   inherit_process_coding_system = 0;
6302
6303   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6304     "Alist to decide a coding system to use for a file I/O operation.\n\
6305 The format is ((PATTERN . VAL) ...),\n\
6306 where PATTERN is a regular expression matching a file name,\n\
6307 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6308 If VAL is a coding system, it is used for both decoding and encoding\n\
6309 the file contents.\n\
6310 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6311 and the cdr part is used for encoding.\n\
6312 If VAL is a function symbol, the function must return a coding system\n\
6313 or a cons of coding systems which are used as above.\n\
6314 \n\
6315 See also the function `find-operation-coding-system'\n\
6316 and the variable `auto-coding-alist'.");
6317   Vfile_coding_system_alist = Qnil;
6318
6319   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6320     "Alist to decide a coding system to use for a process I/O operation.\n\
6321 The format is ((PATTERN . VAL) ...),\n\
6322 where PATTERN is a regular expression matching a program name,\n\
6323 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6324 If VAL is a coding system, it is used for both decoding what received\n\
6325 from the program and encoding what sent to the program.\n\
6326 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6327 and the cdr part is used for encoding.\n\
6328 If VAL is a function symbol, the function must return a coding system\n\
6329 or a cons of coding systems which are used as above.\n\
6330 \n\
6331 See also the function `find-operation-coding-system'.");
6332   Vprocess_coding_system_alist = Qnil;
6333
6334   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6335     "Alist to decide a coding system to use for a network I/O operation.\n\
6336 The format is ((PATTERN . VAL) ...),\n\
6337 where PATTERN is a regular expression matching a network service name\n\
6338 or is a port number to connect to,\n\
6339 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6340 If VAL is a coding system, it is used for both decoding what received\n\
6341 from the network stream and encoding what sent to the network stream.\n\
6342 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6343 and the cdr part is used for encoding.\n\
6344 If VAL is a function symbol, the function must return a coding system\n\
6345 or a cons of coding systems which are used as above.\n\
6346 \n\
6347 See also the function `find-operation-coding-system'.");
6348   Vnetwork_coding_system_alist = Qnil;
6349
6350   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6351     "Coding system to use with system messages.");
6352   Vlocale_coding_system = Qnil;
6353
6354   /* The eol mnemonics are reset in startup.el system-dependently.  */
6355   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6356     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6357   eol_mnemonic_unix = build_string (":");
6358
6359   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6360     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6361   eol_mnemonic_dos = build_string ("\\");
6362
6363   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6364     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6365   eol_mnemonic_mac = build_string ("/");
6366
6367   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6368     "*String displayed in mode line when end-of-line format is not yet determined.");
6369   eol_mnemonic_undecided = build_string (":");
6370
6371   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6372     "*Non-nil enables character translation while encoding and decoding.");
6373   Venable_character_translation = Qt;
6374
6375   DEFVAR_LISP ("standard-translation-table-for-decode",
6376     &Vstandard_translation_table_for_decode,
6377     "Table for translating characters while decoding.");
6378   Vstandard_translation_table_for_decode = Qnil;
6379
6380   DEFVAR_LISP ("standard-translation-table-for-encode",
6381     &Vstandard_translation_table_for_encode,
6382     "Table for translationg characters while encoding.");
6383   Vstandard_translation_table_for_encode = Qnil;
6384
6385   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6386     "Alist of charsets vs revision numbers.\n\
6387 While encoding, if a charset (car part of an element) is found,\n\
6388 designate it with the escape sequence identifing revision (cdr part of the element).");
6389   Vcharset_revision_alist = Qnil;
6390
6391   DEFVAR_LISP ("default-process-coding-system",
6392                &Vdefault_process_coding_system,
6393     "Cons of coding systems used for process I/O by default.\n\
6394 The car part is used for decoding a process output,\n\
6395 the cdr part is used for encoding a text to be sent to a process.");
6396   Vdefault_process_coding_system = Qnil;
6397
6398   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6399     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6400 This is a vector of length 256.\n\
6401 If Nth element is non-nil, the existence of code N in a file\n\
6402 \(or output of subprocess) doesn't prevent it to be detected as\n\
6403 a coding system of ISO 2022 variant which has a flag\n\
6404 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6405 or reading output of a subprocess.\n\
6406 Only 128th through 159th elements has a meaning.");
6407   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6408
6409   DEFVAR_LISP ("select-safe-coding-system-function",
6410                &Vselect_safe_coding_system_function,
6411     "Function to call to select safe coding system for encoding a text.\n\
6412 \n\
6413 If set, this function is called to force a user to select a proper\n\
6414 coding system which can encode the text in the case that a default\n\
6415 coding system used in each operation can't encode the text.\n\
6416 \n\
6417 The default value is `select-safe-coding-system' (which see).");
6418   Vselect_safe_coding_system_function = Qnil;
6419
6420   DEFVAR_BOOL ("inhibit-iso-eacape-detection",
6421                &inhibit_iso_escape_detection,
6422     "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
6423 \n\
6424 By default, on reading a file, Emacs tries to detect how the text is\n\
6425 encoded.  This code detection is sensitive to escape sequences.  If\n\
6426 the sequence is valid as ISO2022, the code is detemined as one of\n\
6427 ISO2022 encoding, and the file is decoded by the corresponding coding\n\
6428 system (e.g. `iso-2022-7bit').\n\
6429 \n\
6430 However, there may be a case that you want to read escape sequences in\n\
6431 a file as is.  In such a case, you can set this variable to non-nil.\n\
6432 Then, as the code detection ignores any escape sequences, no file is\n\
6433 detected as some of ISO2022 encoding.  The result is that all escape\n\
6434 sequences become visible in a buffer.\n\
6435 \n\
6436 The default value is nil, and it is strongly recommended not to change\n\
6437 it.  That is because many Emacs Lisp source files that contain\n\
6438 non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
6439 in Emacs's distribution, and they won't be decoded correctly on\n\
6440 reading if you suppress escapse sequence detection.\n\
6441 \n\
6442 The other way to read escape sequences in a file without decoding is\n\
6443 to explicitely specify some coding system that doesn't use ISO2022's\n\
6444 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
6445   inhibit_iso_escape_detection = 0;
6446 }
6447
6448 char *
6449 emacs_strerror (error_number)
6450      int error_number;
6451 {
6452   char *str;
6453
6454   synchronize_system_messages_locale ();
6455   str = strerror (error_number);
6456
6457   if (! NILP (Vlocale_coding_system))
6458     {
6459       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6460                                                       Vlocale_coding_system,
6461                                                       0);
6462       str = (char *) XSTRING (dec)->data;
6463     }
6464
6465   return str;
6466 }
6467
6468 #endif /* emacs */
6469