code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  */
 116 #if 0
 117 int
 118 detect_coding_emacs_mule (src, src_end)
 119      unsigned char *src, *src_end;
 120 {
 121   ...
 122 }
 123 #endif
 124
 125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 126
 127   These functions decode SRC_BYTES length of unibyte text at SOURCE
 128   encoded in CODING to Emacs' internal format.  The resulting
 129   multibyte text goes to a place pointed to by DESTINATION, the length
 130   of which should not exceed DST_BYTES.
 131
 132   These functions set the information of original and decoded texts in
 133   the members produced, produced_char, consumed, and consumed_char of
 134   the structure *CODING.  They also set the member result to one of
 135   CODING_FINISH_XXX indicating how the decoding finished.
 136
 137   DST_BYTES zero means that source area and destination area are
 138   overlapped, which means that we can produce a decoded text until it
 139   reaches at the head of not-yet-decoded source text.
 140
 141   Below is a template of these functions.  */
 142 #if 0
 143 static void
 144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148 {
 149   ...
 150 }
 151 #endif
 152
 153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 154
 155   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 156   internal multibyte format to CODING.  The resulting unibyte text
 157   goes to a place pointed to by DESTINATION, the length of which
 158   should not exceed DST_BYTES.
 159
 160   These functions set the information of original and encoded texts in
 161   the members produced, produced_char, consumed, and consumed_char of
 162   the structure *CODING.  They also set the member result to one of
 163   CODING_FINISH_XXX indicating how the encoding finished.
 164
 165   DST_BYTES zero means that source area and destination area are
 166   overlapped, which means that we can produce a encoded text until it
 167   reaches at the head of not-yet-encoded source text.
 168
 169   Below is a template of these functions.  */
 170 #if 0
 171 static void
 172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 173      struct coding_system *coding;
 174      unsigned char *source, *destination;
 175      int src_bytes, dst_bytes;
 176 {
 177   ...
 178 }
 179 #endif
 180
 181 /*** COMMONLY USED MACROS ***/
 182
 183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 184    get one, two, and three bytes from the source text respectively.
 185    If there are not enough bytes in the source, they jump to
 186    `label_end_of_loop'.  The caller should set variables `coding',
 187    `src' and `src_end' to appropriate pointer in advance.  These
 188    macros are called from decoding routines `decode_coding_XXX', thus
 189    it is assumed that the source text is unibyte.  */
 190
 191 #define ONE_MORE_BYTE(c1)                                       \
 192   do {                                                          \
 193     if (src >= src_end)                                         \
 194       {                                                         \
 195         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 196         goto label_end_of_loop;                                 \
 197       }                                                         \
 198     c1 = *src++;                                                \
 199   } while (0)
 200
 201 #define TWO_MORE_BYTES(c1, c2)                                  \
 202   do {                                                          \
 203     if (src + 1 >= src_end)                                     \
 204       {                                                         \
 205         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 206         goto label_end_of_loop;                                 \
 207       }                                                         \
 208     c1 = *src++;                                                \
 209     c2 = *src++;                                                \
 210   } while (0)
 211
 212
 213 /* Set C to the next character at the source text pointed by `src'.
 214    If there are not enough characters in the source, jump to
 215    `label_end_of_loop'.  The caller should set variables `coding'
 216    `src', `src_end', and `translation_table' to appropriate pointers
 217    in advance.  This macro is used in encoding routines
 218    `encode_coding_XXX', thus it assumes that the source text is in
 219    multibyte form except for 8-bit characters.  8-bit characters are
 220    in multibyte form if coding->src_multibyte is nonzero, else they
 221    are represented by a single byte.  */
 222
 223 #define ONE_MORE_CHAR(c)                                        \
 224   do {                                                          \
 225     int len = src_end - src;                                    \
 226     int bytes;                                                  \
 227     if (len <= 0)                                               \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         goto label_end_of_loop;                                 \
 231       }                                                         \
 232     if (coding->src_multibyte                                   \
 233         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 234       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 235     else                                                        \
 236       c = *src, bytes = 1;                                      \
 237     if (!NILP (translation_table))                              \
 238       c = translate_char (translation_table, c, 0, 0, 0);       \
 239     src += bytes;                                               \
 240   } while (0)
 241
 242
 243 /* Produce a multibyte form of characater C to `dst'.  Jump to
 244    `label_end_of_loop' if there's not enough space at `dst'.
 245
 246    If we are now in the middle of composition sequence, the decoded
 247    character may be ALTCHAR (for the current composition).  In that
 248    case, the character goes to coding->cmp_data->data instead of
 249    `dst'.
 250
 251    This macro is used in decoding routines.  */
 252
 253 #define EMIT_CHAR(c)                                                    \
 254   do {                                                                  \
 255     if (! COMPOSING_P (coding)                                          \
 256         || coding->composing == COMPOSITION_RELATIVE                    \
 257         || coding->composing == COMPOSITION_WITH_RULE)                  \
 258       {                                                                 \
 259         int bytes = CHAR_BYTES (c);                                     \
 260         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 261           {                                                             \
 262             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 263             goto label_end_of_loop;                                     \
 264           }                                                             \
 265         dst += CHAR_STRING (c, dst);                                    \
 266         coding->produced_char++;                                        \
 267       }                                                                 \
 268                                                                         \
 269     if (COMPOSING_P (coding)                                            \
 270         && coding->composing != COMPOSITION_RELATIVE)                   \
 271       {                                                                 \
 272         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 273         coding->composition_rule_follows                                \
 274           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 275       }                                                                 \
 276   } while (0)
 277
 278
 279 #define EMIT_ONE_BYTE(c)                                        \
 280   do {                                                          \
 281     if (dst >= (dst_bytes ? dst_end : src))                     \
 282       {                                                         \
 283         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 284         goto label_end_of_loop;                                 \
 285       }                                                         \
 286     *dst++ = c;                                                 \
 287   } while (0)
 288
 289 #define EMIT_TWO_BYTES(c1, c2)                                  \
 290   do {                                                          \
 291     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 292       {                                                         \
 293         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 294         goto label_end_of_loop;                                 \
 295       }                                                         \
 296     *dst++ = c1, *dst++ = c2;                                   \
 297   } while (0)
 298
 299 #define EMIT_BYTES(from, to)                                    \
 300   do {                                                          \
 301     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     while (from < to)                                           \
 307       *dst++ = *from++;                                         \
 308   } while (0)
 309
 310 \f
 311 /*** 1. Preamble ***/
 312
 313 #ifdef emacs
 314 #include <config.h>
 315 #endif
 316
 317 #include <stdio.h>
 318
 319 #ifdef emacs
 320
 321 #include "lisp.h"
 322 #include "buffer.h"
 323 #include "charset.h"
 324 #include "composite.h"
 325 #include "ccl.h"
 326 #include "coding.h"
 327 #include "window.h"
 328
 329 #else  /* not emacs */
 330
 331 #include "mulelib.h"
 332
 333 #endif /* not emacs */
 334
 335 Lisp_Object Qcoding_system, Qeol_type;
 336 Lisp_Object Qbuffer_file_coding_system;
 337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 338 Lisp_Object Qno_conversion, Qundecided;
 339 Lisp_Object Qcoding_system_history;
 340 Lisp_Object Qsafe_charsets;
 341 Lisp_Object Qvalid_codes;
 342
 343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 345 Lisp_Object Qstart_process, Qopen_network_stream;
 346 Lisp_Object Qtarget_idx;
 347
 348 Lisp_Object Vselect_safe_coding_system_function;
 349
 350 /* Mnemonic string for each format of end-of-line.  */
 351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 352 /* Mnemonic string to indicate format of end-of-line is not yet
 353    decided.  */
 354 Lisp_Object eol_mnemonic_undecided;
 355
 356 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 357    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 358 int system_eol_type;
 359
 360 #ifdef emacs
 361
 362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 363
 364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 365
 366 /* Coding system emacs-mule and raw-text are for converting only
 367    end-of-line format.  */
 368 Lisp_Object Qemacs_mule, Qraw_text;
 369
 370 /* Coding-systems are handed between Emacs Lisp programs and C internal
 371    routines by the following three variables.  */
 372 /* Coding-system for reading files and receiving data from process.  */
 373 Lisp_Object Vcoding_system_for_read;
 374 /* Coding-system for writing files and sending data to process.  */
 375 Lisp_Object Vcoding_system_for_write;
 376 /* Coding-system actually used in the latest I/O.  */
 377 Lisp_Object Vlast_coding_system_used;
 378
 379 /* A vector of length 256 which contains information about special
 380    Latin codes (especially for dealing with Microsoft codes).  */
 381 Lisp_Object Vlatin_extra_code_table;
 382
 383 /* Flag to inhibit code conversion of end-of-line format.  */
 384 int inhibit_eol_conversion;
 385
 386 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 387 int inherit_process_coding_system;
 388
 389 /* Coding system to be used to encode text for terminal display.  */
 390 struct coding_system terminal_coding;
 391
 392 /* Coding system to be used to encode text for terminal display when
 393    terminal coding system is nil.  */
 394 struct coding_system safe_terminal_coding;
 395
 396 /* Coding system of what is sent from terminal keyboard.  */
 397 struct coding_system keyboard_coding;
 398
 399 /* Default coding system to be used to write a file.  */
 400 struct coding_system default_buffer_file_coding;
 401
 402 Lisp_Object Vfile_coding_system_alist;
 403 Lisp_Object Vprocess_coding_system_alist;
 404 Lisp_Object Vnetwork_coding_system_alist;
 405
 406 Lisp_Object Vlocale_coding_system;
 407
 408 #endif /* emacs */
 409
 410 Lisp_Object Qcoding_category, Qcoding_category_index;
 411
 412 /* List of symbols `coding-category-xxx' ordered by priority.  */
 413 Lisp_Object Vcoding_category_list;
 414
 415 /* Table of coding categories (Lisp symbols).  */
 416 Lisp_Object Vcoding_category_table;
 417
 418 /* Table of names of symbol for each coding-category.  */
 419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 420   "coding-category-emacs-mule",
 421   "coding-category-sjis",
 422   "coding-category-iso-7",
 423   "coding-category-iso-7-tight",
 424   "coding-category-iso-8-1",
 425   "coding-category-iso-8-2",
 426   "coding-category-iso-7-else",
 427   "coding-category-iso-8-else",
 428   "coding-category-ccl",
 429   "coding-category-big5",
 430   "coding-category-utf-8",
 431   "coding-category-utf-16-be",
 432   "coding-category-utf-16-le",
 433   "coding-category-raw-text",
 434   "coding-category-binary"
 435 };
 436
 437 /* Table of pointers to coding systems corresponding to each coding
 438    categories.  */
 439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 440
 441 /* Table of coding category masks.  Nth element is a mask for a coding
 442    cateogry of which priority is Nth.  */
 443 static
 444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 445
 446 /* Flag to tell if we look up translation table on character code
 447    conversion.  */
 448 Lisp_Object Venable_character_translation;
 449 /* Standard translation table to look up on decoding (reading).  */
 450 Lisp_Object Vstandard_translation_table_for_decode;
 451 /* Standard translation table to look up on encoding (writing).  */
 452 Lisp_Object Vstandard_translation_table_for_encode;
 453
 454 Lisp_Object Qtranslation_table;
 455 Lisp_Object Qtranslation_table_id;
 456 Lisp_Object Qtranslation_table_for_decode;
 457 Lisp_Object Qtranslation_table_for_encode;
 458
 459 /* Alist of charsets vs revision number.  */
 460 Lisp_Object Vcharset_revision_alist;
 461
 462 /* Default coding systems used for process I/O.  */
 463 Lisp_Object Vdefault_process_coding_system;
 464
 465 /* Global flag to tell that we can't call post-read-conversion and
 466    pre-write-conversion functions.  Usually the value is zero, but it
 467    is set to 1 temporarily while such functions are running.  This is
 468    to avoid infinite recursive call.  */
 469 static int inhibit_pre_post_conversion;
 470
 471 \f
 472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 473
 474 /* Emacs' internal format for encoding multiple character sets is a
 475    kind of multi-byte encoding, i.e. characters are encoded by
 476    variable-length sequences of one-byte codes.
 477
 478    ASCII characters and control characters (e.g. `tab', `newline') are
 479    represented by one-byte sequences which are their ASCII codes, in
 480    the range 0x00 through 0x7F.
 481
 482    8-bit characters of the range 0x80..0x9F are represented by
 483    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 484    code + 0x20).
 485
 486    8-bit characters of the range 0xA0..0xFF are represented by
 487    one-byte sequences which are their 8-bit code.
 488
 489    The other characters are represented by a sequence of `base
 490    leading-code', optional `extended leading-code', and one or two
 491    `position-code's.  The length of the sequence is determined by the
 492    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 493    whereas extended leading-code and position-code take the range 0xA0
 494    through 0xFF.  See `charset.h' for more details about leading-code
 495    and position-code.
 496
 497    --- CODE RANGE of Emacs' internal format ---
 498    character set        range
 499    -------------        -----
 500    ascii                0x00..0x7F
 501    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 502    eight-bit-graphic    0xA0..0xBF
 503    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 504    ---------------------------------------------
 505
 506   */
 507
 508 enum emacs_code_class_type emacs_code_class[256];
 509
 510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 511    Check if a text is encoded in Emacs' internal format.  If it is,
 512    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 513
 514 int
 515 detect_coding_emacs_mule (src, src_end)
 516       unsigned char *src, *src_end;
 517 {
 518   unsigned char c;
 519   int composing = 0;
 520   /* Dummy for ONE_MORE_BYTE.  */
 521   struct coding_system dummy_coding;
 522   struct coding_system *coding = &dummy_coding;
 523
 524   while (1)
 525     {
 526       ONE_MORE_BYTE (c);
 527
 528       if (composing)
 529         {
 530           if (c < 0xA0)
 531             composing = 0;
 532           else if (c == 0xA0)
 533             {
 534               ONE_MORE_BYTE (c);
 535               c &= 0x7F;
 536             }
 537           else
 538             c -= 0x20;
 539         }
 540
 541       if (c < 0x20)
 542         {
 543           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 544             return 0;
 545         }
 546       else if (c >= 0x80 && c < 0xA0)
 547         {
 548           if (c == 0x80)
 549             /* Old leading code for a composite character.  */
 550             composing = 1;
 551           else
 552             {
 553               unsigned char *src_base = src - 1;
 554               int bytes;
 555
 556               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 557                                                bytes))
 558                 return 0;
 559               src = src_base + bytes;
 560             }
 561         }
 562     }
 563  label_end_of_loop:
 564   return CODING_CATEGORY_MASK_EMACS_MULE;
 565 }
 566
 567
 568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 569
 570 static void
 571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 572      struct coding_system *coding;
 573      unsigned char *source, *destination;
 574      int src_bytes, dst_bytes;
 575 {
 576   unsigned char *src = source;
 577   unsigned char *src_end = source + src_bytes;
 578   unsigned char *dst = destination;
 579   unsigned char *dst_end = destination + dst_bytes;
 580   /* SRC_BASE remembers the start position in source in each loop.
 581      The loop will be exited when there's not enough source code, or
 582      when there's not enough destination area to produce a
 583      character.  */
 584   unsigned char *src_base;
 585
 586   coding->produced_char = 0;
 587   while ((src_base = src) < src_end)
 588     {
 589       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 590       int bytes;
 591
 592       if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 593         {
 594           p = src;
 595           src += bytes;
 596         }
 597       else
 598         {
 599           bytes = CHAR_STRING (*src, tmp);
 600           p = tmp;
 601           src++;
 602         }
 603       if (dst + bytes >= (dst_bytes ? dst_end : src))
 604         {
 605           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 606           break;
 607         }
 608       while (bytes--) *dst++ = *p++;
 609       coding->produced_char++;
 610     }
 611   coding->consumed = coding->consumed_char = src_base - source;
 612   coding->produced = dst - destination;
 613 }
 614
 615 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 616   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 617
 618
 619 \f
 620 /*** 3. ISO2022 handlers ***/
 621
 622 /* The following note describes the coding system ISO2022 briefly.
 623    Since the intention of this note is to help understand the
 624    functions in this file, some parts are NOT ACCURATE or OVERLY
 625    SIMPLIFIED.  For thorough understanding, please refer to the
 626    original document of ISO2022.
 627
 628    ISO2022 provides many mechanisms to encode several character sets
 629    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 630    is encoded using bytes less than 128.  This may make the encoded
 631    text a little bit longer, but the text passes more easily through
 632    several gateways, some of which strip off MSB (Most Signigant Bit).
 633
 634    There are two kinds of character sets: control character set and
 635    graphic character set.  The former contains control characters such
 636    as `newline' and `escape' to provide control functions (control
 637    functions are also provided by escape sequences).  The latter
 638    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 639    two control character sets and many graphic character sets.
 640
 641    Graphic character sets are classified into one of the following
 642    four classes, according to the number of bytes (DIMENSION) and
 643    number of characters in one dimension (CHARS) of the set:
 644    - DIMENSION1_CHARS94
 645    - DIMENSION1_CHARS96
 646    - DIMENSION2_CHARS94
 647    - DIMENSION2_CHARS96
 648
 649    In addition, each character set is assigned an identification tag,
 650    unique for each set, called "final character" (denoted as <F>
 651    hereafter).  The <F> of each character set is decided by ECMA(*)
 652    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 653    (0x30..0x3F are for private use only).
 654
 655    Note (*): ECMA = European Computer Manufacturers Association
 656
 657    Here are examples of graphic character set [NAME(<F>)]:
 658         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 659         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 660         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 661         o DIMENSION2_CHARS96 -- none for the moment
 662
 663    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 664         C0 [0x00..0x1F] -- control character plane 0
 665         GL [0x20..0x7F] -- graphic character plane 0
 666         C1 [0x80..0x9F] -- control character plane 1
 667         GR [0xA0..0xFF] -- graphic character plane 1
 668
 669    A control character set is directly designated and invoked to C0 or
 670    C1 by an escape sequence.  The most common case is that:
 671    - ISO646's  control character set is designated/invoked to C0, and
 672    - ISO6429's control character set is designated/invoked to C1,
 673    and usually these designations/invocations are omitted in encoded
 674    text.  In a 7-bit environment, only C0 can be used, and a control
 675    character for C1 is encoded by an appropriate escape sequence to
 676    fit into the environment.  All control characters for C1 are
 677    defined to have corresponding escape sequences.
 678
 679    A graphic character set is at first designated to one of four
 680    graphic registers (G0 through G3), then these graphic registers are
 681    invoked to GL or GR.  These designations and invocations can be
 682    done independently.  The most common case is that G0 is invoked to
 683    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 684    these invocations and designations are omitted in encoded text.
 685    In a 7-bit environment, only GL can be used.
 686
 687    When a graphic character set of CHARS94 is invoked to GL, codes
 688    0x20 and 0x7F of the GL area work as control characters SPACE and
 689    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 690    be used.
 691
 692    There are two ways of invocation: locking-shift and single-shift.
 693    With locking-shift, the invocation lasts until the next different
 694    invocation, whereas with single-shift, the invocation affects the
 695    following character only and doesn't affect the locking-shift
 696    state.  Invocations are done by the following control characters or
 697    escape sequences:
 698
 699    ----------------------------------------------------------------------
 700    abbrev  function                  cntrl escape seq   description
 701    ----------------------------------------------------------------------
 702    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 703    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 704    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 705    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 706    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 707    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 708    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 709    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 710    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 711    ----------------------------------------------------------------------
 712    (*) These are not used by any known coding system.
 713
 714    Control characters for these functions are defined by macros
 715    ISO_CODE_XXX in `coding.h'.
 716
 717    Designations are done by the following escape sequences:
 718    ----------------------------------------------------------------------
 719    escape sequence      description
 720    ----------------------------------------------------------------------
 721    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 722    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 723    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 724    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 725    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 726    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 727    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 728    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 729    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 730    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 731    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 732    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 733    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 734    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 735    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 736    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 737    ----------------------------------------------------------------------
 738
 739    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 740    of dimension 1, chars 94, and final character <F>, etc...
 741
 742    Note (*): Although these designations are not allowed in ISO2022,
 743    Emacs accepts them on decoding, and produces them on encoding
 744    CHARS96 character sets in a coding system which is characterized as
 745    7-bit environment, non-locking-shift, and non-single-shift.
 746
 747    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 748    '(' can be omitted.  We refer to this as "short-form" hereafter.
 749
 750    Now you may notice that there are a lot of ways for encoding the
 751    same multilingual text in ISO2022.  Actually, there exist many
 752    coding systems such as Compound Text (used in X11's inter client
 753    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 754    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 755    localized platforms), and all of these are variants of ISO2022.
 756
 757    In addition to the above, Emacs handles two more kinds of escape
 758    sequences: ISO6429's direction specification and Emacs' private
 759    sequence for specifying character composition.
 760
 761    ISO6429's direction specification takes the following form:
 762         o CSI ']'      -- end of the current direction
 763         o CSI '0' ']'  -- end of the current direction
 764         o CSI '1' ']'  -- start of left-to-right text
 765         o CSI '2' ']'  -- start of right-to-left text
 766    The control character CSI (0x9B: control sequence introducer) is
 767    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 768
 769    Character composition specification takes the following form:
 770         o ESC '0' -- start relative composition
 771         o ESC '1' -- end composition
 772         o ESC '2' -- start rule-base composition (*)
 773         o ESC '3' -- start relative composition with alternate chars  (**)
 774         o ESC '4' -- start rule-base composition with alternate chars  (**)
 775   Since these are not standard escape sequences of any ISO standard,
 776   the use of them for these meaning is restricted to Emacs only.
 777
 778   (*) This form is used only in Emacs 20.5 and the older versions,
 779   but the newer versions can safely decode it.
 780   (**) This form is used only in Emacs 21.1 and the newer versions,
 781   and the older versions can't decode it.
 782
 783   Here's a list of examples usages of these composition escape
 784   sequences (categorized by `enum composition_method').
 785
 786   COMPOSITION_RELATIVE:
 787         ESC 0 CHAR [ CHAR ] ESC 1
 788   COMPOSITOIN_WITH_RULE:
 789         ESC 2 CHAR [ RULE CHAR ] ESC 1
 790   COMPOSITION_WITH_ALTCHARS:
 791         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 792   COMPOSITION_WITH_RULE_ALTCHARS:
 793         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 794
 795 enum iso_code_class_type iso_code_class[256];
 796
 797 #define CHARSET_OK(idx, charset)                                \
 798   (coding_system_table[idx]                                     \
 799    && (coding_system_table[idx]->safe_charsets[charset]         \
 800        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 801             (coding_system_table[idx], charset)                 \
 802            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 803
 804 #define SHIFT_OUT_OK(idx) \
 805   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 806
 807 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 808    Check if a text is encoded in ISO2022.  If it is, returns an
 809    integer in which appropriate flag bits any of:
 810         CODING_CATEGORY_MASK_ISO_7
 811         CODING_CATEGORY_MASK_ISO_7_TIGHT
 812         CODING_CATEGORY_MASK_ISO_8_1
 813         CODING_CATEGORY_MASK_ISO_8_2
 814         CODING_CATEGORY_MASK_ISO_7_ELSE
 815         CODING_CATEGORY_MASK_ISO_8_ELSE
 816    are set.  If a code which should never appear in ISO2022 is found,
 817    returns 0.  */
 818
 819 int
 820 detect_coding_iso2022 (src, src_end)
 821      unsigned char *src, *src_end;
 822 {
 823   int mask = CODING_CATEGORY_MASK_ISO;
 824   int mask_found = 0;
 825   int reg[4], shift_out = 0, single_shifting = 0;
 826   int c, c1, i, charset;
 827   /* Dummy for ONE_MORE_BYTE.  */
 828   struct coding_system dummy_coding;
 829   struct coding_system *coding = &dummy_coding;
 830
 831   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 832   while (mask && src < src_end)
 833     {
 834       ONE_MORE_BYTE (c);
 835       switch (c)
 836         {
 837         case ISO_CODE_ESC:
 838           single_shifting = 0;
 839           ONE_MORE_BYTE (c);
 840           if (c >= '(' && c <= '/')
 841             {
 842               /* Designation sequence for a charset of dimension 1.  */
 843               ONE_MORE_BYTE (c1);
 844               if (c1 < ' ' || c1 >= 0x80
 845                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 846                 /* Invalid designation sequence.  Just ignore.  */
 847                 break;
 848               reg[(c - '(') % 4] = charset;
 849             }
 850           else if (c == '$')
 851             {
 852               /* Designation sequence for a charset of dimension 2.  */
 853               ONE_MORE_BYTE (c);
 854               if (c >= '@' && c <= 'B')
 855                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 856                 reg[0] = charset = iso_charset_table[1][0][c];
 857               else if (c >= '(' && c <= '/')
 858                 {
 859                   ONE_MORE_BYTE (c1);
 860                   if (c1 < ' ' || c1 >= 0x80
 861                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 862                     /* Invalid designation sequence.  Just ignore.  */
 863                     break;
 864                   reg[(c - '(') % 4] = charset;
 865                 }
 866               else
 867                 /* Invalid designation sequence.  Just ignore.  */
 868                 break;
 869             }
 870           else if (c == 'N' || c == 'O')
 871             {
 872               /* ESC <Fe> for SS2 or SS3.  */
 873               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 874               break;
 875             }
 876           else if (c >= '0' && c <= '4')
 877             {
 878               /* ESC <Fp> for start/end composition.  */
 879               mask_found |= CODING_CATEGORY_MASK_ISO;
 880               break;
 881             }
 882           else
 883             /* Invalid escape sequence.  Just ignore.  */
 884             break;
 885
 886           /* We found a valid designation sequence for CHARSET.  */
 887           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 888           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 889             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 890           else
 891             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 892           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 893             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 894           else
 895             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 896           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 897             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 898           else
 899             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 900           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 901             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 902           else
 903             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 904           break;
 905
 906         case ISO_CODE_SO:
 907           single_shifting = 0;
 908           if (shift_out == 0
 909               && (reg[1] >= 0
 910                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 911                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 912             {
 913               /* Locking shift out.  */
 914               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 915               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 916             }
 917           break;
 918
 919         case ISO_CODE_SI:
 920           single_shifting = 0;
 921           if (shift_out == 1)
 922             {
 923               /* Locking shift in.  */
 924               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 925               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 926             }
 927           break;
 928
 929         case ISO_CODE_CSI:
 930           single_shifting = 0;
 931         case ISO_CODE_SS2:
 932         case ISO_CODE_SS3:
 933           {
 934             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 935
 936             if (c != ISO_CODE_CSI)
 937               {
 938                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 939                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 940                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 941                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 942                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 943                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 944                 single_shifting = 1;
 945               }
 946             if (VECTORP (Vlatin_extra_code_table)
 947                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 948               {
 949                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 950                     & CODING_FLAG_ISO_LATIN_EXTRA)
 951                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 952                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 953                     & CODING_FLAG_ISO_LATIN_EXTRA)
 954                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 955               }
 956             mask &= newmask;
 957             mask_found |= newmask;
 958           }
 959           break;
 960
 961         default:
 962           if (c < 0x80)
 963             {
 964               single_shifting = 0;
 965               break;
 966             }
 967           else if (c < 0xA0)
 968             {
 969               single_shifting = 0;
 970               if (VECTORP (Vlatin_extra_code_table)
 971                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 972                 {
 973                   int newmask = 0;
 974
 975                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 976                       & CODING_FLAG_ISO_LATIN_EXTRA)
 977                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 978                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 979                       & CODING_FLAG_ISO_LATIN_EXTRA)
 980                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 981                   mask &= newmask;
 982                   mask_found |= newmask;
 983                 }
 984               else
 985                 return 0;
 986             }
 987           else
 988             {
 989               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 990                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 991               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 992               /* Check the length of succeeding codes of the range
 993                  0xA0..0FF.  If the byte length is odd, we exclude
 994                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 995                  when we are not single shifting.  */
 996               if (!single_shifting
 997                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
 998                 {
 999                   int i = 1;
1000                   while (src < src_end)
1001                     {
1002                       ONE_MORE_BYTE (c);
1003                       if (c < 0xA0)
1004                         break;
1005                       i++;
1006                     }
1007
1008                   if (i & 1 && src < src_end)
1009                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1010                   else
1011                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1012                 }
1013             }
1014           break;
1015         }
1016     }
1017  label_end_of_loop:
1018   return (mask & mask_found);
1019 }
1020
1021 /* Decode a character of which charset is CHARSET, the 1st position
1022    code is C1, the 2nd position code is C2, and return the decoded
1023    character code.  If the variable `translation_table' is non-nil,
1024    returned the translated code.  */
1025
1026 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1027   (NILP (translation_table)                     \
1028    ? MAKE_CHAR (charset, c1, c2)                \
1029    : translate_char (translation_table, -1, charset, c1, c2))
1030
1031 /* Set designation state into CODING.  */
1032 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1033   do {                                                                     \
1034     int charset;                                                           \
1035                                                                            \
1036     if (final_char < '0' || final_char >= 128)                             \
1037       goto label_invalid_code;                                             \
1038     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1039                                  make_number (chars),                      \
1040                                  make_number (final_char));                \
1041     if (charset >= 0                                                       \
1042         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1043             || coding->safe_charsets[charset]))                            \
1044       {                                                                    \
1045         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1046             && reg == 0                                                    \
1047             && charset == CHARSET_ASCII)                                   \
1048           {                                                                \
1049             /* We should insert this designation sequence as is so         \
1050                that it is surely written back to a file.  */               \
1051             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1052             goto label_invalid_code;                                       \
1053           }                                                                \
1054         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1055         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1056             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1057           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1058         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1059       }                                                                    \
1060     else                                                                   \
1061       {                                                                    \
1062         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1063         goto label_invalid_code;                                           \
1064       }                                                                    \
1065   } while (0)
1066
1067 /* Allocate a memory block for storing information about compositions.
1068    The block is chained to the already allocated blocks.  */
1069
1070 void
1071 coding_allocate_composition_data (coding, char_offset)
1072      struct coding_system *coding;
1073      int char_offset;
1074 {
1075   struct composition_data *cmp_data
1076     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1077
1078   cmp_data->char_offset = char_offset;
1079   cmp_data->used = 0;
1080   cmp_data->prev = coding->cmp_data;
1081   cmp_data->next = NULL;
1082   if (coding->cmp_data)
1083     coding->cmp_data->next = cmp_data;
1084   coding->cmp_data = cmp_data;
1085   coding->cmp_data_start = 0;
1086 }
1087
1088 /* Record the starting position START and METHOD of one composition.  */
1089
1090 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1091   do {                                                          \
1092     struct composition_data *cmp_data = coding->cmp_data;       \
1093     int *data = cmp_data->data + cmp_data->used;                \
1094     coding->cmp_data_start = cmp_data->used;                    \
1095     data[0] = -1;                                               \
1096     data[1] = cmp_data->char_offset + start;                    \
1097     data[3] = (int) method;                                     \
1098     cmp_data->used += 4;                                        \
1099   } while (0)
1100
1101 /* Record the ending position END of the current composition.  */
1102
1103 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1104   do {                                                          \
1105     struct composition_data *cmp_data = coding->cmp_data;       \
1106     int *data = cmp_data->data + coding->cmp_data_start;        \
1107     data[0] = cmp_data->used - coding->cmp_data_start;          \
1108     data[2] = cmp_data->char_offset + end;                      \
1109   } while (0)
1110
1111 /* Record one COMPONENT (alternate character or composition rule).  */
1112
1113 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1114   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1115
1116 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1117
1118 #define DECODE_COMPOSITION_START(c1)                                       \
1119   do {                                                                     \
1120     if (coding->composing == COMPOSITION_DISABLED)                         \
1121       {                                                                    \
1122         *dst++ = ISO_CODE_ESC;                                             \
1123         *dst++ = c1 & 0x7f;                                                \
1124         coding->produced_char += 2;                                        \
1125       }                                                                    \
1126     else if (!COMPOSING_P (coding))                                        \
1127       {                                                                    \
1128         /* This is surely the start of a composition.  We must be sure     \
1129            that coding->cmp_data has enough space to store the             \
1130            information about the composition.  If not, terminate the       \
1131            current decoding loop, allocate one more memory block for       \
1132            coding->cmp_data in the calller, then start the decoding        \
1133            loop again.  We can't allocate memory here directly because     \
1134            it may cause buffer/string relocation.  */                      \
1135         if (!coding->cmp_data                                              \
1136             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1137                 >= COMPOSITION_DATA_SIZE))                                 \
1138           {                                                                \
1139             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1140             goto label_end_of_loop;                                        \
1141           }                                                                \
1142         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1143                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1144                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1145                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1146         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1147                                       coding->composing);                  \
1148         coding->composition_rule_follows = 0;                              \
1149       }                                                                    \
1150     else                                                                   \
1151       {                                                                    \
1152         /* We are already handling a composition.  If the method is        \
1153            the following two, the codes following the current escape       \
1154            sequence are actual characters stored in a buffer.  */          \
1155         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1156             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1157           {                                                                \
1158             coding->composing = COMPOSITION_RELATIVE;                      \
1159             coding->composition_rule_follows = 0;                          \
1160           }                                                                \
1161       }                                                                    \
1162   } while (0)
1163
1164 /* Handle compositoin end sequence ESC 1.  */
1165
1166 #define DECODE_COMPOSITION_END(c1)                                      \
1167   do {                                                                  \
1168     if (coding->composing == COMPOSITION_DISABLED)                      \
1169       {                                                                 \
1170         *dst++ = ISO_CODE_ESC;                                          \
1171         *dst++ = c1;                                                    \
1172         coding->produced_char += 2;                                     \
1173       }                                                                 \
1174     else                                                                \
1175       {                                                                 \
1176         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1177         coding->composing = COMPOSITION_NO;                             \
1178       }                                                                 \
1179   } while (0)
1180
1181 /* Decode a composition rule from the byte C1 (and maybe one more byte
1182    from SRC) and store one encoded composition rule in
1183    coding->cmp_data.  */
1184
1185 #define DECODE_COMPOSITION_RULE(c1)                                     \
1186   do {                                                                  \
1187     int rule = 0;                                                       \
1188     (c1) -= 32;                                                         \
1189     if (c1 < 81)                /* old format (before ver.21) */        \
1190       {                                                                 \
1191         int gref = (c1) / 9;                                            \
1192         int nref = (c1) % 9;                                            \
1193         if (gref == 4) gref = 10;                                       \
1194         if (nref == 4) nref = 10;                                       \
1195         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1196       }                                                                 \
1197     else if (c1 < 93)           /* new format (after ver.21) */         \
1198       {                                                                 \
1199         ONE_MORE_BYTE (c2);                                             \
1200         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1201       }                                                                 \
1202     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1203     coding->composition_rule_follows = 0;                               \
1204   } while (0)
1205
1206
1207 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1208
1209 static void
1210 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1211      struct coding_system *coding;
1212      unsigned char *source, *destination;
1213      int src_bytes, dst_bytes;
1214 {
1215   unsigned char *src = source;
1216   unsigned char *src_end = source + src_bytes;
1217   unsigned char *dst = destination;
1218   unsigned char *dst_end = destination + dst_bytes;
1219   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1220   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1221   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1222   /* SRC_BASE remembers the start position in source in each loop.
1223      The loop will be exited when there's not enough source code
1224      (within macro ONE_MORE_BYTE), or when there's not enough
1225      destination area to produce a character (within macro
1226      EMIT_CHAR).  */
1227   unsigned char *src_base;
1228   int c, charset;
1229   Lisp_Object translation_table;
1230
1231   if (NILP (Venable_character_translation))
1232     translation_table = Qnil;
1233   else
1234     {
1235       translation_table = coding->translation_table_for_decode;
1236       if (NILP (translation_table))
1237         translation_table = Vstandard_translation_table_for_decode;
1238     }
1239
1240   coding->result = CODING_FINISH_NORMAL;
1241
1242   while (1)
1243     {
1244       int c1, c2;
1245
1246       src_base = src;
1247       ONE_MORE_BYTE (c1);
1248
1249       /* We produce no character or one character.  */
1250       switch (iso_code_class [c1])
1251         {
1252         case ISO_0x20_or_0x7F:
1253           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1254             {
1255               DECODE_COMPOSITION_RULE (c1);
1256               continue;
1257             }
1258           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1259             {
1260               /* This is SPACE or DEL.  */
1261               charset = CHARSET_ASCII;
1262               break;
1263             }
1264           /* This is a graphic character, we fall down ...  */
1265
1266         case ISO_graphic_plane_0:
1267           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1268             {
1269               DECODE_COMPOSITION_RULE (c1);
1270               continue;
1271             }
1272           charset = charset0;
1273           break;
1274
1275         case ISO_0xA0_or_0xFF:
1276           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1277               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1278             goto label_invalid_code;
1279           /* This is a graphic character, we fall down ... */
1280
1281         case ISO_graphic_plane_1:
1282           if (charset1 < 0)
1283             goto label_invalid_code;
1284           charset = charset1;
1285           break;
1286
1287         case ISO_control_0:
1288           if (COMPOSING_P (coding))
1289             DECODE_COMPOSITION_END ('1');
1290
1291           /* All ISO2022 control characters in this class have the
1292              same representation in Emacs internal format.  */
1293           if (c1 == '\n'
1294               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1295               && (coding->eol_type == CODING_EOL_CR
1296                   || coding->eol_type == CODING_EOL_CRLF))
1297             {
1298               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1299               goto label_end_of_loop;
1300             }
1301           charset = CHARSET_ASCII;
1302           break;
1303
1304         case ISO_control_1:
1305           if (COMPOSING_P (coding))
1306             DECODE_COMPOSITION_END ('1');
1307           goto label_invalid_code;
1308
1309         case ISO_carriage_return:
1310           if (COMPOSING_P (coding))
1311             DECODE_COMPOSITION_END ('1');
1312
1313           if (coding->eol_type == CODING_EOL_CR)
1314             c1 = '\n';
1315           else if (coding->eol_type == CODING_EOL_CRLF)
1316             {
1317               ONE_MORE_BYTE (c1);
1318               if (c1 != ISO_CODE_LF)
1319                 {
1320                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1321                     {
1322                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1323                       goto label_end_of_loop;
1324                     }
1325                   src--;
1326                   c1 = '\r';
1327                 }
1328             }
1329           charset = CHARSET_ASCII;
1330           break;
1331
1332         case ISO_shift_out:
1333           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1334               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1335             goto label_invalid_code;
1336           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1337           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1338           continue;
1339
1340         case ISO_shift_in:
1341           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1342             goto label_invalid_code;
1343           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1344           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1345           continue;
1346
1347         case ISO_single_shift_2_7:
1348         case ISO_single_shift_2:
1349           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1350             goto label_invalid_code;
1351           /* SS2 is handled as an escape sequence of ESC 'N' */
1352           c1 = 'N';
1353           goto label_escape_sequence;
1354
1355         case ISO_single_shift_3:
1356           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1357             goto label_invalid_code;
1358           /* SS2 is handled as an escape sequence of ESC 'O' */
1359           c1 = 'O';
1360           goto label_escape_sequence;
1361
1362         case ISO_control_sequence_introducer:
1363           /* CSI is handled as an escape sequence of ESC '[' ...  */
1364           c1 = '[';
1365           goto label_escape_sequence;
1366
1367         case ISO_escape:
1368           ONE_MORE_BYTE (c1);
1369         label_escape_sequence:
1370           /* Escape sequences handled by Emacs are invocation,
1371              designation, direction specification, and character
1372              composition specification.  */
1373           switch (c1)
1374             {
1375             case '&':           /* revision of following character set */
1376               ONE_MORE_BYTE (c1);
1377               if (!(c1 >= '@' && c1 <= '~'))
1378                 goto label_invalid_code;
1379               ONE_MORE_BYTE (c1);
1380               if (c1 != ISO_CODE_ESC)
1381                 goto label_invalid_code;
1382               ONE_MORE_BYTE (c1);
1383               goto label_escape_sequence;
1384
1385             case '$':           /* designation of 2-byte character set */
1386               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1387                 goto label_invalid_code;
1388               ONE_MORE_BYTE (c1);
1389               if (c1 >= '@' && c1 <= 'B')
1390                 {       /* designation of JISX0208.1978, GB2312.1980,
1391                            or JISX0208.1980 */
1392                   DECODE_DESIGNATION (0, 2, 94, c1);
1393                 }
1394               else if (c1 >= 0x28 && c1 <= 0x2B)
1395                 {       /* designation of DIMENSION2_CHARS94 character set */
1396                   ONE_MORE_BYTE (c2);
1397                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1398                 }
1399               else if (c1 >= 0x2C && c1 <= 0x2F)
1400                 {       /* designation of DIMENSION2_CHARS96 character set */
1401                   ONE_MORE_BYTE (c2);
1402                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1403                 }
1404               else
1405                 goto label_invalid_code;
1406               /* We must update these variables now.  */
1407               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1408               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1409               continue;
1410
1411             case 'n':           /* invocation of locking-shift-2 */
1412               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1413                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1414                 goto label_invalid_code;
1415               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1416               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1417               continue;
1418
1419             case 'o':           /* invocation of locking-shift-3 */
1420               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1421                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1422                 goto label_invalid_code;
1423               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1424               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1425               continue;
1426
1427             case 'N':           /* invocation of single-shift-2 */
1428               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1429                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1430                 goto label_invalid_code;
1431               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1432               ONE_MORE_BYTE (c1);
1433               break;
1434
1435             case 'O':           /* invocation of single-shift-3 */
1436               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1437                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1438                 goto label_invalid_code;
1439               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1440               ONE_MORE_BYTE (c1);
1441               break;
1442
1443             case '0': case '2': case '3': case '4': /* start composition */
1444               DECODE_COMPOSITION_START (c1);
1445               continue;
1446
1447             case '1':           /* end composition */
1448               DECODE_COMPOSITION_END (c1);
1449               continue;
1450
1451             case '[':           /* specification of direction */
1452               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1453                 goto label_invalid_code;
1454               /* For the moment, nested direction is not supported.
1455                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1456                  left-to-right, and nozero means right-to-left.  */
1457               ONE_MORE_BYTE (c1);
1458               switch (c1)
1459                 {
1460                 case ']':       /* end of the current direction */
1461                   coding->mode &= ~CODING_MODE_DIRECTION;
1462
1463                 case '0':       /* end of the current direction */
1464                 case '1':       /* start of left-to-right direction */
1465                   ONE_MORE_BYTE (c1);
1466                   if (c1 == ']')
1467                     coding->mode &= ~CODING_MODE_DIRECTION;
1468                   else
1469                     goto label_invalid_code;
1470                   break;
1471
1472                 case '2':       /* start of right-to-left direction */
1473                   ONE_MORE_BYTE (c1);
1474                   if (c1 == ']')
1475                     coding->mode |= CODING_MODE_DIRECTION;
1476                   else
1477                     goto label_invalid_code;
1478                   break;
1479
1480                 default:
1481                   goto label_invalid_code;
1482                 }
1483               continue;
1484
1485             default:
1486               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1487                 goto label_invalid_code;
1488               if (c1 >= 0x28 && c1 <= 0x2B)
1489                 {       /* designation of DIMENSION1_CHARS94 character set */
1490                   ONE_MORE_BYTE (c2);
1491                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1492                 }
1493               else if (c1 >= 0x2C && c1 <= 0x2F)
1494                 {       /* designation of DIMENSION1_CHARS96 character set */
1495                   ONE_MORE_BYTE (c2);
1496                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1497                 }
1498               else
1499                 goto label_invalid_code;
1500               /* We must update these variables now.  */
1501               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1502               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1503               continue;
1504             }
1505         }
1506
1507       /* Now we know CHARSET and 1st position code C1 of a character.
1508          Produce a multibyte sequence for that character while getting
1509          2nd position code C2 if necessary.  */
1510       if (CHARSET_DIMENSION (charset) == 2)
1511         {
1512           ONE_MORE_BYTE (c2);
1513           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1514             /* C2 is not in a valid range.  */
1515             goto label_invalid_code;
1516         }
1517       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1518       EMIT_CHAR (c);
1519       continue;
1520
1521     label_invalid_code:
1522       coding->errors++;
1523       if (COMPOSING_P (coding))
1524         DECODE_COMPOSITION_END ('1');
1525       src = src_base;
1526       c = *src++;
1527       EMIT_CHAR (c);
1528     }
1529
1530  label_end_of_loop:
1531   coding->consumed = coding->consumed_char = src_base - source;
1532   coding->produced = dst - destination;
1533   return;
1534 }
1535
1536
1537 /* ISO2022 encoding stuff.  */
1538
1539 /*
1540    It is not enough to say just "ISO2022" on encoding, we have to
1541    specify more details.  In Emacs, each coding system of ISO2022
1542    variant has the following specifications:
1543         1. Initial designation to G0 thru G3.
1544         2. Allows short-form designation?
1545         3. ASCII should be designated to G0 before control characters?
1546         4. ASCII should be designated to G0 at end of line?
1547         5. 7-bit environment or 8-bit environment?
1548         6. Use locking-shift?
1549         7. Use Single-shift?
1550    And the following two are only for Japanese:
1551         8. Use ASCII in place of JIS0201-1976-Roman?
1552         9. Use JISX0208-1983 in place of JISX0208-1978?
1553    These specifications are encoded in `coding->flags' as flag bits
1554    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1555    details.
1556 */
1557
1558 /* Produce codes (escape sequence) for designating CHARSET to graphic
1559    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1560    '@', 'A', or 'B' and the coding system CODING allows, produce
1561    designation sequence of short-form.  */
1562
1563 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1564   do {                                                                  \
1565     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1566     char *intermediate_char_94 = "()*+";                                \
1567     char *intermediate_char_96 = ",-./";                                \
1568     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1569                                                                         \
1570     if (revision < 255)                                                 \
1571       {                                                                 \
1572         *dst++ = ISO_CODE_ESC;                                          \
1573         *dst++ = '&';                                                   \
1574         *dst++ = '@' + revision;                                        \
1575       }                                                                 \
1576     *dst++ = ISO_CODE_ESC;                                              \
1577     if (CHARSET_DIMENSION (charset) == 1)                               \
1578       {                                                                 \
1579         if (CHARSET_CHARS (charset) == 94)                              \
1580           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1581         else                                                            \
1582           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1583       }                                                                 \
1584     else                                                                \
1585       {                                                                 \
1586         *dst++ = '$';                                                   \
1587         if (CHARSET_CHARS (charset) == 94)                              \
1588           {                                                             \
1589             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1590                 || reg != 0                                             \
1591                 || final_char < '@' || final_char > 'B')                \
1592               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1593           }                                                             \
1594         else                                                            \
1595           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1596       }                                                                 \
1597     *dst++ = final_char;                                                \
1598     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1599   } while (0)
1600
1601 /* The following two macros produce codes (control character or escape
1602    sequence) for ISO2022 single-shift functions (single-shift-2 and
1603    single-shift-3).  */
1604
1605 #define ENCODE_SINGLE_SHIFT_2                           \
1606   do {                                                  \
1607     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1608       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1609     else                                                \
1610       *dst++ = ISO_CODE_SS2;                            \
1611     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1612   } while (0)
1613
1614 #define ENCODE_SINGLE_SHIFT_3                           \
1615   do {                                                  \
1616     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1617       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1618     else                                                \
1619       *dst++ = ISO_CODE_SS3;                            \
1620     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1621   } while (0)
1622
1623 /* The following four macros produce codes (control character or
1624    escape sequence) for ISO2022 locking-shift functions (shift-in,
1625    shift-out, locking-shift-2, and locking-shift-3).  */
1626
1627 #define ENCODE_SHIFT_IN                         \
1628   do {                                          \
1629     *dst++ = ISO_CODE_SI;                       \
1630     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1631   } while (0)
1632
1633 #define ENCODE_SHIFT_OUT                        \
1634   do {                                          \
1635     *dst++ = ISO_CODE_SO;                       \
1636     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1637   } while (0)
1638
1639 #define ENCODE_LOCKING_SHIFT_2                  \
1640   do {                                          \
1641     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1642     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1643   } while (0)
1644
1645 #define ENCODE_LOCKING_SHIFT_3                  \
1646   do {                                          \
1647     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1648     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1649   } while (0)
1650
1651 /* Produce codes for a DIMENSION1 character whose character set is
1652    CHARSET and whose position-code is C1.  Designation and invocation
1653    sequences are also produced in advance if necessary.  */
1654
1655 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1656   do {                                                                  \
1657     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1658       {                                                                 \
1659         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1660           *dst++ = c1 & 0x7F;                                           \
1661         else                                                            \
1662           *dst++ = c1 | 0x80;                                           \
1663         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1664         break;                                                          \
1665       }                                                                 \
1666     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1667       {                                                                 \
1668         *dst++ = c1 & 0x7F;                                             \
1669         break;                                                          \
1670       }                                                                 \
1671     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1672       {                                                                 \
1673         *dst++ = c1 | 0x80;                                             \
1674         break;                                                          \
1675       }                                                                 \
1676     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1677              && !coding->safe_charsets[charset])                        \
1678       {                                                                 \
1679         /* We should not encode this character, instead produce one or  \
1680            two `?'s.  */                                                \
1681         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1682         if (CHARSET_WIDTH (charset) == 2)                               \
1683           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1684         break;                                                          \
1685       }                                                                 \
1686     else                                                                \
1687       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1688          must invoke it, or, at first, designate it to some graphic     \
1689          register.  Then repeat the loop to actually produce the        \
1690          character.  */                                                 \
1691       dst = encode_invocation_designation (charset, coding, dst);       \
1692   } while (1)
1693
1694 /* Produce codes for a DIMENSION2 character whose character set is
1695    CHARSET and whose position-codes are C1 and C2.  Designation and
1696    invocation codes are also produced in advance if necessary.  */
1697
1698 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1699   do {                                                                  \
1700     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1701       {                                                                 \
1702         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1703           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1704         else                                                            \
1705           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1706         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1707         break;                                                          \
1708       }                                                                 \
1709     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1710       {                                                                 \
1711         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1712         break;                                                          \
1713       }                                                                 \
1714     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1715       {                                                                 \
1716         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1717         break;                                                          \
1718       }                                                                 \
1719     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1720              && !coding->safe_charsets[charset])                        \
1721       {                                                                 \
1722         /* We should not encode this character, instead produce one or  \
1723            two `?'s.  */                                                \
1724         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1725         if (CHARSET_WIDTH (charset) == 2)                               \
1726           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1727         break;                                                          \
1728       }                                                                 \
1729     else                                                                \
1730       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1731          must invoke it, or, at first, designate it to some graphic     \
1732          register.  Then repeat the loop to actually produce the        \
1733          character.  */                                                 \
1734       dst = encode_invocation_designation (charset, coding, dst);       \
1735   } while (1)
1736
1737 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1738   do {                                                                  \
1739     int alt_charset = charset;                                          \
1740                                                                         \
1741     if (CHARSET_DEFINED_P (charset))                                    \
1742       {                                                                 \
1743         if (CHARSET_DIMENSION (charset) == 1)                           \
1744           {                                                             \
1745             if (charset == CHARSET_ASCII                                \
1746                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1747               alt_charset = charset_latin_jisx0201;                     \
1748             ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1);          \
1749           }                                                             \
1750         else                                                            \
1751           {                                                             \
1752             if (charset == charset_jisx0208                             \
1753                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1754               alt_charset = charset_jisx0208_1978;                      \
1755             ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2);      \
1756           }                                                             \
1757       }                                                                 \
1758     else                                                                \
1759       {                                                                 \
1760         *dst++ = c1;                                                    \
1761         if (c2 >= 0)                                                    \
1762           *dst++ = c2;                                                  \
1763       }                                                                 \
1764   } while (0)
1765
1766 /* Produce designation and invocation codes at a place pointed by DST
1767    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1768    Return new DST.  */
1769
1770 unsigned char *
1771 encode_invocation_designation (charset, coding, dst)
1772      int charset;
1773      struct coding_system *coding;
1774      unsigned char *dst;
1775 {
1776   int reg;                      /* graphic register number */
1777
1778   /* At first, check designations.  */
1779   for (reg = 0; reg < 4; reg++)
1780     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1781       break;
1782
1783   if (reg >= 4)
1784     {
1785       /* CHARSET is not yet designated to any graphic registers.  */
1786       /* At first check the requested designation.  */
1787       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1788       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1789         /* Since CHARSET requests no special designation, designate it
1790            to graphic register 0.  */
1791         reg = 0;
1792
1793       ENCODE_DESIGNATION (charset, reg, coding);
1794     }
1795
1796   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1797       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1798     {
1799       /* Since the graphic register REG is not invoked to any graphic
1800          planes, invoke it to graphic plane 0.  */
1801       switch (reg)
1802         {
1803         case 0:                 /* graphic register 0 */
1804           ENCODE_SHIFT_IN;
1805           break;
1806
1807         case 1:                 /* graphic register 1 */
1808           ENCODE_SHIFT_OUT;
1809           break;
1810
1811         case 2:                 /* graphic register 2 */
1812           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1813             ENCODE_SINGLE_SHIFT_2;
1814           else
1815             ENCODE_LOCKING_SHIFT_2;
1816           break;
1817
1818         case 3:                 /* graphic register 3 */
1819           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1820             ENCODE_SINGLE_SHIFT_3;
1821           else
1822             ENCODE_LOCKING_SHIFT_3;
1823           break;
1824         }
1825     }
1826
1827   return dst;
1828 }
1829
1830 /* Produce 2-byte codes for encoded composition rule RULE.  */
1831
1832 #define ENCODE_COMPOSITION_RULE(rule)           \
1833   do {                                          \
1834     int gref, nref;                             \
1835     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1836     *dst++ = 32 + 81 + gref;                    \
1837     *dst++ = 32 + nref;                         \
1838   } while (0)
1839
1840 /* Produce codes for indicating the start of a composition sequence
1841    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1842    which specify information about the composition.  See the comment
1843    in coding.h for the format of DATA.  */
1844
1845 #define ENCODE_COMPOSITION_START(coding, data)                          \
1846   do {                                                                  \
1847     coding->composing = data[3];                                        \
1848     *dst++ = ISO_CODE_ESC;                                              \
1849     if (coding->composing == COMPOSITION_RELATIVE)                      \
1850       *dst++ = '0';                                                     \
1851     else                                                                \
1852       {                                                                 \
1853         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1854                   ? '3' : '4');                                         \
1855         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1856         coding->composition_rule_follows = 0;                           \
1857       }                                                                 \
1858   } while (0)
1859
1860 /* Produce codes for indicating the end of the current composition.  */
1861
1862 #define ENCODE_COMPOSITION_END(coding, data)                    \
1863   do {                                                          \
1864     *dst++ = ISO_CODE_ESC;                                      \
1865     *dst++ = '1';                                               \
1866     coding->cmp_data_start += data[0];                          \
1867     coding->composing = COMPOSITION_NO;                         \
1868     if (coding->cmp_data_start == coding->cmp_data->used        \
1869         && coding->cmp_data->next)                              \
1870       {                                                         \
1871         coding->cmp_data = coding->cmp_data->next;              \
1872         coding->cmp_data_start = 0;                             \
1873       }                                                         \
1874   } while (0)
1875
1876 /* Produce composition start sequence ESC 0.  Here, this sequence
1877    doesn't mean the start of a new composition but means that we have
1878    just produced components (alternate chars and composition rules) of
1879    the composition and the actual text follows in SRC.  */
1880
1881 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1882   do {                                          \
1883     *dst++ = ISO_CODE_ESC;                      \
1884     *dst++ = '0';                               \
1885     coding->composing = COMPOSITION_RELATIVE;   \
1886   } while (0)
1887
1888 /* The following three macros produce codes for indicating direction
1889    of text.  */
1890 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1891   do {                                                  \
1892     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1893       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1894     else                                                \
1895       *dst++ = ISO_CODE_CSI;                            \
1896   } while (0)
1897
1898 #define ENCODE_DIRECTION_R2L    \
1899   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1900
1901 #define ENCODE_DIRECTION_L2R    \
1902   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1903
1904 /* Produce codes for designation and invocation to reset the graphic
1905    planes and registers to initial state.  */
1906 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1907   do {                                                                      \
1908     int reg;                                                                \
1909     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1910       ENCODE_SHIFT_IN;                                                      \
1911     for (reg = 0; reg < 4; reg++)                                           \
1912       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1913           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1914               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1915         ENCODE_DESIGNATION                                                  \
1916           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1917   } while (0)
1918
1919 /* Produce designation sequences of charsets in the line started from
1920    SRC to a place pointed by DST, and return updated DST.
1921
1922    If the current block ends before any end-of-line, we may fail to
1923    find all the necessary designations.  */
1924
1925 static unsigned char *
1926 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1927      struct coding_system *coding;
1928      Lisp_Object translation_table;
1929      unsigned char *src, *src_end, *dst;
1930 {
1931   int charset, c, found = 0, reg;
1932   /* Table of charsets to be designated to each graphic register.  */
1933   int r[4];
1934
1935   for (reg = 0; reg < 4; reg++)
1936     r[reg] = -1;
1937
1938   while (found < 4)
1939     {
1940       ONE_MORE_CHAR (c);
1941       if (c == '\n')
1942         break;
1943
1944       charset = CHAR_CHARSET (c);
1945       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1946       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1947         {
1948           found++;
1949           r[reg] = charset;
1950         }
1951     }
1952
1953  label_end_of_loop:
1954   if (found)
1955     {
1956       for (reg = 0; reg < 4; reg++)
1957         if (r[reg] >= 0
1958             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1959           ENCODE_DESIGNATION (r[reg], reg, coding);
1960     }
1961
1962   return dst;
1963 }
1964
1965 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1966
1967 static void
1968 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1969      struct coding_system *coding;
1970      unsigned char *source, *destination;
1971      int src_bytes, dst_bytes;
1972 {
1973   unsigned char *src = source;
1974   unsigned char *src_end = source + src_bytes;
1975   unsigned char *dst = destination;
1976   unsigned char *dst_end = destination + dst_bytes;
1977   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1978      from DST_END to assure overflow checking is necessary only at the
1979      head of loop.  */
1980   unsigned char *adjusted_dst_end = dst_end - 19;
1981   /* SRC_BASE remembers the start position in source in each loop.
1982      The loop will be exited when there's not enough source text to
1983      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1984      there's not enough destination area to produce encoded codes
1985      (within macro EMIT_BYTES).  */
1986   unsigned char *src_base;
1987   int c;
1988   Lisp_Object translation_table;
1989
1990   if (NILP (Venable_character_translation))
1991     translation_table = Qnil;
1992   else
1993     {
1994       translation_table = coding->translation_table_for_encode;
1995       if (NILP (translation_table))
1996         translation_table = Vstandard_translation_table_for_encode;
1997     }
1998
1999   coding->consumed_char = 0;
2000   coding->errors = 0;
2001   while (1)
2002     {
2003       int charset, c1, c2;
2004
2005       src_base = src;
2006
2007       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2008         {
2009           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2010           break;
2011         }
2012
2013       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2014           && CODING_SPEC_ISO_BOL (coding))
2015         {
2016           /* We have to produce designation sequences if any now.  */
2017           dst = encode_designation_at_bol (coding, translation_table,
2018                                            src, src_end, dst);
2019           CODING_SPEC_ISO_BOL (coding) = 0;
2020         }
2021
2022       /* Check composition start and end.  */
2023       if (coding->composing != COMPOSITION_DISABLED
2024           && coding->cmp_data_start < coding->cmp_data->used)
2025         {
2026           struct composition_data *cmp_data = coding->cmp_data;
2027           int *data = cmp_data->data + coding->cmp_data_start;
2028           int this_pos = cmp_data->char_offset + coding->consumed_char;
2029
2030           if (coding->composing == COMPOSITION_RELATIVE)
2031             {
2032               if (this_pos == data[2])
2033                 {
2034                   ENCODE_COMPOSITION_END (coding, data);
2035                   cmp_data = coding->cmp_data;
2036                   data = cmp_data->data + coding->cmp_data_start;
2037                 }
2038             }
2039           else if (COMPOSING_P (coding))
2040             {
2041               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2042               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2043                 /* We have consumed components of the composition.
2044                    What follows in SRC is the compositions's base
2045                    text.  */
2046                 ENCODE_COMPOSITION_FAKE_START (coding);
2047               else
2048                 {
2049                   int c = cmp_data->data[coding->cmp_data_index++];
2050                   if (coding->composition_rule_follows)
2051                     {
2052                       ENCODE_COMPOSITION_RULE (c);
2053                       coding->composition_rule_follows = 0;
2054                     }
2055                   else
2056                     {
2057                       SPLIT_CHAR (c, charset, c1, c2);
2058                       ENCODE_ISO_CHARACTER (charset, c1, c2);
2059                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2060                         coding->composition_rule_follows = 1;
2061                     }
2062                   continue;
2063                 }
2064             }
2065           if (!COMPOSING_P (coding))
2066             {
2067               if (this_pos == data[1])
2068                 {
2069                   ENCODE_COMPOSITION_START (coding, data);
2070                   continue;
2071                 }
2072             }
2073         }
2074
2075       ONE_MORE_CHAR (c);
2076
2077       /* Now encode the character C.  */
2078       if (c < 0x20 || c == 0x7F)
2079         {
2080           if (c == '\r')
2081             {
2082               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2083                 {
2084                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2085                     ENCODE_RESET_PLANE_AND_REGISTER;
2086                   *dst++ = c;
2087                   continue;
2088                 }
2089               /* fall down to treat '\r' as '\n' ...  */
2090               c = '\n';
2091             }
2092           if (c == '\n')
2093             {
2094               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2095                 ENCODE_RESET_PLANE_AND_REGISTER;
2096               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2097                 bcopy (coding->spec.iso2022.initial_designation,
2098                        coding->spec.iso2022.current_designation,
2099                        sizeof coding->spec.iso2022.initial_designation);
2100               if (coding->eol_type == CODING_EOL_LF
2101                   || coding->eol_type == CODING_EOL_UNDECIDED)
2102                 *dst++ = ISO_CODE_LF;
2103               else if (coding->eol_type == CODING_EOL_CRLF)
2104                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2105               else
2106                 *dst++ = ISO_CODE_CR;
2107               CODING_SPEC_ISO_BOL (coding) = 1;
2108             }
2109           else
2110             {
2111               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2112                 ENCODE_RESET_PLANE_AND_REGISTER;
2113               *dst++ = c;
2114             }
2115         }
2116       else if (ASCII_BYTE_P (c))
2117         ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2118       else if (SINGLE_BYTE_CHAR_P (c))
2119         {
2120           *dst++ = c;
2121           coding->errors++;
2122         }
2123       else
2124         {
2125           SPLIT_CHAR (c, charset, c1, c2);
2126           ENCODE_ISO_CHARACTER (charset, c1, c2);
2127         }
2128
2129       coding->consumed_char++;
2130     }
2131
2132  label_end_of_loop:
2133   coding->consumed = src_base - source;
2134   coding->produced = coding->produced_char = dst - destination;
2135 }
2136
2137 \f
2138 /*** 4. SJIS and BIG5 handlers ***/
2139
2140 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2141    quite widely.  So, for the moment, Emacs supports them in the bare
2142    C code.  But, in the future, they may be supported only by CCL.  */
2143
2144 /* SJIS is a coding system encoding three character sets: ASCII, right
2145    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2146    as is.  A character of charset katakana-jisx0201 is encoded by
2147    "position-code + 0x80".  A character of charset japanese-jisx0208
2148    is encoded in 2-byte but two position-codes are divided and shifted
2149    so that it fit in the range below.
2150
2151    --- CODE RANGE of SJIS ---
2152    (character set)      (range)
2153    ASCII                0x00 .. 0x7F
2154    KATAKANA-JISX0201    0xA0 .. 0xDF
2155    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2156             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2157    -------------------------------
2158
2159 */
2160
2161 /* BIG5 is a coding system encoding two character sets: ASCII and
2162    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2163    character set and is encoded in two-byte.
2164
2165    --- CODE RANGE of BIG5 ---
2166    (character set)      (range)
2167    ASCII                0x00 .. 0x7F
2168    Big5 (1st byte)      0xA1 .. 0xFE
2169         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2170    --------------------------
2171
2172    Since the number of characters in Big5 is larger than maximum
2173    characters in Emacs' charset (96x96), it can't be handled as one
2174    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2175    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2176    contains frequently used characters and the latter contains less
2177    frequently used characters.  */
2178
2179 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2180    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2181    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2182    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2183
2184 /* Number of Big5 characters which have the same code in 1st byte.  */
2185 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2186
2187 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2188   do {                                                                  \
2189     unsigned int temp                                                   \
2190       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2191     if (b1 < 0xC9)                                                      \
2192       charset = charset_big5_1;                                         \
2193     else                                                                \
2194       {                                                                 \
2195         charset = charset_big5_2;                                       \
2196         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2197       }                                                                 \
2198     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2199     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2200   } while (0)
2201
2202 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2203   do {                                                                  \
2204     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2205     if (charset == charset_big5_2)                                      \
2206       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2207     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2208     b2 = temp % BIG5_SAME_ROW;                                          \
2209     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2210   } while (0)
2211
2212 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2213    Check if a text is encoded in SJIS.  If it is, return
2214    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2215
2216 int
2217 detect_coding_sjis (src, src_end)
2218      unsigned char *src, *src_end;
2219 {
2220   int c;
2221   /* Dummy for ONE_MORE_BYTE.  */
2222   struct coding_system dummy_coding;
2223   struct coding_system *coding = &dummy_coding;
2224
2225   while (1)
2226     {
2227       ONE_MORE_BYTE (c);
2228       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2229         {
2230           ONE_MORE_BYTE (c);
2231           if (c < 0x40)
2232             return 0;
2233         }
2234     }
2235  label_end_of_loop:
2236   return CODING_CATEGORY_MASK_SJIS;
2237 }
2238
2239 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2240    Check if a text is encoded in BIG5.  If it is, return
2241    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2242
2243 int
2244 detect_coding_big5 (src, src_end)
2245      unsigned char *src, *src_end;
2246 {
2247   int c;
2248   /* Dummy for ONE_MORE_BYTE.  */
2249   struct coding_system dummy_coding;
2250   struct coding_system *coding = &dummy_coding;
2251
2252   while (1)
2253     {
2254       ONE_MORE_BYTE (c);
2255       if (c >= 0xA1)
2256         {
2257           ONE_MORE_BYTE (c);
2258           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2259             return 0;
2260         }
2261     }
2262  label_end_of_loop:
2263   return CODING_CATEGORY_MASK_BIG5;
2264 }
2265
2266 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2267    Check if a text is encoded in UTF-8.  If it is, return
2268    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2269
2270 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2271 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2272 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2273 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2274 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2275 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2276 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2277
2278 int
2279 detect_coding_utf_8 (src, src_end)
2280      unsigned char *src, *src_end;
2281 {
2282   unsigned char c;
2283   int seq_maybe_bytes;
2284   /* Dummy for ONE_MORE_BYTE.  */
2285   struct coding_system dummy_coding;
2286   struct coding_system *coding = &dummy_coding;
2287
2288   while (1)
2289     {
2290       ONE_MORE_BYTE (c);
2291       if (UTF_8_1_OCTET_P (c))
2292         continue;
2293       else if (UTF_8_2_OCTET_LEADING_P (c))
2294         seq_maybe_bytes = 1;
2295       else if (UTF_8_3_OCTET_LEADING_P (c))
2296         seq_maybe_bytes = 2;
2297       else if (UTF_8_4_OCTET_LEADING_P (c))
2298         seq_maybe_bytes = 3;
2299       else if (UTF_8_5_OCTET_LEADING_P (c))
2300         seq_maybe_bytes = 4;
2301       else if (UTF_8_6_OCTET_LEADING_P (c))
2302         seq_maybe_bytes = 5;
2303       else
2304         return 0;
2305
2306       do
2307         {
2308           ONE_MORE_BYTE (c);
2309           if (!UTF_8_EXTRA_OCTET_P (c))
2310             return 0;
2311           seq_maybe_bytes--;
2312         }
2313       while (seq_maybe_bytes > 0);
2314     }
2315
2316  label_end_of_loop:
2317   return CODING_CATEGORY_MASK_UTF_8;
2318 }
2319
2320 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2321    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2322    Little Endian (otherwise).  If it is, return
2323    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2324    else return 0.  */
2325
2326 #define UTF_16_INVALID_P(val)   \
2327   (((val) == 0xFFFE)            \
2328    || ((val) == 0xFFFF))
2329
2330 #define UTF_16_HIGH_SURROGATE_P(val) \
2331   (((val) & 0xD800) == 0xD800)
2332
2333 #define UTF_16_LOW_SURROGATE_P(val) \
2334   (((val) & 0xDC00) == 0xDC00)
2335
2336 int
2337 detect_coding_utf_16 (src, src_end)
2338      unsigned char *src, *src_end;
2339 {
2340   unsigned char c1, c2;
2341   /* Dummy for TWO_MORE_BYTES.  */
2342   struct coding_system dummy_coding;
2343   struct coding_system *coding = &dummy_coding;
2344
2345   TWO_MORE_BYTES (c1, c2);
2346
2347   if ((c1 == 0xFF) && (c2 == 0xFE))
2348     return CODING_CATEGORY_MASK_UTF_16_LE;
2349   else if ((c1 == 0xFE) && (c2 == 0xFF))
2350     return CODING_CATEGORY_MASK_UTF_16_BE;
2351
2352  label_end_of_loop:
2353   return 0;
2354 }
2355
2356 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2357    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2358
2359 static void
2360 decode_coding_sjis_big5 (coding, source, destination,
2361                          src_bytes, dst_bytes, sjis_p)
2362      struct coding_system *coding;
2363      unsigned char *source, *destination;
2364      int src_bytes, dst_bytes;
2365      int sjis_p;
2366 {
2367   unsigned char *src = source;
2368   unsigned char *src_end = source + src_bytes;
2369   unsigned char *dst = destination;
2370   unsigned char *dst_end = destination + dst_bytes;
2371   /* SRC_BASE remembers the start position in source in each loop.
2372      The loop will be exited when there's not enough source code
2373      (within macro ONE_MORE_BYTE), or when there's not enough
2374      destination area to produce a character (within macro
2375      EMIT_CHAR).  */
2376   unsigned char *src_base;
2377   Lisp_Object translation_table;
2378
2379   if (NILP (Venable_character_translation))
2380     translation_table = Qnil;
2381   else
2382     {
2383       translation_table = coding->translation_table_for_decode;
2384       if (NILP (translation_table))
2385         translation_table = Vstandard_translation_table_for_decode;
2386     }
2387
2388   coding->produced_char = 0;
2389   while (1)
2390     {
2391       int c, charset, c1, c2;
2392
2393       src_base = src;
2394       ONE_MORE_BYTE (c1);
2395
2396       if (c1 < 0x80)
2397         {
2398           charset = CHARSET_ASCII;
2399           if (c1 < 0x20)
2400             {
2401               if (c1 == '\r')
2402                 {
2403                   if (coding->eol_type == CODING_EOL_CRLF)
2404                     {
2405                       ONE_MORE_BYTE (c2);
2406                       if (c2 == '\n')
2407                         c1 = c2;
2408                       else if (coding->mode
2409                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2410                         {
2411                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2412                           goto label_end_of_loop;
2413                         }
2414                       else
2415                         /* To process C2 again, SRC is subtracted by 1.  */
2416                         src--;
2417                     }
2418                   else if (coding->eol_type == CODING_EOL_CR)
2419                     c1 = '\n';
2420                 }
2421               else if (c1 == '\n'
2422                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2423                        && (coding->eol_type == CODING_EOL_CR
2424                            || coding->eol_type == CODING_EOL_CRLF))
2425                 {
2426                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2427                   goto label_end_of_loop;
2428                 }
2429             }
2430         }
2431       else
2432         {
2433           if (sjis_p)
2434             {
2435               if (c1 >= 0xF0)
2436                 goto label_invalid_code;
2437               if (c1 < 0xA0 || c1 >= 0xE0)
2438                 {
2439                   /* SJIS -> JISX0208 */
2440                   ONE_MORE_BYTE (c2);
2441                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2442                     goto label_invalid_code;
2443                   DECODE_SJIS (c1, c2, c1, c2);
2444                   charset = charset_jisx0208;
2445                 }
2446               else
2447                 /* SJIS -> JISX0201-Kana */
2448                 charset = charset_katakana_jisx0201;
2449             }
2450           else
2451             {
2452               /* BIG5 -> Big5 */
2453               if (c1 < 0xA1 || c1 > 0xFE)
2454                 goto label_invalid_code;
2455               ONE_MORE_BYTE (c2);
2456               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2457                 goto label_invalid_code;
2458               DECODE_BIG5 (c1, c2, charset, c1, c2);
2459             }
2460         }
2461
2462       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2463       EMIT_CHAR (c);
2464       continue;
2465
2466     label_invalid_code:
2467       coding->errors++;
2468       src = src_base;
2469       c = *src++;
2470       EMIT_CHAR (c);
2471     }
2472
2473  label_end_of_loop:
2474   coding->consumed = coding->consumed_char = src_base - source;
2475   coding->produced = dst - destination;
2476   return;
2477 }
2478
2479 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2480    This function can encode charsets `ascii', `katakana-jisx0201',
2481    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2482    are sure that all these charsets are registered as official charset
2483    (i.e. do not have extended leading-codes).  Characters of other
2484    charsets are produced without any encoding.  If SJIS_P is 1, encode
2485    SJIS text, else encode BIG5 text.  */
2486
2487 static void
2488 encode_coding_sjis_big5 (coding, source, destination,
2489                          src_bytes, dst_bytes, sjis_p)
2490      struct coding_system *coding;
2491      unsigned char *source, *destination;
2492      int src_bytes, dst_bytes;
2493      int sjis_p;
2494 {
2495   unsigned char *src = source;
2496   unsigned char *src_end = source + src_bytes;
2497   unsigned char *dst = destination;
2498   unsigned char *dst_end = destination + dst_bytes;
2499   /* SRC_BASE remembers the start position in source in each loop.
2500      The loop will be exited when there's not enough source text to
2501      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2502      there's not enough destination area to produce encoded codes
2503      (within macro EMIT_BYTES).  */
2504   unsigned char *src_base;
2505   Lisp_Object translation_table;
2506
2507   if (NILP (Venable_character_translation))
2508     translation_table = Qnil;
2509   else
2510     {
2511       translation_table = coding->translation_table_for_decode;
2512       if (NILP (translation_table))
2513         translation_table = Vstandard_translation_table_for_decode;
2514     }
2515
2516   while (1)
2517     {
2518       int c, charset, c1, c2;
2519
2520       src_base = src;
2521       ONE_MORE_CHAR (c);
2522
2523       /* Now encode the character C.  */
2524       if (SINGLE_BYTE_CHAR_P (c))
2525         {
2526           switch (c)
2527             {
2528             case '\r':
2529               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2530                 {
2531                   EMIT_ONE_BYTE (c);
2532                   break;
2533                 }
2534               c = '\n';
2535             case '\n':
2536               if (coding->eol_type == CODING_EOL_CRLF)
2537                 {
2538                   EMIT_TWO_BYTES ('\r', c);
2539                   break;
2540                 }
2541               else if (coding->eol_type == CODING_EOL_CR)
2542                 c = '\r';
2543             default:
2544               EMIT_ONE_BYTE (c);
2545             }
2546         }
2547       else
2548         {
2549           SPLIT_CHAR (c, charset, c1, c2);
2550           if (sjis_p)
2551             {
2552               if (charset == charset_jisx0208
2553                   || charset == charset_jisx0208_1978)
2554                 {
2555                   ENCODE_SJIS (c1, c2, c1, c2);
2556                   EMIT_TWO_BYTES (c1, c2);
2557                 }
2558               else if (charset == charset_latin_jisx0201)
2559                 EMIT_ONE_BYTE (c1);
2560               else
2561                 /* There's no way other than producing the internal
2562                    codes as is.  */
2563                 EMIT_BYTES (src_base, src);
2564             }
2565           else
2566             {
2567               if (charset == charset_big5_1 || charset == charset_big5_2)
2568                 {
2569                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2570                   EMIT_TWO_BYTES (c1, c2);
2571                 }
2572               else
2573                 /* There's no way other than producing the internal
2574                    codes as is.  */
2575                 EMIT_BYTES (src_base, src);
2576             }
2577         }
2578       coding->consumed_char++;
2579     }
2580
2581  label_end_of_loop:
2582   coding->consumed = src_base - source;
2583   coding->produced = coding->produced_char = dst - destination;
2584 }
2585
2586 \f
2587 /*** 5. CCL handlers ***/
2588
2589 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2590    Check if a text is encoded in a coding system of which
2591    encoder/decoder are written in CCL program.  If it is, return
2592    CODING_CATEGORY_MASK_CCL, else return 0.  */
2593
2594 int
2595 detect_coding_ccl (src, src_end)
2596      unsigned char *src, *src_end;
2597 {
2598   unsigned char *valid;
2599   int c;
2600   /* Dummy for ONE_MORE_BYTE.  */
2601   struct coding_system dummy_coding;
2602   struct coding_system *coding = &dummy_coding;
2603
2604   /* No coding system is assigned to coding-category-ccl.  */
2605   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2606     return 0;
2607
2608   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2609   while (1)
2610     {
2611       ONE_MORE_BYTE (c);
2612       if (! valid[c])
2613         return 0;
2614     }
2615  label_end_of_loop:
2616   return CODING_CATEGORY_MASK_CCL;
2617 }
2618
2619 \f
2620 /*** 6. End-of-line handlers ***/
2621
2622 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2623
2624 static void
2625 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2626      struct coding_system *coding;
2627      unsigned char *source, *destination;
2628      int src_bytes, dst_bytes;
2629 {
2630   unsigned char *src = source;
2631   unsigned char *dst = destination;
2632   unsigned char *src_end = src + src_bytes;
2633   unsigned char *dst_end = dst + dst_bytes;
2634   Lisp_Object translation_table;
2635   /* SRC_BASE remembers the start position in source in each loop.
2636      The loop will be exited when there's not enough source code
2637      (within macro ONE_MORE_BYTE), or when there's not enough
2638      destination area to produce a character (within macro
2639      EMIT_CHAR).  */
2640   unsigned char *src_base;
2641   int c;
2642
2643   translation_table = Qnil;
2644   switch (coding->eol_type)
2645     {
2646     case CODING_EOL_CRLF:
2647       while (1)
2648         {
2649           src_base = src;
2650           ONE_MORE_BYTE (c);
2651           if (c == '\r')
2652             {
2653               ONE_MORE_BYTE (c);
2654               if (c != '\n')
2655                 {
2656                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2657                     {
2658                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2659                       goto label_end_of_loop;
2660                     }
2661                   src--;
2662                   c = '\r';
2663                 }
2664             }
2665           else if (c == '\n'
2666                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2667             {
2668               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2669               goto label_end_of_loop;
2670             }
2671           EMIT_CHAR (c);
2672         }
2673       break;
2674
2675     case CODING_EOL_CR:
2676       while (1)
2677         {
2678           src_base = src;
2679           ONE_MORE_BYTE (c);
2680           if (c == '\n')
2681             {
2682               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2683                 {
2684                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2685                   goto label_end_of_loop;
2686                 }
2687             }
2688           else if (c == '\r')
2689             c = '\n';
2690           EMIT_CHAR (c);
2691         }
2692       break;
2693
2694     default:                    /* no need for EOL handling */
2695       while (1)
2696         {
2697           src_base = src;
2698           ONE_MORE_BYTE (c);
2699           EMIT_CHAR (c);
2700         }
2701     }
2702
2703  label_end_of_loop:
2704   coding->consumed = coding->consumed_char = src_base - source;
2705   coding->produced = dst - destination;
2706   return;
2707 }
2708
2709 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2710    format of end-of-line according to `coding->eol_type'.  It also
2711    convert multibyte form 8-bit characers to unibyte if
2712    CODING->src_multibyte is nonzero.  If `coding->mode &
2713    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2714    also means end-of-line.  */
2715
2716 static void
2717 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2718      struct coding_system *coding;
2719      unsigned char *source, *destination;
2720      int src_bytes, dst_bytes;
2721 {
2722   unsigned char *src = source;
2723   unsigned char *dst = destination;
2724   unsigned char *src_end = src + src_bytes;
2725   unsigned char *dst_end = dst + dst_bytes;
2726   Lisp_Object translation_table;
2727   /* SRC_BASE remembers the start position in source in each loop.
2728      The loop will be exited when there's not enough source text to
2729      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2730      there's not enough destination area to produce encoded codes
2731      (within macro EMIT_BYTES).  */
2732   unsigned char *src_base;
2733   int c;
2734   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2735
2736   translation_table = Qnil;
2737   if (coding->src_multibyte
2738       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2739     {
2740       src_end--;
2741       src_bytes--;
2742       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2743     }
2744
2745   if (coding->eol_type == CODING_EOL_CRLF)
2746     {
2747       while (src < src_end)
2748         {
2749           src_base = src;
2750           c = *src++;
2751           if (c >= 0x20)
2752             EMIT_ONE_BYTE (c);
2753           else if (c == '\n' || (c == '\r' && selective_display))
2754             EMIT_TWO_BYTES ('\r', '\n');
2755           else
2756             EMIT_ONE_BYTE (c);
2757         }
2758       src_base = src;
2759     label_end_of_loop:
2760       ;
2761     }
2762   else
2763     {
2764       if (src_bytes <= dst_bytes)
2765         {
2766           safe_bcopy (src, dst, src_bytes);
2767           src_base = src_end;
2768           dst += src_bytes;
2769         }
2770       else
2771         {
2772           if (coding->src_multibyte
2773               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2774             dst_bytes--;
2775           safe_bcopy (src, dst, dst_bytes);
2776           src_base = src + dst_bytes;
2777           dst = destination + dst_bytes;
2778           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2779         }
2780       if (coding->eol_type == CODING_EOL_CR)
2781         {
2782           for (src = destination; src < dst; src++)
2783             if (*src == '\n') *src = '\r';
2784         }
2785       else if (selective_display)
2786         {
2787           for (src = destination; src < dst; src++)
2788             if (*src == '\r') *src = '\n';
2789         }
2790     }
2791   if (coding->src_multibyte)
2792     dst = destination + str_as_unibyte (destination, dst - destination);
2793
2794   coding->consumed = src_base - source;
2795   coding->produced = dst - destination;
2796 }
2797
2798 \f
2799 /*** 7. C library functions ***/
2800
2801 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2802    has a property `coding-system'.  The value of this property is a
2803    vector of length 5 (called as coding-vector).  Among elements of
2804    this vector, the first (element[0]) and the fifth (element[4])
2805    carry important information for decoding/encoding.  Before
2806    decoding/encoding, this information should be set in fields of a
2807    structure of type `coding_system'.
2808
2809    A value of property `coding-system' can be a symbol of another
2810    subsidiary coding-system.  In that case, Emacs gets coding-vector
2811    from that symbol.
2812
2813    `element[0]' contains information to be set in `coding->type'.  The
2814    value and its meaning is as follows:
2815
2816    0 -- coding_type_emacs_mule
2817    1 -- coding_type_sjis
2818    2 -- coding_type_iso2022
2819    3 -- coding_type_big5
2820    4 -- coding_type_ccl encoder/decoder written in CCL
2821    nil -- coding_type_no_conversion
2822    t -- coding_type_undecided (automatic conversion on decoding,
2823                                no-conversion on encoding)
2824
2825    `element[4]' contains information to be set in `coding->flags' and
2826    `coding->spec'.  The meaning varies by `coding->type'.
2827
2828    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2829    of length 32 (of which the first 13 sub-elements are used now).
2830    Meanings of these sub-elements are:
2831
2832    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2833         If the value is an integer of valid charset, the charset is
2834         assumed to be designated to graphic register N initially.
2835
2836         If the value is minus, it is a minus value of charset which
2837         reserves graphic register N, which means that the charset is
2838         not designated initially but should be designated to graphic
2839         register N just before encoding a character in that charset.
2840
2841         If the value is nil, graphic register N is never used on
2842         encoding.
2843
2844    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2845         Each value takes t or nil.  See the section ISO2022 of
2846         `coding.h' for more information.
2847
2848    If `coding->type' is `coding_type_big5', element[4] is t to denote
2849    BIG5-ETen or nil to denote BIG5-HKU.
2850
2851    If `coding->type' takes the other value, element[4] is ignored.
2852
2853    Emacs Lisp's coding system also carries information about format of
2854    end-of-line in a value of property `eol-type'.  If the value is
2855    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2856    means CODING_EOL_CR.  If it is not integer, it should be a vector
2857    of subsidiary coding systems of which property `eol-type' has one
2858    of above values.
2859
2860 */
2861
2862 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2863    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2864    is setup so that no conversion is necessary and return -1, else
2865    return 0.  */
2866
2867 int
2868 setup_coding_system (coding_system, coding)
2869      Lisp_Object coding_system;
2870      struct coding_system *coding;
2871 {
2872   Lisp_Object coding_spec, coding_type, eol_type, plist;
2873   Lisp_Object val;
2874   int i;
2875
2876   /* Initialize some fields required for all kinds of coding systems.  */
2877   coding->symbol = coding_system;
2878   coding->common_flags = 0;
2879   coding->mode = 0;
2880   coding->heading_ascii = -1;
2881   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2882   coding->composing = COMPOSITION_DISABLED;
2883   coding->cmp_data = NULL;
2884
2885   if (NILP (coding_system))
2886     goto label_invalid_coding_system;
2887
2888   coding_spec = Fget (coding_system, Qcoding_system);
2889
2890   if (!VECTORP (coding_spec)
2891       || XVECTOR (coding_spec)->size != 5
2892       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2893     goto label_invalid_coding_system;
2894
2895   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2896   if (VECTORP (eol_type))
2897     {
2898       coding->eol_type = CODING_EOL_UNDECIDED;
2899       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2900     }
2901   else if (XFASTINT (eol_type) == 1)
2902     {
2903       coding->eol_type = CODING_EOL_CRLF;
2904       coding->common_flags
2905         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2906     }
2907   else if (XFASTINT (eol_type) == 2)
2908     {
2909       coding->eol_type = CODING_EOL_CR;
2910       coding->common_flags
2911         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2912     }
2913   else
2914     coding->eol_type = CODING_EOL_LF;
2915
2916   coding_type = XVECTOR (coding_spec)->contents[0];
2917   /* Try short cut.  */
2918   if (SYMBOLP (coding_type))
2919     {
2920       if (EQ (coding_type, Qt))
2921         {
2922           coding->type = coding_type_undecided;
2923           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2924         }
2925       else
2926         coding->type = coding_type_no_conversion;
2927       return 0;
2928     }
2929
2930   /* Get values of coding system properties:
2931      `post-read-conversion', `pre-write-conversion',
2932      `translation-table-for-decode', `translation-table-for-encode'.  */
2933   plist = XVECTOR (coding_spec)->contents[3];
2934   /* Pre & post conversion functions should be disabled if
2935      inhibit_eol_conversion is nozero.  This is the case that a code
2936      conversion function is called while those functions are running.  */
2937   if (! inhibit_pre_post_conversion)
2938     {
2939       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2940       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2941     }
2942   val = Fplist_get (plist, Qtranslation_table_for_decode);
2943   if (SYMBOLP (val))
2944     val = Fget (val, Qtranslation_table_for_decode);
2945   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2946   val = Fplist_get (plist, Qtranslation_table_for_encode);
2947   if (SYMBOLP (val))
2948     val = Fget (val, Qtranslation_table_for_encode);
2949   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2950   val = Fplist_get (plist, Qcoding_category);
2951   if (!NILP (val))
2952     {
2953       val = Fget (val, Qcoding_category_index);
2954       if (INTEGERP (val))
2955         coding->category_idx = XINT (val);
2956       else
2957         goto label_invalid_coding_system;
2958     }
2959   else
2960     goto label_invalid_coding_system;
2961
2962   val = Fplist_get (plist, Qsafe_charsets);
2963   if (EQ (val, Qt))
2964     {
2965       for (i = 0; i <= MAX_CHARSET; i++)
2966         coding->safe_charsets[i] = 1;
2967     }
2968   else
2969     {
2970       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2971       while (CONSP (val))
2972         {
2973           if ((i = get_charset_id (XCAR (val))) >= 0)
2974             coding->safe_charsets[i] = 1;
2975           val = XCDR (val);
2976         }
2977     }
2978
2979   /* If the coding system has non-nil `composition' property, enable
2980      composition handling.  */
2981   val = Fplist_get (plist, Qcomposition);
2982   if (!NILP (val))
2983     coding->composing = COMPOSITION_NO;
2984
2985   switch (XFASTINT (coding_type))
2986     {
2987     case 0:
2988       coding->type = coding_type_emacs_mule;
2989       if (!NILP (coding->post_read_conversion))
2990         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2991       if (!NILP (coding->pre_write_conversion))
2992         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2993       break;
2994
2995     case 1:
2996       coding->type = coding_type_sjis;
2997       coding->common_flags
2998         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2999       break;
3000
3001     case 2:
3002       coding->type = coding_type_iso2022;
3003       coding->common_flags
3004         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3005       {
3006         Lisp_Object val, temp;
3007         Lisp_Object *flags;
3008         int i, charset, reg_bits = 0;
3009
3010         val = XVECTOR (coding_spec)->contents[4];
3011
3012         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3013           goto label_invalid_coding_system;
3014
3015         flags = XVECTOR (val)->contents;
3016         coding->flags
3017           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3018              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3019              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3020              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3021              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3022              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3023              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3024              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3025              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3026              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3027              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3028              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3029              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3030              );
3031
3032         /* Invoke graphic register 0 to plane 0.  */
3033         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3034         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3035         CODING_SPEC_ISO_INVOCATION (coding, 1)
3036           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3037         /* Not single shifting at first.  */
3038         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3039         /* Beginning of buffer should also be regarded as bol. */
3040         CODING_SPEC_ISO_BOL (coding) = 1;
3041
3042         for (charset = 0; charset <= MAX_CHARSET; charset++)
3043           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3044         val = Vcharset_revision_alist;
3045         while (CONSP (val))
3046           {
3047             charset = get_charset_id (Fcar_safe (XCAR (val)));
3048             if (charset >= 0
3049                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3050                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3051               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3052             val = XCDR (val);
3053           }
3054
3055         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3056            FLAGS[REG] can be one of below:
3057                 integer CHARSET: CHARSET occupies register I,
3058                 t: designate nothing to REG initially, but can be used
3059                   by any charsets,
3060                 list of integer, nil, or t: designate the first
3061                   element (if integer) to REG initially, the remaining
3062                   elements (if integer) is designated to REG on request,
3063                   if an element is t, REG can be used by any charsets,
3064                 nil: REG is never used.  */
3065         for (charset = 0; charset <= MAX_CHARSET; charset++)
3066           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3067             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3068         for (i = 0; i < 4; i++)
3069           {
3070             if (INTEGERP (flags[i])
3071                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3072                 || (charset = get_charset_id (flags[i])) >= 0)
3073               {
3074                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3075                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3076               }
3077             else if (EQ (flags[i], Qt))
3078               {
3079                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3080                 reg_bits |= 1 << i;
3081                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3082               }
3083             else if (CONSP (flags[i]))
3084               {
3085                 Lisp_Object tail;
3086                 tail = flags[i];
3087
3088                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3089                 if (INTEGERP (XCAR (tail))
3090                     && (charset = XINT (XCAR (tail)),
3091                         CHARSET_VALID_P (charset))
3092                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3093                   {
3094                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3095                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3096                   }
3097                 else
3098                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3099                 tail = XCDR (tail);
3100                 while (CONSP (tail))
3101                   {
3102                     if (INTEGERP (XCAR (tail))
3103                         && (charset = XINT (XCAR (tail)),
3104                             CHARSET_VALID_P (charset))
3105                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3106                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3107                         = i;
3108                     else if (EQ (XCAR (tail), Qt))
3109                       reg_bits |= 1 << i;
3110                     tail = XCDR (tail);
3111                   }
3112               }
3113             else
3114               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3115
3116             CODING_SPEC_ISO_DESIGNATION (coding, i)
3117               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3118           }
3119
3120         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3121           {
3122             /* REG 1 can be used only by locking shift in 7-bit env.  */
3123             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3124               reg_bits &= ~2;
3125             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3126               /* Without any shifting, only REG 0 and 1 can be used.  */
3127               reg_bits &= 3;
3128           }
3129
3130         if (reg_bits)
3131           for (charset = 0; charset <= MAX_CHARSET; charset++)
3132             {
3133               if (CHARSET_VALID_P (charset))
3134                 {
3135                   /* There exist some default graphic registers to be
3136                      used CHARSET.  */
3137
3138                   /* We had better avoid designating a charset of
3139                      CHARS96 to REG 0 as far as possible.  */
3140                   if (CHARSET_CHARS (charset) == 96)
3141                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3142                       = (reg_bits & 2
3143                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3144                   else
3145                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3146                       = (reg_bits & 1
3147                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3148                 }
3149             }
3150       }
3151       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3152       coding->spec.iso2022.last_invalid_designation_register = -1;
3153       break;
3154
3155     case 3:
3156       coding->type = coding_type_big5;
3157       coding->common_flags
3158         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3159       coding->flags
3160         = (NILP (XVECTOR (coding_spec)->contents[4])
3161            ? CODING_FLAG_BIG5_HKU
3162            : CODING_FLAG_BIG5_ETEN);
3163       break;
3164
3165     case 4:
3166       coding->type = coding_type_ccl;
3167       coding->common_flags
3168         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3169       {
3170         val = XVECTOR (coding_spec)->contents[4];
3171         if (! CONSP (val)
3172             || setup_ccl_program (&(coding->spec.ccl.decoder),
3173                                   XCAR (val)) < 0
3174             || setup_ccl_program (&(coding->spec.ccl.encoder),
3175                                   XCDR (val)) < 0)
3176           goto label_invalid_coding_system;
3177
3178         bzero (coding->spec.ccl.valid_codes, 256);
3179         val = Fplist_get (plist, Qvalid_codes);
3180         if (CONSP (val))
3181           {
3182             Lisp_Object this;
3183
3184             for (; CONSP (val); val = XCDR (val))
3185               {
3186                 this = XCAR (val);
3187                 if (INTEGERP (this)
3188                     && XINT (this) >= 0 && XINT (this) < 256)
3189                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3190                 else if (CONSP (this)
3191                          && INTEGERP (XCAR (this))
3192                          && INTEGERP (XCDR (this)))
3193                   {
3194                     int start = XINT (XCAR (this));
3195                     int end = XINT (XCDR (this));
3196
3197                     if (start >= 0 && start <= end && end < 256)
3198                       while (start <= end)
3199                         coding->spec.ccl.valid_codes[start++] = 1;
3200                   }
3201               }
3202           }
3203       }
3204       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3205       coding->spec.ccl.cr_carryover = 0;
3206       break;
3207
3208     case 5:
3209       coding->type = coding_type_raw_text;
3210       break;
3211
3212     default:
3213       goto label_invalid_coding_system;
3214     }
3215   return 0;
3216
3217  label_invalid_coding_system:
3218   coding->type = coding_type_no_conversion;
3219   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3220   coding->common_flags = 0;
3221   coding->eol_type = CODING_EOL_LF;
3222   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3223   return -1;
3224 }
3225
3226 /* Free memory blocks allocated for storing composition information.  */
3227
3228 void
3229 coding_free_composition_data (coding)
3230      struct coding_system *coding;
3231 {
3232   struct composition_data *cmp_data = coding->cmp_data, *next;
3233
3234   if (!cmp_data)
3235     return;
3236   /* Memory blocks are chained.  At first, rewind to the first, then,
3237      free blocks one by one.  */
3238   while (cmp_data->prev)
3239     cmp_data = cmp_data->prev;
3240   while (cmp_data)
3241     {
3242       next = cmp_data->next;
3243       xfree (cmp_data);
3244       cmp_data = next;
3245     }
3246   coding->cmp_data = NULL;
3247 }
3248
3249 /* Set `char_offset' member of all memory blocks pointed by
3250    coding->cmp_data to POS.  */
3251
3252 void
3253 coding_adjust_composition_offset (coding, pos)
3254      struct coding_system *coding;
3255      int pos;
3256 {
3257   struct composition_data *cmp_data;
3258
3259   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3260     cmp_data->char_offset = pos;
3261 }
3262
3263 /* Setup raw-text or one of its subsidiaries in the structure
3264    coding_system CODING according to the already setup value eol_type
3265    in CODING.  CODING should be setup for some coding system in
3266    advance.  */
3267
3268 void
3269 setup_raw_text_coding_system (coding)
3270      struct coding_system *coding;
3271 {
3272   if (coding->type != coding_type_raw_text)
3273     {
3274       coding->symbol = Qraw_text;
3275       coding->type = coding_type_raw_text;
3276       if (coding->eol_type != CODING_EOL_UNDECIDED)
3277         {
3278           Lisp_Object subsidiaries;
3279           subsidiaries = Fget (Qraw_text, Qeol_type);
3280
3281           if (VECTORP (subsidiaries)
3282               && XVECTOR (subsidiaries)->size == 3)
3283             coding->symbol
3284               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3285         }
3286       setup_coding_system (coding->symbol, coding);
3287     }
3288   return;
3289 }
3290
3291 /* Emacs has a mechanism to automatically detect a coding system if it
3292    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3293    it's impossible to distinguish some coding systems accurately
3294    because they use the same range of codes.  So, at first, coding
3295    systems are categorized into 7, those are:
3296
3297    o coding-category-emacs-mule
3298
3299         The category for a coding system which has the same code range
3300         as Emacs' internal format.  Assigned the coding-system (Lisp
3301         symbol) `emacs-mule' by default.
3302
3303    o coding-category-sjis
3304
3305         The category for a coding system which has the same code range
3306         as SJIS.  Assigned the coding-system (Lisp
3307         symbol) `japanese-shift-jis' by default.
3308
3309    o coding-category-iso-7
3310
3311         The category for a coding system which has the same code range
3312         as ISO2022 of 7-bit environment.  This doesn't use any locking
3313         shift and single shift functions.  This can encode/decode all
3314         charsets.  Assigned the coding-system (Lisp symbol)
3315         `iso-2022-7bit' by default.
3316
3317    o coding-category-iso-7-tight
3318
3319         Same as coding-category-iso-7 except that this can
3320         encode/decode only the specified charsets.
3321
3322    o coding-category-iso-8-1
3323
3324         The category for a coding system which has the same code range
3325         as ISO2022 of 8-bit environment and graphic plane 1 used only
3326         for DIMENSION1 charset.  This doesn't use any locking shift
3327         and single shift functions.  Assigned the coding-system (Lisp
3328         symbol) `iso-latin-1' by default.
3329
3330    o coding-category-iso-8-2
3331
3332         The category for a coding system which has the same code range
3333         as ISO2022 of 8-bit environment and graphic plane 1 used only
3334         for DIMENSION2 charset.  This doesn't use any locking shift
3335         and single shift functions.  Assigned the coding-system (Lisp
3336         symbol) `japanese-iso-8bit' by default.
3337
3338    o coding-category-iso-7-else
3339
3340         The category for a coding system which has the same code range
3341         as ISO2022 of 7-bit environemnt but uses locking shift or
3342         single shift functions.  Assigned the coding-system (Lisp
3343         symbol) `iso-2022-7bit-lock' by default.
3344
3345    o coding-category-iso-8-else
3346
3347         The category for a coding system which has the same code range
3348         as ISO2022 of 8-bit environemnt but uses locking shift or
3349         single shift functions.  Assigned the coding-system (Lisp
3350         symbol) `iso-2022-8bit-ss2' by default.
3351
3352    o coding-category-big5
3353
3354         The category for a coding system which has the same code range
3355         as BIG5.  Assigned the coding-system (Lisp symbol)
3356         `cn-big5' by default.
3357
3358    o coding-category-utf-8
3359
3360         The category for a coding system which has the same code range
3361         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3362         symbol) `utf-8' by default.
3363
3364    o coding-category-utf-16-be
3365
3366         The category for a coding system in which a text has an
3367         Unicode signature (cf. Unicode Standard) in the order of BIG
3368         endian at the head.  Assigned the coding-system (Lisp symbol)
3369         `utf-16-be' by default.
3370
3371    o coding-category-utf-16-le
3372
3373         The category for a coding system in which a text has an
3374         Unicode signature (cf. Unicode Standard) in the order of
3375         LITTLE endian at the head.  Assigned the coding-system (Lisp
3376         symbol) `utf-16-le' by default.
3377
3378    o coding-category-ccl
3379
3380         The category for a coding system of which encoder/decoder is
3381         written in CCL programs.  The default value is nil, i.e., no
3382         coding system is assigned.
3383
3384    o coding-category-binary
3385
3386         The category for a coding system not categorized in any of the
3387         above.  Assigned the coding-system (Lisp symbol)
3388         `no-conversion' by default.
3389
3390    Each of them is a Lisp symbol and the value is an actual
3391    `coding-system's (this is also a Lisp symbol) assigned by a user.
3392    What Emacs does actually is to detect a category of coding system.
3393    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3394    decide only one possible category, it selects a category of the
3395    highest priority.  Priorities of categories are also specified by a
3396    user in a Lisp variable `coding-category-list'.
3397
3398 */
3399
3400 static
3401 int ascii_skip_code[256];
3402
3403 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3404    If it detects possible coding systems, return an integer in which
3405    appropriate flag bits are set.  Flag bits are defined by macros
3406    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3407    it should point the table `coding_priorities'.  In that case, only
3408    the flag bit for a coding system of the highest priority is set in
3409    the returned value.
3410
3411    How many ASCII characters are at the head is returned as *SKIP.  */
3412
3413 static int
3414 detect_coding_mask (source, src_bytes, priorities, skip)
3415      unsigned char *source;
3416      int src_bytes, *priorities, *skip;
3417 {
3418   register unsigned char c;
3419   unsigned char *src = source, *src_end = source + src_bytes;
3420   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3421   int i, idx;
3422
3423   /* At first, skip all ASCII characters and control characters except
3424      for three ISO2022 specific control characters.  */
3425   ascii_skip_code[ISO_CODE_SO] = 0;
3426   ascii_skip_code[ISO_CODE_SI] = 0;
3427   ascii_skip_code[ISO_CODE_ESC] = 0;
3428
3429  label_loop_detect_coding:
3430   while (src < src_end && ascii_skip_code[*src]) src++;
3431   *skip = src - source;
3432
3433   if (src >= src_end)
3434     /* We found nothing other than ASCII.  There's nothing to do.  */
3435     return 0;
3436
3437   c = *src;
3438   /* The text seems to be encoded in some multilingual coding system.
3439      Now, try to find in which coding system the text is encoded.  */
3440   if (c < 0x80)
3441     {
3442       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3443       /* C is an ISO2022 specific control code of C0.  */
3444       mask = detect_coding_iso2022 (src, src_end);
3445       if (mask == 0)
3446         {
3447           /* No valid ISO2022 code follows C.  Try again.  */
3448           src++;
3449           if (c == ISO_CODE_ESC)
3450             ascii_skip_code[ISO_CODE_ESC] = 1;
3451           else
3452             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3453           goto label_loop_detect_coding;
3454         }
3455       if (priorities)
3456         {
3457           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3458             {
3459               if (mask & priorities[i])
3460                 return priorities[i];
3461             }
3462           return CODING_CATEGORY_MASK_RAW_TEXT;
3463         }
3464     }
3465   else
3466     {
3467       int try;
3468
3469       if (c < 0xA0)
3470         {
3471           /* C is the first byte of SJIS character code,
3472              or a leading-code of Emacs' internal format (emacs-mule),
3473              or the first byte of UTF-16.  */
3474           try = (CODING_CATEGORY_MASK_SJIS
3475                   | CODING_CATEGORY_MASK_EMACS_MULE
3476                   | CODING_CATEGORY_MASK_UTF_16_BE
3477                   | CODING_CATEGORY_MASK_UTF_16_LE);
3478
3479           /* Or, if C is a special latin extra code,
3480              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3481              or is an ISO2022 control-sequence-introducer (CSI),
3482              we should also consider the possibility of ISO2022 codings.  */
3483           if ((VECTORP (Vlatin_extra_code_table)
3484                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3485               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3486               || (c == ISO_CODE_CSI
3487                   && (src < src_end
3488                       && (*src == ']'
3489                           || ((*src == '0' || *src == '1' || *src == '2')
3490                               && src + 1 < src_end
3491                               && src[1] == ']')))))
3492             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3493                      | CODING_CATEGORY_MASK_ISO_8BIT);
3494         }
3495       else
3496         /* C is a character of ISO2022 in graphic plane right,
3497            or a SJIS's 1-byte character code (i.e. JISX0201),
3498            or the first byte of BIG5's 2-byte code,
3499            or the first byte of UTF-8/16.  */
3500         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3501                 | CODING_CATEGORY_MASK_ISO_8BIT
3502                 | CODING_CATEGORY_MASK_SJIS
3503                 | CODING_CATEGORY_MASK_BIG5
3504                 | CODING_CATEGORY_MASK_UTF_8
3505                 | CODING_CATEGORY_MASK_UTF_16_BE
3506                 | CODING_CATEGORY_MASK_UTF_16_LE);
3507
3508       /* Or, we may have to consider the possibility of CCL.  */
3509       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3510           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511               ->spec.ccl.valid_codes)[c])
3512         try |= CODING_CATEGORY_MASK_CCL;
3513
3514       mask = 0;
3515       utf16_examined_p = iso2022_examined_p = 0;
3516       if (priorities)
3517         {
3518           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3519             {
3520               if (!iso2022_examined_p
3521                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3522                 {
3523                   mask |= detect_coding_iso2022 (src, src_end);
3524                   iso2022_examined_p = 1;
3525                 }
3526               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3527                 mask |= detect_coding_sjis (src, src_end);
3528               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3529                 mask |= detect_coding_utf_8 (src, src_end);
3530               else if (!utf16_examined_p
3531                        && (priorities[i] & try &
3532                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3533                 {
3534                   mask |= detect_coding_utf_16 (src, src_end);
3535                   utf16_examined_p = 1;
3536                 }
3537               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3538                 mask |= detect_coding_big5 (src, src_end);
3539               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3540                 mask |= detect_coding_emacs_mule (src, src_end);
3541               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3542                 mask |= detect_coding_ccl (src, src_end);
3543               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3544                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3545               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3546                 mask |= CODING_CATEGORY_MASK_BINARY;
3547               if (mask & priorities[i])
3548                 return priorities[i];
3549             }
3550           return CODING_CATEGORY_MASK_RAW_TEXT;
3551         }
3552       if (try & CODING_CATEGORY_MASK_ISO)
3553         mask |= detect_coding_iso2022 (src, src_end);
3554       if (try & CODING_CATEGORY_MASK_SJIS)
3555         mask |= detect_coding_sjis (src, src_end);
3556       if (try & CODING_CATEGORY_MASK_BIG5)
3557         mask |= detect_coding_big5 (src, src_end);
3558       if (try & CODING_CATEGORY_MASK_UTF_8)
3559         mask |= detect_coding_utf_8 (src, src_end);
3560       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3561         mask |= detect_coding_utf_16 (src, src_end);
3562       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3563         mask |= detect_coding_emacs_mule (src, src_end);
3564       if (try & CODING_CATEGORY_MASK_CCL)
3565         mask |= detect_coding_ccl (src, src_end);
3566     }
3567   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3568 }
3569
3570 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3571    The information of the detected coding system is set in CODING.  */
3572
3573 void
3574 detect_coding (coding, src, src_bytes)
3575      struct coding_system *coding;
3576      unsigned char *src;
3577      int src_bytes;
3578 {
3579   unsigned int idx;
3580   int skip, mask, i;
3581   Lisp_Object val;
3582
3583   val = Vcoding_category_list;
3584   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3585   coding->heading_ascii = skip;
3586
3587   if (!mask) return;
3588
3589   /* We found a single coding system of the highest priority in MASK.  */
3590   idx = 0;
3591   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3592   if (! mask)
3593     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3594
3595   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3596
3597   if (coding->eol_type != CODING_EOL_UNDECIDED)
3598     {
3599       Lisp_Object tmp;
3600
3601       tmp = Fget (val, Qeol_type);
3602       if (VECTORP (tmp))
3603         val = XVECTOR (tmp)->contents[coding->eol_type];
3604     }
3605
3606   /* Setup this new coding system while preserving some slots.  */
3607   {
3608     int src_multibyte = coding->src_multibyte;
3609     int dst_multibyte = coding->dst_multibyte;
3610
3611     setup_coding_system (val, coding);
3612     coding->src_multibyte = src_multibyte;
3613     coding->dst_multibyte = dst_multibyte;
3614     coding->heading_ascii = skip;
3615   }
3616 }
3617
3618 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3619    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3620    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3621
3622    How many non-eol characters are at the head is returned as *SKIP.  */
3623
3624 #define MAX_EOL_CHECK_COUNT 3
3625
3626 static int
3627 detect_eol_type (source, src_bytes, skip)
3628      unsigned char *source;
3629      int src_bytes, *skip;
3630 {
3631   unsigned char *src = source, *src_end = src + src_bytes;
3632   unsigned char c;
3633   int total = 0;                /* How many end-of-lines are found so far.  */
3634   int eol_type = CODING_EOL_UNDECIDED;
3635   int this_eol_type;
3636
3637   *skip = 0;
3638
3639   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3640     {
3641       c = *src++;
3642       if (c == '\n' || c == '\r')
3643         {
3644           if (*skip == 0)
3645             *skip = src - 1 - source;
3646           total++;
3647           if (c == '\n')
3648             this_eol_type = CODING_EOL_LF;
3649           else if (src >= src_end || *src != '\n')
3650             this_eol_type = CODING_EOL_CR;
3651           else
3652             this_eol_type = CODING_EOL_CRLF, src++;
3653
3654           if (eol_type == CODING_EOL_UNDECIDED)
3655             /* This is the first end-of-line.  */
3656             eol_type = this_eol_type;
3657           else if (eol_type != this_eol_type)
3658             {
3659               /* The found type is different from what found before.  */
3660               eol_type = CODING_EOL_INCONSISTENT;
3661               break;
3662             }
3663         }
3664     }
3665
3666   if (*skip == 0)
3667     *skip = src_end - source;
3668   return eol_type;
3669 }
3670
3671 /* Like detect_eol_type, but detect EOL type in 2-octet
3672    big-endian/little-endian format for coding systems utf-16-be and
3673    utf-16-le.  */
3674
3675 static int
3676 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3677      unsigned char *source;
3678      int src_bytes, *skip;
3679 {
3680   unsigned char *src = source, *src_end = src + src_bytes;
3681   unsigned int c1, c2;
3682   int total = 0;                /* How many end-of-lines are found so far.  */
3683   int eol_type = CODING_EOL_UNDECIDED;
3684   int this_eol_type;
3685   int msb, lsb;
3686
3687   if (big_endian_p)
3688     msb = 0, lsb = 1;
3689   else
3690     msb = 1, lsb = 0;
3691
3692   *skip = 0;
3693
3694   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3695     {
3696       c1 = (src[msb] << 8) | (src[lsb]);
3697       src += 2;
3698
3699       if (c1 == '\n' || c1 == '\r')
3700         {
3701           if (*skip == 0)
3702             *skip = src - 2 - source;
3703           total++;
3704           if (c1 == '\n')
3705             {
3706               this_eol_type = CODING_EOL_LF;
3707             }
3708           else
3709             {
3710               if ((src + 1) >= src_end)
3711                 {
3712                   this_eol_type = CODING_EOL_CR;
3713                 }
3714               else
3715                 {
3716                   c2 = (src[msb] << 8) | (src[lsb]);
3717                   if (c2 == '\n')
3718                     this_eol_type = CODING_EOL_CRLF, src += 2;
3719                   else
3720                     this_eol_type = CODING_EOL_CR;
3721                 }
3722             }
3723
3724           if (eol_type == CODING_EOL_UNDECIDED)
3725             /* This is the first end-of-line.  */
3726             eol_type = this_eol_type;
3727           else if (eol_type != this_eol_type)
3728             {
3729               /* The found type is different from what found before.  */
3730               eol_type = CODING_EOL_INCONSISTENT;
3731               break;
3732             }
3733         }
3734     }
3735
3736   if (*skip == 0)
3737     *skip = src_end - source;
3738   return eol_type;
3739 }
3740
3741 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3742    is encoded.  If it detects an appropriate format of end-of-line, it
3743    sets the information in *CODING.  */
3744
3745 void
3746 detect_eol (coding, src, src_bytes)
3747      struct coding_system *coding;
3748      unsigned char *src;
3749      int src_bytes;
3750 {
3751   Lisp_Object val;
3752   int skip;
3753   int eol_type;
3754
3755   switch (coding->category_idx)
3756     {
3757     case CODING_CATEGORY_IDX_UTF_16_BE:
3758       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3759       break;
3760     case CODING_CATEGORY_IDX_UTF_16_LE:
3761       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3762       break;
3763     default:
3764       eol_type = detect_eol_type (src, src_bytes, &skip);
3765       break;
3766     }
3767
3768   if (coding->heading_ascii > skip)
3769     coding->heading_ascii = skip;
3770   else
3771     skip = coding->heading_ascii;
3772
3773   if (eol_type == CODING_EOL_UNDECIDED)
3774     return;
3775   if (eol_type == CODING_EOL_INCONSISTENT)
3776     {
3777 #if 0
3778       /* This code is suppressed until we find a better way to
3779          distinguish raw text file and binary file.  */
3780
3781       /* If we have already detected that the coding is raw-text, the
3782          coding should actually be no-conversion.  */
3783       if (coding->type == coding_type_raw_text)
3784         {
3785           setup_coding_system (Qno_conversion, coding);
3786           return;
3787         }
3788       /* Else, let's decode only text code anyway.  */
3789 #endif /* 0 */
3790       eol_type = CODING_EOL_LF;
3791     }
3792
3793   val = Fget (coding->symbol, Qeol_type);
3794   if (VECTORP (val) && XVECTOR (val)->size == 3)
3795     {
3796       int src_multibyte = coding->src_multibyte;
3797       int dst_multibyte = coding->dst_multibyte;
3798
3799       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3800       coding->src_multibyte = src_multibyte;
3801       coding->dst_multibyte = dst_multibyte;
3802       coding->heading_ascii = skip;
3803     }
3804 }
3805
3806 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3807
3808 #define DECODING_BUFFER_MAG(coding)                     \
3809   (coding->type == coding_type_iso2022                  \
3810    ? 3                                                  \
3811    : (coding->type == coding_type_ccl                   \
3812       ? coding->spec.ccl.decoder.buf_magnification      \
3813       : 2))
3814
3815 /* Return maximum size (bytes) of a buffer enough for decoding
3816    SRC_BYTES of text encoded in CODING.  */
3817
3818 int
3819 decoding_buffer_size (coding, src_bytes)
3820      struct coding_system *coding;
3821      int src_bytes;
3822 {
3823   return (src_bytes * DECODING_BUFFER_MAG (coding)
3824           + CONVERSION_BUFFER_EXTRA_ROOM);
3825 }
3826
3827 /* Return maximum size (bytes) of a buffer enough for encoding
3828    SRC_BYTES of text to CODING.  */
3829
3830 int
3831 encoding_buffer_size (coding, src_bytes)
3832      struct coding_system *coding;
3833      int src_bytes;
3834 {
3835   int magnification;
3836
3837   if (coding->type == coding_type_ccl)
3838     magnification = coding->spec.ccl.encoder.buf_magnification;
3839   else if (CODING_REQUIRE_ENCODING (coding))
3840     magnification = 3;
3841   else
3842     magnification = 1;
3843
3844   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3845 }
3846
3847 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3848 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3849 #endif
3850
3851 char *conversion_buffer;
3852 int conversion_buffer_size;
3853
3854 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3855    or decoding.  Sufficient memory is allocated automatically.  If we
3856    run out of memory, return NULL.  */
3857
3858 char *
3859 get_conversion_buffer (size)
3860      int size;
3861 {
3862   if (size > conversion_buffer_size)
3863     {
3864       char *buf;
3865       int real_size = conversion_buffer_size * 2;
3866
3867       while (real_size < size) real_size *= 2;
3868       buf = (char *) xmalloc (real_size);
3869       xfree (conversion_buffer);
3870       conversion_buffer = buf;
3871       conversion_buffer_size = real_size;
3872     }
3873   return conversion_buffer;
3874 }
3875
3876 int
3877 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3878      struct coding_system *coding;
3879      unsigned char *source, *destination;
3880      int src_bytes, dst_bytes, encodep;
3881 {
3882   struct ccl_program *ccl
3883     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3884   int result;
3885
3886   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3887   if (encodep)
3888     ccl->eol_type = coding->eol_type;
3889   coding->produced = ccl_driver (ccl, source, destination,
3890                                  src_bytes, dst_bytes, &(coding->consumed));
3891   if (encodep)
3892     coding->produced_char = coding->produced;
3893   else
3894     {
3895       int bytes
3896         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3897       coding->produced = str_as_multibyte (destination, bytes,
3898                                            coding->produced,
3899                                            &(coding->produced_char));
3900     }
3901
3902   switch (ccl->status)
3903     {
3904     case CCL_STAT_SUSPEND_BY_SRC:
3905       result = CODING_FINISH_INSUFFICIENT_SRC;
3906       break;
3907     case CCL_STAT_SUSPEND_BY_DST:
3908       result = CODING_FINISH_INSUFFICIENT_DST;
3909       break;
3910     case CCL_STAT_QUIT:
3911     case CCL_STAT_INVALID_CMD:
3912       result = CODING_FINISH_INTERRUPT;
3913       break;
3914     default:
3915       result = CODING_FINISH_NORMAL;
3916       break;
3917     }
3918   return result;
3919 }
3920
3921 /* Decode EOL format of the text at PTR of BYTES length destructively
3922    according to CODING->eol_type.  This is called after the CCL
3923    program produced a decoded text at PTR.  If we do CRLF->LF
3924    conversion, update CODING->produced and CODING->produced_char.  */
3925
3926 static void
3927 decode_eol_post_ccl (coding, ptr, bytes)
3928      struct coding_system *coding;
3929      unsigned char *ptr;
3930      int bytes;
3931 {
3932   Lisp_Object val, saved_coding_symbol;
3933   unsigned char *pend = ptr + bytes;
3934   int dummy;
3935
3936   /* Remember the current coding system symbol.  We set it back when
3937      an inconsistent EOL is found so that `last-coding-system-used' is
3938      set to the coding system that doesn't specify EOL conversion.  */
3939   saved_coding_symbol = coding->symbol;
3940
3941   coding->spec.ccl.cr_carryover = 0;
3942   if (coding->eol_type == CODING_EOL_UNDECIDED)
3943     {
3944       /* Here, to avoid the call of setup_coding_system, we directly
3945          call detect_eol_type.  */
3946       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
3947       if (coding->eol_type == CODING_EOL_INCONSISTENT)
3948         coding->eol_type = CODING_EOL_LF;
3949       if (coding->eol_type != CODING_EOL_UNDECIDED)
3950         {
3951           val = Fget (coding->symbol, Qeol_type);
3952           if (VECTORP (val) && XVECTOR (val)->size == 3)
3953             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
3954         }
3955       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
3956     }
3957
3958   if (coding->eol_type == CODING_EOL_LF
3959       || coding->eol_type == CODING_EOL_UNDECIDED)
3960     {
3961       /* We have nothing to do.  */
3962       ptr = pend;
3963     }
3964   else if (coding->eol_type == CODING_EOL_CRLF)
3965     {
3966       unsigned char *pstart = ptr, *p = ptr;
3967
3968       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
3969           && *(pend - 1) == '\r')
3970         {
3971           /* If the last character is CR, we can't handle it here
3972              because LF will be in the not-yet-decoded source text.
3973              Recorded that the CR is not yet processed.  */
3974           coding->spec.ccl.cr_carryover = 1;
3975           coding->produced--;
3976           coding->produced_char--;
3977           pend--;
3978         }
3979       while (ptr < pend)
3980         {
3981           if (*ptr == '\r')
3982             {
3983               if (ptr + 1 < pend && *(ptr + 1) == '\n')
3984                 {
3985                   *p++ = '\n';
3986                   ptr += 2;
3987                 }
3988               else
3989                 {
3990                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3991                     goto undo_eol_conversion;
3992                   *p++ = *ptr++;
3993                 }
3994             }
3995           else if (*ptr == '\n'
3996                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3997             goto undo_eol_conversion;
3998           else
3999             *p++ = *ptr++;
4000           continue;
4001
4002         undo_eol_conversion:
4003           /* We have faced with inconsistent EOL format at PTR.
4004              Convert all LFs before PTR back to CRLFs.  */
4005           for (p--, ptr--; p >= pstart; p--)
4006             {
4007               if (*p == '\n')
4008                 *ptr-- = '\n', *ptr-- = '\r';
4009               else
4010                 *ptr-- = *p;
4011             }
4012           /*  If carryover is recorded, cancel it because we don't
4013               convert CRLF anymore.  */
4014           if (coding->spec.ccl.cr_carryover)
4015             {
4016               coding->spec.ccl.cr_carryover = 0;
4017               coding->produced++;
4018               coding->produced_char++;
4019               pend++;
4020             }
4021           p = ptr = pend;
4022           coding->eol_type = CODING_EOL_LF;
4023           coding->symbol = saved_coding_symbol;
4024         }
4025       if (p < pend)
4026         {
4027           /* As each two-byte sequence CRLF was converted to LF, (PEND
4028              - P) is the number of deleted characters.  */
4029           coding->produced -= pend - p;
4030           coding->produced_char -= pend - p;
4031         }
4032     }
4033   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4034     {
4035       unsigned char *p = ptr;
4036
4037       for (; ptr < pend; ptr++)
4038         {
4039           if (*ptr == '\r')
4040             *ptr = '\n';
4041           else if (*ptr == '\n'
4042                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4043             {
4044               for (; p < ptr; p++)
4045                 {
4046                   if (*p == '\n')
4047                     *p = '\r';
4048                 }
4049               ptr = pend;
4050               coding->eol_type = CODING_EOL_LF;
4051               coding->symbol = saved_coding_symbol;
4052             }
4053         }
4054     }
4055 }
4056
4057 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4058    decoding, it may detect coding system and format of end-of-line if
4059    those are not yet decided.  The source should be unibyte, the
4060    result is multibyte if CODING->dst_multibyte is nonzero, else
4061    unibyte.  */
4062
4063 int
4064 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4065      struct coding_system *coding;
4066      unsigned char *source, *destination;
4067      int src_bytes, dst_bytes;
4068 {
4069   if (coding->type == coding_type_undecided)
4070     detect_coding (coding, source, src_bytes);
4071
4072   if (coding->eol_type == CODING_EOL_UNDECIDED
4073       && coding->type != coding_type_ccl)
4074     detect_eol (coding, source, src_bytes);
4075
4076   coding->produced = coding->produced_char = 0;
4077   coding->consumed = coding->consumed_char = 0;
4078   coding->errors = 0;
4079   coding->result = CODING_FINISH_NORMAL;
4080
4081   switch (coding->type)
4082     {
4083     case coding_type_sjis:
4084       decode_coding_sjis_big5 (coding, source, destination,
4085                                src_bytes, dst_bytes, 1);
4086       break;
4087
4088     case coding_type_iso2022:
4089       decode_coding_iso2022 (coding, source, destination,
4090                              src_bytes, dst_bytes);
4091       break;
4092
4093     case coding_type_big5:
4094       decode_coding_sjis_big5 (coding, source, destination,
4095                                src_bytes, dst_bytes, 0);
4096       break;
4097
4098     case coding_type_emacs_mule:
4099       decode_coding_emacs_mule (coding, source, destination,
4100                                 src_bytes, dst_bytes);
4101       break;
4102
4103     case coding_type_ccl:
4104       if (coding->spec.ccl.cr_carryover)
4105         {
4106           /* Set the CR which is not processed by the previous call of
4107              decode_eol_post_ccl in DESTINATION.  */
4108           *destination = '\r';
4109           coding->produced++;
4110           coding->produced_char++;
4111           dst_bytes--;
4112         }
4113       ccl_coding_driver (coding, source,
4114                          destination + coding->spec.ccl.cr_carryover,
4115                          src_bytes, dst_bytes, 0);
4116       if (coding->eol_type != CODING_EOL_LF)
4117         decode_eol_post_ccl (coding, destination, coding->produced);
4118       break;
4119
4120     default:
4121       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4122     }
4123
4124   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4125       && coding->consumed == src_bytes)
4126     coding->result = CODING_FINISH_NORMAL;
4127
4128   if (coding->mode & CODING_MODE_LAST_BLOCK
4129       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4130     {
4131       unsigned char *src = source + coding->consumed;
4132       unsigned char *dst = destination + coding->produced;
4133
4134       src_bytes -= coding->consumed;
4135      coding->errors++;
4136       if (COMPOSING_P (coding))
4137         DECODE_COMPOSITION_END ('1');
4138       while (src_bytes--)
4139         {
4140           int c = *src++;
4141           dst += CHAR_STRING (c, dst);
4142           coding->produced_char++;
4143         }
4144       coding->consumed = coding->consumed_char = src - source;
4145       coding->produced = dst - destination;
4146     }
4147
4148   if (!coding->dst_multibyte)
4149     {
4150       coding->produced = str_as_unibyte (destination, coding->produced);
4151       coding->produced_char = coding->produced;
4152     }
4153
4154   return coding->result;
4155 }
4156
4157 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4158    multibyteness of the source is CODING->src_multibyte, the
4159    multibyteness of the result is always unibyte.  */
4160
4161 int
4162 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4163      struct coding_system *coding;
4164      unsigned char *source, *destination;
4165      int src_bytes, dst_bytes;
4166 {
4167   coding->produced = coding->produced_char = 0;
4168   coding->consumed = coding->consumed_char = 0;
4169   coding->errors = 0;
4170   coding->result = CODING_FINISH_NORMAL;
4171
4172   switch (coding->type)
4173     {
4174     case coding_type_sjis:
4175       encode_coding_sjis_big5 (coding, source, destination,
4176                                src_bytes, dst_bytes, 1);
4177       break;
4178
4179     case coding_type_iso2022:
4180       encode_coding_iso2022 (coding, source, destination,
4181                              src_bytes, dst_bytes);
4182       break;
4183
4184     case coding_type_big5:
4185       encode_coding_sjis_big5 (coding, source, destination,
4186                                src_bytes, dst_bytes, 0);
4187       break;
4188
4189     case coding_type_emacs_mule:
4190       encode_coding_emacs_mule (coding, source, destination,
4191                                 src_bytes, dst_bytes);
4192       break;
4193
4194     case coding_type_ccl:
4195       ccl_coding_driver (coding, source, destination,
4196                          src_bytes, dst_bytes, 1);
4197       break;
4198
4199     default:
4200       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4201     }
4202
4203   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4204       && coding->consumed == src_bytes)
4205     coding->result = CODING_FINISH_NORMAL;
4206
4207   if (coding->mode & CODING_MODE_LAST_BLOCK)
4208     {
4209       unsigned char *src = source + coding->consumed;
4210       unsigned char *src_end = src + src_bytes;
4211       unsigned char *dst = destination + coding->produced;
4212
4213       if (coding->type == coding_type_iso2022)
4214         ENCODE_RESET_PLANE_AND_REGISTER;
4215       if (COMPOSING_P (coding))
4216         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4217       if (coding->consumed < src_bytes)
4218         {
4219           int len = src_bytes - coding->consumed;
4220
4221           BCOPY_SHORT (source + coding->consumed, dst, len);
4222           if (coding->src_multibyte)
4223             len = str_as_unibyte (dst, len);
4224           dst += len;
4225           coding->consumed = src_bytes;
4226         }
4227       coding->produced = coding->produced_char = dst - destination;
4228     }
4229
4230   return coding->result;
4231 }
4232
4233 /* Scan text in the region between *BEG and *END (byte positions),
4234    skip characters which we don't have to decode by coding system
4235    CODING at the head and tail, then set *BEG and *END to the region
4236    of the text we actually have to convert.  The caller should move
4237    the gap out of the region in advance if the region is from a
4238    buffer.
4239
4240    If STR is not NULL, *BEG and *END are indices into STR.  */
4241
4242 static void
4243 shrink_decoding_region (beg, end, coding, str)
4244      int *beg, *end;
4245      struct coding_system *coding;
4246      unsigned char *str;
4247 {
4248   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4249   int eol_conversion;
4250   Lisp_Object translation_table;
4251
4252   if (coding->type == coding_type_ccl
4253       || coding->type == coding_type_undecided
4254       || coding->eol_type != CODING_EOL_LF
4255       || !NILP (coding->post_read_conversion)
4256       || coding->composing != COMPOSITION_DISABLED)
4257     {
4258       /* We can't skip any data.  */
4259       return;
4260     }
4261   if (coding->type == coding_type_no_conversion
4262       || coding->type == coding_type_raw_text
4263       || coding->type == coding_type_emacs_mule)
4264     {
4265       /* We need no conversion, but don't have to skip any data here.
4266          Decoding routine handles them effectively anyway.  */
4267       return;
4268     }
4269
4270   translation_table = coding->translation_table_for_decode;
4271   if (NILP (translation_table) && !NILP (Venable_character_translation))
4272     translation_table = Vstandard_translation_table_for_decode;
4273   if (CHAR_TABLE_P (translation_table))
4274     {
4275       int i;
4276       for (i = 0; i < 128; i++)
4277         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4278           break;
4279       if (i < 128)
4280         /* Some ASCII character should be translated.  We give up
4281            shrinking.  */
4282         return;
4283     }
4284
4285   if (coding->heading_ascii >= 0)
4286     /* Detection routine has already found how much we can skip at the
4287        head.  */
4288     *beg += coding->heading_ascii;
4289
4290   if (str)
4291     {
4292       begp_orig = begp = str + *beg;
4293       endp_orig = endp = str + *end;
4294     }
4295   else
4296     {
4297       begp_orig = begp = BYTE_POS_ADDR (*beg);
4298       endp_orig = endp = begp + *end - *beg;
4299     }
4300
4301   eol_conversion = (coding->eol_type == CODING_EOL_CR
4302                     || coding->eol_type == CODING_EOL_CRLF);
4303
4304   switch (coding->type)
4305     {
4306     case coding_type_sjis:
4307     case coding_type_big5:
4308       /* We can skip all ASCII characters at the head.  */
4309       if (coding->heading_ascii < 0)
4310         {
4311           if (eol_conversion)
4312             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4313           else
4314             while (begp < endp && *begp < 0x80) begp++;
4315         }
4316       /* We can skip all ASCII characters at the tail except for the
4317          second byte of SJIS or BIG5 code.  */
4318       if (eol_conversion)
4319         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4320       else
4321         while (begp < endp && endp[-1] < 0x80) endp--;
4322       /* Do not consider LF as ascii if preceded by CR, since that
4323          confuses eol decoding. */
4324       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4325         endp++;
4326       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4327         endp++;
4328       break;
4329
4330     case coding_type_iso2022:
4331       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4332         /* We can't skip any data.  */
4333         break;
4334       if (coding->heading_ascii < 0)
4335         {
4336           /* We can skip all ASCII characters at the head except for a
4337              few control codes.  */
4338           while (begp < endp && (c = *begp) < 0x80
4339                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4340                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4341                  && (!eol_conversion || c != ISO_CODE_LF))
4342             begp++;
4343         }
4344       switch (coding->category_idx)
4345         {
4346         case CODING_CATEGORY_IDX_ISO_8_1:
4347         case CODING_CATEGORY_IDX_ISO_8_2:
4348           /* We can skip all ASCII characters at the tail.  */
4349           if (eol_conversion)
4350             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4351           else
4352             while (begp < endp && endp[-1] < 0x80) endp--;
4353           /* Do not consider LF as ascii if preceded by CR, since that
4354              confuses eol decoding. */
4355           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4356             endp++;
4357           break;
4358
4359         case CODING_CATEGORY_IDX_ISO_7:
4360         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4361           {
4362             /* We can skip all charactes at the tail except for 8-bit
4363                codes and ESC and the following 2-byte at the tail.  */
4364             unsigned char *eight_bit = NULL;
4365
4366             if (eol_conversion)
4367               while (begp < endp
4368                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4369                 {
4370                   if (!eight_bit && c & 0x80) eight_bit = endp;
4371                   endp--;
4372                 }
4373             else
4374               while (begp < endp
4375                      && (c = endp[-1]) != ISO_CODE_ESC)
4376                 {
4377                   if (!eight_bit && c & 0x80) eight_bit = endp;
4378                   endp--;
4379                 }
4380             /* Do not consider LF as ascii if preceded by CR, since that
4381                confuses eol decoding. */
4382             if (begp < endp && endp < endp_orig
4383                 && endp[-1] == '\r' && endp[0] == '\n')
4384               endp++;
4385             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4386               {
4387                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4388                   /* This is an ASCII designation sequence.  We can
4389                      surely skip the tail.  But, if we have
4390                      encountered an 8-bit code, skip only the codes
4391                      after that.  */
4392                   endp = eight_bit ? eight_bit : endp + 2;
4393                 else
4394                   /* Hmmm, we can't skip the tail.  */
4395                   endp = endp_orig;
4396               }
4397             else if (eight_bit)
4398               endp = eight_bit;
4399           }
4400         }
4401       break;
4402
4403     default:
4404       abort ();
4405     }
4406   *beg += begp - begp_orig;
4407   *end += endp - endp_orig;
4408   return;
4409 }
4410
4411 /* Like shrink_decoding_region but for encoding.  */
4412
4413 static void
4414 shrink_encoding_region (beg, end, coding, str)
4415      int *beg, *end;
4416      struct coding_system *coding;
4417      unsigned char *str;
4418 {
4419   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4420   int eol_conversion;
4421   Lisp_Object translation_table;
4422
4423   if (coding->type == coding_type_ccl
4424       || coding->eol_type == CODING_EOL_CRLF
4425       || coding->eol_type == CODING_EOL_CR
4426       || coding->cmp_data && coding->cmp_data->used > 0)
4427     {
4428       /* We can't skip any data.  */
4429       return;
4430     }
4431   if (coding->type == coding_type_no_conversion
4432       || coding->type == coding_type_raw_text
4433       || coding->type == coding_type_emacs_mule
4434       || coding->type == coding_type_undecided)
4435     {
4436       /* We need no conversion, but don't have to skip any data here.
4437          Encoding routine handles them effectively anyway.  */
4438       return;
4439     }
4440
4441   translation_table = coding->translation_table_for_encode;
4442   if (NILP (translation_table) && !NILP (Venable_character_translation))
4443     translation_table = Vstandard_translation_table_for_encode;
4444   if (CHAR_TABLE_P (translation_table))
4445     {
4446       int i;
4447       for (i = 0; i < 128; i++)
4448         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4449           break;
4450       if (i < 128)
4451         /* Some ASCII character should be tranlsated.  We give up
4452            shrinking.  */
4453         return;
4454     }
4455
4456   if (str)
4457     {
4458       begp_orig = begp = str + *beg;
4459       endp_orig = endp = str + *end;
4460     }
4461   else
4462     {
4463       begp_orig = begp = BYTE_POS_ADDR (*beg);
4464       endp_orig = endp = begp + *end - *beg;
4465     }
4466
4467   eol_conversion = (coding->eol_type == CODING_EOL_CR
4468                     || coding->eol_type == CODING_EOL_CRLF);
4469
4470   /* Here, we don't have to check coding->pre_write_conversion because
4471      the caller is expected to have handled it already.  */
4472   switch (coding->type)
4473     {
4474     case coding_type_iso2022:
4475       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4476         /* We can't skip any data.  */
4477         break;
4478       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4479         {
4480           unsigned char *bol = begp;
4481           while (begp < endp && *begp < 0x80)
4482             {
4483               begp++;
4484               if (begp[-1] == '\n')
4485                 bol = begp;
4486             }
4487           begp = bol;
4488           goto label_skip_tail;
4489         }
4490       /* fall down ... */
4491
4492     case coding_type_sjis:
4493     case coding_type_big5:
4494       /* We can skip all ASCII characters at the head and tail.  */
4495       if (eol_conversion)
4496         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4497       else
4498         while (begp < endp && *begp < 0x80) begp++;
4499     label_skip_tail:
4500       if (eol_conversion)
4501         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4502       else
4503         while (begp < endp && *(endp - 1) < 0x80) endp--;
4504       break;
4505
4506     default:
4507       abort ();
4508     }
4509
4510   *beg += begp - begp_orig;
4511   *end += endp - endp_orig;
4512   return;
4513 }
4514
4515 /* As shrinking conversion region requires some overhead, we don't try
4516    shrinking if the length of conversion region is less than this
4517    value.  */
4518 static int shrink_conversion_region_threshhold = 1024;
4519
4520 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4521   do {                                                                  \
4522     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4523       {                                                                 \
4524         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4525         else shrink_decoding_region (beg, end, coding, str);            \
4526       }                                                                 \
4527   } while (0)
4528
4529 static Lisp_Object
4530 code_convert_region_unwind (dummy)
4531      Lisp_Object dummy;
4532 {
4533   inhibit_pre_post_conversion = 0;
4534   return Qnil;
4535 }
4536
4537 /* Store information about all compositions in the range FROM and TO
4538    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4539    buffer or a string, defaults to the current buffer.  */
4540
4541 void
4542 coding_save_composition (coding, from, to, obj)
4543      struct coding_system *coding;
4544      int from, to;
4545      Lisp_Object obj;
4546 {
4547   Lisp_Object prop;
4548   int start, end;
4549
4550   if (coding->composing == COMPOSITION_DISABLED)
4551     return;
4552   if (!coding->cmp_data)
4553     coding_allocate_composition_data (coding, from);
4554   if (!find_composition (from, to, &start, &end, &prop, obj)
4555       || end > to)
4556     return;
4557   if (start < from
4558       && (!find_composition (end, to, &start, &end, &prop, obj)
4559           || end > to))
4560     return;
4561   coding->composing = COMPOSITION_NO;
4562   do
4563     {
4564       if (COMPOSITION_VALID_P (start, end, prop))
4565         {
4566           enum composition_method method = COMPOSITION_METHOD (prop);
4567           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4568               >= COMPOSITION_DATA_SIZE)
4569             coding_allocate_composition_data (coding, from);
4570           /* For relative composition, we remember start and end
4571              positions, for the other compositions, we also remember
4572              components.  */
4573           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4574           if (method != COMPOSITION_RELATIVE)
4575             {
4576               /* We must store a*/
4577               Lisp_Object val, ch;
4578
4579               val = COMPOSITION_COMPONENTS (prop);
4580               if (CONSP (val))
4581                 while (CONSP (val))
4582                   {
4583                     ch = XCAR (val), val = XCDR (val);
4584                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4585                   }
4586               else if (VECTORP (val) || STRINGP (val))
4587                 {
4588                   int len = (VECTORP (val)
4589                              ? XVECTOR (val)->size : XSTRING (val)->size);
4590                   int i;
4591                   for (i = 0; i < len; i++)
4592                     {
4593                       ch = (STRINGP (val)
4594                             ? Faref (val, make_number (i))
4595                             : XVECTOR (val)->contents[i]);
4596                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4597                     }
4598                 }
4599               else              /* INTEGERP (val) */
4600                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4601             }
4602           CODING_ADD_COMPOSITION_END (coding, end - from);
4603         }
4604       start = end;
4605     }
4606   while (start < to
4607          && find_composition (start, to, &start, &end, &prop, obj)
4608          && end <= to);
4609
4610   /* Make coding->cmp_data point to the first memory block.  */
4611   while (coding->cmp_data->prev)
4612     coding->cmp_data = coding->cmp_data->prev;
4613   coding->cmp_data_start = 0;
4614 }
4615
4616 /* Reflect the saved information about compositions to OBJ.
4617    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4618    is a buffer or a string, defaults to the current buffer.  */
4619
4620 void
4621 coding_restore_composition (coding, obj)
4622      struct coding_system *coding;
4623      Lisp_Object obj;
4624 {
4625   struct composition_data *cmp_data = coding->cmp_data;
4626
4627   if (!cmp_data)
4628     return;
4629
4630   while (cmp_data->prev)
4631     cmp_data = cmp_data->prev;
4632
4633   while (cmp_data)
4634     {
4635       int i;
4636
4637       for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4638         {
4639           int *data = cmp_data->data + i;
4640           enum composition_method method = (enum composition_method) data[3];
4641           Lisp_Object components;
4642
4643           if (method == COMPOSITION_RELATIVE)
4644             components = Qnil;
4645           else
4646             {
4647               int len = data[0] - 4, j;
4648               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4649
4650               for (j = 0; j < len; j++)
4651                 args[j] = make_number (data[4 + j]);
4652               components = (method == COMPOSITION_WITH_ALTCHARS
4653                             ? Fstring (len, args) : Fvector (len, args));
4654             }
4655           compose_text (data[1], data[2], components, Qnil, obj);
4656         }
4657       cmp_data = cmp_data->next;
4658     }
4659 }
4660
4661 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4662    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4663    coding system CODING, and return the status code of code conversion
4664    (currently, this value has no meaning).
4665
4666    How many characters (and bytes) are converted to how many
4667    characters (and bytes) are recorded in members of the structure
4668    CODING.
4669
4670    If REPLACE is nonzero, we do various things as if the original text
4671    is deleted and a new text is inserted.  See the comments in
4672    replace_range (insdel.c) to know what we are doing.
4673
4674    If REPLACE is zero, it is assumed that the source text is unibyte.
4675    Otherwize, it is assumed that the source text is multibyte.  */
4676
4677 int
4678 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4679      int from, from_byte, to, to_byte, encodep, replace;
4680      struct coding_system *coding;
4681 {
4682   int len = to - from, len_byte = to_byte - from_byte;
4683   int require, inserted, inserted_byte;
4684   int head_skip, tail_skip, total_skip = 0;
4685   Lisp_Object saved_coding_symbol;
4686   int first = 1;
4687   unsigned char *src, *dst;
4688   Lisp_Object deletion;
4689   int orig_point = PT, orig_len = len;
4690   int prev_Z;
4691   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4692
4693   coding->src_multibyte = replace && multibyte_p;
4694   coding->dst_multibyte = multibyte_p;
4695
4696   deletion = Qnil;
4697   saved_coding_symbol = Qnil;
4698
4699   if (from < PT && PT < to)
4700     {
4701       TEMP_SET_PT_BOTH (from, from_byte);
4702       orig_point = from;
4703     }
4704
4705   if (replace)
4706     {
4707       int saved_from = from;
4708
4709       prepare_to_modify_buffer (from, to, &from);
4710       if (saved_from != from)
4711         {
4712           to = from + len;
4713           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4714           len_byte = to_byte - from_byte;
4715         }
4716     }
4717
4718   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4719     {
4720       /* We must detect encoding of text and eol format.  */
4721
4722       if (from < GPT && to > GPT)
4723         move_gap_both (from, from_byte);
4724       if (coding->type == coding_type_undecided)
4725         {
4726           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4727           if (coding->type == coding_type_undecided)
4728             /* It seems that the text contains only ASCII, but we
4729                should not left it undecided because the deeper
4730                decoding routine (decode_coding) tries to detect the
4731                encodings again in vain.  */
4732             coding->type = coding_type_emacs_mule;
4733         }
4734       if (coding->eol_type == CODING_EOL_UNDECIDED
4735           && coding->type != coding_type_ccl)
4736         {
4737           saved_coding_symbol = coding->symbol;
4738           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4739           if (coding->eol_type == CODING_EOL_UNDECIDED)
4740             coding->eol_type = CODING_EOL_LF;
4741           /* We had better recover the original eol format if we
4742              encounter an inconsitent eol format while decoding.  */
4743           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4744         }
4745     }
4746
4747   /* Now we convert the text.  */
4748
4749   /* For encoding, we must process pre-write-conversion in advance.  */
4750   if (! inhibit_pre_post_conversion
4751       && encodep
4752       && SYMBOLP (coding->pre_write_conversion)
4753       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4754     {
4755       /* The function in pre-write-conversion may put a new text in a
4756          new buffer.  */
4757       struct buffer *prev = current_buffer;
4758       Lisp_Object new;
4759       int count = specpdl_ptr - specpdl;
4760
4761       record_unwind_protect (code_convert_region_unwind, Qnil);
4762       /* We should not call any more pre-write/post-read-conversion
4763          functions while this pre-write-conversion is running.  */
4764       inhibit_pre_post_conversion = 1;
4765       call2 (coding->pre_write_conversion,
4766              make_number (from), make_number (to));
4767       inhibit_pre_post_conversion = 0;
4768       /* Discard the unwind protect.  */
4769       specpdl_ptr--;
4770
4771       if (current_buffer != prev)
4772         {
4773           len = ZV - BEGV;
4774           new = Fcurrent_buffer ();
4775           set_buffer_internal_1 (prev);
4776           del_range_2 (from, from_byte, to, to_byte, 0);
4777           TEMP_SET_PT_BOTH (from, from_byte);
4778           insert_from_buffer (XBUFFER (new), 1, len, 0);
4779           Fkill_buffer (new);
4780           if (orig_point >= to)
4781             orig_point += len - orig_len;
4782           else if (orig_point > from)
4783             orig_point = from;
4784           orig_len = len;
4785           to = from + len;
4786           from_byte = CHAR_TO_BYTE (from);
4787           to_byte = CHAR_TO_BYTE (to);
4788           len_byte = to_byte - from_byte;
4789           TEMP_SET_PT_BOTH (from, from_byte);
4790         }
4791     }
4792
4793   if (replace)
4794     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4795
4796   if (coding->composing != COMPOSITION_DISABLED)
4797     {
4798       if (encodep)
4799         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4800       else
4801         coding_allocate_composition_data (coding, from);
4802     }
4803
4804   /* Try to skip the heading and tailing ASCIIs.  */
4805   if (coding->type != coding_type_ccl)
4806     {
4807       int from_byte_orig = from_byte, to_byte_orig = to_byte;
4808
4809       if (from < GPT && GPT < to)
4810         move_gap_both (from, from_byte);
4811       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4812       if (from_byte == to_byte
4813           && (encodep || NILP (coding->post_read_conversion))
4814           && ! CODING_REQUIRE_FLUSHING (coding))
4815         {
4816           coding->produced = len_byte;
4817           coding->produced_char = len;
4818           if (!replace)
4819             /* We must record and adjust for this new text now.  */
4820             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4821           return 0;
4822         }
4823
4824       head_skip = from_byte - from_byte_orig;
4825       tail_skip = to_byte_orig - to_byte;
4826       total_skip = head_skip + tail_skip;
4827       from += head_skip;
4828       to -= tail_skip;
4829       len -= total_skip; len_byte -= total_skip;
4830     }
4831
4832   /* The code conversion routine can not preserve text properties for
4833      now.  So, we must remove all text properties in the region.
4834      Here, we must suppress all modification hooks.  */
4835   if (replace)
4836     {
4837       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4838       inhibit_modification_hooks = 1;
4839       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4840       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4841     }
4842
4843   /* For converion, we must put the gap before the text in addition to
4844      making the gap larger for efficient decoding.  The required gap
4845      size starts from 2000 which is the magic number used in make_gap.
4846      But, after one batch of conversion, it will be incremented if we
4847      find that it is not enough .  */
4848   require = 2000;
4849
4850   if (GAP_SIZE  < require)
4851     make_gap (require - GAP_SIZE);
4852   move_gap_both (from, from_byte);
4853
4854   inserted = inserted_byte = 0;
4855
4856   GAP_SIZE += len_byte;
4857   ZV -= len;
4858   Z -= len;
4859   ZV_BYTE -= len_byte;
4860   Z_BYTE -= len_byte;
4861
4862   if (GPT - BEG < BEG_UNCHANGED)
4863     BEG_UNCHANGED = GPT - BEG;
4864   if (Z - GPT < END_UNCHANGED)
4865     END_UNCHANGED = Z - GPT;
4866
4867   if (!encodep && coding->src_multibyte)
4868     {
4869       /* Decoding routines expects that the source text is unibyte.
4870          We must convert 8-bit characters of multibyte form to
4871          unibyte.  */
4872       int len_byte_orig = len_byte;
4873       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4874       if (len_byte < len_byte_orig)
4875         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4876                     len_byte);
4877       coding->src_multibyte = 0;
4878     }
4879
4880   for (;;)
4881     {
4882       int result;
4883
4884       /* The buffer memory is now:
4885          +--------+converted-text+---------+-------original-text-------+---+
4886          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4887                   |<---------------------- GAP ----------------------->|  */
4888       src = GAP_END_ADDR - len_byte;
4889       dst = GPT_ADDR + inserted_byte;
4890
4891       if (encodep)
4892         result = encode_coding (coding, src, dst, len_byte, 0);
4893       else
4894         result = decode_coding (coding, src, dst, len_byte, 0);
4895
4896       /* The buffer memory is now:
4897          +--------+-------converted-text----+--+------original-text----+---+
4898          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4899                   |<---------------------- GAP ----------------------->|  */
4900
4901       inserted += coding->produced_char;
4902       inserted_byte += coding->produced;
4903       len_byte -= coding->consumed;
4904
4905       if (result == CODING_FINISH_INSUFFICIENT_CMP)
4906         {
4907           coding_allocate_composition_data (coding, from + inserted);
4908           continue;
4909         }
4910
4911       src += coding->consumed;
4912       dst += coding->produced;
4913
4914       if (result == CODING_FINISH_NORMAL)
4915         {
4916           src += len_byte;
4917           break;
4918         }
4919       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4920         {
4921           unsigned char *pend = dst, *p = pend - inserted_byte;
4922           Lisp_Object eol_type;
4923
4924           /* Encode LFs back to the original eol format (CR or CRLF).  */
4925           if (coding->eol_type == CODING_EOL_CR)
4926             {
4927               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4928             }
4929           else
4930             {
4931               int count = 0;
4932
4933               while (p < pend) if (*p++ == '\n') count++;
4934               if (src - dst < count)
4935                 {
4936                   /* We don't have sufficient room for encoding LFs
4937                      back to CRLF.  We must record converted and
4938                      not-yet-converted text back to the buffer
4939                      content, enlarge the gap, then record them out of
4940                      the buffer contents again.  */
4941                   int add = len_byte + inserted_byte;
4942
4943                   GAP_SIZE -= add;
4944                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4945                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4946                   make_gap (count - GAP_SIZE);
4947                   GAP_SIZE += add;
4948                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4949                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4950                   /* Don't forget to update SRC, DST, and PEND.  */
4951                   src = GAP_END_ADDR - len_byte;
4952                   dst = GPT_ADDR + inserted_byte;
4953                   pend = dst;
4954                 }
4955               inserted += count;
4956               inserted_byte += count;
4957               coding->produced += count;
4958               p = dst = pend + count;
4959               while (count)
4960                 {
4961                   *--p = *--pend;
4962                   if (*p == '\n') count--, *--p = '\r';
4963                 }
4964             }
4965
4966           /* Suppress eol-format conversion in the further conversion.  */
4967           coding->eol_type = CODING_EOL_LF;
4968
4969           /* Set the coding system symbol to that for Unix-like EOL.  */
4970           eol_type = Fget (saved_coding_symbol, Qeol_type);
4971           if (VECTORP (eol_type)
4972               && XVECTOR (eol_type)->size == 3
4973               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4974             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4975           else
4976             coding->symbol = saved_coding_symbol;
4977
4978           continue;
4979         }
4980       if (len_byte <= 0)
4981         {
4982           if (coding->type != coding_type_ccl
4983               || coding->mode & CODING_MODE_LAST_BLOCK)
4984             break;
4985           coding->mode |= CODING_MODE_LAST_BLOCK;
4986           continue;
4987         }
4988       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4989         {
4990           /* The source text ends in invalid codes.  Let's just
4991              make them valid buffer contents, and finish conversion.  */
4992           inserted += len_byte;
4993           inserted_byte += len_byte;
4994           while (len_byte--)
4995             *dst++ = *src++;
4996           break;
4997         }
4998       if (result == CODING_FINISH_INTERRUPT)
4999         {
5000           /* The conversion procedure was interrupted by a user.  */
5001           break;
5002         }
5003       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5004       if (coding->consumed < 1)
5005         {
5006           /* It's quite strange to require more memory without
5007              consuming any bytes.  Perhaps CCL program bug.  */
5008           break;
5009         }
5010       if (first)
5011         {
5012           /* We have just done the first batch of conversion which was
5013              stoped because of insufficient gap.  Let's reconsider the
5014              required gap size (i.e. SRT - DST) now.
5015
5016              We have converted ORIG bytes (== coding->consumed) into
5017              NEW bytes (coding->produced).  To convert the remaining
5018              LEN bytes, we may need REQUIRE bytes of gap, where:
5019                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5020                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5021              Here, we are sure that NEW >= ORIG.  */
5022           float ratio = coding->produced - coding->consumed;
5023           ratio /= coding->consumed;
5024           require = len_byte * ratio;
5025           first = 0;
5026         }
5027       if ((src - dst) < (require + 2000))
5028         {
5029           /* See the comment above the previous call of make_gap.  */
5030           int add = len_byte + inserted_byte;
5031
5032           GAP_SIZE -= add;
5033           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5034           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5035           make_gap (require + 2000);
5036           GAP_SIZE += add;
5037           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5038           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5039         }
5040     }
5041   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5042
5043   if (encodep && coding->dst_multibyte)
5044     {
5045       /* The output is unibyte.  We must convert 8-bit characters to
5046          multibyte form.  */
5047       if (inserted_byte * 2 > GAP_SIZE)
5048         {
5049           GAP_SIZE -= inserted_byte;
5050           ZV += inserted_byte; Z += inserted_byte;
5051           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5052           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5053           make_gap (inserted_byte - GAP_SIZE);
5054           GAP_SIZE += inserted_byte;
5055           ZV -= inserted_byte; Z -= inserted_byte;
5056           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5057           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5058         }
5059       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5060     }
5061
5062   /* If we have shrinked the conversion area, adjust it now.  */
5063   if (total_skip > 0)
5064     {
5065       if (tail_skip > 0)
5066         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5067       inserted += total_skip; inserted_byte += total_skip;
5068       GAP_SIZE += total_skip;
5069       GPT -= head_skip; GPT_BYTE -= head_skip;
5070       ZV -= total_skip; ZV_BYTE -= total_skip;
5071       Z -= total_skip; Z_BYTE -= total_skip;
5072       from -= head_skip; from_byte -= head_skip;
5073       to += tail_skip; to_byte += tail_skip;
5074     }
5075
5076   prev_Z = Z;
5077   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5078   inserted = Z - prev_Z;
5079
5080   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5081     coding_restore_composition (coding, Fcurrent_buffer ());
5082   coding_free_composition_data (coding);
5083
5084   if (! inhibit_pre_post_conversion
5085       && ! encodep && ! NILP (coding->post_read_conversion))
5086     {
5087       Lisp_Object val;
5088       int count = specpdl_ptr - specpdl;
5089
5090       if (from != PT)
5091         TEMP_SET_PT_BOTH (from, from_byte);
5092       prev_Z = Z;
5093       record_unwind_protect (code_convert_region_unwind, Qnil);
5094       /* We should not call any more pre-write/post-read-conversion
5095          functions while this post-read-conversion is running.  */
5096       inhibit_pre_post_conversion = 1;
5097       val = call1 (coding->post_read_conversion, make_number (inserted));
5098       inhibit_pre_post_conversion = 0;
5099       /* Discard the unwind protect.  */
5100       specpdl_ptr--;
5101       CHECK_NUMBER (val, 0);
5102       inserted += Z - prev_Z;
5103     }
5104
5105   if (orig_point >= from)
5106     {
5107       if (orig_point >= from + orig_len)
5108         orig_point += inserted - orig_len;
5109       else
5110         orig_point = from;
5111       TEMP_SET_PT (orig_point);
5112     }
5113
5114   if (replace)
5115     {
5116       signal_after_change (from, to - from, inserted);
5117       update_compositions (from, from + inserted, CHECK_BORDER);
5118     }
5119
5120   {
5121     coding->consumed = to_byte - from_byte;
5122     coding->consumed_char = to - from;
5123     coding->produced = inserted_byte;
5124     coding->produced_char = inserted;
5125   }
5126
5127   return 0;
5128 }
5129
5130 Lisp_Object
5131 run_pre_post_conversion_on_str (str, coding, encodep)
5132      Lisp_Object str;
5133      struct coding_system *coding;
5134      int encodep;
5135 {
5136   int count = specpdl_ptr - specpdl;
5137   struct gcpro gcpro1;
5138   struct buffer *prev = current_buffer;
5139   int multibyte = STRING_MULTIBYTE (str);
5140
5141   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5142   record_unwind_protect (code_convert_region_unwind, Qnil);
5143   GCPRO1 (str);
5144   temp_output_buffer_setup (" *code-converting-work*");
5145   set_buffer_internal (XBUFFER (Vstandard_output));
5146   /* We must insert the contents of STR as is without
5147      unibyte<->multibyte conversion.  For that, we adjust the
5148      multibyteness of the working buffer to that of STR.  */
5149   Ferase_buffer ();
5150   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5151   insert_from_string (str, 0, 0,
5152                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5153   UNGCPRO;
5154   inhibit_pre_post_conversion = 1;
5155   if (encodep)
5156     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5157   else
5158     {
5159       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5160       call1 (coding->post_read_conversion, make_number (Z - BEG));
5161     }
5162   inhibit_pre_post_conversion = 0;
5163   str = make_buffer_string (BEG, Z, 0);
5164   return unbind_to (count, str);
5165 }
5166
5167 Lisp_Object
5168 decode_coding_string (str, coding, nocopy)
5169      Lisp_Object str;
5170      struct coding_system *coding;
5171      int nocopy;
5172 {
5173   int len;
5174   char *buf;
5175   int from, to, to_byte;
5176   struct gcpro gcpro1;
5177   Lisp_Object saved_coding_symbol;
5178   int result;
5179
5180   from = 0;
5181   to = XSTRING (str)->size;
5182   to_byte = STRING_BYTES (XSTRING (str));
5183
5184   saved_coding_symbol = Qnil;
5185   if (CODING_REQUIRE_DETECTION (coding))
5186     {
5187       /* See the comments in code_convert_region.  */
5188       if (coding->type == coding_type_undecided)
5189         {
5190           detect_coding (coding, XSTRING (str)->data, to_byte);
5191           if (coding->type == coding_type_undecided)
5192             coding->type = coding_type_emacs_mule;
5193         }
5194       if (coding->eol_type == CODING_EOL_UNDECIDED
5195           && coding->type != coding_type_ccl)
5196         {
5197           saved_coding_symbol = coding->symbol;
5198           detect_eol (coding, XSTRING (str)->data, to_byte);
5199           if (coding->eol_type == CODING_EOL_UNDECIDED)
5200             coding->eol_type = CODING_EOL_LF;
5201           /* We had better recover the original eol format if we
5202              encounter an inconsitent eol format while decoding.  */
5203           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5204         }
5205     }
5206
5207   if (! CODING_REQUIRE_DECODING (coding))
5208     {
5209       if (!STRING_MULTIBYTE (str))
5210         {
5211           str = Fstring_as_multibyte (str);
5212           nocopy = 1;
5213         }
5214       return (nocopy ? str : Fcopy_sequence (str));
5215     }
5216
5217   if (STRING_MULTIBYTE (str))
5218     {
5219       /* Decoding routines expect the source text to be unibyte.  */
5220       str = Fstring_as_unibyte (str);
5221       to_byte = STRING_BYTES (XSTRING (str));
5222       nocopy = 1;
5223       coding->src_multibyte = 0;
5224     }
5225   coding->dst_multibyte = 1;
5226
5227   if (coding->composing != COMPOSITION_DISABLED)
5228     coding_allocate_composition_data (coding, from);
5229
5230   /* Try to skip the heading and tailing ASCIIs.  */
5231   if (coding->type != coding_type_ccl)
5232     {
5233       int from_orig = from;
5234
5235       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5236                                 0);
5237       if (from == to_byte)
5238         return (nocopy ? str : Fcopy_sequence (str));
5239     }
5240
5241   len = decoding_buffer_size (coding, to_byte - from);
5242   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5243   GCPRO1 (str);
5244   buf = get_conversion_buffer (len);
5245   UNGCPRO;
5246
5247   if (from > 0)
5248     bcopy (XSTRING (str)->data, buf, from);
5249   result = decode_coding (coding, XSTRING (str)->data + from,
5250                          buf + from, to_byte - from, len);
5251   if (result == CODING_FINISH_INCONSISTENT_EOL)
5252     {
5253       /* We simply try to decode the whole string again but without
5254          eol-conversion this time.  */
5255       coding->eol_type = CODING_EOL_LF;
5256       coding->symbol = saved_coding_symbol;
5257       coding_free_composition_data (coding);
5258       return decode_coding_string (str, coding, nocopy);
5259     }
5260
5261   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5262          STRING_BYTES (XSTRING (str)) - to_byte);
5263
5264   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5265   str = make_multibyte_string (buf, len + coding->produced_char,
5266                                len + coding->produced);
5267
5268   if (coding->cmp_data && coding->cmp_data->used)
5269     coding_restore_composition (coding, str);
5270   coding_free_composition_data (coding);
5271
5272   if (SYMBOLP (coding->post_read_conversion)
5273       && !NILP (Ffboundp (coding->post_read_conversion)))
5274     str = run_pre_post_conversion_on_str (str, coding, 0);
5275
5276   return str;
5277 }
5278
5279 Lisp_Object
5280 encode_coding_string (str, coding, nocopy)
5281      Lisp_Object str;
5282      struct coding_system *coding;
5283      int nocopy;
5284 {
5285   int len;
5286   char *buf;
5287   int from, to, to_byte;
5288   struct gcpro gcpro1;
5289   Lisp_Object saved_coding_symbol;
5290   int result;
5291
5292   if (SYMBOLP (coding->pre_write_conversion)
5293       && !NILP (Ffboundp (coding->pre_write_conversion)))
5294     str = run_pre_post_conversion_on_str (str, coding, 1);
5295
5296   from = 0;
5297   to = XSTRING (str)->size;
5298   to_byte = STRING_BYTES (XSTRING (str));
5299
5300   saved_coding_symbol = Qnil;
5301   if (! CODING_REQUIRE_ENCODING (coding))
5302     {
5303       if (STRING_MULTIBYTE (str))
5304         {
5305           str = Fstring_as_unibyte (str);
5306           nocopy = 1;
5307         }
5308       return (nocopy ? str : Fcopy_sequence (str));
5309     }
5310
5311   /* Encoding routines determine the multibyteness of the source text
5312      by coding->src_multibyte.  */
5313   coding->src_multibyte = STRING_MULTIBYTE (str);
5314   coding->dst_multibyte = 0;
5315
5316   if (coding->composing != COMPOSITION_DISABLED)
5317     coding_save_composition (coding, from, to, str);
5318
5319   /* Try to skip the heading and tailing ASCIIs.  */
5320   if (coding->type != coding_type_ccl)
5321     {
5322       int from_orig = from;
5323
5324       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5325                                 1);
5326       if (from == to_byte)
5327         return (nocopy ? str : Fcopy_sequence (str));
5328     }
5329
5330   len = encoding_buffer_size (coding, to_byte - from);
5331   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5332   GCPRO1 (str);
5333   buf = get_conversion_buffer (len);
5334   UNGCPRO;
5335
5336   if (from > 0)
5337     bcopy (XSTRING (str)->data, buf, from);
5338   result = encode_coding (coding, XSTRING (str)->data + from,
5339                           buf + from, to_byte - from, len);
5340   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5341          STRING_BYTES (XSTRING (str)) - to_byte);
5342
5343   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5344   str = make_unibyte_string (buf, len + coding->produced);
5345   coding_free_composition_data (coding);
5346
5347   return str;
5348 }
5349
5350 \f
5351 #ifdef emacs
5352 /*** 8. Emacs Lisp library functions ***/
5353
5354 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5355   "Return t if OBJECT is nil or a coding-system.\n\
5356 See the documentation of `make-coding-system' for information\n\
5357 about coding-system objects.")
5358   (obj)
5359      Lisp_Object obj;
5360 {
5361   if (NILP (obj))
5362     return Qt;
5363   if (!SYMBOLP (obj))
5364     return Qnil;
5365   /* Get coding-spec vector for OBJ.  */
5366   obj = Fget (obj, Qcoding_system);
5367   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5368           ? Qt : Qnil);
5369 }
5370
5371 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5372        Sread_non_nil_coding_system, 1, 1, 0,
5373   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5374   (prompt)
5375      Lisp_Object prompt;
5376 {
5377   Lisp_Object val;
5378   do
5379     {
5380       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5381                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5382     }
5383   while (XSTRING (val)->size == 0);
5384   return (Fintern (val, Qnil));
5385 }
5386
5387 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5388   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5389 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5390   (prompt, default_coding_system)
5391      Lisp_Object prompt, default_coding_system;
5392 {
5393   Lisp_Object val;
5394   if (SYMBOLP (default_coding_system))
5395     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5396   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5397                           Qt, Qnil, Qcoding_system_history,
5398                           default_coding_system, Qnil);
5399   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5400 }
5401
5402 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5403        1, 1, 0,
5404   "Check validity of CODING-SYSTEM.\n\
5405 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5406 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5407 The value of property should be a vector of length 5.")
5408   (coding_system)
5409      Lisp_Object coding_system;
5410 {
5411   CHECK_SYMBOL (coding_system, 0);
5412   if (!NILP (Fcoding_system_p (coding_system)))
5413     return coding_system;
5414   while (1)
5415     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5416 }
5417 \f
5418 Lisp_Object
5419 detect_coding_system (src, src_bytes, highest)
5420      unsigned char *src;
5421      int src_bytes, highest;
5422 {
5423   int coding_mask, eol_type;
5424   Lisp_Object val, tmp;
5425   int dummy;
5426
5427   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5428   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5429   if (eol_type == CODING_EOL_INCONSISTENT)
5430     eol_type = CODING_EOL_UNDECIDED;
5431
5432   if (!coding_mask)
5433     {
5434       val = Qundecided;
5435       if (eol_type != CODING_EOL_UNDECIDED)
5436         {
5437           Lisp_Object val2;
5438           val2 = Fget (Qundecided, Qeol_type);
5439           if (VECTORP (val2))
5440             val = XVECTOR (val2)->contents[eol_type];
5441         }
5442       return (highest ? val : Fcons (val, Qnil));
5443     }
5444
5445   /* At first, gather possible coding systems in VAL.  */
5446   val = Qnil;
5447   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5448     {
5449       Lisp_Object category_val, category_index;
5450
5451       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5452       category_val = Fsymbol_value (XCAR (tmp));
5453       if (!NILP (category_val)
5454           && NATNUMP (category_index)
5455           && (coding_mask & (1 << XFASTINT (category_index))))
5456         {
5457           val = Fcons (category_val, val);
5458           if (highest)
5459             break;
5460         }
5461     }
5462   if (!highest)
5463     val = Fnreverse (val);
5464
5465   /* Then, replace the elements with subsidiary coding systems.  */
5466   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5467     {
5468       if (eol_type != CODING_EOL_UNDECIDED
5469           && eol_type != CODING_EOL_INCONSISTENT)
5470         {
5471           Lisp_Object eol;
5472           eol = Fget (XCAR (tmp), Qeol_type);
5473           if (VECTORP (eol))
5474             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5475         }
5476     }
5477   return (highest ? XCAR (val) : val);
5478 }
5479
5480 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5481        2, 3, 0,
5482   "Detect coding system of the text in the region between START and END.\n\
5483 Return a list of possible coding systems ordered by priority.\n\
5484 \n\
5485 If only ASCII characters are found, it returns a list of single element\n\
5486 `undecided' or its subsidiary coding system according to a detected\n\
5487 end-of-line format.\n\
5488 \n\
5489 If optional argument HIGHEST is non-nil, return the coding system of\n\
5490 highest priority.")
5491   (start, end, highest)
5492      Lisp_Object start, end, highest;
5493 {
5494   int from, to;
5495   int from_byte, to_byte;
5496
5497   CHECK_NUMBER_COERCE_MARKER (start, 0);
5498   CHECK_NUMBER_COERCE_MARKER (end, 1);
5499
5500   validate_region (&start, &end);
5501   from = XINT (start), to = XINT (end);
5502   from_byte = CHAR_TO_BYTE (from);
5503   to_byte = CHAR_TO_BYTE (to);
5504
5505   if (from < GPT && to >= GPT)
5506     move_gap_both (to, to_byte);
5507
5508   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5509                                to_byte - from_byte,
5510                                !NILP (highest));
5511 }
5512
5513 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5514        1, 2, 0,
5515   "Detect coding system of the text in STRING.\n\
5516 Return a list of possible coding systems ordered by priority.\n\
5517 \n\
5518 If only ASCII characters are found, it returns a list of single element\n\
5519 `undecided' or its subsidiary coding system according to a detected\n\
5520 end-of-line format.\n\
5521 \n\
5522 If optional argument HIGHEST is non-nil, return the coding system of\n\
5523 highest priority.")
5524   (string, highest)
5525      Lisp_Object string, highest;
5526 {
5527   CHECK_STRING (string, 0);
5528
5529   return detect_coding_system (XSTRING (string)->data,
5530                                STRING_BYTES (XSTRING (string)),
5531                                !NILP (highest));
5532 }
5533
5534 Lisp_Object
5535 code_convert_region1 (start, end, coding_system, encodep)
5536      Lisp_Object start, end, coding_system;
5537      int encodep;
5538 {
5539   struct coding_system coding;
5540   int from, to, len;
5541
5542   CHECK_NUMBER_COERCE_MARKER (start, 0);
5543   CHECK_NUMBER_COERCE_MARKER (end, 1);
5544   CHECK_SYMBOL (coding_system, 2);
5545
5546   validate_region (&start, &end);
5547   from = XFASTINT (start);
5548   to = XFASTINT (end);
5549
5550   if (NILP (coding_system))
5551     return make_number (to - from);
5552
5553   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5554     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5555
5556   coding.mode |= CODING_MODE_LAST_BLOCK;
5557   coding.src_multibyte = coding.dst_multibyte
5558     = !NILP (current_buffer->enable_multibyte_characters);
5559   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5560                        &coding, encodep, 1);
5561   Vlast_coding_system_used = coding.symbol;
5562   return make_number (coding.produced_char);
5563 }
5564
5565 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5566        3, 3, "r\nzCoding system: ",
5567   "Decode the current region by specified coding system.\n\
5568 When called from a program, takes three arguments:\n\
5569 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5570 This function sets `last-coding-system-used' to the precise coding system\n\
5571 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5572 not fully specified.)\n\
5573 It returns the length of the decoded text.")
5574   (start, end, coding_system)
5575      Lisp_Object start, end, coding_system;
5576 {
5577   return code_convert_region1 (start, end, coding_system, 0);
5578 }
5579
5580 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5581        3, 3, "r\nzCoding system: ",
5582   "Encode the current region by specified coding system.\n\
5583 When called from a program, takes three arguments:\n\
5584 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5585 This function sets `last-coding-system-used' to the precise coding system\n\
5586 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5587 not fully specified.)\n\
5588 It returns the length of the encoded text.")
5589   (start, end, coding_system)
5590      Lisp_Object start, end, coding_system;
5591 {
5592   return code_convert_region1 (start, end, coding_system, 1);
5593 }
5594
5595 Lisp_Object
5596 code_convert_string1 (string, coding_system, nocopy, encodep)
5597      Lisp_Object string, coding_system, nocopy;
5598      int encodep;
5599 {
5600   struct coding_system coding;
5601
5602   CHECK_STRING (string, 0);
5603   CHECK_SYMBOL (coding_system, 1);
5604
5605   if (NILP (coding_system))
5606     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5607
5608   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5609     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5610
5611   coding.mode |= CODING_MODE_LAST_BLOCK;
5612   string = (encodep
5613             ? encode_coding_string (string, &coding, !NILP (nocopy))
5614             : decode_coding_string (string, &coding, !NILP (nocopy)));
5615   Vlast_coding_system_used = coding.symbol;
5616
5617   return string;
5618 }
5619
5620 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5621        2, 3, 0,
5622   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5623 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5624 if the decoding operation is trivial.\n\
5625 This function sets `last-coding-system-used' to the precise coding system\n\
5626 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5627 not fully specified.)")
5628   (string, coding_system, nocopy)
5629      Lisp_Object string, coding_system, nocopy;
5630 {
5631   return code_convert_string1 (string, coding_system, nocopy, 0);
5632 }
5633
5634 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5635        2, 3, 0,
5636   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5637 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5638 if the encoding operation is trivial.\n\
5639 This function sets `last-coding-system-used' to the precise coding system\n\
5640 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5641 not fully specified.)")
5642   (string, coding_system, nocopy)
5643      Lisp_Object string, coding_system, nocopy;
5644 {
5645   return code_convert_string1 (string, coding_system, nocopy, 1);
5646 }
5647
5648 /* Encode or decode STRING according to CODING_SYSTEM.
5649    Do not set Vlast_coding_system_used.
5650
5651    This function is called only from macros DECODE_FILE and
5652    ENCODE_FILE, thus we ignore character composition.  */
5653
5654 Lisp_Object
5655 code_convert_string_norecord (string, coding_system, encodep)
5656      Lisp_Object string, coding_system;
5657      int encodep;
5658 {
5659   struct coding_system coding;
5660
5661   CHECK_STRING (string, 0);
5662   CHECK_SYMBOL (coding_system, 1);
5663
5664   if (NILP (coding_system))
5665     return string;
5666
5667   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5668     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5669
5670   coding.composing = COMPOSITION_DISABLED;
5671   coding.mode |= CODING_MODE_LAST_BLOCK;
5672   return (encodep
5673           ? encode_coding_string (string, &coding, 1)
5674           : decode_coding_string (string, &coding, 1));
5675 }
5676 \f
5677 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5678   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5679 Return the corresponding character.")
5680   (code)
5681      Lisp_Object code;
5682 {
5683   unsigned char c1, c2, s1, s2;
5684   Lisp_Object val;
5685
5686   CHECK_NUMBER (code, 0);
5687   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5688   if (s1 == 0)
5689     {
5690       if (s2 < 0x80)
5691         XSETFASTINT (val, s2);
5692       else if (s2 >= 0xA0 || s2 <= 0xDF)
5693         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5694       else
5695         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5696     }
5697   else
5698     {
5699       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5700           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5701         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5702       DECODE_SJIS (s1, s2, c1, c2);
5703       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5704     }
5705   return val;
5706 }
5707
5708 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5709   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5710 Return the corresponding code in SJIS.")
5711   (ch)
5712      Lisp_Object ch;
5713 {
5714   int charset, c1, c2, s1, s2;
5715   Lisp_Object val;
5716
5717   CHECK_NUMBER (ch, 0);
5718   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5719   if (charset == CHARSET_ASCII)
5720     {
5721       val = ch;
5722     }
5723   else if (charset == charset_jisx0208
5724            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5725     {
5726       ENCODE_SJIS (c1, c2, s1, s2);
5727       XSETFASTINT (val, (s1 << 8) | s2);
5728     }
5729   else if (charset == charset_katakana_jisx0201
5730            && c1 > 0x20 && c2 < 0xE0)
5731     {
5732       XSETFASTINT (val, c1 | 0x80);
5733     }
5734   else
5735     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5736   return val;
5737 }
5738
5739 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5740   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5741 Return the corresponding character.")
5742   (code)
5743      Lisp_Object code;
5744 {
5745   int charset;
5746   unsigned char b1, b2, c1, c2;
5747   Lisp_Object val;
5748
5749   CHECK_NUMBER (code, 0);
5750   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5751   if (b1 == 0)
5752     {
5753       if (b2 >= 0x80)
5754         error ("Invalid BIG5 code: %x", XFASTINT (code));
5755       val = code;
5756     }
5757   else
5758     {
5759       if ((b1 < 0xA1 || b1 > 0xFE)
5760           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5761         error ("Invalid BIG5 code: %x", XFASTINT (code));
5762       DECODE_BIG5 (b1, b2, charset, c1, c2);
5763       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5764     }
5765   return val;
5766 }
5767
5768 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5769   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5770 Return the corresponding character code in Big5.")
5771   (ch)
5772      Lisp_Object ch;
5773 {
5774   int charset, c1, c2, b1, b2;
5775   Lisp_Object val;
5776
5777   CHECK_NUMBER (ch, 0);
5778   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5779   if (charset == CHARSET_ASCII)
5780     {
5781       val = ch;
5782     }
5783   else if ((charset == charset_big5_1
5784             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5785            || (charset == charset_big5_2
5786                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5787     {
5788       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5789       XSETFASTINT (val, (b1 << 8) | b2);
5790     }
5791   else
5792     error ("Can't encode to Big5: %d", XFASTINT (ch));
5793   return val;
5794 }
5795 \f
5796 DEFUN ("set-terminal-coding-system-internal",
5797        Fset_terminal_coding_system_internal,
5798        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5799   (coding_system)
5800      Lisp_Object coding_system;
5801 {
5802   CHECK_SYMBOL (coding_system, 0);
5803   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5804   /* We had better not send unsafe characters to terminal.  */
5805   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5806   /* Characer composition should be disabled.  */
5807   terminal_coding.composing = COMPOSITION_DISABLED;
5808   terminal_coding.src_multibyte = 1;
5809   terminal_coding.dst_multibyte = 0;
5810   return Qnil;
5811 }
5812
5813 DEFUN ("set-safe-terminal-coding-system-internal",
5814        Fset_safe_terminal_coding_system_internal,
5815        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5816   (coding_system)
5817      Lisp_Object coding_system;
5818 {
5819   CHECK_SYMBOL (coding_system, 0);
5820   setup_coding_system (Fcheck_coding_system (coding_system),
5821                        &safe_terminal_coding);
5822   /* Characer composition should be disabled.  */
5823   safe_terminal_coding.composing = COMPOSITION_DISABLED;
5824   safe_terminal_coding.src_multibyte = 1;
5825   safe_terminal_coding.dst_multibyte = 0;
5826   return Qnil;
5827 }
5828
5829 DEFUN ("terminal-coding-system",
5830        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5831   "Return coding system specified for terminal output.")
5832   ()
5833 {
5834   return terminal_coding.symbol;
5835 }
5836
5837 DEFUN ("set-keyboard-coding-system-internal",
5838        Fset_keyboard_coding_system_internal,
5839        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5840   (coding_system)
5841      Lisp_Object coding_system;
5842 {
5843   CHECK_SYMBOL (coding_system, 0);
5844   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5845   /* Characer composition should be disabled.  */
5846   keyboard_coding.composing = COMPOSITION_DISABLED;
5847   return Qnil;
5848 }
5849
5850 DEFUN ("keyboard-coding-system",
5851        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5852   "Return coding system specified for decoding keyboard input.")
5853   ()
5854 {
5855   return keyboard_coding.symbol;
5856 }
5857
5858 \f
5859 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5860        Sfind_operation_coding_system,  1, MANY, 0,
5861   "Choose a coding system for an operation based on the target name.\n\
5862 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5863 DECODING-SYSTEM is the coding system to use for decoding\n\
5864 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5865 for encoding (in case OPERATION does encoding).\n\
5866 \n\
5867 The first argument OPERATION specifies an I/O primitive:\n\
5868   For file I/O, `insert-file-contents' or `write-region'.\n\
5869   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5870   For network I/O, `open-network-stream'.\n\
5871 \n\
5872 The remaining arguments should be the same arguments that were passed\n\
5873 to the primitive.  Depending on which primitive, one of those arguments\n\
5874 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5875 whichever argument specifies the file name is TARGET.\n\
5876 \n\
5877 TARGET has a meaning which depends on OPERATION:\n\
5878   For file I/O, TARGET is a file name.\n\
5879   For process I/O, TARGET is a process name.\n\
5880   For network I/O, TARGET is a service name or a port number\n\
5881 \n\
5882 This function looks up what specified for TARGET in,\n\
5883 `file-coding-system-alist', `process-coding-system-alist',\n\
5884 or `network-coding-system-alist' depending on OPERATION.\n\
5885 They may specify a coding system, a cons of coding systems,\n\
5886 or a function symbol to call.\n\
5887 In the last case, we call the function with one argument,\n\
5888 which is a list of all the arguments given to this function.")
5889   (nargs, args)
5890      int nargs;
5891      Lisp_Object *args;
5892 {
5893   Lisp_Object operation, target_idx, target, val;
5894   register Lisp_Object chain;
5895
5896   if (nargs < 2)
5897     error ("Too few arguments");
5898   operation = args[0];
5899   if (!SYMBOLP (operation)
5900       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5901     error ("Invalid first arguement");
5902   if (nargs < 1 + XINT (target_idx))
5903     error ("Too few arguments for operation: %s",
5904            XSYMBOL (operation)->name->data);
5905   target = args[XINT (target_idx) + 1];
5906   if (!(STRINGP (target)
5907         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5908     error ("Invalid %dth argument", XINT (target_idx) + 1);
5909
5910   chain = ((EQ (operation, Qinsert_file_contents)
5911             || EQ (operation, Qwrite_region))
5912            ? Vfile_coding_system_alist
5913            : (EQ (operation, Qopen_network_stream)
5914               ? Vnetwork_coding_system_alist
5915               : Vprocess_coding_system_alist));
5916   if (NILP (chain))
5917     return Qnil;
5918
5919   for (; CONSP (chain); chain = XCDR (chain))
5920     {
5921       Lisp_Object elt;
5922       elt = XCAR (chain);
5923
5924       if (CONSP (elt)
5925           && ((STRINGP (target)
5926                && STRINGP (XCAR (elt))
5927                && fast_string_match (XCAR (elt), target) >= 0)
5928               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5929         {
5930           val = XCDR (elt);
5931           /* Here, if VAL is both a valid coding system and a valid
5932              function symbol, we return VAL as a coding system.  */
5933           if (CONSP (val))
5934             return val;
5935           if (! SYMBOLP (val))
5936             return Qnil;
5937           if (! NILP (Fcoding_system_p (val)))
5938             return Fcons (val, val);
5939           if (! NILP (Ffboundp (val)))
5940             {
5941               val = call1 (val, Flist (nargs, args));
5942               if (CONSP (val))
5943                 return val;
5944               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5945                 return Fcons (val, val);
5946             }
5947           return Qnil;
5948         }
5949     }
5950   return Qnil;
5951 }
5952
5953 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5954        Supdate_coding_systems_internal, 0, 0, 0,
5955   "Update internal database for ISO2022 and CCL based coding systems.\n\
5956 When values of any coding categories are changed, you must\n\
5957 call this function")
5958   ()
5959 {
5960   int i;
5961
5962   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5963     {
5964       Lisp_Object val;
5965
5966       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5967       if (!NILP (val))
5968         {
5969           if (! coding_system_table[i])
5970             coding_system_table[i] = ((struct coding_system *)
5971                                       xmalloc (sizeof (struct coding_system)));
5972           setup_coding_system (val, coding_system_table[i]);
5973         }
5974       else if (coding_system_table[i])
5975         {
5976           xfree (coding_system_table[i]);
5977           coding_system_table[i] = NULL;
5978         }
5979     }
5980
5981   return Qnil;
5982 }
5983
5984 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5985        Sset_coding_priority_internal, 0, 0, 0,
5986   "Update internal database for the current value of `coding-category-list'.\n\
5987 This function is internal use only.")
5988   ()
5989 {
5990   int i = 0, idx;
5991   Lisp_Object val;
5992
5993   val = Vcoding_category_list;
5994
5995   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5996     {
5997       if (! SYMBOLP (XCAR (val)))
5998         break;
5999       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6000       if (idx >= CODING_CATEGORY_IDX_MAX)
6001         break;
6002       coding_priorities[i++] = (1 << idx);
6003       val = XCDR (val);
6004     }
6005   /* If coding-category-list is valid and contains all coding
6006      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
6007      the following code saves Emacs from crashing.  */
6008   while (i < CODING_CATEGORY_IDX_MAX)
6009     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6010
6011   return Qnil;
6012 }
6013
6014 #endif /* emacs */
6015
6016 \f
6017 /*** 9. Post-amble ***/
6018
6019 void
6020 init_coding ()
6021 {
6022   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6023 }
6024
6025 void
6026 init_coding_once ()
6027 {
6028   int i;
6029
6030   /* Emacs' internal format specific initialize routine.  */
6031   for (i = 0; i <= 0x20; i++)
6032     emacs_code_class[i] = EMACS_control_code;
6033   emacs_code_class[0x0A] = EMACS_linefeed_code;
6034   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6035   for (i = 0x21 ; i < 0x7F; i++)
6036     emacs_code_class[i] = EMACS_ascii_code;
6037   emacs_code_class[0x7F] = EMACS_control_code;
6038   for (i = 0x80; i < 0xFF; i++)
6039     emacs_code_class[i] = EMACS_invalid_code;
6040   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6041   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6042   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6043   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6044
6045   /* ISO2022 specific initialize routine.  */
6046   for (i = 0; i < 0x20; i++)
6047     iso_code_class[i] = ISO_control_0;
6048   for (i = 0x21; i < 0x7F; i++)
6049     iso_code_class[i] = ISO_graphic_plane_0;
6050   for (i = 0x80; i < 0xA0; i++)
6051     iso_code_class[i] = ISO_control_1;
6052   for (i = 0xA1; i < 0xFF; i++)
6053     iso_code_class[i] = ISO_graphic_plane_1;
6054   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6055   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6056   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6057   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6058   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6059   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6060   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6061   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6062   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6063   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6064
6065   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6066
6067   setup_coding_system (Qnil, &keyboard_coding);
6068   setup_coding_system (Qnil, &terminal_coding);
6069   setup_coding_system (Qnil, &safe_terminal_coding);
6070   setup_coding_system (Qnil, &default_buffer_file_coding);
6071
6072   bzero (coding_system_table, sizeof coding_system_table);
6073
6074   bzero (ascii_skip_code, sizeof ascii_skip_code);
6075   for (i = 0; i < 128; i++)
6076     ascii_skip_code[i] = 1;
6077
6078 #if defined (MSDOS) || defined (WINDOWSNT)
6079   system_eol_type = CODING_EOL_CRLF;
6080 #else
6081   system_eol_type = CODING_EOL_LF;
6082 #endif
6083
6084   inhibit_pre_post_conversion = 0;
6085 }
6086
6087 #ifdef emacs
6088
6089 void
6090 syms_of_coding ()
6091 {
6092   Qtarget_idx = intern ("target-idx");
6093   staticpro (&Qtarget_idx);
6094
6095   Qcoding_system_history = intern ("coding-system-history");
6096   staticpro (&Qcoding_system_history);
6097   Fset (Qcoding_system_history, Qnil);
6098
6099   /* Target FILENAME is the first argument.  */
6100   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6101   /* Target FILENAME is the third argument.  */
6102   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6103
6104   Qcall_process = intern ("call-process");
6105   staticpro (&Qcall_process);
6106   /* Target PROGRAM is the first argument.  */
6107   Fput (Qcall_process, Qtarget_idx, make_number (0));
6108
6109   Qcall_process_region = intern ("call-process-region");
6110   staticpro (&Qcall_process_region);
6111   /* Target PROGRAM is the third argument.  */
6112   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6113
6114   Qstart_process = intern ("start-process");
6115   staticpro (&Qstart_process);
6116   /* Target PROGRAM is the third argument.  */
6117   Fput (Qstart_process, Qtarget_idx, make_number (2));
6118
6119   Qopen_network_stream = intern ("open-network-stream");
6120   staticpro (&Qopen_network_stream);
6121   /* Target SERVICE is the fourth argument.  */
6122   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6123
6124   Qcoding_system = intern ("coding-system");
6125   staticpro (&Qcoding_system);
6126
6127   Qeol_type = intern ("eol-type");
6128   staticpro (&Qeol_type);
6129
6130   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6131   staticpro (&Qbuffer_file_coding_system);
6132
6133   Qpost_read_conversion = intern ("post-read-conversion");
6134   staticpro (&Qpost_read_conversion);
6135
6136   Qpre_write_conversion = intern ("pre-write-conversion");
6137   staticpro (&Qpre_write_conversion);
6138
6139   Qno_conversion = intern ("no-conversion");
6140   staticpro (&Qno_conversion);
6141
6142   Qundecided = intern ("undecided");
6143   staticpro (&Qundecided);
6144
6145   Qcoding_system_p = intern ("coding-system-p");
6146   staticpro (&Qcoding_system_p);
6147
6148   Qcoding_system_error = intern ("coding-system-error");
6149   staticpro (&Qcoding_system_error);
6150
6151   Fput (Qcoding_system_error, Qerror_conditions,
6152         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6153   Fput (Qcoding_system_error, Qerror_message,
6154         build_string ("Invalid coding system"));
6155
6156   Qcoding_category = intern ("coding-category");
6157   staticpro (&Qcoding_category);
6158   Qcoding_category_index = intern ("coding-category-index");
6159   staticpro (&Qcoding_category_index);
6160
6161   Vcoding_category_table
6162     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6163   staticpro (&Vcoding_category_table);
6164   {
6165     int i;
6166     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6167       {
6168         XVECTOR (Vcoding_category_table)->contents[i]
6169           = intern (coding_category_name[i]);
6170         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6171               Qcoding_category_index, make_number (i));
6172       }
6173   }
6174
6175   Qtranslation_table = intern ("translation-table");
6176   staticpro (&Qtranslation_table);
6177   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6178
6179   Qtranslation_table_id = intern ("translation-table-id");
6180   staticpro (&Qtranslation_table_id);
6181
6182   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6183   staticpro (&Qtranslation_table_for_decode);
6184
6185   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6186   staticpro (&Qtranslation_table_for_encode);
6187
6188   Qsafe_charsets = intern ("safe-charsets");
6189   staticpro (&Qsafe_charsets);
6190
6191   Qvalid_codes = intern ("valid-codes");
6192   staticpro (&Qvalid_codes);
6193
6194   Qemacs_mule = intern ("emacs-mule");
6195   staticpro (&Qemacs_mule);
6196
6197   Qraw_text = intern ("raw-text");
6198   staticpro (&Qraw_text);
6199
6200   defsubr (&Scoding_system_p);
6201   defsubr (&Sread_coding_system);
6202   defsubr (&Sread_non_nil_coding_system);
6203   defsubr (&Scheck_coding_system);
6204   defsubr (&Sdetect_coding_region);
6205   defsubr (&Sdetect_coding_string);
6206   defsubr (&Sdecode_coding_region);
6207   defsubr (&Sencode_coding_region);
6208   defsubr (&Sdecode_coding_string);
6209   defsubr (&Sencode_coding_string);
6210   defsubr (&Sdecode_sjis_char);
6211   defsubr (&Sencode_sjis_char);
6212   defsubr (&Sdecode_big5_char);
6213   defsubr (&Sencode_big5_char);
6214   defsubr (&Sset_terminal_coding_system_internal);
6215   defsubr (&Sset_safe_terminal_coding_system_internal);
6216   defsubr (&Sterminal_coding_system);
6217   defsubr (&Sset_keyboard_coding_system_internal);
6218   defsubr (&Skeyboard_coding_system);
6219   defsubr (&Sfind_operation_coding_system);
6220   defsubr (&Supdate_coding_systems_internal);
6221   defsubr (&Sset_coding_priority_internal);
6222
6223   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6224     "List of coding systems.\n\
6225 \n\
6226 Do not alter the value of this variable manually.  This variable should be\n\
6227 updated by the functions `make-coding-system' and\n\
6228 `define-coding-system-alias'.");
6229   Vcoding_system_list = Qnil;
6230
6231   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6232     "Alist of coding system names.\n\
6233 Each element is one element list of coding system name.\n\
6234 This variable is given to `completing-read' as TABLE argument.\n\
6235 \n\
6236 Do not alter the value of this variable manually.  This variable should be\n\
6237 updated by the functions `make-coding-system' and\n\
6238 `define-coding-system-alias'.");
6239   Vcoding_system_alist = Qnil;
6240
6241   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6242     "List of coding-categories (symbols) ordered by priority.");
6243   {
6244     int i;
6245
6246     Vcoding_category_list = Qnil;
6247     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6248       Vcoding_category_list
6249         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6250                  Vcoding_category_list);
6251   }
6252
6253   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6254     "Specify the coding system for read operations.\n\
6255 It is useful to bind this variable with `let', but do not set it globally.\n\
6256 If the value is a coding system, it is used for decoding on read operation.\n\
6257 If not, an appropriate element is used from one of the coding system alists:\n\
6258 There are three such tables, `file-coding-system-alist',\n\
6259 `process-coding-system-alist', and `network-coding-system-alist'.");
6260   Vcoding_system_for_read = Qnil;
6261
6262   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6263     "Specify the coding system for write operations.\n\
6264 Programs bind this variable with `let', but you should not set it globally.\n\
6265 If the value is a coding system, it is used for encoding of output,\n\
6266 when writing it to a file and when sending it to a file or subprocess.\n\
6267 \n\
6268 If this does not specify a coding system, an appropriate element\n\
6269 is used from one of the coding system alists:\n\
6270 There are three such tables, `file-coding-system-alist',\n\
6271 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6272 For output to files, if the above procedure does not specify a coding system,\n\
6273 the value of `buffer-file-coding-system' is used.");
6274   Vcoding_system_for_write = Qnil;
6275
6276   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6277     "Coding system used in the latest file or process I/O.");
6278   Vlast_coding_system_used = Qnil;
6279
6280   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6281     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6282 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6283 such conversion.");
6284   inhibit_eol_conversion = 0;
6285
6286   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6287     "Non-nil means process buffer inherits coding system of process output.\n\
6288 Bind it to t if the process output is to be treated as if it were a file\n\
6289 read from some filesystem.");
6290   inherit_process_coding_system = 0;
6291
6292   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6293     "Alist to decide a coding system to use for a file I/O operation.\n\
6294 The format is ((PATTERN . VAL) ...),\n\
6295 where PATTERN is a regular expression matching a file name,\n\
6296 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6297 If VAL is a coding system, it is used for both decoding and encoding\n\
6298 the file contents.\n\
6299 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6300 and the cdr part is used for encoding.\n\
6301 If VAL is a function symbol, the function must return a coding system\n\
6302 or a cons of coding systems which are used as above.\n\
6303 \n\
6304 See also the function `find-operation-coding-system'\n\
6305 and the variable `auto-coding-alist'.");
6306   Vfile_coding_system_alist = Qnil;
6307
6308   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6309     "Alist to decide a coding system to use for a process I/O operation.\n\
6310 The format is ((PATTERN . VAL) ...),\n\
6311 where PATTERN is a regular expression matching a program name,\n\
6312 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6313 If VAL is a coding system, it is used for both decoding what received\n\
6314 from the program and encoding what sent to the program.\n\
6315 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6316 and the cdr part is used for encoding.\n\
6317 If VAL is a function symbol, the function must return a coding system\n\
6318 or a cons of coding systems which are used as above.\n\
6319 \n\
6320 See also the function `find-operation-coding-system'.");
6321   Vprocess_coding_system_alist = Qnil;
6322
6323   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6324     "Alist to decide a coding system to use for a network I/O operation.\n\
6325 The format is ((PATTERN . VAL) ...),\n\
6326 where PATTERN is a regular expression matching a network service name\n\
6327 or is a port number to connect to,\n\
6328 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6329 If VAL is a coding system, it is used for both decoding what received\n\
6330 from the network stream and encoding what sent to the network stream.\n\
6331 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6332 and the cdr part is used for encoding.\n\
6333 If VAL is a function symbol, the function must return a coding system\n\
6334 or a cons of coding systems which are used as above.\n\
6335 \n\
6336 See also the function `find-operation-coding-system'.");
6337   Vnetwork_coding_system_alist = Qnil;
6338
6339   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6340     "Coding system to use with system messages.");
6341   Vlocale_coding_system = Qnil;
6342
6343   /* The eol mnemonics are reset in startup.el system-dependently.  */
6344   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6345     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6346   eol_mnemonic_unix = build_string (":");
6347
6348   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6349     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6350   eol_mnemonic_dos = build_string ("\\");
6351
6352   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6353     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6354   eol_mnemonic_mac = build_string ("/");
6355
6356   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6357     "*String displayed in mode line when end-of-line format is not yet determined.");
6358   eol_mnemonic_undecided = build_string (":");
6359
6360   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6361     "*Non-nil enables character translation while encoding and decoding.");
6362   Venable_character_translation = Qt;
6363
6364   DEFVAR_LISP ("standard-translation-table-for-decode",
6365     &Vstandard_translation_table_for_decode,
6366     "Table for translating characters while decoding.");
6367   Vstandard_translation_table_for_decode = Qnil;
6368
6369   DEFVAR_LISP ("standard-translation-table-for-encode",
6370     &Vstandard_translation_table_for_encode,
6371     "Table for translationg characters while encoding.");
6372   Vstandard_translation_table_for_encode = Qnil;
6373
6374   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6375     "Alist of charsets vs revision numbers.\n\
6376 While encoding, if a charset (car part of an element) is found,\n\
6377 designate it with the escape sequence identifing revision (cdr part of the element).");
6378   Vcharset_revision_alist = Qnil;
6379
6380   DEFVAR_LISP ("default-process-coding-system",
6381                &Vdefault_process_coding_system,
6382     "Cons of coding systems used for process I/O by default.\n\
6383 The car part is used for decoding a process output,\n\
6384 the cdr part is used for encoding a text to be sent to a process.");
6385   Vdefault_process_coding_system = Qnil;
6386
6387   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6388     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6389 This is a vector of length 256.\n\
6390 If Nth element is non-nil, the existence of code N in a file\n\
6391 \(or output of subprocess) doesn't prevent it to be detected as\n\
6392 a coding system of ISO 2022 variant which has a flag\n\
6393 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6394 or reading output of a subprocess.\n\
6395 Only 128th through 159th elements has a meaning.");
6396   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6397
6398   DEFVAR_LISP ("select-safe-coding-system-function",
6399                &Vselect_safe_coding_system_function,
6400     "Function to call to select safe coding system for encoding a text.\n\
6401 \n\
6402 If set, this function is called to force a user to select a proper\n\
6403 coding system which can encode the text in the case that a default\n\
6404 coding system used in each operation can't encode the text.\n\
6405 \n\
6406 The default value is `select-safe-coding-system' (which see).");
6407   Vselect_safe_coding_system_function = Qnil;
6408
6409 }
6410
6411 char *
6412 emacs_strerror (error_number)
6413      int error_number;
6414 {
6415   char *str;
6416
6417   synchronize_system_messages_locale ();
6418   str = strerror (error_number);
6419
6420   if (! NILP (Vlocale_coding_system))
6421     {
6422       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6423                                                       Vlocale_coding_system,
6424                                                       0);
6425       str = (char *) XSTRING (dec)->data;
6426     }
6427
6428   return str;
6429 }
6430
6431 #endif /* emacs */
6432