code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. CCL handlers
  29   6. End-of-line handlers
  30   7. C library functions
  31   8. Emacs Lisp library functions
  32   9. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format (emacs-internal), and when we say "encode",
  42   it means converting the coding system emacs-mule to some other
  43   coding system.
  44
  45   0. Emacs' internal format (emacs-mule)
  46
  47   Emacs itself holds a multi-lingual character in a buffer and a string
  48   in a special format.  Details are described in section 2.
  49
  50   1. ISO2022
  51
  52   The most famous coding system for multiple character sets.  X's
  53   Compound Text, various EUCs (Extended Unix Code), and coding
  54   systems used in Internet communication such as ISO-2022-JP are
  55   all variants of ISO2022.  Details are described in section 3.
  56
  57   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  58
  59   A coding system to encode character sets: ASCII, JISX0201, and
  60   JISX0208.  Widely used for PC's in Japan.  Details are described in
  61   section 4.
  62
  63   3. BIG5
  64
  65   A coding system to encode character sets: ASCII and Big5.  Widely
  66   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  67   described in section 4.  In this file, when we write "BIG5"
  68   (all uppercase), we mean the coding system, and when we write
  69   "Big5" (capitalized), we mean the character set.
  70
  71   4. Raw text
  72
  73   A coding system for a text containing random 8-bit code.  Emacs does
  74   no code conversion on such a text except for end-of-line format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is usually one byte of
  96   `carriage-return'.
  97
  98   Since text characters encoding and end-of-line encoding are
  99   independent, any coding system described above can take
 100   any format of end-of-line.  So, Emacs has information of format of
 101   end-of-line in each coding-system.  See section 6 for more details.
 102
 103 */
 104
 105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 106
 107   These functions check if a text between SRC and SRC_END is encoded
 108   in the coding system category XXX.  Each returns an integer value in
 109   which appropriate flag bits for the category XXX is set.  The flag
 110   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 111   template of these functions.  */
 112 #if 0
 113 int
 114 detect_coding_emacs_mule (src, src_end)
 115      unsigned char *src, *src_end;
 116 {
 117   ...
 118 }
 119 #endif
 120
 121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 122
 123   These functions decode SRC_BYTES length text at SOURCE encoded in
 124   CODING to Emacs' internal format (emacs-mule).  The resulting text
 125   goes to a place pointed to by DESTINATION, the length of which
 126   should not exceed DST_BYTES.  These functions set the information of
 127   original and decoded texts in the members produced, produced_char,
 128   consumed, and consumed_char of the structure *CODING.
 129
 130   The return value is an integer (CODING_FINISH_XXX) indicating how
 131   the decoding finished.
 132
 133   DST_BYTES zero means that source area and destination area are
 134   overlapped, which means that we can produce a decoded text until it
 135   reaches at the head of not-yet-decoded source text.
 136
 137   Below is a template of these functions.  */
 138 #if 0
 139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 140      struct coding_system *coding;
 141      unsigned char *source, *destination;
 142      int src_bytes, dst_bytes;
 143 {
 144   ...
 145 }
 146 #endif
 147
 148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 149
 150   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 151   internal format (emacs-mule) to CODING.  The resulting text goes to
 152   a place pointed to by DESTINATION, the length of which should not
 153   exceed DST_BYTES.  These functions set the information of
 154   original and encoded texts in the members produced, produced_char,
 155   consumed, and consumed_char of the structure *CODING.
 156
 157   The return value is an integer (CODING_FINISH_XXX) indicating how
 158   the encoding finished.
 159
 160   DST_BYTES zero means that source area and destination area are
 161   overlapped, which means that we can produce a decoded text until it
 162   reaches at the head of not-yet-decoded source text.
 163
 164   Below is a template of these functions.  */
 165 #if 0
 166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 167      struct coding_system *coding;
 168      unsigned char *source, *destination;
 169      int src_bytes, dst_bytes;
 170 {
 171   ...
 172 }
 173 #endif
 174
 175 /*** COMMONLY USED MACROS ***/
 176
 177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 178    THREE_MORE_BYTES safely get one, two, and three bytes from the
 179    source text respectively.  If there are not enough bytes in the
 180    source, they jump to `label_end_of_loop'.  The caller should set
 181    variables `src' and `src_end' to appropriate areas in advance.  */
 182
 183 #define ONE_MORE_BYTE(c1)       \
 184   do {                          \
 185     if (src < src_end)          \
 186       c1 = *src++;              \
 187     else                        \
 188       goto label_end_of_loop;   \
 189   } while (0)
 190
 191 #define TWO_MORE_BYTES(c1, c2)  \
 192   do {                          \
 193     if (src + 1 < src_end)      \
 194       c1 = *src++, c2 = *src++; \
 195     else                        \
 196       goto label_end_of_loop;   \
 197   } while (0)
 198
 199 #define THREE_MORE_BYTES(c1, c2, c3)            \
 200   do {                                          \
 201     if (src + 2 < src_end)                      \
 202       c1 = *src++, c2 = *src++, c3 = *src++;    \
 203     else                                        \
 204       goto label_end_of_loop;                   \
 205   } while (0)
 206
 207 /* The following three macros DECODE_CHARACTER_ASCII,
 208    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 209    the multi-byte form of a character of each class at the place
 210    pointed by `dst'.  The caller should set the variable `dst' to
 211    point to an appropriate area and the variable `coding' to point to
 212    the coding-system of the currently decoding text in advance.  */
 213
 214 /* Decode one ASCII character C.  */
 215
 216 #define DECODE_CHARACTER_ASCII(c)       \
 217   do {                                  \
 218     *dst++ = (c) & 0x7F;                \
 219     coding->produced_char++;            \
 220   } while (0)
 221
 222 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 223    position-code is C.  */
 224
 225 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 226   do {                                                                  \
 227     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 228                                                                         \
 229     *dst++ = leading_code;                                              \
 230     if ((leading_code = CHARSET_LEADING_CODE_EXT (charset)) > 0)        \
 231       *dst++ = leading_code;                                            \
 232     *dst++ = (c) | 0x80;                                                \
 233     coding->produced_char++;                                            \
 234   } while (0)
 235
 236 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 237    position-codes are C1 and C2.  */
 238
 239 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 240   do {                                                  \
 241     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 242     *dst++ = (c2) | 0x80;                               \
 243   } while (0)
 244
 245 \f
 246 /*** 1. Preamble ***/
 247
 248 #ifdef emacs
 249 #include <config.h>
 250 #endif
 251
 252 #include <stdio.h>
 253
 254 #ifdef emacs
 255
 256 #include "lisp.h"
 257 #include "buffer.h"
 258 #include "charset.h"
 259 #include "composite.h"
 260 #include "ccl.h"
 261 #include "coding.h"
 262 #include "window.h"
 263
 264 #else  /* not emacs */
 265
 266 #include "mulelib.h"
 267
 268 #endif /* not emacs */
 269
 270 Lisp_Object Qcoding_system, Qeol_type;
 271 Lisp_Object Qbuffer_file_coding_system;
 272 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 273 Lisp_Object Qno_conversion, Qundecided;
 274 Lisp_Object Qcoding_system_history;
 275 Lisp_Object Qsafe_charsets;
 276 Lisp_Object Qvalid_codes;
 277
 278 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 279 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 280 Lisp_Object Qstart_process, Qopen_network_stream;
 281 Lisp_Object Qtarget_idx;
 282
 283 Lisp_Object Vselect_safe_coding_system_function;
 284
 285 /* Mnemonic string for each format of end-of-line.  */
 286 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 287 /* Mnemonic string to indicate format of end-of-line is not yet
 288    decided.  */
 289 Lisp_Object eol_mnemonic_undecided;
 290
 291 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 292    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 293 int system_eol_type;
 294
 295 #ifdef emacs
 296
 297 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 298
 299 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 300
 301 /* Coding system emacs-mule and raw-text are for converting only
 302    end-of-line format.  */
 303 Lisp_Object Qemacs_mule, Qraw_text;
 304
 305 /* Coding-systems are handed between Emacs Lisp programs and C internal
 306    routines by the following three variables.  */
 307 /* Coding-system for reading files and receiving data from process.  */
 308 Lisp_Object Vcoding_system_for_read;
 309 /* Coding-system for writing files and sending data to process.  */
 310 Lisp_Object Vcoding_system_for_write;
 311 /* Coding-system actually used in the latest I/O.  */
 312 Lisp_Object Vlast_coding_system_used;
 313
 314 /* A vector of length 256 which contains information about special
 315    Latin codes (especially for dealing with Microsoft codes).  */
 316 Lisp_Object Vlatin_extra_code_table;
 317
 318 /* Flag to inhibit code conversion of end-of-line format.  */
 319 int inhibit_eol_conversion;
 320
 321 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 322 int inherit_process_coding_system;
 323
 324 /* Coding system to be used to encode text for terminal display.  */
 325 struct coding_system terminal_coding;
 326
 327 /* Coding system to be used to encode text for terminal display when
 328    terminal coding system is nil.  */
 329 struct coding_system safe_terminal_coding;
 330
 331 /* Coding system of what is sent from terminal keyboard.  */
 332 struct coding_system keyboard_coding;
 333
 334 /* Default coding system to be used to write a file.  */
 335 struct coding_system default_buffer_file_coding;
 336
 337 Lisp_Object Vfile_coding_system_alist;
 338 Lisp_Object Vprocess_coding_system_alist;
 339 Lisp_Object Vnetwork_coding_system_alist;
 340
 341 Lisp_Object Vlocale_coding_system;
 342
 343 #endif /* emacs */
 344
 345 Lisp_Object Qcoding_category, Qcoding_category_index;
 346
 347 /* List of symbols `coding-category-xxx' ordered by priority.  */
 348 Lisp_Object Vcoding_category_list;
 349
 350 /* Table of coding categories (Lisp symbols).  */
 351 Lisp_Object Vcoding_category_table;
 352
 353 /* Table of names of symbol for each coding-category.  */
 354 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 355   "coding-category-emacs-mule",
 356   "coding-category-sjis",
 357   "coding-category-iso-7",
 358   "coding-category-iso-7-tight",
 359   "coding-category-iso-8-1",
 360   "coding-category-iso-8-2",
 361   "coding-category-iso-7-else",
 362   "coding-category-iso-8-else",
 363   "coding-category-ccl",
 364   "coding-category-big5",
 365   "coding-category-utf-8",
 366   "coding-category-utf-16-be",
 367   "coding-category-utf-16-le",
 368   "coding-category-raw-text",
 369   "coding-category-binary"
 370 };
 371
 372 /* Table of pointers to coding systems corresponding to each coding
 373    categories.  */
 374 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 375
 376 /* Table of coding category masks.  Nth element is a mask for a coding
 377    cateogry of which priority is Nth.  */
 378 static
 379 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 380
 381 /* Flag to tell if we look up translation table on character code
 382    conversion.  */
 383 Lisp_Object Venable_character_translation;
 384 /* Standard translation table to look up on decoding (reading).  */
 385 Lisp_Object Vstandard_translation_table_for_decode;
 386 /* Standard translation table to look up on encoding (writing).  */
 387 Lisp_Object Vstandard_translation_table_for_encode;
 388
 389 Lisp_Object Qtranslation_table;
 390 Lisp_Object Qtranslation_table_id;
 391 Lisp_Object Qtranslation_table_for_decode;
 392 Lisp_Object Qtranslation_table_for_encode;
 393
 394 /* Alist of charsets vs revision number.  */
 395 Lisp_Object Vcharset_revision_alist;
 396
 397 /* Default coding systems used for process I/O.  */
 398 Lisp_Object Vdefault_process_coding_system;
 399
 400 /* Global flag to tell that we can't call post-read-conversion and
 401    pre-write-conversion functions.  Usually the value is zero, but it
 402    is set to 1 temporarily while such functions are running.  This is
 403    to avoid infinite recursive call.  */
 404 static int inhibit_pre_post_conversion;
 405
 406 \f
 407 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 408
 409 /* Emacs' internal format for encoding multiple character sets is a
 410    kind of multi-byte encoding, i.e. characters are encoded by
 411    variable-length sequences of one-byte codes.  ASCII characters
 412    and control characters (e.g. `tab', `newline') are represented by
 413    one-byte sequences which are their ASCII codes, in the range 0x00
 414    through 0x7F.  The other characters are represented by a sequence
 415    of `base leading-code', optional `extended leading-code', and one
 416    or two `position-code's.  The length of the sequence is determined
 417    by the base leading-code.  Leading-code takes the range 0x80
 418    through 0x9F, whereas extended leading-code and position-code take
 419    the range 0xA0 through 0xFF.  See `charset.h' for more details
 420    about leading-code and position-code.
 421
 422    --- CODE RANGE of Emacs' internal format ---
 423    (character set)      (range)
 424    ASCII                0x00 .. 0x7F
 425    ELSE (1st byte)      0x81 .. 0x9F
 426         (rest bytes)    0xA0 .. 0xFF
 427    ---------------------------------------------
 428
 429   */
 430
 431 enum emacs_code_class_type emacs_code_class[256];
 432
 433 /* Go to the next statement only if *SRC is accessible and the code is
 434    greater than 0xA0.  */
 435 #define CHECK_CODE_RANGE_A0_FF  \
 436   do {                          \
 437     if (src >= src_end)         \
 438       goto label_end_of_switch; \
 439     else if (*src++ < 0xA0)     \
 440       return 0;                 \
 441   } while (0)
 442
 443 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 444    Check if a text is encoded in Emacs' internal format.  If it is,
 445    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 446
 447 int
 448 detect_coding_emacs_mule (src, src_end)
 449      unsigned char *src, *src_end;
 450 {
 451   unsigned char c;
 452   int composing = 0;
 453
 454   while (src < src_end)
 455     {
 456       c = *src++;
 457
 458       if (composing)
 459         {
 460           if (c < 0xA0)
 461             composing = 0;
 462           else
 463             c -= 0x20;
 464         }
 465
 466       switch (emacs_code_class[c])
 467         {
 468         case EMACS_ascii_code:
 469         case EMACS_linefeed_code:
 470           break;
 471
 472         case EMACS_control_code:
 473           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 474             return 0;
 475           break;
 476
 477         case EMACS_invalid_code:
 478           return 0;
 479
 480         case EMACS_leading_code_4:
 481           CHECK_CODE_RANGE_A0_FF;
 482           /* fall down to check it two more times ...  */
 483
 484         case EMACS_leading_code_3:
 485           CHECK_CODE_RANGE_A0_FF;
 486           /* fall down to check it one more time ...  */
 487
 488         case EMACS_leading_code_2:
 489           CHECK_CODE_RANGE_A0_FF;
 490           break;
 491
 492         case 0x80:      /* Old leading code for a composite character.  */
 493           if (composing)
 494             CHECK_CODE_RANGE_A0_FF;
 495           else
 496             composing = 1;
 497           break;
 498
 499         default:
 500         label_end_of_switch:
 501           break;
 502         }
 503     }
 504   return CODING_CATEGORY_MASK_EMACS_MULE;
 505 }
 506
 507 \f
 508 /*** 3. ISO2022 handlers ***/
 509
 510 /* The following note describes the coding system ISO2022 briefly.
 511    Since the intention of this note is to help understand the
 512    functions in this file, some parts are NOT ACCURATE or OVERLY
 513    SIMPLIFIED.  For thorough understanding, please refer to the
 514    original document of ISO2022.
 515
 516    ISO2022 provides many mechanisms to encode several character sets
 517    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 518    is encoded using bytes less than 128.  This may make the encoded
 519    text a little bit longer, but the text passes more easily through
 520    several gateways, some of which strip off MSB (Most Signigant Bit).
 521
 522    There are two kinds of character sets: control character set and
 523    graphic character set.  The former contains control characters such
 524    as `newline' and `escape' to provide control functions (control
 525    functions are also provided by escape sequences).  The latter
 526    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 527    two control character sets and many graphic character sets.
 528
 529    Graphic character sets are classified into one of the following
 530    four classes, according to the number of bytes (DIMENSION) and
 531    number of characters in one dimension (CHARS) of the set:
 532    - DIMENSION1_CHARS94
 533    - DIMENSION1_CHARS96
 534    - DIMENSION2_CHARS94
 535    - DIMENSION2_CHARS96
 536
 537    In addition, each character set is assigned an identification tag,
 538    unique for each set, called "final character" (denoted as <F>
 539    hereafter).  The <F> of each character set is decided by ECMA(*)
 540    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 541    (0x30..0x3F are for private use only).
 542
 543    Note (*): ECMA = European Computer Manufacturers Association
 544
 545    Here are examples of graphic character set [NAME(<F>)]:
 546         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 547         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 548         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 549         o DIMENSION2_CHARS96 -- none for the moment
 550
 551    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 552         C0 [0x00..0x1F] -- control character plane 0
 553         GL [0x20..0x7F] -- graphic character plane 0
 554         C1 [0x80..0x9F] -- control character plane 1
 555         GR [0xA0..0xFF] -- graphic character plane 1
 556
 557    A control character set is directly designated and invoked to C0 or
 558    C1 by an escape sequence.  The most common case is that:
 559    - ISO646's  control character set is designated/invoked to C0, and
 560    - ISO6429's control character set is designated/invoked to C1,
 561    and usually these designations/invocations are omitted in encoded
 562    text.  In a 7-bit environment, only C0 can be used, and a control
 563    character for C1 is encoded by an appropriate escape sequence to
 564    fit into the environment.  All control characters for C1 are
 565    defined to have corresponding escape sequences.
 566
 567    A graphic character set is at first designated to one of four
 568    graphic registers (G0 through G3), then these graphic registers are
 569    invoked to GL or GR.  These designations and invocations can be
 570    done independently.  The most common case is that G0 is invoked to
 571    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 572    these invocations and designations are omitted in encoded text.
 573    In a 7-bit environment, only GL can be used.
 574
 575    When a graphic character set of CHARS94 is invoked to GL, codes
 576    0x20 and 0x7F of the GL area work as control characters SPACE and
 577    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 578    be used.
 579
 580    There are two ways of invocation: locking-shift and single-shift.
 581    With locking-shift, the invocation lasts until the next different
 582    invocation, whereas with single-shift, the invocation affects the
 583    following character only and doesn't affect the locking-shift
 584    state.  Invocations are done by the following control characters or
 585    escape sequences:
 586
 587    ----------------------------------------------------------------------
 588    abbrev  function                  cntrl escape seq   description
 589    ----------------------------------------------------------------------
 590    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 591    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 592    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 593    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 594    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 595    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 596    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 597    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 598    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 599    ----------------------------------------------------------------------
 600    (*) These are not used by any known coding system.
 601
 602    Control characters for these functions are defined by macros
 603    ISO_CODE_XXX in `coding.h'.
 604
 605    Designations are done by the following escape sequences:
 606    ----------------------------------------------------------------------
 607    escape sequence      description
 608    ----------------------------------------------------------------------
 609    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 610    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 611    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 612    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 613    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 614    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 615    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 616    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 617    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 618    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 619    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 620    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 621    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 622    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 623    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 624    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 625    ----------------------------------------------------------------------
 626
 627    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 628    of dimension 1, chars 94, and final character <F>, etc...
 629
 630    Note (*): Although these designations are not allowed in ISO2022,
 631    Emacs accepts them on decoding, and produces them on encoding
 632    CHARS96 character sets in a coding system which is characterized as
 633    7-bit environment, non-locking-shift, and non-single-shift.
 634
 635    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 636    '(' can be omitted.  We refer to this as "short-form" hereafter.
 637
 638    Now you may notice that there are a lot of ways for encoding the
 639    same multilingual text in ISO2022.  Actually, there exist many
 640    coding systems such as Compound Text (used in X11's inter client
 641    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 642    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 643    localized platforms), and all of these are variants of ISO2022.
 644
 645    In addition to the above, Emacs handles two more kinds of escape
 646    sequences: ISO6429's direction specification and Emacs' private
 647    sequence for specifying character composition.
 648
 649    ISO6429's direction specification takes the following form:
 650         o CSI ']'      -- end of the current direction
 651         o CSI '0' ']'  -- end of the current direction
 652         o CSI '1' ']'  -- start of left-to-right text
 653         o CSI '2' ']'  -- start of right-to-left text
 654    The control character CSI (0x9B: control sequence introducer) is
 655    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 656
 657    Character composition specification takes the following form:
 658         o ESC '0' -- start relative composition
 659         o ESC '1' -- end composition
 660         o ESC '2' -- start rule-base composition (*)
 661         o ESC '3' -- start relative composition with alternate chars  (**)
 662         o ESC '4' -- start rule-base composition with alternate chars  (**)
 663    Since these are not standard escape sequences of any ISO standard,
 664    the use of them for these meaning is restricted to Emacs only.
 665
 666    (*) This form is used only in Emacs 20.5 and the older versions,
 667    but the newer versions can safely decode it.
 668    (**) This form is used only in Emacs 21.1 and the newer versions,
 669    and the older versions can't decode it.
 670
 671    Here's a list of examples usages of these composition escape
 672    sequences (categorized by `enum composition_method').
 673
 674    COMPOSITION_RELATIVE:
 675         ESC 0 CHAR [ CHAR ] ESC 1
 676    COMPOSITOIN_WITH_RULE:
 677         ESC 2 CHAR [ RULE CHAR ] ESC 1
 678    COMPOSITION_WITH_ALTCHARS:
 679         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 680    COMPOSITION_WITH_RULE_ALTCHARS:
 681         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 682
 683 enum iso_code_class_type iso_code_class[256];
 684
 685 #define CHARSET_OK(idx, charset)                                \
 686   (coding_system_table[idx]                                     \
 687    && (coding_system_table[idx]->safe_charsets[charset]         \
 688        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 689             (coding_system_table[idx], charset)                 \
 690            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 691
 692 #define SHIFT_OUT_OK(idx) \
 693   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 694
 695 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 696    Check if a text is encoded in ISO2022.  If it is, returns an
 697    integer in which appropriate flag bits any of:
 698         CODING_CATEGORY_MASK_ISO_7
 699         CODING_CATEGORY_MASK_ISO_7_TIGHT
 700         CODING_CATEGORY_MASK_ISO_8_1
 701         CODING_CATEGORY_MASK_ISO_8_2
 702         CODING_CATEGORY_MASK_ISO_7_ELSE
 703         CODING_CATEGORY_MASK_ISO_8_ELSE
 704    are set.  If a code which should never appear in ISO2022 is found,
 705    returns 0.  */
 706
 707 int
 708 detect_coding_iso2022 (src, src_end)
 709      unsigned char *src, *src_end;
 710 {
 711   int mask = CODING_CATEGORY_MASK_ISO;
 712   int mask_found = 0;
 713   int reg[4], shift_out = 0, single_shifting = 0;
 714   int c, c1, i, charset;
 715
 716   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 717   while (mask && src < src_end)
 718     {
 719       c = *src++;
 720       switch (c)
 721         {
 722         case ISO_CODE_ESC:
 723           single_shifting = 0;
 724           if (src >= src_end)
 725             break;
 726           c = *src++;
 727           if (c >= '(' && c <= '/')
 728             {
 729               /* Designation sequence for a charset of dimension 1.  */
 730               if (src >= src_end)
 731                 break;
 732               c1 = *src++;
 733               if (c1 < ' ' || c1 >= 0x80
 734                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 735                 /* Invalid designation sequence.  Just ignore.  */
 736                 break;
 737               reg[(c - '(') % 4] = charset;
 738             }
 739           else if (c == '$')
 740             {
 741               /* Designation sequence for a charset of dimension 2.  */
 742               if (src >= src_end)
 743                 break;
 744               c = *src++;
 745               if (c >= '@' && c <= 'B')
 746                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 747                 reg[0] = charset = iso_charset_table[1][0][c];
 748               else if (c >= '(' && c <= '/')
 749                 {
 750                   if (src >= src_end)
 751                     break;
 752                   c1 = *src++;
 753                   if (c1 < ' ' || c1 >= 0x80
 754                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 755                     /* Invalid designation sequence.  Just ignore.  */
 756                     break;
 757                   reg[(c - '(') % 4] = charset;
 758                 }
 759               else
 760                 /* Invalid designation sequence.  Just ignore.  */
 761                 break;
 762             }
 763           else if (c == 'N' || c == 'O')
 764             {
 765               /* ESC <Fe> for SS2 or SS3.  */
 766               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 767               break;
 768             }
 769           else if (c >= '0' && c <= '4')
 770             {
 771               /* ESC <Fp> for start/end composition.  */
 772               mask_found |= CODING_CATEGORY_MASK_ISO;
 773               break;
 774             }
 775           else
 776             /* Invalid escape sequence.  Just ignore.  */
 777             break;
 778
 779           /* We found a valid designation sequence for CHARSET.  */
 780           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 781           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 782             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 783           else
 784             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 785           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 786             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 787           else
 788             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 789           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 790             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 791           else
 792             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 793           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 794             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 795           else
 796             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 797           break;
 798
 799         case ISO_CODE_SO:
 800           single_shifting = 0;
 801           if (shift_out == 0
 802               && (reg[1] >= 0
 803                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 804                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 805             {
 806               /* Locking shift out.  */
 807               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 808               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 809             }
 810           break;
 811
 812         case ISO_CODE_SI:
 813           single_shifting = 0;
 814           if (shift_out == 1)
 815             {
 816               /* Locking shift in.  */
 817               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 818               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 819             }
 820           break;
 821
 822         case ISO_CODE_CSI:
 823           single_shifting = 0;
 824         case ISO_CODE_SS2:
 825         case ISO_CODE_SS3:
 826           {
 827             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 828
 829             if (c != ISO_CODE_CSI)
 830               {
 831                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 832                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 833                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 834                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 835                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 836                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 837                 single_shifting = 1;
 838               }
 839             if (VECTORP (Vlatin_extra_code_table)
 840                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 841               {
 842                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 843                     & CODING_FLAG_ISO_LATIN_EXTRA)
 844                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 845                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 846                     & CODING_FLAG_ISO_LATIN_EXTRA)
 847                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 848               }
 849             mask &= newmask;
 850             mask_found |= newmask;
 851           }
 852           break;
 853
 854         default:
 855           if (c < 0x80)
 856             {
 857               single_shifting = 0;
 858               break;
 859             }
 860           else if (c < 0xA0)
 861             {
 862               single_shifting = 0;
 863               if (VECTORP (Vlatin_extra_code_table)
 864                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 865                 {
 866                   int newmask = 0;
 867
 868                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 869                       & CODING_FLAG_ISO_LATIN_EXTRA)
 870                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 871                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 872                       & CODING_FLAG_ISO_LATIN_EXTRA)
 873                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 874                   mask &= newmask;
 875                   mask_found |= newmask;
 876                 }
 877               else
 878                 return 0;
 879             }
 880           else
 881             {
 882               unsigned char *src_begin = src;
 883
 884               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 885                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 886               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 887               /* Check the length of succeeding codes of the range
 888                  0xA0..0FF.  If the byte length is odd, we exclude
 889                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 890                  when we are not single shifting.  */
 891               if (!single_shifting)
 892                 {
 893                   while (src < src_end && *src >= 0xA0)
 894                     src++;
 895                   if ((src - src_begin - 1) & 1 && src < src_end)
 896                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 897                   else
 898                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
 899                 }
 900             }
 901           break;
 902         }
 903     }
 904
 905   return (mask & mask_found);
 906 }
 907
 908 /* Decode a character of which charset is CHARSET and the 1st position
 909    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 910    fetched from SRC and set to C2.  If CHARSET is negative, it means
 911    that we are decoding ill formed text, and what we can do is just to
 912    read C1 as is.
 913
 914    If we are now in the middle of composition sequence, the decoded
 915    character may be ALTCHAR (see the comment above).  In that case,
 916    the character goes to coding->cmp_data->data instead of DST.  */
 917
 918 #define DECODE_ISO_CHARACTER(charset, c1)                                 \
 919   do {                                                                    \
 920     int c_alt = -1, charset_alt = (charset);                              \
 921     if (charset_alt >= 0)                                                 \
 922       {                                                                   \
 923         if (CHARSET_DIMENSION (charset_alt) == 2)                         \
 924           {                                                               \
 925             ONE_MORE_BYTE (c2);                                           \
 926             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F           \
 927                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)    \
 928               {                                                           \
 929                 src--;                                                    \
 930                 charset_alt = CHARSET_ASCII;                              \
 931               }                                                           \
 932           }                                                               \
 933         if (!NILP (translation_table)                                     \
 934             && ((c_alt = translate_char (translation_table,               \
 935                                          -1, charset_alt, c1, c2)) >= 0)) \
 936           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                        \
 937       }                                                                   \
 938     if (! COMPOSING_P (coding)                                            \
 939         || coding->composing == COMPOSITION_RELATIVE                      \
 940         || coding->composing == COMPOSITION_WITH_RULE)                    \
 941       {                                                                   \
 942         if (charset_alt == CHARSET_ASCII || charset_alt < 0)              \
 943           DECODE_CHARACTER_ASCII (c1);                                    \
 944         else if (CHARSET_DIMENSION (charset_alt) == 1)                    \
 945           DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
 946         else                                                              \
 947           DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
 948       }                                                                   \
 949     if (COMPOSING_P (coding)                                              \
 950         && coding->composing != COMPOSITION_RELATIVE)                     \
 951       {                                                                   \
 952         if (c_alt < 0)                                                    \
 953           c_alt = MAKE_CHAR (charset_alt, c1, c2);                        \
 954         CODING_ADD_COMPOSITION_COMPONENT (coding, c_alt);                 \
 955         coding->composition_rule_follows                                  \
 956           = coding->composing != COMPOSITION_WITH_ALTCHARS;               \
 957       }                                                                   \
 958   } while (0)
 959
 960 /* Set designation state into CODING.  */
 961 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
 962   do {                                                                     \
 963     int charset;                                                           \
 964                                                                            \
 965     if (final_char < '0' || final_char >= 128)                             \
 966       goto label_invalid_code;                                             \
 967     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
 968                                  make_number (chars),                      \
 969                                  make_number (final_char));                \
 970     if (charset >= 0                                                       \
 971         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
 972             || coding->safe_charsets[charset]))                            \
 973       {                                                                    \
 974         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
 975             && reg == 0                                                    \
 976             && charset == CHARSET_ASCII)                                   \
 977           {                                                                \
 978             /* We should insert this designation sequence as is so         \
 979                that it is surely written back to a file.  */               \
 980             coding->spec.iso2022.last_invalid_designation_register = -1;   \
 981             goto label_invalid_code;                                       \
 982           }                                                                \
 983         coding->spec.iso2022.last_invalid_designation_register = -1;       \
 984         if ((coding->mode & CODING_MODE_DIRECTION)                         \
 985             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
 986           charset = CHARSET_REVERSE_CHARSET (charset);                     \
 987         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
 988       }                                                                    \
 989     else                                                                   \
 990       {                                                                    \
 991         coding->spec.iso2022.last_invalid_designation_register = reg;      \
 992         goto label_invalid_code;                                           \
 993       }                                                                    \
 994   } while (0)
 995
 996 /* Allocate a memory block for storing information about compositions.
 997    The block is chained to the already allocated blocks.  */
 998
 999 static void
1000 coding_allocate_composition_data (coding, char_offset)
1001      struct coding_system *coding;
1002      int char_offset;
1003 {
1004   struct composition_data *cmp_data
1005     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1006
1007   cmp_data->char_offset = char_offset;
1008   cmp_data->used = 0;
1009   cmp_data->prev = coding->cmp_data;
1010   cmp_data->next = NULL;
1011   if (coding->cmp_data)
1012     coding->cmp_data->next = cmp_data;
1013   coding->cmp_data = cmp_data;
1014   coding->cmp_data_start = 0;
1015 }
1016
1017 /* Record the starting position START and METHOD of one composition.  */
1018
1019 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1020   do {                                                          \
1021     struct composition_data *cmp_data = coding->cmp_data;       \
1022     int *data = cmp_data->data + cmp_data->used;                \
1023     coding->cmp_data_start = cmp_data->used;                    \
1024     data[0] = -1;                                               \
1025     data[1] = cmp_data->char_offset + start;                    \
1026     data[3] = (int) method;                                     \
1027     cmp_data->used += 4;                                        \
1028   } while (0)
1029
1030 /* Record the ending position END of the current composition.  */
1031
1032 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1033   do {                                                          \
1034     struct composition_data *cmp_data = coding->cmp_data;       \
1035     int *data = cmp_data->data + coding->cmp_data_start;        \
1036     data[0] = cmp_data->used - coding->cmp_data_start;          \
1037     data[2] = cmp_data->char_offset + end;                      \
1038   } while (0)
1039
1040 /* Record one COMPONENT (alternate character or composition rule).  */
1041
1042 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1043   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1044
1045 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1046
1047 #define DECODE_COMPOSITION_START(c1)                                    \
1048   do {                                                                  \
1049     if (coding->composing == COMPOSITION_DISABLED)                      \
1050       {                                                                 \
1051         *dst++ = ISO_CODE_ESC;                                          \
1052         *dst++ = c1 & 0x7f;                                             \
1053         coding->produced_char += 2;                                     \
1054       }                                                                 \
1055     else if (!COMPOSING_P (coding))                                     \
1056       {                                                                 \
1057         /* This is surely the start of a composition.  We must be sure  \
1058            that coding->cmp_data has enough space to store the          \
1059            information about the composition.  If not, terminate the    \
1060            current decoding loop, allocate one more memory block for    \
1061            coding->cmp_data in the calller, then start the decoding     \
1062            loop again.  We can't allocate memory here directly because  \
1063            it may cause buffer/string relocation.  */                   \
1064         if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH  \
1065             >= COMPOSITION_DATA_SIZE)                                   \
1066           {                                                             \
1067             result = CODING_FINISH_INSUFFICIENT_CMP;                    \
1068             goto label_end_of_loop_2;                                   \
1069           }                                                             \
1070         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE           \
1071                              : c1 == '2' ? COMPOSITION_WITH_RULE        \
1072                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS    \
1073                              : COMPOSITION_WITH_RULE_ALTCHARS);         \
1074         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,    \
1075                                       coding->composing);               \
1076         coding->composition_rule_follows = 0;                           \
1077       }                                                                 \
1078     else                                                                \
1079       {                                                                 \
1080         /* We are already handling a composition.  If the method is     \
1081            the following two, the codes following the current escape    \
1082            sequence are actual characters stored in a buffer.  */       \
1083         if (coding->composing == COMPOSITION_WITH_ALTCHARS              \
1084             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)     \
1085           {                                                             \
1086             coding->composing = COMPOSITION_RELATIVE;                   \
1087             coding->composition_rule_follows = 0;                       \
1088           }                                                             \
1089       }                                                                 \
1090   } while (0)
1091
1092 /* Handle compositoin end sequence ESC 1.  */
1093
1094 #define DECODE_COMPOSITION_END(c1)                                      \
1095   do {                                                                  \
1096     if (coding->composing == COMPOSITION_DISABLED)                      \
1097       {                                                                 \
1098         *dst++ = ISO_CODE_ESC;                                          \
1099         *dst++ = c1;                                                    \
1100         coding->produced_char += 2;                                     \
1101       }                                                                 \
1102     else                                                                \
1103       {                                                                 \
1104         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1105         coding->composing = COMPOSITION_NO;                             \
1106       }                                                                 \
1107   } while (0)
1108
1109 /* Decode a composition rule from the byte C1 (and maybe one more byte
1110    from SRC) and store one encoded composition rule in
1111    coding->cmp_data.  */
1112
1113 #define DECODE_COMPOSITION_RULE(c1)                                     \
1114   do {                                                                  \
1115     int rule = 0;                                                       \
1116     (c1) -= 32;                                                         \
1117     if (c1 < 81)                /* old format (before ver.21) */        \
1118       {                                                                 \
1119         int gref = (c1) / 9;                                            \
1120         int nref = (c1) % 9;                                            \
1121         if (gref == 4) gref = 10;                                       \
1122         if (nref == 4) nref = 10;                                       \
1123         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1124       }                                                                 \
1125     else if (c1 < 93)           /* new format (after ver.21 */          \
1126       {                                                                 \
1127         ONE_MORE_BYTE (c2);                                             \
1128         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1129       }                                                                 \
1130     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1131     coding->composition_rule_follows = 0;                               \
1132   } while (0)
1133
1134
1135 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1136
1137 int
1138 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1139      struct coding_system *coding;
1140      unsigned char *source, *destination;
1141      int src_bytes, dst_bytes;
1142 {
1143   unsigned char *src = source;
1144   unsigned char *src_end = source + src_bytes;
1145   unsigned char *dst = destination;
1146   unsigned char *dst_end = destination + dst_bytes;
1147   /* Since the maximum bytes produced by each loop is 7, we subtract 6
1148      from DST_END to assure that overflow checking is necessary only
1149      at the head of loop.  */
1150   unsigned char *adjusted_dst_end = dst_end - 6;
1151   int charset;
1152   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1153   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1154   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1155   Lisp_Object translation_table
1156     = coding->translation_table_for_decode;
1157   int result = CODING_FINISH_NORMAL;
1158
1159   if (!NILP (Venable_character_translation) && NILP (translation_table))
1160     translation_table = Vstandard_translation_table_for_decode;
1161
1162   coding->produced_char = 0;
1163   coding->fake_multibyte = 0;
1164   while (src < src_end && (dst_bytes
1165                            ? (dst < adjusted_dst_end)
1166                            : (dst < src - 6)))
1167     {
1168       /* SRC_BASE remembers the start position in source in each loop.
1169          The loop will be exited when there's not enough source text
1170          to analyze long escape sequence or 2-byte code (within macros
1171          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
1172          to SRC_BASE before exiting.  */
1173       unsigned char *src_base = src;
1174       int c1 = *src++, c2;
1175
1176       /* We produce no character or one character.  */
1177       switch (iso_code_class [c1])
1178         {
1179         case ISO_0x20_or_0x7F:
1180           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1181             {
1182               DECODE_COMPOSITION_RULE (c1);
1183               break;
1184             }
1185           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1186             {
1187               /* This is SPACE or DEL.  */
1188               *dst++ = c1;
1189               coding->produced_char++;
1190               break;
1191             }
1192           /* This is a graphic character, we fall down ...  */
1193
1194         case ISO_graphic_plane_0:
1195           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1196             DECODE_COMPOSITION_RULE (c1);
1197           else
1198             DECODE_ISO_CHARACTER (charset0, c1);
1199           break;
1200
1201         case ISO_0xA0_or_0xFF:
1202           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1203               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1204             goto label_invalid_code;
1205           /* This is a graphic character, we fall down ... */
1206
1207         case ISO_graphic_plane_1:
1208           if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1209             goto label_invalid_code;
1210           DECODE_ISO_CHARACTER (charset1, c1);
1211           break;
1212
1213         case ISO_control_code:
1214           if (COMPOSING_P (coding))
1215             DECODE_COMPOSITION_END ('1');
1216
1217           /* All ISO2022 control characters in this class have the
1218              same representation in Emacs internal format.  */
1219           if (c1 == '\n'
1220               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1221               && (coding->eol_type == CODING_EOL_CR
1222                   || coding->eol_type == CODING_EOL_CRLF))
1223             {
1224               result = CODING_FINISH_INCONSISTENT_EOL;
1225               goto label_end_of_loop_2;
1226             }
1227           *dst++ = c1;
1228           coding->produced_char++;
1229           break;
1230
1231         case ISO_carriage_return:
1232           if (COMPOSING_P (coding))
1233             DECODE_COMPOSITION_END ('1');
1234
1235           if (coding->eol_type == CODING_EOL_CR)
1236             *dst++ = '\n';
1237           else if (coding->eol_type == CODING_EOL_CRLF)
1238             {
1239               ONE_MORE_BYTE (c1);
1240               if (c1 == ISO_CODE_LF)
1241                 *dst++ = '\n';
1242               else
1243                 {
1244                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1245                     {
1246                       result = CODING_FINISH_INCONSISTENT_EOL;
1247                       goto label_end_of_loop_2;
1248                     }
1249                   src--;
1250                   *dst++ = '\r';
1251                 }
1252             }
1253           else
1254             *dst++ = c1;
1255           coding->produced_char++;
1256           break;
1257
1258         case ISO_shift_out:
1259           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1260               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1261             goto label_invalid_code;
1262           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1263           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1264           break;
1265
1266         case ISO_shift_in:
1267           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1268             goto label_invalid_code;
1269           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1270           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1271           break;
1272
1273         case ISO_single_shift_2_7:
1274         case ISO_single_shift_2:
1275           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1276             goto label_invalid_code;
1277           /* SS2 is handled as an escape sequence of ESC 'N' */
1278           c1 = 'N';
1279           goto label_escape_sequence;
1280
1281         case ISO_single_shift_3:
1282           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1283             goto label_invalid_code;
1284           /* SS2 is handled as an escape sequence of ESC 'O' */
1285           c1 = 'O';
1286           goto label_escape_sequence;
1287
1288         case ISO_control_sequence_introducer:
1289           /* CSI is handled as an escape sequence of ESC '[' ...  */
1290           c1 = '[';
1291           goto label_escape_sequence;
1292
1293         case ISO_escape:
1294           ONE_MORE_BYTE (c1);
1295         label_escape_sequence:
1296           /* Escape sequences handled by Emacs are invocation,
1297              designation, direction specification, and character
1298              composition specification.  */
1299           switch (c1)
1300             {
1301             case '&':           /* revision of following character set */
1302               ONE_MORE_BYTE (c1);
1303               if (!(c1 >= '@' && c1 <= '~'))
1304                 goto label_invalid_code;
1305               ONE_MORE_BYTE (c1);
1306               if (c1 != ISO_CODE_ESC)
1307                 goto label_invalid_code;
1308               ONE_MORE_BYTE (c1);
1309               goto label_escape_sequence;
1310
1311             case '$':           /* designation of 2-byte character set */
1312               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1313                 goto label_invalid_code;
1314               ONE_MORE_BYTE (c1);
1315               if (c1 >= '@' && c1 <= 'B')
1316                 {       /* designation of JISX0208.1978, GB2312.1980,
1317                            or JISX0208.1980 */
1318                   DECODE_DESIGNATION (0, 2, 94, c1);
1319                 }
1320               else if (c1 >= 0x28 && c1 <= 0x2B)
1321                 {       /* designation of DIMENSION2_CHARS94 character set */
1322                   ONE_MORE_BYTE (c2);
1323                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1324                 }
1325               else if (c1 >= 0x2C && c1 <= 0x2F)
1326                 {       /* designation of DIMENSION2_CHARS96 character set */
1327                   ONE_MORE_BYTE (c2);
1328                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1329                 }
1330               else
1331                 goto label_invalid_code;
1332               break;
1333
1334             case 'n':           /* invocation of locking-shift-2 */
1335               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1336                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1337                 goto label_invalid_code;
1338               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1339               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1340               break;
1341
1342             case 'o':           /* invocation of locking-shift-3 */
1343               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1344                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1345                 goto label_invalid_code;
1346               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1347               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1348               break;
1349
1350             case 'N':           /* invocation of single-shift-2 */
1351               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1352                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1353                 goto label_invalid_code;
1354               ONE_MORE_BYTE (c1);
1355               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1356               DECODE_ISO_CHARACTER (charset, c1);
1357               break;
1358
1359             case 'O':           /* invocation of single-shift-3 */
1360               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1361                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1362                 goto label_invalid_code;
1363               ONE_MORE_BYTE (c1);
1364               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1365               DECODE_ISO_CHARACTER (charset, c1);
1366               break;
1367
1368             case '0': case '2': case '3': case '4': /* start composition */
1369               DECODE_COMPOSITION_START (c1);
1370               break;
1371
1372             case '1':           /* end composition */
1373               DECODE_COMPOSITION_END (c1);
1374               break;
1375
1376             case '[':           /* specification of direction */
1377               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1378                 goto label_invalid_code;
1379               /* For the moment, nested direction is not supported.
1380                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1381                  left-to-right, and nozero means right-to-left.  */
1382               ONE_MORE_BYTE (c1);
1383               switch (c1)
1384                 {
1385                 case ']':       /* end of the current direction */
1386                   coding->mode &= ~CODING_MODE_DIRECTION;
1387
1388                 case '0':       /* end of the current direction */
1389                 case '1':       /* start of left-to-right direction */
1390                   ONE_MORE_BYTE (c1);
1391                   if (c1 == ']')
1392                     coding->mode &= ~CODING_MODE_DIRECTION;
1393                   else
1394                     goto label_invalid_code;
1395                   break;
1396
1397                 case '2':       /* start of right-to-left direction */
1398                   ONE_MORE_BYTE (c1);
1399                   if (c1 == ']')
1400                     coding->mode |= CODING_MODE_DIRECTION;
1401                   else
1402                     goto label_invalid_code;
1403                   break;
1404
1405                 default:
1406                   goto label_invalid_code;
1407                 }
1408               break;
1409
1410             default:
1411               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1412                 goto label_invalid_code;
1413               if (c1 >= 0x28 && c1 <= 0x2B)
1414                 {       /* designation of DIMENSION1_CHARS94 character set */
1415                   ONE_MORE_BYTE (c2);
1416                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1417                 }
1418               else if (c1 >= 0x2C && c1 <= 0x2F)
1419                 {       /* designation of DIMENSION1_CHARS96 character set */
1420                   ONE_MORE_BYTE (c2);
1421                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1422                 }
1423               else
1424                 {
1425                   goto label_invalid_code;
1426                 }
1427             }
1428           /* We must update these variables now.  */
1429           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1430           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1431           break;
1432
1433         label_invalid_code:
1434           if (COMPOSING_P (coding))
1435             DECODE_COMPOSITION_END ('1');
1436           coding->produced_char += src - src_base;
1437           while (src_base < src)
1438             *dst++ = (*src_base++) & 0x7F;
1439         }
1440       continue;
1441
1442     label_end_of_loop:
1443       result = CODING_FINISH_INSUFFICIENT_SRC;
1444     label_end_of_loop_2:
1445       src = src_base;
1446       break;
1447     }
1448
1449   if (src < src_end)
1450     {
1451       if (result == CODING_FINISH_NORMAL)
1452         result = CODING_FINISH_INSUFFICIENT_DST;
1453       else if (result != CODING_FINISH_INCONSISTENT_EOL
1454                && coding->mode & CODING_MODE_LAST_BLOCK)
1455         {
1456           /* This is the last block of the text to be decoded.  We had
1457              better just flush out all remaining codes in the text
1458              although they are not valid characters.  */
1459           if (COMPOSING_P (coding))
1460             DECODE_COMPOSITION_END ('1');
1461           src_bytes = src_end - src;
1462           if (dst_bytes && (dst_end - dst < src_end - src))
1463             src_end = src + (dst_end - dst);
1464           coding->produced_char += src_end - src;
1465           while (src < src_end)
1466             *dst++ = (*src++) & 0x7F;
1467         }
1468     }
1469
1470   coding->consumed = coding->consumed_char = src - source;
1471   coding->produced = dst - destination;
1472   return result;
1473 }
1474
1475 /* ISO2022 encoding stuff.  */
1476
1477 /*
1478    It is not enough to say just "ISO2022" on encoding, we have to
1479    specify more details.  In Emacs, each coding system of ISO2022
1480    variant has the following specifications:
1481         1. Initial designation to G0 thru G3.
1482         2. Allows short-form designation?
1483         3. ASCII should be designated to G0 before control characters?
1484         4. ASCII should be designated to G0 at end of line?
1485         5. 7-bit environment or 8-bit environment?
1486         6. Use locking-shift?
1487         7. Use Single-shift?
1488    And the following two are only for Japanese:
1489         8. Use ASCII in place of JIS0201-1976-Roman?
1490         9. Use JISX0208-1983 in place of JISX0208-1978?
1491    These specifications are encoded in `coding->flags' as flag bits
1492    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1493    details.
1494 */
1495
1496 /* Produce codes (escape sequence) for designating CHARSET to graphic
1497    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1498    the coding system CODING allows, produce designation sequence of
1499    short-form.  */
1500
1501 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1502   do {                                                                  \
1503     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1504     char *intermediate_char_94 = "()*+";                                \
1505     char *intermediate_char_96 = ",-./";                                \
1506     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1507     if (revision < 255)                                                 \
1508       {                                                                 \
1509         *dst++ = ISO_CODE_ESC;                                          \
1510         *dst++ = '&';                                                   \
1511         *dst++ = '@' + revision;                                        \
1512       }                                                                 \
1513     *dst++ = ISO_CODE_ESC;                                              \
1514     if (CHARSET_DIMENSION (charset) == 1)                               \
1515       {                                                                 \
1516         if (CHARSET_CHARS (charset) == 94)                              \
1517           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1518         else                                                            \
1519           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1520       }                                                                 \
1521     else                                                                \
1522       {                                                                 \
1523         *dst++ = '$';                                                   \
1524         if (CHARSET_CHARS (charset) == 94)                              \
1525           {                                                             \
1526             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1527                 || reg != 0                                             \
1528                 || final_char < '@' || final_char > 'B')                \
1529               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1530           }                                                             \
1531         else                                                            \
1532           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1533       }                                                                 \
1534     *dst++ = final_char;                                                \
1535     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1536   } while (0)
1537
1538 /* The following two macros produce codes (control character or escape
1539    sequence) for ISO2022 single-shift functions (single-shift-2 and
1540    single-shift-3).  */
1541
1542 #define ENCODE_SINGLE_SHIFT_2                           \
1543   do {                                                  \
1544     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1545       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1546     else                                                \
1547       {                                                 \
1548         *dst++ = ISO_CODE_SS2;                          \
1549         coding->fake_multibyte = 1;                     \
1550       }                                                 \
1551     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1552   } while (0)
1553
1554 #define ENCODE_SINGLE_SHIFT_3                           \
1555   do {                                                  \
1556     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1557       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1558     else                                                \
1559       {                                                 \
1560         *dst++ = ISO_CODE_SS3;                          \
1561         coding->fake_multibyte = 1;                     \
1562       }                                                 \
1563     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1564   } while (0)
1565
1566 /* The following four macros produce codes (control character or
1567    escape sequence) for ISO2022 locking-shift functions (shift-in,
1568    shift-out, locking-shift-2, and locking-shift-3).  */
1569
1570 #define ENCODE_SHIFT_IN                         \
1571   do {                                          \
1572     *dst++ = ISO_CODE_SI;                       \
1573     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1574   } while (0)
1575
1576 #define ENCODE_SHIFT_OUT                        \
1577   do {                                          \
1578     *dst++ = ISO_CODE_SO;                       \
1579     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1580   } while (0)
1581
1582 #define ENCODE_LOCKING_SHIFT_2                  \
1583   do {                                          \
1584     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1585     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1586   } while (0)
1587
1588 #define ENCODE_LOCKING_SHIFT_3                  \
1589   do {                                          \
1590     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1591     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1592   } while (0)
1593
1594 /* Produce codes for a DIMENSION1 character whose character set is
1595    CHARSET and whose position-code is C1.  Designation and invocation
1596    sequences are also produced in advance if necessary.  */
1597
1598
1599 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1600   do {                                                                  \
1601     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1602       {                                                                 \
1603         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1604           *dst++ = c1 & 0x7F;                                           \
1605         else                                                            \
1606           *dst++ = c1 | 0x80;                                           \
1607         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1608         break;                                                          \
1609       }                                                                 \
1610     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1611       {                                                                 \
1612         *dst++ = c1 & 0x7F;                                             \
1613         break;                                                          \
1614       }                                                                 \
1615     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1616       {                                                                 \
1617         *dst++ = c1 | 0x80;                                             \
1618         break;                                                          \
1619       }                                                                 \
1620     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1621              && !coding->safe_charsets[charset])                        \
1622       {                                                                 \
1623         /* We should not encode this character, instead produce one or  \
1624            two `?'s.  */                                                \
1625         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1626         if (CHARSET_WIDTH (charset) == 2)                               \
1627           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1628         break;                                                          \
1629       }                                                                 \
1630     else                                                                \
1631       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1632          must invoke it, or, at first, designate it to some graphic     \
1633          register.  Then repeat the loop to actually produce the        \
1634          character.  */                                                 \
1635       dst = encode_invocation_designation (charset, coding, dst);       \
1636   } while (1)
1637
1638 /* Produce codes for a DIMENSION2 character whose character set is
1639    CHARSET and whose position-codes are C1 and C2.  Designation and
1640    invocation codes are also produced in advance if necessary.  */
1641
1642 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1643   do {                                                                  \
1644     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1645       {                                                                 \
1646         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1647           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1648         else                                                            \
1649           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1650         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1651         break;                                                          \
1652       }                                                                 \
1653     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1654       {                                                                 \
1655         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1656         break;                                                          \
1657       }                                                                 \
1658     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1659       {                                                                 \
1660         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1661         break;                                                          \
1662       }                                                                 \
1663     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1664              && !coding->safe_charsets[charset])                        \
1665       {                                                                 \
1666         /* We should not encode this character, instead produce one or  \
1667            two `?'s.  */                                                \
1668         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1669         if (CHARSET_WIDTH (charset) == 2)                               \
1670           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1671         break;                                                          \
1672       }                                                                 \
1673     else                                                                \
1674       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1675          must invoke it, or, at first, designate it to some graphic     \
1676          register.  Then repeat the loop to actually produce the        \
1677          character.  */                                                 \
1678       dst = encode_invocation_designation (charset, coding, dst);       \
1679   } while (1)
1680
1681 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1682   do {                                                                  \
1683     int c_alt, charset_alt;                                             \
1684                                                                         \
1685     if (!NILP (translation_table)                                       \
1686         && ((c_alt = translate_char (translation_table, -1,             \
1687                                      charset, c1, c2))                  \
1688             >= 0))                                                      \
1689       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                          \
1690     else                                                                \
1691       charset_alt = charset;                                            \
1692     if (CHARSET_DEFINED_P (charset_alt))                                \
1693       {                                                                 \
1694         if (CHARSET_DIMENSION (charset_alt) == 1)                       \
1695           {                                                             \
1696             if (charset == CHARSET_ASCII                                \
1697                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1698               charset_alt = charset_latin_jisx0201;                     \
1699             ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);          \
1700           }                                                             \
1701         else                                                            \
1702           {                                                             \
1703             if (charset == charset_jisx0208                             \
1704                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1705               charset_alt = charset_jisx0208_1978;                      \
1706             ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);      \
1707           }                                                             \
1708       }                                                                 \
1709     else                                                                \
1710       {                                                                 \
1711         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1712           {                                                             \
1713             *dst++ = charset & 0x7f;                                    \
1714             *dst++ = c1 & 0x7f;                                         \
1715             if (c2)                                                     \
1716               *dst++ = c2 & 0x7f;                                       \
1717           }                                                             \
1718         else                                                            \
1719           {                                                             \
1720             *dst++ = charset;                                           \
1721             *dst++ = c1;                                                \
1722             if (c2)                                                     \
1723               *dst++ = c2;                                              \
1724           }                                                             \
1725       }                                                                 \
1726     coding->consumed_char++;                                            \
1727   } while (0)
1728
1729 /* Produce designation and invocation codes at a place pointed by DST
1730    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1731    Return new DST.  */
1732
1733 unsigned char *
1734 encode_invocation_designation (charset, coding, dst)
1735      int charset;
1736      struct coding_system *coding;
1737      unsigned char *dst;
1738 {
1739   int reg;                      /* graphic register number */
1740
1741   /* At first, check designations.  */
1742   for (reg = 0; reg < 4; reg++)
1743     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1744       break;
1745
1746   if (reg >= 4)
1747     {
1748       /* CHARSET is not yet designated to any graphic registers.  */
1749       /* At first check the requested designation.  */
1750       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1751       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1752         /* Since CHARSET requests no special designation, designate it
1753            to graphic register 0.  */
1754         reg = 0;
1755
1756       ENCODE_DESIGNATION (charset, reg, coding);
1757     }
1758
1759   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1760       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1761     {
1762       /* Since the graphic register REG is not invoked to any graphic
1763          planes, invoke it to graphic plane 0.  */
1764       switch (reg)
1765         {
1766         case 0:                 /* graphic register 0 */
1767           ENCODE_SHIFT_IN;
1768           break;
1769
1770         case 1:                 /* graphic register 1 */
1771           ENCODE_SHIFT_OUT;
1772           break;
1773
1774         case 2:                 /* graphic register 2 */
1775           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1776             ENCODE_SINGLE_SHIFT_2;
1777           else
1778             ENCODE_LOCKING_SHIFT_2;
1779           break;
1780
1781         case 3:                 /* graphic register 3 */
1782           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1783             ENCODE_SINGLE_SHIFT_3;
1784           else
1785             ENCODE_LOCKING_SHIFT_3;
1786           break;
1787         }
1788     }
1789   return dst;
1790 }
1791
1792 /* Produce 2-byte codes for encoded composition rule RULE.  */
1793
1794 #define ENCODE_COMPOSITION_RULE(rule)           \
1795   do {                                          \
1796     int gref, nref;                             \
1797     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1798     *dst++ = 32 + 81 + gref;                    \
1799     *dst++ = 32 + nref;                         \
1800   } while (0)
1801
1802 /* Produce codes for indicating the start of a composition sequence
1803    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1804    which specify information about the composition.  See the comment
1805    in coding.h for the format of DATA.  */
1806
1807 #define ENCODE_COMPOSITION_START(coding, data)                          \
1808   do {                                                                  \
1809     coding->composing = data[3];                                        \
1810     *dst++ = ISO_CODE_ESC;                                              \
1811     if (coding->composing == COMPOSITION_RELATIVE)                      \
1812       *dst++ = '0';                                                     \
1813     else                                                                \
1814       {                                                                 \
1815         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1816                   ? '3' : '4');                                         \
1817         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1818         coding->composition_rule_follows = 0;                           \
1819       }                                                                 \
1820   } while (0)
1821
1822 /* Produce codes for indicating the end of the current composition.  */
1823
1824 #define ENCODE_COMPOSITION_END(coding, data)                    \
1825   do {                                                          \
1826     *dst++ = ISO_CODE_ESC;                                      \
1827     *dst++ = '1';                                               \
1828     coding->cmp_data_start += data[0];                          \
1829     coding->composing = COMPOSITION_NO;                         \
1830     if (coding->cmp_data_start == coding->cmp_data->used        \
1831         && coding->cmp_data->next)                              \
1832       {                                                         \
1833         coding->cmp_data = coding->cmp_data->next;              \
1834         coding->cmp_data_start = 0;                             \
1835       }                                                         \
1836   } while (0)
1837
1838 /* Produce composition start sequence ESC 0.  Here, this sequence
1839    doesn't mean the start of a new composition but means that we have
1840    just produced components (alternate chars and composition rules) of
1841    the composition and the actual text follows in SRC.  */
1842
1843 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1844   do {                                          \
1845     *dst++ = ISO_CODE_ESC;                      \
1846     *dst++ = '0';                               \
1847     coding->composing = COMPOSITION_RELATIVE;   \
1848   } while (0)
1849
1850 /* The following three macros produce codes for indicating direction
1851    of text.  */
1852 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1853   do {                                                  \
1854     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1855       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1856     else                                                \
1857       *dst++ = ISO_CODE_CSI;                            \
1858   } while (0)
1859
1860 #define ENCODE_DIRECTION_R2L    \
1861   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1862
1863 #define ENCODE_DIRECTION_L2R    \
1864   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1865
1866 /* Produce codes for designation and invocation to reset the graphic
1867    planes and registers to initial state.  */
1868 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1869   do {                                                                      \
1870     int reg;                                                                \
1871     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1872       ENCODE_SHIFT_IN;                                                      \
1873     for (reg = 0; reg < 4; reg++)                                           \
1874       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1875           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1876               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1877         ENCODE_DESIGNATION                                                  \
1878           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1879   } while (0)
1880
1881 /* Produce designation sequences of charsets in the line started from
1882    SRC to a place pointed by *DSTP, and update DSTP.
1883
1884    If the current block ends before any end-of-line, we may fail to
1885    find all the necessary designations.  */
1886
1887 void
1888 encode_designation_at_bol (coding, table, src, src_end, dstp)
1889      struct coding_system *coding;
1890      Lisp_Object table;
1891      unsigned char *src, *src_end, **dstp;
1892 {
1893   int charset, c, found = 0, reg;
1894   /* Table of charsets to be designated to each graphic register.  */
1895   int r[4];
1896   unsigned char *dst = *dstp;
1897
1898   for (reg = 0; reg < 4; reg++)
1899     r[reg] = -1;
1900
1901   while (src < src_end && *src != '\n' && found < 4)
1902     {
1903       int bytes = BYTES_BY_CHAR_HEAD (*src);
1904
1905       if (NILP (table))
1906         charset = CHARSET_AT (src);
1907       else
1908         {
1909           int c_alt;
1910           unsigned char c1, c2;
1911
1912           SPLIT_STRING(src, bytes, charset, c1, c2);
1913           if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1914             charset = CHAR_CHARSET (c_alt);
1915         }
1916
1917       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1918       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1919         {
1920           found++;
1921           r[reg] = charset;
1922         }
1923
1924       src += bytes;
1925     }
1926
1927   if (found)
1928     {
1929       for (reg = 0; reg < 4; reg++)
1930         if (r[reg] >= 0
1931             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1932           ENCODE_DESIGNATION (r[reg], reg, coding);
1933       *dstp = dst;
1934     }
1935 }
1936
1937 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1938
1939 int
1940 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1941      struct coding_system *coding;
1942      unsigned char *source, *destination;
1943      int src_bytes, dst_bytes;
1944 {
1945   unsigned char *src = source;
1946   unsigned char *src_end = source + src_bytes;
1947   unsigned char *dst = destination;
1948   unsigned char *dst_end = destination + dst_bytes;
1949   /* Since the maximum bytes produced by each loop is 14, we subtract 13
1950      from DST_END to assure overflow checking is necessary only at the
1951      head of loop.  */
1952   unsigned char *adjusted_dst_end = dst_end - 13;
1953   Lisp_Object translation_table
1954       = coding->translation_table_for_encode;
1955   int result = CODING_FINISH_NORMAL;
1956
1957   if (!NILP (Venable_character_translation) && NILP (translation_table))
1958     translation_table = Vstandard_translation_table_for_encode;
1959
1960   coding->consumed_char = 0;
1961   coding->fake_multibyte = 0;
1962   while (src < src_end && (dst_bytes
1963                            ? (dst < adjusted_dst_end)
1964                            : (dst < src - 13)))
1965     {
1966       /* SRC_BASE remembers the start position in source in each loop.
1967          The loop will be exited when there's not enough source text
1968          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1969          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1970          reset to SRC_BASE before exiting.  */
1971       unsigned char *src_base = src;
1972       int charset, c1, c2, c3, c4;
1973
1974       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1975           && CODING_SPEC_ISO_BOL (coding))
1976         {
1977           /* We have to produce designation sequences if any now.  */
1978           encode_designation_at_bol (coding, translation_table,
1979                                      src, src_end, &dst);
1980           CODING_SPEC_ISO_BOL (coding) = 0;
1981         }
1982
1983       /* Check composition start and end.  */
1984       if (coding->composing != COMPOSITION_DISABLED
1985           && coding->cmp_data_start < coding->cmp_data->used)
1986         {
1987           struct composition_data *cmp_data = coding->cmp_data;
1988           int *data = cmp_data->data + coding->cmp_data_start;
1989           int this_pos = cmp_data->char_offset + coding->consumed_char;
1990
1991           if (coding->composing == COMPOSITION_RELATIVE)
1992             {
1993               if (this_pos == data[2])
1994                 {
1995                   ENCODE_COMPOSITION_END (coding, data);
1996                   cmp_data = coding->cmp_data;
1997                   data = cmp_data->data + coding->cmp_data_start;
1998                 }
1999             }
2000           else if (COMPOSING_P (coding))
2001             {
2002               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2003               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2004                 /* We have consumed components of the composition.
2005                    What follows in SRC is the compositions's base
2006                    text.  */
2007                 ENCODE_COMPOSITION_FAKE_START (coding);
2008               else
2009                 {
2010                   int c = cmp_data->data[coding->cmp_data_index++];
2011                   if (coding->composition_rule_follows)
2012                     {
2013                       ENCODE_COMPOSITION_RULE (c);
2014                       coding->composition_rule_follows = 0;
2015                     }
2016                   else
2017                     {
2018                       SPLIT_CHAR (c, charset, c1, c2);
2019                       ENCODE_ISO_CHARACTER (charset, c1, c2);
2020                       /* But, we didn't consume a character in SRC.  */
2021                       coding->consumed_char--;
2022                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2023                         coding->composition_rule_follows = 1;
2024                     }
2025                   continue;
2026                 }
2027             }
2028           if (!COMPOSING_P (coding))
2029             {
2030               if (this_pos == data[1])
2031                 {
2032                   ENCODE_COMPOSITION_START (coding, data);
2033                   continue;
2034                 }
2035             }
2036         }
2037
2038       c1 = *src++;
2039       /* Now encode one character.  C1 is a control character, an
2040          ASCII character, or a leading-code of multi-byte character.  */
2041       switch (emacs_code_class[c1])
2042         {
2043         case EMACS_ascii_code:
2044           c2 = 0;
2045           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
2046           break;
2047
2048         case EMACS_control_code:
2049           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2050             ENCODE_RESET_PLANE_AND_REGISTER;
2051           *dst++ = c1;
2052           coding->consumed_char++;
2053           break;
2054
2055         case EMACS_carriage_return_code:
2056           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2057             {
2058               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2059                 ENCODE_RESET_PLANE_AND_REGISTER;
2060               *dst++ = c1;
2061               coding->consumed_char++;
2062               break;
2063             }
2064           /* fall down to treat '\r' as '\n' ...  */
2065
2066         case EMACS_linefeed_code:
2067           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2068             ENCODE_RESET_PLANE_AND_REGISTER;
2069           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2070             bcopy (coding->spec.iso2022.initial_designation,
2071                    coding->spec.iso2022.current_designation,
2072                    sizeof coding->spec.iso2022.initial_designation);
2073           if (coding->eol_type == CODING_EOL_LF
2074               || coding->eol_type == CODING_EOL_UNDECIDED)
2075             *dst++ = ISO_CODE_LF;
2076           else if (coding->eol_type == CODING_EOL_CRLF)
2077             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2078           else
2079             *dst++ = ISO_CODE_CR;
2080           CODING_SPEC_ISO_BOL (coding) = 1;
2081           coding->consumed_char++;
2082           break;
2083
2084         case EMACS_leading_code_2:
2085           ONE_MORE_BYTE (c2);
2086           c3 = 0;
2087           if (c2 < 0xA0)
2088             {
2089               /* invalid sequence */
2090               *dst++ = c1;
2091               src--;
2092               coding->consumed_char++;
2093             }
2094           else
2095             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
2096           break;
2097
2098         case EMACS_leading_code_3:
2099           TWO_MORE_BYTES (c2, c3);
2100           c4 = 0;
2101           if (c2 < 0xA0 || c3 < 0xA0)
2102             {
2103               /* invalid sequence */
2104               *dst++ = c1;
2105               src -= 2;
2106               coding->consumed_char++;
2107             }
2108           else if (c1 < LEADING_CODE_PRIVATE_11)
2109             ENCODE_ISO_CHARACTER (c1, c2, c3);
2110           else
2111             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
2112           break;
2113
2114         case EMACS_leading_code_4:
2115           THREE_MORE_BYTES (c2, c3, c4);
2116           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
2117             {
2118               /* invalid sequence */
2119               *dst++ = c1;
2120               src -= 3;
2121               coding->consumed_char++;
2122             }
2123           else
2124             ENCODE_ISO_CHARACTER (c2, c3, c4);
2125           break;
2126
2127         case EMACS_invalid_code:
2128           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2129             ENCODE_RESET_PLANE_AND_REGISTER;
2130           *dst++ = c1;
2131           coding->consumed_char++;
2132           break;
2133         }
2134       continue;
2135     label_end_of_loop:
2136       result = CODING_FINISH_INSUFFICIENT_SRC;
2137       src = src_base;
2138       break;
2139     }
2140
2141   if (src < src_end && result == CODING_FINISH_NORMAL)
2142     result = CODING_FINISH_INSUFFICIENT_DST;
2143
2144   /* If this is the last block of the text to be encoded, we must
2145      reset graphic planes and registers to the initial state, and
2146      flush out the carryover if any.  */
2147   if (coding->mode & CODING_MODE_LAST_BLOCK)
2148     {
2149       ENCODE_RESET_PLANE_AND_REGISTER;
2150       if (COMPOSING_P (coding))
2151         *dst++ = ISO_CODE_ESC, *dst++ = '1';
2152       if (result == CODING_FINISH_INSUFFICIENT_SRC)
2153         {
2154           while (src < src_end && dst < dst_end)
2155             *dst++ = *src++;
2156         }
2157     }
2158   coding->consumed = src - source;
2159   coding->produced = coding->produced_char = dst - destination;
2160   return result;
2161 }
2162
2163 \f
2164 /*** 4. SJIS and BIG5 handlers ***/
2165
2166 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2167    quite widely.  So, for the moment, Emacs supports them in the bare
2168    C code.  But, in the future, they may be supported only by CCL.  */
2169
2170 /* SJIS is a coding system encoding three character sets: ASCII, right
2171    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2172    as is.  A character of charset katakana-jisx0201 is encoded by
2173    "position-code + 0x80".  A character of charset japanese-jisx0208
2174    is encoded in 2-byte but two position-codes are divided and shifted
2175    so that it fit in the range below.
2176
2177    --- CODE RANGE of SJIS ---
2178    (character set)      (range)
2179    ASCII                0x00 .. 0x7F
2180    KATAKANA-JISX0201    0xA0 .. 0xDF
2181    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2182             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2183    -------------------------------
2184
2185 */
2186
2187 /* BIG5 is a coding system encoding two character sets: ASCII and
2188    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2189    character set and is encoded in two-byte.
2190
2191    --- CODE RANGE of BIG5 ---
2192    (character set)      (range)
2193    ASCII                0x00 .. 0x7F
2194    Big5 (1st byte)      0xA1 .. 0xFE
2195         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2196    --------------------------
2197
2198    Since the number of characters in Big5 is larger than maximum
2199    characters in Emacs' charset (96x96), it can't be handled as one
2200    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2201    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2202    contains frequently used characters and the latter contains less
2203    frequently used characters.  */
2204
2205 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2206    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2207    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2208    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2209
2210 /* Number of Big5 characters which have the same code in 1st byte.  */
2211 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2212
2213 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2214   do {                                                                  \
2215     unsigned int temp                                                   \
2216       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2217     if (b1 < 0xC9)                                                      \
2218       charset = charset_big5_1;                                         \
2219     else                                                                \
2220       {                                                                 \
2221         charset = charset_big5_2;                                       \
2222         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2223       }                                                                 \
2224     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2225     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2226   } while (0)
2227
2228 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2229   do {                                                                  \
2230     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2231     if (charset == charset_big5_2)                                      \
2232       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2233     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2234     b2 = temp % BIG5_SAME_ROW;                                          \
2235     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2236   } while (0)
2237
2238 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
2239   do {                                                                  \
2240     int c_alt, charset_alt = (charset);                                 \
2241     if (!NILP (translation_table)                                       \
2242         && ((c_alt = translate_char (translation_table,                 \
2243                                      -1, (charset), c1, c2)) >= 0))     \
2244       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                          \
2245     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
2246       DECODE_CHARACTER_ASCII (c1);                                      \
2247     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
2248       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
2249     else                                                                \
2250       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
2251   } while (0)
2252
2253 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)             \
2254   do {                                                          \
2255     int c_alt, charset_alt;                                     \
2256     if (!NILP (translation_table)                               \
2257         && ((c_alt = translate_char (translation_table, -1,     \
2258                                      charset, c1, c2))          \
2259             >= 0))                                              \
2260       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
2261     else                                                        \
2262       charset_alt = charset;                                    \
2263     if (charset_alt == charset_ascii)                           \
2264       *dst++ = c1;                                              \
2265     else if (CHARSET_DIMENSION (charset_alt) == 1)              \
2266       {                                                         \
2267         if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2268           *dst++ = c1;                                          \
2269         else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2270           *dst++ = c1 & 0x7F;                                   \
2271         else                                                    \
2272           {                                                     \
2273             *dst++ = charset_alt, *dst++ = c1;                  \
2274             coding->fake_multibyte = 1;                         \
2275           }                                                     \
2276       }                                                         \
2277     else                                                        \
2278       {                                                         \
2279         c1 &= 0x7F, c2 &= 0x7F;                                 \
2280         if (sjis_p && (charset_alt == charset_jisx0208          \
2281                        || charset_alt == charset_jisx0208_1978))\
2282           {                                                     \
2283             unsigned char s1, s2;                               \
2284                                                                 \
2285             ENCODE_SJIS (c1, c2, s1, s2);                       \
2286             *dst++ = s1, *dst++ = s2;                           \
2287             coding->fake_multibyte = 1;                         \
2288           }                                                     \
2289         else if (!sjis_p                                        \
2290                  && (charset_alt == charset_big5_1              \
2291                      || charset_alt == charset_big5_2))         \
2292           {                                                     \
2293             unsigned char b1, b2;                               \
2294                                                                 \
2295             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);          \
2296             *dst++ = b1, *dst++ = b2;                           \
2297           }                                                     \
2298         else                                                    \
2299           {                                                     \
2300             *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;     \
2301             coding->fake_multibyte = 1;                         \
2302           }                                                     \
2303       }                                                         \
2304     coding->consumed_char++;                                    \
2305   } while (0)
2306
2307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2308    Check if a text is encoded in SJIS.  If it is, return
2309    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2310
2311 int
2312 detect_coding_sjis (src, src_end)
2313      unsigned char *src, *src_end;
2314 {
2315   unsigned char c;
2316
2317   while (src < src_end)
2318     {
2319       c = *src++;
2320       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2321         {
2322           if (src < src_end && *src++ < 0x40)
2323             return 0;
2324         }
2325     }
2326   return CODING_CATEGORY_MASK_SJIS;
2327 }
2328
2329 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2330    Check if a text is encoded in BIG5.  If it is, return
2331    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2332
2333 int
2334 detect_coding_big5 (src, src_end)
2335      unsigned char *src, *src_end;
2336 {
2337   unsigned char c;
2338
2339   while (src < src_end)
2340     {
2341       c = *src++;
2342       if (c >= 0xA1)
2343         {
2344           if (src >= src_end)
2345             break;
2346           c = *src++;
2347           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2348             return 0;
2349         }
2350     }
2351   return CODING_CATEGORY_MASK_BIG5;
2352 }
2353
2354 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2355    Check if a text is encoded in UTF-8.  If it is, return
2356    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2357
2358 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2359 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2360 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2361 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2362 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2363 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2364 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2365
2366 int
2367 detect_coding_utf_8 (src, src_end)
2368      unsigned char *src, *src_end;
2369 {
2370   unsigned char c;
2371   int seq_maybe_bytes;
2372
2373   while (src < src_end)
2374     {
2375       c = *src++;
2376       if (UTF_8_1_OCTET_P (c))
2377         continue;
2378       else if (UTF_8_2_OCTET_LEADING_P (c))
2379         seq_maybe_bytes = 1;
2380       else if (UTF_8_3_OCTET_LEADING_P (c))
2381         seq_maybe_bytes = 2;
2382       else if (UTF_8_4_OCTET_LEADING_P (c))
2383         seq_maybe_bytes = 3;
2384       else if (UTF_8_5_OCTET_LEADING_P (c))
2385         seq_maybe_bytes = 4;
2386       else if (UTF_8_6_OCTET_LEADING_P (c))
2387         seq_maybe_bytes = 5;
2388       else
2389         return 0;
2390
2391       do
2392         {
2393           if (src >= src_end)
2394             return CODING_CATEGORY_MASK_UTF_8;
2395
2396           c = *src++;
2397           if (!UTF_8_EXTRA_OCTET_P (c))
2398             return 0;
2399           seq_maybe_bytes--;
2400         }
2401       while (seq_maybe_bytes > 0);
2402     }
2403
2404   return CODING_CATEGORY_MASK_UTF_8;
2405 }
2406
2407 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2408    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2409    Little Endian (otherwise).  If it is, return
2410    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2411    else return 0.  */
2412
2413 #define UTF_16_INVALID_P(val)   \
2414   (((val) == 0xFFFE)            \
2415    || ((val) == 0xFFFF))
2416
2417 #define UTF_16_HIGH_SURROGATE_P(val) \
2418   (((val) & 0xD800) == 0xD800)
2419
2420 #define UTF_16_LOW_SURROGATE_P(val) \
2421   (((val) & 0xDC00) == 0xDC00)
2422
2423 int
2424 detect_coding_utf_16 (src, src_end)
2425      unsigned char *src, *src_end;
2426 {
2427   if ((src + 1) >= src_end) return 0;
2428
2429   if ((src[0] == 0xFF) && (src[1] == 0xFE))
2430     return CODING_CATEGORY_MASK_UTF_16_LE;
2431   else if ((src[0] == 0xFE) && (src[1] == 0xFF))
2432     return CODING_CATEGORY_MASK_UTF_16_BE;
2433
2434   return 0;
2435 }
2436
2437 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2438    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2439
2440 int
2441 decode_coding_sjis_big5 (coding, source, destination,
2442                          src_bytes, dst_bytes, sjis_p)
2443      struct coding_system *coding;
2444      unsigned char *source, *destination;
2445      int src_bytes, dst_bytes;
2446      int sjis_p;
2447 {
2448   unsigned char *src = source;
2449   unsigned char *src_end = source + src_bytes;
2450   unsigned char *dst = destination;
2451   unsigned char *dst_end = destination + dst_bytes;
2452   /* Since the maximum bytes produced by each loop is 4, we subtract 3
2453      from DST_END to assure overflow checking is necessary only at the
2454      head of loop.  */
2455   unsigned char *adjusted_dst_end = dst_end - 3;
2456   Lisp_Object translation_table
2457       = coding->translation_table_for_decode;
2458   int result = CODING_FINISH_NORMAL;
2459
2460   if (!NILP (Venable_character_translation) && NILP (translation_table))
2461     translation_table = Vstandard_translation_table_for_decode;
2462
2463   coding->produced_char = 0;
2464   coding->fake_multibyte = 0;
2465   while (src < src_end && (dst_bytes
2466                            ? (dst < adjusted_dst_end)
2467                            : (dst < src - 3)))
2468     {
2469       /* SRC_BASE remembers the start position in source in each loop.
2470          The loop will be exited when there's not enough source text
2471          to analyze two-byte character (within macro ONE_MORE_BYTE).
2472          In that case, SRC is reset to SRC_BASE before exiting.  */
2473       unsigned char *src_base = src;
2474       unsigned char c1 = *src++, c2, c3, c4;
2475
2476       if (c1 < 0x20)
2477         {
2478           if (c1 == '\r')
2479             {
2480               if (coding->eol_type == CODING_EOL_CRLF)
2481                 {
2482                   ONE_MORE_BYTE (c2);
2483                   if (c2 == '\n')
2484                     *dst++ = c2;
2485                   else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2486                     {
2487                       result = CODING_FINISH_INCONSISTENT_EOL;
2488                       goto label_end_of_loop_2;
2489                     }
2490                   else
2491                     /* To process C2 again, SRC is subtracted by 1.  */
2492                     *dst++ = c1, src--;
2493                 }
2494               else if (coding->eol_type == CODING_EOL_CR)
2495                 *dst++ = '\n';
2496               else
2497                 *dst++ = c1;
2498             }
2499           else if (c1 == '\n'
2500                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2501                    && (coding->eol_type == CODING_EOL_CR
2502                        || coding->eol_type == CODING_EOL_CRLF))
2503             {
2504               result = CODING_FINISH_INCONSISTENT_EOL;
2505               goto label_end_of_loop_2;
2506             }
2507           else
2508             *dst++ = c1;
2509           coding->produced_char++;
2510         }
2511       else if (c1 < 0x80)
2512         {
2513           c2 = 0;               /* avoid warning */
2514           DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2515         }
2516       else
2517         {
2518           if (sjis_p)
2519             {
2520               if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2521                 {
2522                   /* SJIS -> JISX0208 */
2523                   ONE_MORE_BYTE (c2);
2524                   if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2525                     {
2526                       DECODE_SJIS (c1, c2, c3, c4);
2527                       DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2528                     }
2529                   else
2530                     goto label_invalid_code_2;
2531                 }
2532               else if (c1 < 0xE0)
2533                 /* SJIS -> JISX0201-Kana */
2534                 {
2535                   c2 = 0;       /* avoid warning */
2536                   DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2537                                               /* dummy */ c2);
2538                 }
2539               else
2540                 goto label_invalid_code_1;
2541             }
2542           else
2543             {
2544               /* BIG5 -> Big5 */
2545               if (c1 >= 0xA1 && c1 <= 0xFE)
2546                 {
2547                   ONE_MORE_BYTE (c2);
2548                   if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2549                     {
2550                       int charset;
2551
2552                       DECODE_BIG5 (c1, c2, charset, c3, c4);
2553                       DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2554                     }
2555                   else
2556                     goto label_invalid_code_2;
2557                 }
2558               else
2559                 goto label_invalid_code_1;
2560             }
2561         }
2562       continue;
2563
2564     label_invalid_code_1:
2565       *dst++ = c1;
2566       coding->produced_char++;
2567       coding->fake_multibyte = 1;
2568       continue;
2569
2570     label_invalid_code_2:
2571       *dst++ = c1; *dst++= c2;
2572       coding->produced_char += 2;
2573       coding->fake_multibyte = 1;
2574       continue;
2575
2576     label_end_of_loop:
2577       result = CODING_FINISH_INSUFFICIENT_SRC;
2578     label_end_of_loop_2:
2579       src = src_base;
2580       break;
2581     }
2582
2583   if (src < src_end)
2584     {
2585       if (result == CODING_FINISH_NORMAL)
2586         result = CODING_FINISH_INSUFFICIENT_DST;
2587       else if (result != CODING_FINISH_INCONSISTENT_EOL
2588                && coding->mode & CODING_MODE_LAST_BLOCK)
2589         {
2590           src_bytes = src_end - src;
2591           if (dst_bytes && (dst_end - dst < src_bytes))
2592             src_bytes = dst_end - dst;
2593           bcopy (dst, src, src_bytes);
2594           src += src_bytes;
2595           dst += src_bytes;
2596           coding->fake_multibyte = 1;
2597         }
2598     }
2599
2600   coding->consumed = coding->consumed_char = src - source;
2601   coding->produced = dst - destination;
2602   return result;
2603 }
2604
2605 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2606    This function can encode `charset_ascii', `charset_katakana_jisx0201',
2607    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
2608    sure that all these charsets are registered as official charset
2609    (i.e. do not have extended leading-codes).  Characters of other
2610    charsets are produced without any encoding.  If SJIS_P is 1, encode
2611    SJIS text, else encode BIG5 text.  */
2612
2613 int
2614 encode_coding_sjis_big5 (coding, source, destination,
2615                          src_bytes, dst_bytes, sjis_p)
2616      struct coding_system *coding;
2617      unsigned char *source, *destination;
2618      int src_bytes, dst_bytes;
2619      int sjis_p;
2620 {
2621   unsigned char *src = source;
2622   unsigned char *src_end = source + src_bytes;
2623   unsigned char *dst = destination;
2624   unsigned char *dst_end = destination + dst_bytes;
2625   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2626      from DST_END to assure overflow checking is necessary only at the
2627      head of loop.  */
2628   unsigned char *adjusted_dst_end = dst_end - 1;
2629   Lisp_Object translation_table
2630       = coding->translation_table_for_encode;
2631   int result = CODING_FINISH_NORMAL;
2632
2633   if (!NILP (Venable_character_translation) && NILP (translation_table))
2634     translation_table = Vstandard_translation_table_for_encode;
2635
2636   coding->consumed_char = 0;
2637   coding->fake_multibyte = 0;
2638   while (src < src_end && (dst_bytes
2639                            ? (dst < adjusted_dst_end)
2640                            : (dst < src - 1)))
2641     {
2642       /* SRC_BASE remembers the start position in source in each loop.
2643          The loop will be exited when there's not enough source text
2644          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2645          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2646          before exiting.  */
2647       unsigned char *src_base = src;
2648       unsigned char c1 = *src++, c2, c3, c4;
2649
2650       switch (emacs_code_class[c1])
2651         {
2652         case EMACS_ascii_code:
2653           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2654           break;
2655
2656         case EMACS_control_code:
2657           *dst++ = c1;
2658           coding->consumed_char++;
2659           break;
2660
2661         case EMACS_carriage_return_code:
2662           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2663             {
2664               *dst++ = c1;
2665               coding->consumed_char++;
2666               break;
2667             }
2668           /* fall down to treat '\r' as '\n' ...  */
2669
2670         case EMACS_linefeed_code:
2671           if (coding->eol_type == CODING_EOL_LF
2672               || coding->eol_type == CODING_EOL_UNDECIDED)
2673             *dst++ = '\n';
2674           else if (coding->eol_type == CODING_EOL_CRLF)
2675             *dst++ = '\r', *dst++ = '\n';
2676           else
2677             *dst++ = '\r';
2678           coding->consumed_char++;
2679           break;
2680
2681         case EMACS_leading_code_2:
2682           ONE_MORE_BYTE (c2);
2683           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2684           break;
2685
2686         case EMACS_leading_code_3:
2687           TWO_MORE_BYTES (c2, c3);
2688           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2689           break;
2690
2691         case EMACS_leading_code_4:
2692           THREE_MORE_BYTES (c2, c3, c4);
2693           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2694           break;
2695
2696         default:                /* i.e. case EMACS_invalid_code: */
2697           *dst++ = c1;
2698           coding->consumed_char++;
2699         }
2700       continue;
2701
2702     label_end_of_loop:
2703       result = CODING_FINISH_INSUFFICIENT_SRC;
2704       src = src_base;
2705       break;
2706     }
2707
2708   if (result == CODING_FINISH_NORMAL
2709       && src < src_end)
2710     result = CODING_FINISH_INSUFFICIENT_DST;
2711   coding->consumed = src - source;
2712   coding->produced = coding->produced_char = dst - destination;
2713   return result;
2714 }
2715
2716 \f
2717 /*** 5. CCL handlers ***/
2718
2719 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2720    Check if a text is encoded in a coding system of which
2721    encoder/decoder are written in CCL program.  If it is, return
2722    CODING_CATEGORY_MASK_CCL, else return 0.  */
2723
2724 int
2725 detect_coding_ccl (src, src_end)
2726      unsigned char *src, *src_end;
2727 {
2728   unsigned char *valid;
2729
2730   /* No coding system is assigned to coding-category-ccl.  */
2731   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2732     return 0;
2733
2734   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2735   while (src < src_end)
2736     {
2737       if (! valid[*src]) return 0;
2738       src++;
2739     }
2740   return CODING_CATEGORY_MASK_CCL;
2741 }
2742
2743 \f
2744 /*** 6. End-of-line handlers ***/
2745
2746 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2747    This function is called only when `coding->eol_type' is
2748    CODING_EOL_CRLF or CODING_EOL_CR.  */
2749
2750 int
2751 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2752      struct coding_system *coding;
2753      unsigned char *source, *destination;
2754      int src_bytes, dst_bytes;
2755 {
2756   unsigned char *src = source;
2757   unsigned char *src_end = source + src_bytes;
2758   unsigned char *dst = destination;
2759   unsigned char *dst_end = destination + dst_bytes;
2760   unsigned char c;
2761   int result = CODING_FINISH_NORMAL;
2762
2763   coding->fake_multibyte = 0;
2764
2765   if (src_bytes <= 0)
2766     {
2767       coding->produced = coding->produced_char = 0;
2768       coding->consumed = coding->consumed_char = 0;
2769       return result;
2770     }
2771
2772   switch (coding->eol_type)
2773     {
2774     case CODING_EOL_CRLF:
2775       {
2776         /* Since the maximum bytes produced by each loop is 2, we
2777            subtract 1 from DST_END to assure overflow checking is
2778            necessary only at the head of loop.  */
2779         unsigned char *adjusted_dst_end = dst_end - 1;
2780
2781         while (src < src_end && (dst_bytes
2782                                  ? (dst < adjusted_dst_end)
2783                                  : (dst < src - 1)))
2784           {
2785             unsigned char *src_base = src;
2786
2787             c = *src++;
2788             if (c == '\r')
2789               {
2790                 ONE_MORE_BYTE (c);
2791                 if (c == '\n')
2792                   *dst++ = c;
2793                 else
2794                   {
2795                     if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2796                       {
2797                         result = CODING_FINISH_INCONSISTENT_EOL;
2798                         goto label_end_of_loop_2;
2799                       }
2800                     src--;
2801                     *dst++ = '\r';
2802                     if (BASE_LEADING_CODE_P (c))
2803                       coding->fake_multibyte = 1;
2804                   }
2805               }
2806             else if (c == '\n'
2807                      && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2808               {
2809                 result = CODING_FINISH_INCONSISTENT_EOL;
2810                 goto label_end_of_loop_2;
2811               }
2812             else
2813               {
2814                 *dst++ = c;
2815                 if (BASE_LEADING_CODE_P (c))
2816                   coding->fake_multibyte = 1;
2817               }
2818             continue;
2819
2820           label_end_of_loop:
2821             result = CODING_FINISH_INSUFFICIENT_SRC;
2822           label_end_of_loop_2:
2823             src = src_base;
2824             break;
2825           }
2826         if (src < src_end)
2827           {
2828             if (result == CODING_FINISH_NORMAL)
2829               result = CODING_FINISH_INSUFFICIENT_DST;
2830             else if (result != CODING_FINISH_INCONSISTENT_EOL
2831                      && coding->mode & CODING_MODE_LAST_BLOCK)
2832               {
2833                 /* This is the last block of the text to be decoded.
2834                    We flush out all remaining codes.  */
2835                 src_bytes = src_end - src;
2836                 if (dst_bytes && (dst_end - dst < src_bytes))
2837                   src_bytes = dst_end - dst;
2838                 bcopy (src, dst, src_bytes);
2839                 dst += src_bytes;
2840                 src += src_bytes;
2841               }
2842           }
2843       }
2844       break;
2845
2846     case CODING_EOL_CR:
2847       if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2848         {
2849           while (src < src_end)
2850             {
2851               if ((c = *src++) == '\n')
2852                 break;
2853               if (BASE_LEADING_CODE_P (c))
2854                 coding->fake_multibyte = 1;
2855             }
2856           if (*--src == '\n')
2857             {
2858               src_bytes = src - source;
2859               result = CODING_FINISH_INCONSISTENT_EOL;
2860             }
2861         }
2862       if (dst_bytes && src_bytes > dst_bytes)
2863         {
2864           result = CODING_FINISH_INSUFFICIENT_DST;
2865           src_bytes = dst_bytes;
2866         }
2867       if (dst_bytes)
2868         bcopy (source, destination, src_bytes);
2869       else
2870         safe_bcopy (source, destination, src_bytes);
2871       src = source + src_bytes;
2872       while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2873       break;
2874
2875     default:                    /* i.e. case: CODING_EOL_LF */
2876       if (dst_bytes && src_bytes > dst_bytes)
2877         {
2878           result = CODING_FINISH_INSUFFICIENT_DST;
2879           src_bytes = dst_bytes;
2880         }
2881       if (dst_bytes)
2882         bcopy (source, destination, src_bytes);
2883       else
2884         safe_bcopy (source, destination, src_bytes);
2885       src += src_bytes;
2886       dst += src_bytes;
2887       coding->fake_multibyte = 1;
2888       break;
2889     }
2890
2891   coding->consumed = coding->consumed_char = src - source;
2892   coding->produced = coding->produced_char = dst - destination;
2893   return result;
2894 }
2895
2896 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2897    format of end-of-line according to `coding->eol_type'.  If
2898    `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2899    '\r' in source text also means end-of-line.  */
2900
2901 int
2902 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2903      struct coding_system *coding;
2904      unsigned char *source, *destination;
2905      int src_bytes, dst_bytes;
2906 {
2907   unsigned char *src = source;
2908   unsigned char *dst = destination;
2909   int result = CODING_FINISH_NORMAL;
2910
2911   coding->fake_multibyte = 0;
2912
2913   if (coding->eol_type == CODING_EOL_CRLF)
2914     {
2915       unsigned char c;
2916       unsigned char *src_end = source + src_bytes;
2917       unsigned char *dst_end = destination + dst_bytes;
2918       /* Since the maximum bytes produced by each loop is 2, we
2919          subtract 1 from DST_END to assure overflow checking is
2920          necessary only at the head of loop.  */
2921       unsigned char *adjusted_dst_end = dst_end - 1;
2922
2923       while (src < src_end && (dst_bytes
2924                                ? (dst < adjusted_dst_end)
2925                                : (dst < src - 1)))
2926         {
2927           c = *src++;
2928           if (c == '\n'
2929               || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2930             *dst++ = '\r', *dst++ = '\n';
2931           else
2932             {
2933               *dst++ = c;
2934               if (BASE_LEADING_CODE_P (c))
2935                 coding->fake_multibyte = 1;
2936             }
2937         }
2938       if (src < src_end)
2939         result = CODING_FINISH_INSUFFICIENT_DST;
2940     }
2941   else
2942     {
2943       unsigned char c;
2944
2945       if (dst_bytes && src_bytes > dst_bytes)
2946         {
2947           src_bytes = dst_bytes;
2948           result = CODING_FINISH_INSUFFICIENT_DST;
2949         }
2950       if (dst_bytes)
2951         bcopy (source, destination, src_bytes);
2952       else
2953         safe_bcopy (source, destination, src_bytes);
2954       dst_bytes = src_bytes;
2955       if (coding->eol_type == CODING_EOL_CR)
2956         {
2957           while (src_bytes--)
2958             {
2959               if ((c = *dst++) == '\n')
2960                 dst[-1] = '\r';
2961               else if (BASE_LEADING_CODE_P (c))
2962                 coding->fake_multibyte = 1;
2963             }
2964         }
2965       else
2966         {
2967           if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2968             {
2969               while (src_bytes--)
2970                 if (*dst++ == '\r') dst[-1] = '\n';
2971             }
2972           coding->fake_multibyte = 1;
2973         }
2974       src = source + dst_bytes;
2975       dst = destination + dst_bytes;
2976     }
2977
2978   coding->consumed = coding->consumed_char = src - source;
2979   coding->produced = coding->produced_char = dst - destination;
2980   return result;
2981 }
2982
2983 \f
2984 /*** 7. C library functions ***/
2985
2986 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2987    has a property `coding-system'.  The value of this property is a
2988    vector of length 5 (called as coding-vector).  Among elements of
2989    this vector, the first (element[0]) and the fifth (element[4])
2990    carry important information for decoding/encoding.  Before
2991    decoding/encoding, this information should be set in fields of a
2992    structure of type `coding_system'.
2993
2994    A value of property `coding-system' can be a symbol of another
2995    subsidiary coding-system.  In that case, Emacs gets coding-vector
2996    from that symbol.
2997
2998    `element[0]' contains information to be set in `coding->type'.  The
2999    value and its meaning is as follows:
3000
3001    0 -- coding_type_emacs_mule
3002    1 -- coding_type_sjis
3003    2 -- coding_type_iso2022
3004    3 -- coding_type_big5
3005    4 -- coding_type_ccl encoder/decoder written in CCL
3006    nil -- coding_type_no_conversion
3007    t -- coding_type_undecided (automatic conversion on decoding,
3008                                no-conversion on encoding)
3009
3010    `element[4]' contains information to be set in `coding->flags' and
3011    `coding->spec'.  The meaning varies by `coding->type'.
3012
3013    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3014    of length 32 (of which the first 13 sub-elements are used now).
3015    Meanings of these sub-elements are:
3016
3017    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3018         If the value is an integer of valid charset, the charset is
3019         assumed to be designated to graphic register N initially.
3020
3021         If the value is minus, it is a minus value of charset which
3022         reserves graphic register N, which means that the charset is
3023         not designated initially but should be designated to graphic
3024         register N just before encoding a character in that charset.
3025
3026         If the value is nil, graphic register N is never used on
3027         encoding.
3028
3029    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3030         Each value takes t or nil.  See the section ISO2022 of
3031         `coding.h' for more information.
3032
3033    If `coding->type' is `coding_type_big5', element[4] is t to denote
3034    BIG5-ETen or nil to denote BIG5-HKU.
3035
3036    If `coding->type' takes the other value, element[4] is ignored.
3037
3038    Emacs Lisp's coding system also carries information about format of
3039    end-of-line in a value of property `eol-type'.  If the value is
3040    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3041    means CODING_EOL_CR.  If it is not integer, it should be a vector
3042    of subsidiary coding systems of which property `eol-type' has one
3043    of above values.
3044
3045 */
3046
3047 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3048    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3049    is setup so that no conversion is necessary and return -1, else
3050    return 0.  */
3051
3052 int
3053 setup_coding_system (coding_system, coding)
3054      Lisp_Object coding_system;
3055      struct coding_system *coding;
3056 {
3057   Lisp_Object coding_spec, coding_type, eol_type, plist;
3058   Lisp_Object val;
3059   int i;
3060
3061   /* Initialize some fields required for all kinds of coding systems.  */
3062   coding->symbol = coding_system;
3063   coding->common_flags = 0;
3064   coding->mode = 0;
3065   coding->heading_ascii = -1;
3066   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3067   coding->composing = COMPOSITION_DISABLED;
3068   coding->cmp_data = NULL;
3069
3070   if (NILP (coding_system))
3071     goto label_invalid_coding_system;
3072
3073   coding_spec = Fget (coding_system, Qcoding_system);
3074
3075   if (!VECTORP (coding_spec)
3076       || XVECTOR (coding_spec)->size != 5
3077       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3078     goto label_invalid_coding_system;
3079
3080   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3081   if (VECTORP (eol_type))
3082     {
3083       coding->eol_type = CODING_EOL_UNDECIDED;
3084       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3085     }
3086   else if (XFASTINT (eol_type) == 1)
3087     {
3088       coding->eol_type = CODING_EOL_CRLF;
3089       coding->common_flags
3090         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3091     }
3092   else if (XFASTINT (eol_type) == 2)
3093     {
3094       coding->eol_type = CODING_EOL_CR;
3095       coding->common_flags
3096         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3097     }
3098   else
3099     coding->eol_type = CODING_EOL_LF;
3100
3101   coding_type = XVECTOR (coding_spec)->contents[0];
3102   /* Try short cut.  */
3103   if (SYMBOLP (coding_type))
3104     {
3105       if (EQ (coding_type, Qt))
3106         {
3107           coding->type = coding_type_undecided;
3108           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3109         }
3110       else
3111         coding->type = coding_type_no_conversion;
3112       return 0;
3113     }
3114
3115   /* Get values of coding system properties:
3116      `post-read-conversion', `pre-write-conversion',
3117      `translation-table-for-decode', `translation-table-for-encode'.  */
3118   plist = XVECTOR (coding_spec)->contents[3];
3119   /* Pre & post conversion functions should be disabled if
3120      inhibit_eol_conversion is nozero.  This is the case that a code
3121      conversion function is called while those functions are running.  */
3122   if (! inhibit_pre_post_conversion)
3123     {
3124       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3125       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3126     }
3127   val = Fplist_get (plist, Qtranslation_table_for_decode);
3128   if (SYMBOLP (val))
3129     val = Fget (val, Qtranslation_table_for_decode);
3130   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3131   val = Fplist_get (plist, Qtranslation_table_for_encode);
3132   if (SYMBOLP (val))
3133     val = Fget (val, Qtranslation_table_for_encode);
3134   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3135   val = Fplist_get (plist, Qcoding_category);
3136   if (!NILP (val))
3137     {
3138       val = Fget (val, Qcoding_category_index);
3139       if (INTEGERP (val))
3140         coding->category_idx = XINT (val);
3141       else
3142         goto label_invalid_coding_system;
3143     }
3144   else
3145     goto label_invalid_coding_system;
3146
3147   val = Fplist_get (plist, Qsafe_charsets);
3148   if (EQ (val, Qt))
3149     {
3150       for (i = 0; i <= MAX_CHARSET; i++)
3151         coding->safe_charsets[i] = 1;
3152     }
3153   else
3154     {
3155       bzero (coding->safe_charsets, MAX_CHARSET + 1);
3156       while (CONSP (val))
3157         {
3158           if ((i = get_charset_id (XCAR (val))) >= 0)
3159             coding->safe_charsets[i] = 1;
3160           val = XCDR (val);
3161         }
3162     }
3163
3164   /* If the coding system has non-nil `composition' property, enable
3165      composition handling.  */
3166   val = Fplist_get (plist, Qcomposition);
3167   if (!NILP (val))
3168     coding->composing = COMPOSITION_NO;
3169
3170   switch (XFASTINT (coding_type))
3171     {
3172     case 0:
3173       coding->type = coding_type_emacs_mule;
3174       if (!NILP (coding->post_read_conversion))
3175         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3176       if (!NILP (coding->pre_write_conversion))
3177         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3178       break;
3179
3180     case 1:
3181       coding->type = coding_type_sjis;
3182       coding->common_flags
3183         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3184       break;
3185
3186     case 2:
3187       coding->type = coding_type_iso2022;
3188       coding->common_flags
3189         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3190       {
3191         Lisp_Object val, temp;
3192         Lisp_Object *flags;
3193         int i, charset, reg_bits = 0;
3194
3195         val = XVECTOR (coding_spec)->contents[4];
3196
3197         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3198           goto label_invalid_coding_system;
3199
3200         flags = XVECTOR (val)->contents;
3201         coding->flags
3202           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3203              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3204              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3205              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3206              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3207              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3208              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3209              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3210              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3211              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3212              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3213              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3214              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3215              );
3216
3217         /* Invoke graphic register 0 to plane 0.  */
3218         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3219         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3220         CODING_SPEC_ISO_INVOCATION (coding, 1)
3221           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3222         /* Not single shifting at first.  */
3223         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3224         /* Beginning of buffer should also be regarded as bol. */
3225         CODING_SPEC_ISO_BOL (coding) = 1;
3226
3227         for (charset = 0; charset <= MAX_CHARSET; charset++)
3228           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3229         val = Vcharset_revision_alist;
3230         while (CONSP (val))
3231           {
3232             charset = get_charset_id (Fcar_safe (XCAR (val)));
3233             if (charset >= 0
3234                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3235                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3236               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3237             val = XCDR (val);
3238           }
3239
3240         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3241            FLAGS[REG] can be one of below:
3242                 integer CHARSET: CHARSET occupies register I,
3243                 t: designate nothing to REG initially, but can be used
3244                   by any charsets,
3245                 list of integer, nil, or t: designate the first
3246                   element (if integer) to REG initially, the remaining
3247                   elements (if integer) is designated to REG on request,
3248                   if an element is t, REG can be used by any charsets,
3249                 nil: REG is never used.  */
3250         for (charset = 0; charset <= MAX_CHARSET; charset++)
3251           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3252             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3253         for (i = 0; i < 4; i++)
3254           {
3255             if (INTEGERP (flags[i])
3256                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3257                 || (charset = get_charset_id (flags[i])) >= 0)
3258               {
3259                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3260                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3261               }
3262             else if (EQ (flags[i], Qt))
3263               {
3264                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3265                 reg_bits |= 1 << i;
3266                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3267               }
3268             else if (CONSP (flags[i]))
3269               {
3270                 Lisp_Object tail;
3271                 tail = flags[i];
3272
3273                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3274                 if (INTEGERP (XCAR (tail))
3275                     && (charset = XINT (XCAR (tail)),
3276                         CHARSET_VALID_P (charset))
3277                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3278                   {
3279                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3280                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3281                   }
3282                 else
3283                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3284                 tail = XCDR (tail);
3285                 while (CONSP (tail))
3286                   {
3287                     if (INTEGERP (XCAR (tail))
3288                         && (charset = XINT (XCAR (tail)),
3289                             CHARSET_VALID_P (charset))
3290                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3291                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3292                         = i;
3293                     else if (EQ (XCAR (tail), Qt))
3294                       reg_bits |= 1 << i;
3295                     tail = XCDR (tail);
3296                   }
3297               }
3298             else
3299               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3300
3301             CODING_SPEC_ISO_DESIGNATION (coding, i)
3302               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3303           }
3304
3305         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3306           {
3307             /* REG 1 can be used only by locking shift in 7-bit env.  */
3308             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3309               reg_bits &= ~2;
3310             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3311               /* Without any shifting, only REG 0 and 1 can be used.  */
3312               reg_bits &= 3;
3313           }
3314
3315         if (reg_bits)
3316           for (charset = 0; charset <= MAX_CHARSET; charset++)
3317             {
3318               if (CHARSET_VALID_P (charset))
3319                 {
3320                   /* There exist some default graphic registers to be
3321                      used CHARSET.  */
3322
3323                   /* We had better avoid designating a charset of
3324                      CHARS96 to REG 0 as far as possible.  */
3325                   if (CHARSET_CHARS (charset) == 96)
3326                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3327                       = (reg_bits & 2
3328                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3329                   else
3330                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3331                       = (reg_bits & 1
3332                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3333                 }
3334             }
3335       }
3336       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3337       coding->spec.iso2022.last_invalid_designation_register = -1;
3338       break;
3339
3340     case 3:
3341       coding->type = coding_type_big5;
3342       coding->common_flags
3343         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3344       coding->flags
3345         = (NILP (XVECTOR (coding_spec)->contents[4])
3346            ? CODING_FLAG_BIG5_HKU
3347            : CODING_FLAG_BIG5_ETEN);
3348       break;
3349
3350     case 4:
3351       coding->type = coding_type_ccl;
3352       coding->common_flags
3353         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3354       {
3355         val = XVECTOR (coding_spec)->contents[4];
3356         if (! CONSP (val)
3357             || setup_ccl_program (&(coding->spec.ccl.decoder),
3358                                   XCAR (val)) < 0
3359             || setup_ccl_program (&(coding->spec.ccl.encoder),
3360                                   XCDR (val)) < 0)
3361           goto label_invalid_coding_system;
3362
3363         bzero (coding->spec.ccl.valid_codes, 256);
3364         val = Fplist_get (plist, Qvalid_codes);
3365         if (CONSP (val))
3366           {
3367             Lisp_Object this;
3368
3369             for (; CONSP (val); val = XCDR (val))
3370               {
3371                 this = XCAR (val);
3372                 if (INTEGERP (this)
3373                     && XINT (this) >= 0 && XINT (this) < 256)
3374                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3375                 else if (CONSP (this)
3376                          && INTEGERP (XCAR (this))
3377                          && INTEGERP (XCDR (this)))
3378                   {
3379                     int start = XINT (XCAR (this));
3380                     int end = XINT (XCDR (this));
3381
3382                     if (start >= 0 && start <= end && end < 256)
3383                       while (start <= end)
3384                         coding->spec.ccl.valid_codes[start++] = 1;
3385                   }
3386               }
3387           }
3388       }
3389       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3390       break;
3391
3392     case 5:
3393       coding->type = coding_type_raw_text;
3394       break;
3395
3396     default:
3397       goto label_invalid_coding_system;
3398     }
3399   return 0;
3400
3401  label_invalid_coding_system:
3402   coding->type = coding_type_no_conversion;
3403   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3404   coding->common_flags = 0;
3405   coding->eol_type = CODING_EOL_LF;
3406   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3407   return -1;
3408 }
3409
3410 /* Free memory blocks allocated for storing composition information.  */
3411
3412 void
3413 coding_free_composition_data (coding)
3414      struct coding_system *coding;
3415 {
3416   struct composition_data *cmp_data = coding->cmp_data, *next;
3417
3418   if (!cmp_data)
3419     return;
3420   /* Memory blocks are chained.  At first, rewind to the first, then,
3421      free blocks one by one.  */
3422   while (cmp_data->prev)
3423     cmp_data = cmp_data->prev;
3424   while (cmp_data)
3425     {
3426       next = cmp_data->next;
3427       xfree (cmp_data);
3428       cmp_data = next;
3429     }
3430   coding->cmp_data = NULL;
3431 }
3432
3433 /* Set `char_offset' member of all memory blocks pointed by
3434    coding->cmp_data to POS.  */
3435
3436 void
3437 coding_adjust_composition_offset (coding, pos)
3438      struct coding_system *coding;
3439      int pos;
3440 {
3441   struct composition_data *cmp_data;
3442
3443   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3444     cmp_data->char_offset = pos;
3445 }
3446
3447 /* Setup raw-text or one of its subsidiaries in the structure
3448    coding_system CODING according to the already setup value eol_type
3449    in CODING.  CODING should be setup for some coding system in
3450    advance.  */
3451
3452 void
3453 setup_raw_text_coding_system (coding)
3454      struct coding_system *coding;
3455 {
3456   if (coding->type != coding_type_raw_text)
3457     {
3458       coding->symbol = Qraw_text;
3459       coding->type = coding_type_raw_text;
3460       if (coding->eol_type != CODING_EOL_UNDECIDED)
3461         {
3462           Lisp_Object subsidiaries;
3463           subsidiaries = Fget (Qraw_text, Qeol_type);
3464
3465           if (VECTORP (subsidiaries)
3466               && XVECTOR (subsidiaries)->size == 3)
3467             coding->symbol
3468               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3469         }
3470       setup_coding_system (coding->symbol, coding);
3471     }
3472   return;
3473 }
3474
3475 /* Emacs has a mechanism to automatically detect a coding system if it
3476    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3477    it's impossible to distinguish some coding systems accurately
3478    because they use the same range of codes.  So, at first, coding
3479    systems are categorized into 7, those are:
3480
3481    o coding-category-emacs-mule
3482
3483         The category for a coding system which has the same code range
3484         as Emacs' internal format.  Assigned the coding-system (Lisp
3485         symbol) `emacs-mule' by default.
3486
3487    o coding-category-sjis
3488
3489         The category for a coding system which has the same code range
3490         as SJIS.  Assigned the coding-system (Lisp
3491         symbol) `japanese-shift-jis' by default.
3492
3493    o coding-category-iso-7
3494
3495         The category for a coding system which has the same code range
3496         as ISO2022 of 7-bit environment.  This doesn't use any locking
3497         shift and single shift functions.  This can encode/decode all
3498         charsets.  Assigned the coding-system (Lisp symbol)
3499         `iso-2022-7bit' by default.
3500
3501    o coding-category-iso-7-tight
3502
3503         Same as coding-category-iso-7 except that this can
3504         encode/decode only the specified charsets.
3505
3506    o coding-category-iso-8-1
3507
3508         The category for a coding system which has the same code range
3509         as ISO2022 of 8-bit environment and graphic plane 1 used only
3510         for DIMENSION1 charset.  This doesn't use any locking shift
3511         and single shift functions.  Assigned the coding-system (Lisp
3512         symbol) `iso-latin-1' by default.
3513
3514    o coding-category-iso-8-2
3515
3516         The category for a coding system which has the same code range
3517         as ISO2022 of 8-bit environment and graphic plane 1 used only
3518         for DIMENSION2 charset.  This doesn't use any locking shift
3519         and single shift functions.  Assigned the coding-system (Lisp
3520         symbol) `japanese-iso-8bit' by default.
3521
3522    o coding-category-iso-7-else
3523
3524         The category for a coding system which has the same code range
3525         as ISO2022 of 7-bit environemnt but uses locking shift or
3526         single shift functions.  Assigned the coding-system (Lisp
3527         symbol) `iso-2022-7bit-lock' by default.
3528
3529    o coding-category-iso-8-else
3530
3531         The category for a coding system which has the same code range
3532         as ISO2022 of 8-bit environemnt but uses locking shift or
3533         single shift functions.  Assigned the coding-system (Lisp
3534         symbol) `iso-2022-8bit-ss2' by default.
3535
3536    o coding-category-big5
3537
3538         The category for a coding system which has the same code range
3539         as BIG5.  Assigned the coding-system (Lisp symbol)
3540         `cn-big5' by default.
3541
3542    o coding-category-utf-8
3543
3544         The category for a coding system which has the same code range
3545         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3546         symbol) `utf-8' by default.
3547
3548    o coding-category-utf-16-be
3549
3550         The category for a coding system in which a text has an
3551         Unicode signature (cf. Unicode Standard) in the order of BIG
3552         endian at the head.  Assigned the coding-system (Lisp symbol)
3553         `utf-16-be' by default.
3554
3555    o coding-category-utf-16-le
3556
3557         The category for a coding system in which a text has an
3558         Unicode signature (cf. Unicode Standard) in the order of
3559         LITTLE endian at the head.  Assigned the coding-system (Lisp
3560         symbol) `utf-16-le' by default.
3561
3562    o coding-category-ccl
3563
3564         The category for a coding system of which encoder/decoder is
3565         written in CCL programs.  The default value is nil, i.e., no
3566         coding system is assigned.
3567
3568    o coding-category-binary
3569
3570         The category for a coding system not categorized in any of the
3571         above.  Assigned the coding-system (Lisp symbol)
3572         `no-conversion' by default.
3573
3574    Each of them is a Lisp symbol and the value is an actual
3575    `coding-system's (this is also a Lisp symbol) assigned by a user.
3576    What Emacs does actually is to detect a category of coding system.
3577    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3578    decide only one possible category, it selects a category of the
3579    highest priority.  Priorities of categories are also specified by a
3580    user in a Lisp variable `coding-category-list'.
3581
3582 */
3583
3584 static
3585 int ascii_skip_code[256];
3586
3587 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3588    If it detects possible coding systems, return an integer in which
3589    appropriate flag bits are set.  Flag bits are defined by macros
3590    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3591    it should point the table `coding_priorities'.  In that case, only
3592    the flag bit for a coding system of the highest priority is set in
3593    the returned value.
3594
3595    How many ASCII characters are at the head is returned as *SKIP.  */
3596
3597 static int
3598 detect_coding_mask (source, src_bytes, priorities, skip)
3599      unsigned char *source;
3600      int src_bytes, *priorities, *skip;
3601 {
3602   register unsigned char c;
3603   unsigned char *src = source, *src_end = source + src_bytes;
3604   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3605   int i, idx;
3606
3607   /* At first, skip all ASCII characters and control characters except
3608      for three ISO2022 specific control characters.  */
3609   ascii_skip_code[ISO_CODE_SO] = 0;
3610   ascii_skip_code[ISO_CODE_SI] = 0;
3611   ascii_skip_code[ISO_CODE_ESC] = 0;
3612
3613  label_loop_detect_coding:
3614   while (src < src_end && ascii_skip_code[*src]) src++;
3615   *skip = src - source;
3616
3617   if (src >= src_end)
3618     /* We found nothing other than ASCII.  There's nothing to do.  */
3619     return 0;
3620
3621   c = *src;
3622   /* The text seems to be encoded in some multilingual coding system.
3623      Now, try to find in which coding system the text is encoded.  */
3624   if (c < 0x80)
3625     {
3626       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3627       /* C is an ISO2022 specific control code of C0.  */
3628       mask = detect_coding_iso2022 (src, src_end);
3629       if (mask == 0)
3630         {
3631           /* No valid ISO2022 code follows C.  Try again.  */
3632           src++;
3633           if (c == ISO_CODE_ESC)
3634             ascii_skip_code[ISO_CODE_ESC] = 1;
3635           else
3636             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3637           goto label_loop_detect_coding;
3638         }
3639       if (priorities)
3640         {
3641           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3642             {
3643               if (mask & priorities[i])
3644                 return priorities[i];
3645             }
3646           return CODING_CATEGORY_MASK_RAW_TEXT;
3647         }
3648     }
3649   else
3650     {
3651       int try;
3652
3653       if (c < 0xA0)
3654         {
3655           /* C is the first byte of SJIS character code,
3656              or a leading-code of Emacs' internal format (emacs-mule),
3657              or the first byte of UTF-16.  */
3658           try = (CODING_CATEGORY_MASK_SJIS
3659                   | CODING_CATEGORY_MASK_EMACS_MULE
3660                   | CODING_CATEGORY_MASK_UTF_16_BE
3661                   | CODING_CATEGORY_MASK_UTF_16_LE);
3662
3663           /* Or, if C is a special latin extra code,
3664              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3665              or is an ISO2022 control-sequence-introducer (CSI),
3666              we should also consider the possibility of ISO2022 codings.  */
3667           if ((VECTORP (Vlatin_extra_code_table)
3668                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3669               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3670               || (c == ISO_CODE_CSI
3671                   && (src < src_end
3672                       && (*src == ']'
3673                           || ((*src == '0' || *src == '1' || *src == '2')
3674                               && src + 1 < src_end
3675                               && src[1] == ']')))))
3676             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3677                      | CODING_CATEGORY_MASK_ISO_8BIT);
3678         }
3679       else
3680         /* C is a character of ISO2022 in graphic plane right,
3681            or a SJIS's 1-byte character code (i.e. JISX0201),
3682            or the first byte of BIG5's 2-byte code,
3683            or the first byte of UTF-8/16.  */
3684         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3685                 | CODING_CATEGORY_MASK_ISO_8BIT
3686                 | CODING_CATEGORY_MASK_SJIS
3687                 | CODING_CATEGORY_MASK_BIG5
3688                 | CODING_CATEGORY_MASK_UTF_8
3689                 | CODING_CATEGORY_MASK_UTF_16_BE
3690                 | CODING_CATEGORY_MASK_UTF_16_LE);
3691
3692       /* Or, we may have to consider the possibility of CCL.  */
3693       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3694           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3695               ->spec.ccl.valid_codes)[c])
3696         try |= CODING_CATEGORY_MASK_CCL;
3697
3698       mask = 0;
3699       utf16_examined_p = iso2022_examined_p = 0;
3700       if (priorities)
3701         {
3702           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3703             {
3704               if (!iso2022_examined_p
3705                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3706                 {
3707                   mask |= detect_coding_iso2022 (src, src_end);
3708                   iso2022_examined_p = 1;
3709                 }
3710               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3711                 mask |= detect_coding_sjis (src, src_end);
3712               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3713                 mask |= detect_coding_utf_8 (src, src_end);
3714               else if (!utf16_examined_p
3715                        && (priorities[i] & try &
3716                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3717                 {
3718                   mask |= detect_coding_utf_16 (src, src_end);
3719                   utf16_examined_p = 1;
3720                 }
3721               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3722                 mask |= detect_coding_big5 (src, src_end);
3723               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3724                 mask |= detect_coding_emacs_mule (src, src_end);
3725               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3726                 mask |= detect_coding_ccl (src, src_end);
3727               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3728                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3729               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3730                 mask |= CODING_CATEGORY_MASK_BINARY;
3731               if (mask & priorities[i])
3732                 return priorities[i];
3733             }
3734           return CODING_CATEGORY_MASK_RAW_TEXT;
3735         }
3736       if (try & CODING_CATEGORY_MASK_ISO)
3737         mask |= detect_coding_iso2022 (src, src_end);
3738       if (try & CODING_CATEGORY_MASK_SJIS)
3739         mask |= detect_coding_sjis (src, src_end);
3740       if (try & CODING_CATEGORY_MASK_BIG5)
3741         mask |= detect_coding_big5 (src, src_end);
3742       if (try & CODING_CATEGORY_MASK_UTF_8)
3743         mask |= detect_coding_utf_8 (src, src_end);
3744       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3745         mask |= detect_coding_utf_16 (src, src_end);
3746       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3747         mask |= detect_coding_emacs_mule (src, src_end);
3748       if (try & CODING_CATEGORY_MASK_CCL)
3749         mask |= detect_coding_ccl (src, src_end);
3750     }
3751   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3752 }
3753
3754 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3755    The information of the detected coding system is set in CODING.  */
3756
3757 void
3758 detect_coding (coding, src, src_bytes)
3759      struct coding_system *coding;
3760      unsigned char *src;
3761      int src_bytes;
3762 {
3763   unsigned int idx;
3764   int skip, mask, i;
3765   Lisp_Object val;
3766
3767   val = Vcoding_category_list;
3768   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3769   coding->heading_ascii = skip;
3770
3771   if (!mask) return;
3772
3773   /* We found a single coding system of the highest priority in MASK.  */
3774   idx = 0;
3775   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3776   if (! mask)
3777     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3778
3779   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3780
3781   if (coding->eol_type != CODING_EOL_UNDECIDED)
3782     {
3783       Lisp_Object tmp;
3784
3785       tmp = Fget (val, Qeol_type);
3786       if (VECTORP (tmp))
3787         val = XVECTOR (tmp)->contents[coding->eol_type];
3788     }
3789   setup_coding_system (val, coding);
3790   /* Set this again because setup_coding_system reset this member.  */
3791   coding->heading_ascii = skip;
3792 }
3793
3794 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3795    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3796    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3797
3798    How many non-eol characters are at the head is returned as *SKIP.  */
3799
3800 #define MAX_EOL_CHECK_COUNT 3
3801
3802 static int
3803 detect_eol_type (source, src_bytes, skip)
3804      unsigned char *source;
3805      int src_bytes, *skip;
3806 {
3807   unsigned char *src = source, *src_end = src + src_bytes;
3808   unsigned char c;
3809   int total = 0;                /* How many end-of-lines are found so far.  */
3810   int eol_type = CODING_EOL_UNDECIDED;
3811   int this_eol_type;
3812
3813   *skip = 0;
3814
3815   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3816     {
3817       c = *src++;
3818       if (c == '\n' || c == '\r')
3819         {
3820           if (*skip == 0)
3821             *skip = src - 1 - source;
3822           total++;
3823           if (c == '\n')
3824             this_eol_type = CODING_EOL_LF;
3825           else if (src >= src_end || *src != '\n')
3826             this_eol_type = CODING_EOL_CR;
3827           else
3828             this_eol_type = CODING_EOL_CRLF, src++;
3829
3830           if (eol_type == CODING_EOL_UNDECIDED)
3831             /* This is the first end-of-line.  */
3832             eol_type = this_eol_type;
3833           else if (eol_type != this_eol_type)
3834             {
3835               /* The found type is different from what found before.  */
3836               eol_type = CODING_EOL_INCONSISTENT;
3837               break;
3838             }
3839         }
3840     }
3841
3842   if (*skip == 0)
3843     *skip = src_end - source;
3844   return eol_type;
3845 }
3846
3847 /* Like detect_eol_type, but detect EOL type in 2-octet
3848    big-endian/little-endian format for coding systems utf-16-be and
3849    utf-16-le.  */
3850
3851 static int
3852 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3853      unsigned char *source;
3854      int src_bytes, *skip;
3855 {
3856   unsigned char *src = source, *src_end = src + src_bytes;
3857   unsigned int c1, c2;
3858   int total = 0;                /* How many end-of-lines are found so far.  */
3859   int eol_type = CODING_EOL_UNDECIDED;
3860   int this_eol_type;
3861   int msb, lsb;
3862
3863   if (big_endian_p)
3864     msb = 0, lsb = 1;
3865   else
3866     msb = 1, lsb = 0;
3867
3868   *skip = 0;
3869
3870   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3871     {
3872       c1 = (src[msb] << 8) | (src[lsb]);
3873       src += 2;
3874
3875       if (c1 == '\n' || c1 == '\r')
3876         {
3877           if (*skip == 0)
3878             *skip = src - 2 - source;
3879           total++;
3880           if (c1 == '\n')
3881             {
3882               this_eol_type = CODING_EOL_LF;
3883             }
3884           else
3885             {
3886               if ((src + 1) >= src_end)
3887                 {
3888                   this_eol_type = CODING_EOL_CR;
3889                 }
3890               else
3891                 {
3892                   c2 = (src[msb] << 8) | (src[lsb]);
3893                   if (c2 == '\n')
3894                     this_eol_type = CODING_EOL_CRLF, src += 2;
3895                   else
3896                     this_eol_type = CODING_EOL_CR;
3897                 }
3898             }
3899
3900           if (eol_type == CODING_EOL_UNDECIDED)
3901             /* This is the first end-of-line.  */
3902             eol_type = this_eol_type;
3903           else if (eol_type != this_eol_type)
3904             {
3905               /* The found type is different from what found before.  */
3906               eol_type = CODING_EOL_INCONSISTENT;
3907               break;
3908             }
3909         }
3910     }
3911
3912   if (*skip == 0)
3913     *skip = src_end - source;
3914   return eol_type;
3915 }
3916
3917 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3918    is encoded.  If it detects an appropriate format of end-of-line, it
3919    sets the information in *CODING.  */
3920
3921 void
3922 detect_eol (coding, src, src_bytes)
3923      struct coding_system *coding;
3924      unsigned char *src;
3925      int src_bytes;
3926 {
3927   Lisp_Object val;
3928   int skip;
3929   int eol_type;
3930
3931   switch (coding->category_idx)
3932     {
3933     case CODING_CATEGORY_IDX_UTF_16_BE:
3934       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3935       break;
3936     case CODING_CATEGORY_IDX_UTF_16_LE:
3937       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3938       break;
3939     default:
3940       eol_type = detect_eol_type (src, src_bytes, &skip);
3941       break;
3942     }
3943
3944   if (coding->heading_ascii > skip)
3945     coding->heading_ascii = skip;
3946   else
3947     skip = coding->heading_ascii;
3948
3949   if (eol_type == CODING_EOL_UNDECIDED)
3950     return;
3951   if (eol_type == CODING_EOL_INCONSISTENT)
3952     {
3953 #if 0
3954       /* This code is suppressed until we find a better way to
3955          distinguish raw text file and binary file.  */
3956
3957       /* If we have already detected that the coding is raw-text, the
3958          coding should actually be no-conversion.  */
3959       if (coding->type == coding_type_raw_text)
3960         {
3961           setup_coding_system (Qno_conversion, coding);
3962           return;
3963         }
3964       /* Else, let's decode only text code anyway.  */
3965 #endif /* 0 */
3966       eol_type = CODING_EOL_LF;
3967     }
3968
3969   val = Fget (coding->symbol, Qeol_type);
3970   if (VECTORP (val) && XVECTOR (val)->size == 3)
3971     {
3972       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3973       coding->heading_ascii = skip;
3974     }
3975 }
3976
3977 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3978
3979 #define DECODING_BUFFER_MAG(coding)                                          \
3980   (coding->type == coding_type_iso2022                                       \
3981    ? 3                                                                       \
3982    : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3983       ? 2                                                                    \
3984       : (coding->type == coding_type_raw_text                                \
3985          ? 1                                                                 \
3986          : (coding->type == coding_type_ccl                                  \
3987             ? coding->spec.ccl.decoder.buf_magnification                     \
3988             : 2))))
3989
3990 /* Return maximum size (bytes) of a buffer enough for decoding
3991    SRC_BYTES of text encoded in CODING.  */
3992
3993 int
3994 decoding_buffer_size (coding, src_bytes)
3995      struct coding_system *coding;
3996      int src_bytes;
3997 {
3998   return (src_bytes * DECODING_BUFFER_MAG (coding)
3999           + CONVERSION_BUFFER_EXTRA_ROOM);
4000 }
4001
4002 /* Return maximum size (bytes) of a buffer enough for encoding
4003    SRC_BYTES of text to CODING.  */
4004
4005 int
4006 encoding_buffer_size (coding, src_bytes)
4007      struct coding_system *coding;
4008      int src_bytes;
4009 {
4010   int magnification;
4011
4012   if (coding->type == coding_type_ccl)
4013     magnification = coding->spec.ccl.encoder.buf_magnification;
4014   else
4015     magnification = 3;
4016
4017   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4018 }
4019
4020 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
4021 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
4022 #endif
4023
4024 char *conversion_buffer;
4025 int conversion_buffer_size;
4026
4027 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
4028    or decoding.  Sufficient memory is allocated automatically.  If we
4029    run out of memory, return NULL.  */
4030
4031 char *
4032 get_conversion_buffer (size)
4033      int size;
4034 {
4035   if (size > conversion_buffer_size)
4036     {
4037       char *buf;
4038       int real_size = conversion_buffer_size * 2;
4039
4040       while (real_size < size) real_size *= 2;
4041       buf = (char *) xmalloc (real_size);
4042       xfree (conversion_buffer);
4043       conversion_buffer = buf;
4044       conversion_buffer_size = real_size;
4045     }
4046   return conversion_buffer;
4047 }
4048
4049 int
4050 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4051      struct coding_system *coding;
4052      unsigned char *source, *destination;
4053      int src_bytes, dst_bytes, encodep;
4054 {
4055   struct ccl_program *ccl
4056     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4057   int result;
4058
4059   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4060
4061   coding->produced = ccl_driver (ccl, source, destination,
4062                                  src_bytes, dst_bytes, &(coding->consumed));
4063   coding->produced_char
4064     = (encodep
4065        ? coding->produced
4066        : multibyte_chars_in_text (destination, coding->produced));
4067   coding->consumed_char
4068     = multibyte_chars_in_text (source, coding->consumed);
4069
4070   switch (ccl->status)
4071     {
4072     case CCL_STAT_SUSPEND_BY_SRC:
4073       result = CODING_FINISH_INSUFFICIENT_SRC;
4074       break;
4075     case CCL_STAT_SUSPEND_BY_DST:
4076       result = CODING_FINISH_INSUFFICIENT_DST;
4077       break;
4078     case CCL_STAT_QUIT:
4079     case CCL_STAT_INVALID_CMD:
4080       result = CODING_FINISH_INTERRUPT;
4081       break;
4082     default:
4083       result = CODING_FINISH_NORMAL;
4084       break;
4085     }
4086   return result;
4087 }
4088
4089 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4090    decoding, it may detect coding system and format of end-of-line if
4091    those are not yet decided.
4092
4093    This function does not make full use of DESTINATION buffer.  For
4094    instance, if coding->type is coding_type_iso2022, it uses only
4095    (DST_BYTES - 7) bytes of DESTINATION buffer.  In the case that
4096    DST_BYTES is decided by the function decoding_buffer_size, it
4097    contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
4098    So, this function can decode the full SOURCE.  But, in the other
4099    case, if you want to avoid carry over, you must supply at least 7
4100    bytes more area in DESTINATION buffer than expected maximum bytes
4101    that will be produced by this function.  */
4102
4103 int
4104 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4105      struct coding_system *coding;
4106      unsigned char *source, *destination;
4107      int src_bytes, dst_bytes;
4108 {
4109   int result;
4110
4111   if (src_bytes <= 0
4112       && coding->type != coding_type_ccl
4113       && ! (coding->mode & CODING_MODE_LAST_BLOCK
4114             && CODING_REQUIRE_FLUSHING (coding)))
4115     {
4116       coding->produced = coding->produced_char = 0;
4117       coding->consumed = coding->consumed_char = 0;
4118       coding->fake_multibyte = 0;
4119       return CODING_FINISH_NORMAL;
4120     }
4121
4122   if (coding->type == coding_type_undecided)
4123     detect_coding (coding, source, src_bytes);
4124
4125   if (coding->eol_type == CODING_EOL_UNDECIDED)
4126     detect_eol (coding, source, src_bytes);
4127
4128   switch (coding->type)
4129     {
4130     case coding_type_emacs_mule:
4131     case coding_type_undecided:
4132     case coding_type_raw_text:
4133       if (coding->eol_type == CODING_EOL_LF
4134           ||  coding->eol_type == CODING_EOL_UNDECIDED)
4135         goto label_no_conversion;
4136       result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4137       break;
4138
4139     case coding_type_sjis:
4140       result = decode_coding_sjis_big5 (coding, source, destination,
4141                                         src_bytes, dst_bytes, 1);
4142       break;
4143
4144     case coding_type_iso2022:
4145       result = decode_coding_iso2022 (coding, source, destination,
4146                                       src_bytes, dst_bytes);
4147       break;
4148
4149     case coding_type_big5:
4150       result = decode_coding_sjis_big5 (coding, source, destination,
4151                                         src_bytes, dst_bytes, 0);
4152       break;
4153
4154     case coding_type_ccl:
4155       result = ccl_coding_driver (coding, source, destination,
4156                                   src_bytes, dst_bytes, 0);
4157       break;
4158
4159     default:                    /* i.e. case coding_type_no_conversion: */
4160     label_no_conversion:
4161       if (dst_bytes && src_bytes > dst_bytes)
4162         {
4163           coding->produced = dst_bytes;
4164           result = CODING_FINISH_INSUFFICIENT_DST;
4165         }
4166       else
4167         {
4168           coding->produced = src_bytes;
4169           result = CODING_FINISH_NORMAL;
4170         }
4171       if (dst_bytes)
4172         bcopy (source, destination, coding->produced);
4173       else
4174         safe_bcopy (source, destination, coding->produced);
4175       coding->fake_multibyte = 1;
4176       coding->consumed
4177         = coding->consumed_char = coding->produced_char = coding->produced;
4178       break;
4179     }
4180
4181   return result;
4182 }
4183
4184 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
4185
4186    This function does not make full use of DESTINATION buffer.  For
4187    instance, if coding->type is coding_type_iso2022, it uses only
4188    (DST_BYTES - 20) bytes of DESTINATION buffer.  In the case that
4189    DST_BYTES is decided by the function encoding_buffer_size, it
4190    contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
4191    So, this function can encode the full SOURCE.  But, in the other
4192    case, if you want to avoid carry over, you must supply at least 20
4193    bytes more area in DESTINATION buffer than expected maximum bytes
4194    that will be produced by this function.  */
4195
4196 int
4197 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4198      struct coding_system *coding;
4199      unsigned char *source, *destination;
4200      int src_bytes, dst_bytes;
4201 {
4202   int result;
4203
4204   if (src_bytes <= 0
4205       && ! (coding->mode & CODING_MODE_LAST_BLOCK
4206             && CODING_REQUIRE_FLUSHING (coding)))
4207     {
4208       coding->produced = coding->produced_char = 0;
4209       coding->consumed = coding->consumed_char = 0;
4210       coding->fake_multibyte = 0;
4211       return CODING_FINISH_NORMAL;
4212     }
4213
4214   switch (coding->type)
4215     {
4216     case coding_type_emacs_mule:
4217     case coding_type_undecided:
4218     case coding_type_raw_text:
4219       if (coding->eol_type == CODING_EOL_LF
4220           ||  coding->eol_type == CODING_EOL_UNDECIDED)
4221         goto label_no_conversion;
4222       result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4223       break;
4224
4225     case coding_type_sjis:
4226       result = encode_coding_sjis_big5 (coding, source, destination,
4227                                         src_bytes, dst_bytes, 1);
4228       break;
4229
4230     case coding_type_iso2022:
4231       result = encode_coding_iso2022 (coding, source, destination,
4232                                       src_bytes, dst_bytes);
4233       break;
4234
4235     case coding_type_big5:
4236       result = encode_coding_sjis_big5 (coding, source, destination,
4237                                         src_bytes, dst_bytes, 0);
4238       break;
4239
4240     case coding_type_ccl:
4241       result = ccl_coding_driver (coding, source, destination,
4242                                   src_bytes, dst_bytes, 1);
4243       break;
4244
4245     default:                    /* i.e. case coding_type_no_conversion: */
4246     label_no_conversion:
4247       if (dst_bytes && src_bytes > dst_bytes)
4248         {
4249           coding->produced = dst_bytes;
4250           result = CODING_FINISH_INSUFFICIENT_DST;
4251         }
4252       else
4253         {
4254           coding->produced = src_bytes;
4255           result = CODING_FINISH_NORMAL;
4256         }
4257       if (dst_bytes)
4258         bcopy (source, destination, coding->produced);
4259       else
4260         safe_bcopy (source, destination, coding->produced);
4261       if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
4262         {
4263           unsigned char *p = destination, *pend = p + coding->produced;
4264           while (p < pend)
4265             if (*p++ == '\015') p[-1] = '\n';
4266         }
4267       coding->fake_multibyte = 1;
4268       coding->consumed
4269         = coding->consumed_char = coding->produced_char = coding->produced;
4270       break;
4271     }
4272
4273   return result;
4274 }
4275
4276 /* Scan text in the region between *BEG and *END (byte positions),
4277    skip characters which we don't have to decode by coding system
4278    CODING at the head and tail, then set *BEG and *END to the region
4279    of the text we actually have to convert.  The caller should move
4280    the gap out of the region in advance.
4281
4282    If STR is not NULL, *BEG and *END are indices into STR.  */
4283
4284 static void
4285 shrink_decoding_region (beg, end, coding, str)
4286      int *beg, *end;
4287      struct coding_system *coding;
4288      unsigned char *str;
4289 {
4290   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4291   int eol_conversion;
4292   Lisp_Object translation_table;
4293
4294   if (coding->type == coding_type_ccl
4295       || coding->type == coding_type_undecided
4296       || !NILP (coding->post_read_conversion))
4297     {
4298       /* We can't skip any data.  */
4299       return;
4300     }
4301   else if (coding->type == coding_type_no_conversion)
4302     {
4303       /* We need no conversion, but don't have to skip any data here.
4304          Decoding routine handles them effectively anyway.  */
4305       return;
4306     }
4307
4308   translation_table = coding->translation_table_for_decode;
4309   if (NILP (translation_table) && !NILP (Venable_character_translation))
4310     translation_table = Vstandard_translation_table_for_decode;
4311   if (CHAR_TABLE_P (translation_table))
4312     {
4313       int i;
4314       for (i = 0; i < 128; i++)
4315         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4316           break;
4317       if (i < 128)
4318         /* Some ASCII character should be tranlsated.  We give up
4319            shrinking.  */
4320         return;
4321     }
4322
4323   eol_conversion = (coding->eol_type != CODING_EOL_LF);
4324
4325   if ((! eol_conversion) && (coding->heading_ascii >= 0))
4326     /* Detection routine has already found how much we can skip at the
4327        head.  */
4328     *beg += coding->heading_ascii;
4329
4330   if (str)
4331     {
4332       begp_orig = begp = str + *beg;
4333       endp_orig = endp = str + *end;
4334     }
4335   else
4336     {
4337       begp_orig = begp = BYTE_POS_ADDR (*beg);
4338       endp_orig = endp = begp + *end - *beg;
4339     }
4340
4341   switch (coding->type)
4342     {
4343     case coding_type_emacs_mule:
4344     case coding_type_raw_text:
4345       if (eol_conversion)
4346         {
4347           if (coding->heading_ascii < 0)
4348             while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
4349           while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
4350             endp--;
4351           /* Do not consider LF as ascii if preceded by CR, since that
4352              confuses eol decoding. */
4353           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4354             endp++;
4355         }
4356       else
4357         begp = endp;
4358       break;
4359
4360     case coding_type_sjis:
4361     case coding_type_big5:
4362       /* We can skip all ASCII characters at the head.  */
4363       if (coding->heading_ascii < 0)
4364         {
4365           if (eol_conversion)
4366             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4367           else
4368             while (begp < endp && *begp < 0x80) begp++;
4369         }
4370       /* We can skip all ASCII characters at the tail except for the
4371          second byte of SJIS or BIG5 code.  */
4372       if (eol_conversion)
4373         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4374       else
4375         while (begp < endp && endp[-1] < 0x80) endp--;
4376       /* Do not consider LF as ascii if preceded by CR, since that
4377          confuses eol decoding. */
4378       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4379         endp++;
4380       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4381         endp++;
4382       break;
4383
4384     default:            /* i.e. case coding_type_iso2022: */
4385       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4386         /* We can't skip any data.  */
4387         break;
4388       if (coding->heading_ascii < 0)
4389         {
4390           /* We can skip all ASCII characters at the head except for a
4391              few control codes.  */
4392           while (begp < endp && (c = *begp) < 0x80
4393                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4394                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4395                  && (!eol_conversion || c != ISO_CODE_LF))
4396             begp++;
4397         }
4398       switch (coding->category_idx)
4399         {
4400         case CODING_CATEGORY_IDX_ISO_8_1:
4401         case CODING_CATEGORY_IDX_ISO_8_2:
4402           /* We can skip all ASCII characters at the tail.  */
4403           if (eol_conversion)
4404             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4405           else
4406             while (begp < endp && endp[-1] < 0x80) endp--;
4407           /* Do not consider LF as ascii if preceded by CR, since that
4408              confuses eol decoding. */
4409           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4410             endp++;
4411           break;
4412
4413         case CODING_CATEGORY_IDX_ISO_7:
4414         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4415           {
4416             /* We can skip all charactes at the tail except for 8-bit
4417                codes and ESC and the following 2-byte at the tail.  */
4418             unsigned char *eight_bit = NULL;
4419
4420             if (eol_conversion)
4421               while (begp < endp
4422                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4423                 {
4424                   if (!eight_bit && c & 0x80) eight_bit = endp;
4425                   endp--;
4426                 }
4427             else
4428               while (begp < endp
4429                      && (c = endp[-1]) != ISO_CODE_ESC)
4430                 {
4431                   if (!eight_bit && c & 0x80) eight_bit = endp;
4432                   endp--;
4433                 }
4434             /* Do not consider LF as ascii if preceded by CR, since that
4435                confuses eol decoding. */
4436             if (begp < endp && endp < endp_orig
4437                 && endp[-1] == '\r' && endp[0] == '\n')
4438               endp++;
4439             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4440               {
4441                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4442                   /* This is an ASCII designation sequence.  We can
4443                      surely skip the tail.  But, if we have
4444                      encountered an 8-bit code, skip only the codes
4445                      after that.  */
4446                   endp = eight_bit ? eight_bit : endp + 2;
4447                 else
4448                   /* Hmmm, we can't skip the tail.  */
4449                   endp = endp_orig;
4450               }
4451             else if (eight_bit)
4452               endp = eight_bit;
4453           }
4454         }
4455     }
4456   *beg += begp - begp_orig;
4457   *end += endp - endp_orig;
4458   return;
4459 }
4460
4461 /* Like shrink_decoding_region but for encoding.  */
4462
4463 static void
4464 shrink_encoding_region (beg, end, coding, str)
4465      int *beg, *end;
4466      struct coding_system *coding;
4467      unsigned char *str;
4468 {
4469   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4470   int eol_conversion;
4471   Lisp_Object translation_table;
4472
4473   if (coding->type == coding_type_ccl)
4474     /* We can't skip any data.  */
4475     return;
4476   else if (coding->type == coding_type_no_conversion)
4477     {
4478       /* We need no conversion.  */
4479       *beg = *end;
4480       return;
4481     }
4482
4483   translation_table = coding->translation_table_for_encode;
4484   if (NILP (translation_table) && !NILP (Venable_character_translation))
4485     translation_table = Vstandard_translation_table_for_encode;
4486   if (CHAR_TABLE_P (translation_table))
4487     {
4488       int i;
4489       for (i = 0; i < 128; i++)
4490         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4491           break;
4492       if (i < 128)
4493         /* Some ASCII character should be tranlsated.  We give up
4494            shrinking.  */
4495         return;
4496     }
4497
4498   if (str)
4499     {
4500       begp_orig = begp = str + *beg;
4501       endp_orig = endp = str + *end;
4502     }
4503   else
4504     {
4505       begp_orig = begp = BYTE_POS_ADDR (*beg);
4506       endp_orig = endp = begp + *end - *beg;
4507     }
4508
4509   eol_conversion = (coding->eol_type == CODING_EOL_CR
4510                     || coding->eol_type == CODING_EOL_CRLF);
4511
4512   /* Here, we don't have to check coding->pre_write_conversion because
4513      the caller is expected to have handled it already.  */
4514   switch (coding->type)
4515     {
4516     case coding_type_undecided:
4517     case coding_type_emacs_mule:
4518     case coding_type_raw_text:
4519       if (eol_conversion)
4520         {
4521           while (begp < endp && *begp != '\n') begp++;
4522           while (begp < endp && endp[-1] != '\n') endp--;
4523         }
4524       else
4525         begp = endp;
4526       break;
4527
4528     case coding_type_iso2022:
4529       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4530         /* We can't skip any data.  */
4531         break;
4532       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4533         {
4534           unsigned char *bol = begp;
4535           while (begp < endp && *begp < 0x80)
4536             {
4537               begp++;
4538               if (begp[-1] == '\n')
4539                 bol = begp;
4540             }
4541           begp = bol;
4542           goto label_skip_tail;
4543         }
4544       /* fall down ... */
4545
4546     default:
4547       /* We can skip all ASCII characters at the head and tail.  */
4548       if (eol_conversion)
4549         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4550       else
4551         while (begp < endp && *begp < 0x80) begp++;
4552     label_skip_tail:
4553       if (eol_conversion)
4554         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4555       else
4556         while (begp < endp && *(endp - 1) < 0x80) endp--;
4557       break;
4558     }
4559
4560   *beg += begp - begp_orig;
4561   *end += endp - endp_orig;
4562   return;
4563 }
4564
4565 /* As shrinking conversion region requires some overhead, we don't try
4566    shrinking if the length of conversion region is less than this
4567    value.  */
4568 static int shrink_conversion_region_threshhold = 1024;
4569
4570 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4571   do {                                                                  \
4572     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4573       {                                                                 \
4574         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4575         else shrink_decoding_region (beg, end, coding, str);            \
4576       }                                                                 \
4577   } while (0)
4578
4579 static Lisp_Object
4580 code_convert_region_unwind (dummy)
4581      Lisp_Object dummy;
4582 {
4583   inhibit_pre_post_conversion = 0;
4584   return Qnil;
4585 }
4586
4587 /* Store information about all compositions in the range FROM and TO
4588    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4589    buffer or a string, defaults to the current buffer.  */
4590
4591 void
4592 coding_save_composition (coding, from, to, obj)
4593      struct coding_system *coding;
4594      int from, to;
4595      Lisp_Object obj;
4596 {
4597   Lisp_Object prop;
4598   int start, end;
4599
4600   if (coding->composing == COMPOSITION_DISABLED)
4601     return;
4602   if (!coding->cmp_data)
4603     coding_allocate_composition_data (coding, from);
4604   if (!find_composition (from, to, &start, &end, &prop, obj)
4605       || end > to)
4606     return;
4607   if (start < from
4608       && (!find_composition (end, to, &start, &end, &prop, obj)
4609           || end > to))
4610     return;
4611   coding->composing = COMPOSITION_NO;
4612   do
4613     {
4614       if (COMPOSITION_VALID_P (start, end, prop))
4615         {
4616           enum composition_method method = COMPOSITION_METHOD (prop);
4617           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4618               >= COMPOSITION_DATA_SIZE)
4619             coding_allocate_composition_data (coding, from);
4620           /* For relative composition, we remember start and end
4621              positions, for the other compositions, we also remember
4622              components.  */
4623           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4624           if (method != COMPOSITION_RELATIVE)
4625             {
4626               /* We must store a*/
4627               Lisp_Object val, ch;
4628
4629               val = COMPOSITION_COMPONENTS (prop);
4630               if (CONSP (val))
4631                 while (CONSP (val))
4632                   {
4633                     ch = XCAR (val), val = XCDR (val);
4634                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4635                   }
4636               else if (VECTORP (val) || STRINGP (val))
4637                 {
4638                   int len = (VECTORP (val)
4639                              ? XVECTOR (val)->size : XSTRING (val)->size);
4640                   int i;
4641                   for (i = 0; i < len; i++)
4642                     {
4643                       ch = (STRINGP (val)
4644                             ? Faref (val, make_number (i))
4645                             : XVECTOR (val)->contents[i]);
4646                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4647                     }
4648                 }
4649               else              /* INTEGERP (val) */
4650                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4651             }
4652           CODING_ADD_COMPOSITION_END (coding, end - from);
4653         }
4654       start = end;
4655     }
4656   while (start < to
4657          && find_composition (start, to, &start, &end, &prop, obj)
4658          && end <= to);
4659
4660   /* Make coding->cmp_data point to the first memory block.  */
4661   while (coding->cmp_data->prev)
4662     coding->cmp_data = coding->cmp_data->prev;
4663   coding->cmp_data_start = 0;
4664 }
4665
4666 /* Reflect the saved information about compositions to OBJ.
4667    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4668    is a buffer or a string, defaults to the current buffer.  */
4669
4670 static void
4671 coding_restore_composition (coding, obj)
4672      struct coding_system *coding;
4673      Lisp_Object obj;
4674 {
4675   struct composition_data *cmp_data = coding->cmp_data;
4676
4677   if (!cmp_data)
4678     return;
4679
4680   while (cmp_data->prev)
4681     cmp_data = cmp_data->prev;
4682
4683   while (cmp_data)
4684     {
4685       int i;
4686
4687       for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4688         {
4689           int *data = cmp_data->data + i;
4690           enum composition_method method = (enum composition_method) data[3];
4691           Lisp_Object components;
4692
4693           if (method == COMPOSITION_RELATIVE)
4694             components = Qnil;
4695           else
4696             {
4697               int len = data[0] - 4, j;
4698               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4699
4700               for (j = 0; j < len; j++)
4701                 args[j] = make_number (data[4 + j]);
4702               components = (method == COMPOSITION_WITH_ALTCHARS
4703                             ? Fstring (len, args) : Fvector (len, args));
4704             }
4705           compose_text (data[1], data[2], components, Qnil, obj);
4706         }
4707       cmp_data = cmp_data->next;
4708     }
4709 }
4710
4711 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4712    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4713    coding system CODING, and return the status code of code conversion
4714    (currently, this value has no meaning).
4715
4716    How many characters (and bytes) are converted to how many
4717    characters (and bytes) are recorded in members of the structure
4718    CODING.
4719
4720    If REPLACE is nonzero, we do various things as if the original text
4721    is deleted and a new text is inserted.  See the comments in
4722    replace_range (insdel.c) to know what we are doing.  */
4723
4724 int
4725 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4726      int from, from_byte, to, to_byte, encodep, replace;
4727      struct coding_system *coding;
4728 {
4729   int len = to - from, len_byte = to_byte - from_byte;
4730   int require, inserted, inserted_byte;
4731   int head_skip, tail_skip, total_skip = 0;
4732   Lisp_Object saved_coding_symbol;
4733   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4734   int first = 1;
4735   int fake_multibyte = 0;
4736   unsigned char *src, *dst;
4737   Lisp_Object deletion;
4738   int orig_point = PT, orig_len = len;
4739   int prev_Z;
4740
4741   deletion = Qnil;
4742   saved_coding_symbol = Qnil;
4743
4744   if (from < PT && PT < to)
4745     {
4746       TEMP_SET_PT_BOTH (from, from_byte);
4747       orig_point = from;
4748     }
4749
4750   if (replace)
4751     {
4752       int saved_from = from;
4753
4754       prepare_to_modify_buffer (from, to, &from);
4755       if (saved_from != from)
4756         {
4757           to = from + len;
4758           if (multibyte)
4759             from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4760           else
4761             from_byte = from, to_byte = to;
4762           len_byte = to_byte - from_byte;
4763         }
4764     }
4765
4766   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4767     {
4768       /* We must detect encoding of text and eol format.  */
4769
4770       if (from < GPT && to > GPT)
4771         move_gap_both (from, from_byte);
4772       if (coding->type == coding_type_undecided)
4773         {
4774           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4775           if (coding->type == coding_type_undecided)
4776             /* It seems that the text contains only ASCII, but we
4777                should not left it undecided because the deeper
4778                decoding routine (decode_coding) tries to detect the
4779                encodings again in vain.  */
4780             coding->type = coding_type_emacs_mule;
4781         }
4782       if (coding->eol_type == CODING_EOL_UNDECIDED)
4783         {
4784           saved_coding_symbol = coding->symbol;
4785           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4786           if (coding->eol_type == CODING_EOL_UNDECIDED)
4787             coding->eol_type = CODING_EOL_LF;
4788           /* We had better recover the original eol format if we
4789              encounter an inconsitent eol format while decoding.  */
4790           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4791         }
4792     }
4793
4794   if (encodep
4795       ? ! CODING_REQUIRE_ENCODING (coding)
4796       : ! CODING_REQUIRE_DECODING (coding))
4797     {
4798       coding->consumed_char = len;
4799       coding->consumed = len_byte;
4800       coding->produced = len_byte;
4801       if (multibyte
4802           && ! replace
4803           /* See the comment of the member heading_ascii in coding.h.  */
4804           && coding->heading_ascii < len_byte)
4805         {
4806           /* We still may have to combine byte at the head and the
4807              tail of the text in the region.  */
4808           if (from < GPT && GPT < to)
4809             move_gap_both (to, to_byte);
4810           len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4811           adjust_after_insert (from, from_byte, to, to_byte, len);
4812           coding->produced_char = len;
4813         }
4814       else
4815         {
4816           if (!replace)
4817             adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4818           coding->produced_char = len_byte;
4819         }
4820       return 0;
4821     }
4822
4823   /* Now we convert the text.  */
4824
4825   /* For encoding, we must process pre-write-conversion in advance.  */
4826   if (encodep
4827       && ! NILP (coding->pre_write_conversion)
4828       && SYMBOLP (coding->pre_write_conversion)
4829       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4830     {
4831       /* The function in pre-write-conversion may put a new text in a
4832          new buffer.  */
4833       struct buffer *prev = current_buffer;
4834       Lisp_Object new;
4835       int count = specpdl_ptr - specpdl;
4836
4837       record_unwind_protect (code_convert_region_unwind, Qnil);
4838       /* We should not call any more pre-write/post-read-conversion
4839          functions while this pre-write-conversion is running.  */
4840       inhibit_pre_post_conversion = 1;
4841       call2 (coding->pre_write_conversion,
4842              make_number (from), make_number (to));
4843       inhibit_pre_post_conversion = 0;
4844       /* Discard the unwind protect.  */
4845       specpdl_ptr--;
4846
4847       if (current_buffer != prev)
4848         {
4849           len = ZV - BEGV;
4850           new = Fcurrent_buffer ();
4851           set_buffer_internal_1 (prev);
4852           del_range_2 (from, from_byte, to, to_byte, 0);
4853           TEMP_SET_PT_BOTH (from, from_byte);
4854           insert_from_buffer (XBUFFER (new), 1, len, 0);
4855           Fkill_buffer (new);
4856           if (orig_point >= to)
4857             orig_point += len - orig_len;
4858           else if (orig_point > from)
4859             orig_point = from;
4860           orig_len = len;
4861           to = from + len;
4862           from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4863           to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4864           len_byte = to_byte - from_byte;
4865           TEMP_SET_PT_BOTH (from, from_byte);
4866         }
4867     }
4868
4869   if (replace)
4870     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4871
4872   if (coding->composing != COMPOSITION_DISABLED)
4873     {
4874       if (encodep)
4875         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4876       else
4877         coding_allocate_composition_data (coding, from);
4878     }
4879
4880   /* For conversion by CCL program and for encoding with composition
4881      handling, we can't skip any character because we may convert or
4882      compose even ASCII characters.  */
4883   if (coding->type != coding_type_ccl
4884       && (!encodep || coding->cmp_data == NULL))
4885     {
4886       /* Try to skip the heading and tailing ASCIIs.  */
4887       int from_byte_orig = from_byte, to_byte_orig = to_byte;
4888
4889       if (from < GPT && GPT < to)
4890         move_gap_both (from, from_byte);
4891       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4892       if (from_byte == to_byte
4893           && (encodep || NILP (coding->post_read_conversion))
4894           && ! CODING_REQUIRE_FLUSHING (coding))
4895         {
4896           coding->produced = len_byte;
4897           coding->produced_char = multibyte ? len : len_byte;
4898           if (!replace)
4899             /* We must record and adjust for this new text now.  */
4900             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4901           return 0;
4902         }
4903
4904       head_skip = from_byte - from_byte_orig;
4905       tail_skip = to_byte_orig - to_byte;
4906       total_skip = head_skip + tail_skip;
4907       from += head_skip;
4908       to -= tail_skip;
4909       len -= total_skip; len_byte -= total_skip;
4910
4911       if (coding->cmp_data)
4912         coding->cmp_data->char_offset = from;
4913     }
4914
4915   /* The code conversion routine can not preserve text properties for
4916      now.  So, we must remove all text properties in the region.
4917      Here, we must suppress all modification hooks.  */
4918   if (replace)
4919     {
4920       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4921       inhibit_modification_hooks = 1;
4922       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4923       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4924     }
4925
4926   /* For converion, we must put the gap before the text in addition to
4927      making the gap larger for efficient decoding.  The required gap
4928      size starts from 2000 which is the magic number used in make_gap.
4929      But, after one batch of conversion, it will be incremented if we
4930      find that it is not enough .  */
4931   require = 2000;
4932
4933   if (GAP_SIZE  < require)
4934     make_gap (require - GAP_SIZE);
4935   move_gap_both (from, from_byte);
4936
4937   inserted = inserted_byte = 0;
4938
4939   GAP_SIZE += len_byte;
4940   ZV -= len;
4941   Z -= len;
4942   ZV_BYTE -= len_byte;
4943   Z_BYTE -= len_byte;
4944
4945   if (GPT - BEG < BEG_UNCHANGED)
4946     BEG_UNCHANGED = GPT - BEG;
4947   if (Z - GPT < END_UNCHANGED)
4948     END_UNCHANGED = Z - GPT;
4949
4950   for (;;)
4951     {
4952       int result;
4953
4954       /* The buffer memory is now:
4955          +--------+converted-text+---------+-------original-text------+---+
4956          |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4957                   |<------------------- GAP_SIZE -------------------->|  */
4958       src = GAP_END_ADDR - len_byte;
4959       dst = GPT_ADDR + inserted_byte;
4960
4961       if (encodep)
4962         result = encode_coding (coding, src, dst, len_byte, 0);
4963       else
4964         result = decode_coding (coding, src, dst, len_byte, 0);
4965
4966       /* The buffer memory is now:
4967          +--------+-------converted-text--------+--+---original-text--+---+
4968          |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4969                   |<------------------- GAP_SIZE -------------------->|  */
4970
4971       if (coding->fake_multibyte)
4972         fake_multibyte = 1;
4973
4974       if (!encodep && !multibyte)
4975         coding->produced_char = coding->produced;
4976       inserted += coding->produced_char;
4977       inserted_byte += coding->produced;
4978       len_byte -= coding->consumed;
4979
4980       if (result == CODING_FINISH_INSUFFICIENT_CMP)
4981         {
4982           coding_allocate_composition_data (coding, from + inserted);
4983           continue;
4984         }
4985
4986       src += coding->consumed;
4987       dst += coding->produced;
4988
4989       if (result == CODING_FINISH_NORMAL)
4990         {
4991           src += len_byte;
4992           break;
4993         }
4994       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4995         {
4996           unsigned char *pend = dst, *p = pend - inserted_byte;
4997           Lisp_Object eol_type;
4998
4999           /* Encode LFs back to the original eol format (CR or CRLF).  */
5000           if (coding->eol_type == CODING_EOL_CR)
5001             {
5002               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5003             }
5004           else
5005             {
5006               int count = 0;
5007
5008               while (p < pend) if (*p++ == '\n') count++;
5009               if (src - dst < count)
5010                 {
5011                   /* We don't have sufficient room for encoding LFs
5012                      back to CRLF.  We must record converted and
5013                      not-yet-converted text back to the buffer
5014                      content, enlarge the gap, then record them out of
5015                      the buffer contents again.  */
5016                   int add = len_byte + inserted_byte;
5017
5018                   GAP_SIZE -= add;
5019                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5020                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5021                   make_gap (count - GAP_SIZE);
5022                   GAP_SIZE += add;
5023                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5024                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5025                   /* Don't forget to update SRC, DST, and PEND.  */
5026                   src = GAP_END_ADDR - len_byte;
5027                   dst = GPT_ADDR + inserted_byte;
5028                   pend = dst;
5029                 }
5030               inserted += count;
5031               inserted_byte += count;
5032               coding->produced += count;
5033               p = dst = pend + count;
5034               while (count)
5035                 {
5036                   *--p = *--pend;
5037                   if (*p == '\n') count--, *--p = '\r';
5038                 }
5039             }
5040
5041           /* Suppress eol-format conversion in the further conversion.  */
5042           coding->eol_type = CODING_EOL_LF;
5043
5044           /* Set the coding system symbol to that for Unix-like EOL.  */
5045           eol_type = Fget (saved_coding_symbol, Qeol_type);
5046           if (VECTORP (eol_type)
5047               && XVECTOR (eol_type)->size == 3
5048               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5049             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5050           else
5051             coding->symbol = saved_coding_symbol;
5052
5053           continue;
5054         }
5055       if (len_byte <= 0)
5056         {
5057           if (coding->type != coding_type_ccl
5058               || coding->mode & CODING_MODE_LAST_BLOCK)
5059             break;
5060           coding->mode |= CODING_MODE_LAST_BLOCK;
5061           continue;
5062         }
5063       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5064         {
5065           /* The source text ends in invalid codes.  Let's just
5066              make them valid buffer contents, and finish conversion.  */
5067           inserted += len_byte;
5068           inserted_byte += len_byte;
5069           while (len_byte--)
5070             *dst++ = *src++;
5071           fake_multibyte = 1;
5072           break;
5073         }
5074       if (result == CODING_FINISH_INTERRUPT)
5075         {
5076           /* The conversion procedure was interrupted by a user.  */
5077           fake_multibyte = 1;
5078           break;
5079         }
5080       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5081       if (coding->consumed < 1)
5082         {
5083           /* It's quite strange to require more memory without
5084              consuming any bytes.  Perhaps CCL program bug.  */
5085           fake_multibyte = 1;
5086           break;
5087         }
5088       if (first)
5089         {
5090           /* We have just done the first batch of conversion which was
5091              stoped because of insufficient gap.  Let's reconsider the
5092              required gap size (i.e. SRT - DST) now.
5093
5094              We have converted ORIG bytes (== coding->consumed) into
5095              NEW bytes (coding->produced).  To convert the remaining
5096              LEN bytes, we may need REQUIRE bytes of gap, where:
5097                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5098                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5099              Here, we are sure that NEW >= ORIG.  */
5100           float ratio = coding->produced - coding->consumed;
5101           ratio /= coding->consumed;
5102           require = len_byte * ratio;
5103           first = 0;
5104         }
5105       if ((src - dst) < (require + 2000))
5106         {
5107           /* See the comment above the previous call of make_gap.  */
5108           int add = len_byte + inserted_byte;
5109
5110           GAP_SIZE -= add;
5111           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5112           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5113           make_gap (require + 2000);
5114           GAP_SIZE += add;
5115           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5116           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5117         }
5118     }
5119   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5120
5121   if (multibyte
5122       && (encodep
5123           || fake_multibyte
5124           || (to - from) != (to_byte - from_byte)))
5125     inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
5126
5127   /* If we have shrinked the conversion area, adjust it now.  */
5128   if (total_skip > 0)
5129     {
5130       if (tail_skip > 0)
5131         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5132       inserted += total_skip; inserted_byte += total_skip;
5133       GAP_SIZE += total_skip;
5134       GPT -= head_skip; GPT_BYTE -= head_skip;
5135       ZV -= total_skip; ZV_BYTE -= total_skip;
5136       Z -= total_skip; Z_BYTE -= total_skip;
5137       from -= head_skip; from_byte -= head_skip;
5138       to += tail_skip; to_byte += tail_skip;
5139     }
5140
5141   prev_Z = Z;
5142   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5143   inserted = Z - prev_Z;
5144
5145   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5146     coding_restore_composition (coding, Fcurrent_buffer ());
5147   coding_free_composition_data (coding);
5148
5149   if (! encodep && ! NILP (coding->post_read_conversion))
5150     {
5151       Lisp_Object val;
5152       int count = specpdl_ptr - specpdl;
5153
5154       if (from != PT)
5155         TEMP_SET_PT_BOTH (from, from_byte);
5156       prev_Z = Z;
5157       record_unwind_protect (code_convert_region_unwind, Qnil);
5158       /* We should not call any more pre-write/post-read-conversion
5159          functions while this post-read-conversion is running.  */
5160       inhibit_pre_post_conversion = 1;
5161       val = call1 (coding->post_read_conversion, make_number (inserted));
5162       inhibit_pre_post_conversion = 0;
5163       /* Discard the unwind protect.  */
5164       specpdl_ptr--;
5165       CHECK_NUMBER (val, 0);
5166       inserted += Z - prev_Z;
5167     }
5168
5169   if (orig_point >= from)
5170     {
5171       if (orig_point >= from + orig_len)
5172         orig_point += inserted - orig_len;
5173       else
5174         orig_point = from;
5175       TEMP_SET_PT (orig_point);
5176     }
5177
5178   if (replace)
5179     {
5180       signal_after_change (from, to - from, inserted);
5181       update_compositions (from, from + inserted, CHECK_BORDER);
5182     }
5183
5184   {
5185     coding->consumed = to_byte - from_byte;
5186     coding->consumed_char = to - from;
5187     coding->produced = inserted_byte;
5188     coding->produced_char = inserted;
5189   }
5190
5191   return 0;
5192 }
5193
5194 Lisp_Object
5195 code_convert_string (str, coding, encodep, nocopy)
5196      Lisp_Object str;
5197      struct coding_system *coding;
5198      int encodep, nocopy;
5199 {
5200   int len;
5201   char *buf;
5202   int from = 0, to = XSTRING (str)->size;
5203   int to_byte = STRING_BYTES (XSTRING (str));
5204   struct gcpro gcpro1;
5205   Lisp_Object saved_coding_symbol;
5206   int result;
5207
5208   saved_coding_symbol = Qnil;
5209   if ((encodep && !NILP (coding->pre_write_conversion)
5210        || !encodep && !NILP (coding->post_read_conversion)))
5211     {
5212       /* Since we have to call Lisp functions which assume target text
5213          is in a buffer, after setting a temporary buffer, call
5214          code_convert_region.  */
5215       int count = specpdl_ptr - specpdl;
5216       struct buffer *prev = current_buffer;
5217       int multibyte = STRING_MULTIBYTE (str);
5218
5219       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5220       record_unwind_protect (code_convert_region_unwind, Qnil);
5221       inhibit_pre_post_conversion = 1;
5222       GCPRO1 (str);
5223       temp_output_buffer_setup (" *code-converting-work*");
5224       set_buffer_internal (XBUFFER (Vstandard_output));
5225       /* We must insert the contents of STR as is without
5226          unibyte<->multibyte conversion.  For that, we adjust the
5227          multibyteness of the working buffer to that of STR.  */
5228       Ferase_buffer ();         /* for safety */
5229       current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5230       insert_from_string (str, 0, 0, to, to_byte, 0);
5231       UNGCPRO;
5232       code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
5233       /* Make a unibyte string if we are encoding, otherwise make a
5234          multibyte string.  */
5235       Fset_buffer_multibyte (encodep ? Qnil : Qt);
5236       str = make_buffer_string (BEGV, ZV, 0);
5237       return unbind_to (count, str);
5238     }
5239
5240   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5241     {
5242       /* See the comments in code_convert_region.  */
5243       if (coding->type == coding_type_undecided)
5244         {
5245           detect_coding (coding, XSTRING (str)->data, to_byte);
5246           if (coding->type == coding_type_undecided)
5247             coding->type = coding_type_emacs_mule;
5248         }
5249       if (coding->eol_type == CODING_EOL_UNDECIDED)
5250         {
5251           saved_coding_symbol = coding->symbol;
5252           detect_eol (coding, XSTRING (str)->data, to_byte);
5253           if (coding->eol_type == CODING_EOL_UNDECIDED)
5254             coding->eol_type = CODING_EOL_LF;
5255           /* We had better recover the original eol format if we
5256              encounter an inconsitent eol format while decoding.  */
5257           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5258         }
5259     }
5260
5261   if (encodep
5262       ? ! CODING_REQUIRE_ENCODING (coding)
5263       : ! CODING_REQUIRE_DECODING (coding))
5264     return (nocopy ? str : Fcopy_sequence (str));
5265
5266   if (coding->composing != COMPOSITION_DISABLED)
5267     {
5268       if (encodep)
5269         coding_save_composition (coding, from, to, str);
5270       else
5271         coding_allocate_composition_data (coding, from);
5272     }
5273
5274   /* For conversion by CCL program and for encoding with composition
5275      handling, we can't skip any character because we may convert or
5276      compose even ASCII characters.  */
5277   if (coding->type != coding_type_ccl
5278       && (!encodep || coding->cmp_data == NULL))
5279     {
5280       /* Try to skip the heading and tailing ASCIIs.  */
5281       int from_orig = from;
5282
5283       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5284                                 encodep);
5285       if (from == to_byte)
5286         return (nocopy ? str : Fcopy_sequence (str));
5287
5288       if (coding->cmp_data)
5289         coding->cmp_data->char_offset = from;
5290     }
5291
5292   if (encodep)
5293     len = encoding_buffer_size (coding, to_byte - from);
5294   else
5295     len = decoding_buffer_size (coding, to_byte - from);
5296   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5297   GCPRO1 (str);
5298   buf = get_conversion_buffer (len);
5299   UNGCPRO;
5300
5301   if (from > 0)
5302     bcopy (XSTRING (str)->data, buf, from);
5303   result = (encodep
5304             ? encode_coding (coding, XSTRING (str)->data + from,
5305                              buf + from, to_byte - from, len)
5306             : decode_coding (coding, XSTRING (str)->data + from,
5307                              buf + from, to_byte - from, len));
5308   if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5309     {
5310       /* We simply try to decode the whole string again but without
5311          eol-conversion this time.  */
5312       coding->eol_type = CODING_EOL_LF;
5313       coding->symbol = saved_coding_symbol;
5314       coding_free_composition_data (coding);
5315       return code_convert_string (str, coding, encodep, nocopy);
5316     }
5317
5318   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5319          STRING_BYTES (XSTRING (str)) - to_byte);
5320
5321   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5322   if (encodep)
5323     str = make_unibyte_string (buf, len + coding->produced);
5324   else
5325     {
5326       int chars= (coding->fake_multibyte
5327                   ? multibyte_chars_in_text (buf + from, coding->produced)
5328                   : coding->produced_char);
5329       str = make_multibyte_string (buf, len + chars, len + coding->produced);
5330     }
5331
5332   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5333     coding_restore_composition (coding, str);
5334
5335   coding_free_composition_data (coding);
5336   return str;
5337 }
5338
5339 \f
5340 #ifdef emacs
5341 /*** 8. Emacs Lisp library functions ***/
5342
5343 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5344   "Return t if OBJECT is nil or a coding-system.\n\
5345 See the documentation of `make-coding-system' for information\n\
5346 about coding-system objects.")
5347   (obj)
5348      Lisp_Object obj;
5349 {
5350   if (NILP (obj))
5351     return Qt;
5352   if (!SYMBOLP (obj))
5353     return Qnil;
5354   /* Get coding-spec vector for OBJ.  */
5355   obj = Fget (obj, Qcoding_system);
5356   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5357           ? Qt : Qnil);
5358 }
5359
5360 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5361        Sread_non_nil_coding_system, 1, 1, 0,
5362   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5363   (prompt)
5364      Lisp_Object prompt;
5365 {
5366   Lisp_Object val;
5367   do
5368     {
5369       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5370                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5371     }
5372   while (XSTRING (val)->size == 0);
5373   return (Fintern (val, Qnil));
5374 }
5375
5376 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5377   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5378 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5379   (prompt, default_coding_system)
5380      Lisp_Object prompt, default_coding_system;
5381 {
5382   Lisp_Object val;
5383   if (SYMBOLP (default_coding_system))
5384     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5385   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5386                           Qt, Qnil, Qcoding_system_history,
5387                           default_coding_system, Qnil);
5388   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5389 }
5390
5391 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5392        1, 1, 0,
5393   "Check validity of CODING-SYSTEM.\n\
5394 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5395 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5396 The value of property should be a vector of length 5.")
5397   (coding_system)
5398      Lisp_Object coding_system;
5399 {
5400   CHECK_SYMBOL (coding_system, 0);
5401   if (!NILP (Fcoding_system_p (coding_system)))
5402     return coding_system;
5403   while (1)
5404     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5405 }
5406 \f
5407 Lisp_Object
5408 detect_coding_system (src, src_bytes, highest)
5409      unsigned char *src;
5410      int src_bytes, highest;
5411 {
5412   int coding_mask, eol_type;
5413   Lisp_Object val, tmp;
5414   int dummy;
5415
5416   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5417   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5418   if (eol_type == CODING_EOL_INCONSISTENT)
5419     eol_type = CODING_EOL_UNDECIDED;
5420
5421   if (!coding_mask)
5422     {
5423       val = Qundecided;
5424       if (eol_type != CODING_EOL_UNDECIDED)
5425         {
5426           Lisp_Object val2;
5427           val2 = Fget (Qundecided, Qeol_type);
5428           if (VECTORP (val2))
5429             val = XVECTOR (val2)->contents[eol_type];
5430         }
5431       return (highest ? val : Fcons (val, Qnil));
5432     }
5433
5434   /* At first, gather possible coding systems in VAL.  */
5435   val = Qnil;
5436   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5437     {
5438       Lisp_Object category_val, category_index;
5439
5440       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5441       category_val = Fsymbol_value (XCAR (tmp));
5442       if (!NILP (category_val)
5443           && NATNUMP (category_index)
5444           && (coding_mask & (1 << XFASTINT (category_index))))
5445         {
5446           val = Fcons (category_val, val);
5447           if (highest)
5448             break;
5449         }
5450     }
5451   if (!highest)
5452     val = Fnreverse (val);
5453
5454   /* Then, replace the elements with subsidiary coding systems.  */
5455   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5456     {
5457       if (eol_type != CODING_EOL_UNDECIDED
5458           && eol_type != CODING_EOL_INCONSISTENT)
5459         {
5460           Lisp_Object eol;
5461           eol = Fget (XCAR (tmp), Qeol_type);
5462           if (VECTORP (eol))
5463             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5464         }
5465     }
5466   return (highest ? XCAR (val) : val);
5467 }
5468
5469 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5470        2, 3, 0,
5471   "Detect coding system of the text in the region between START and END.\n\
5472 Return a list of possible coding systems ordered by priority.\n\
5473 \n\
5474 If only ASCII characters are found, it returns a list of single element\n\
5475 `undecided' or its subsidiary coding system according to a detected\n\
5476 end-of-line format.\n\
5477 \n\
5478 If optional argument HIGHEST is non-nil, return the coding system of\n\
5479 highest priority.")
5480   (start, end, highest)
5481      Lisp_Object start, end, highest;
5482 {
5483   int from, to;
5484   int from_byte, to_byte;
5485
5486   CHECK_NUMBER_COERCE_MARKER (start, 0);
5487   CHECK_NUMBER_COERCE_MARKER (end, 1);
5488
5489   validate_region (&start, &end);
5490   from = XINT (start), to = XINT (end);
5491   from_byte = CHAR_TO_BYTE (from);
5492   to_byte = CHAR_TO_BYTE (to);
5493
5494   if (from < GPT && to >= GPT)
5495     move_gap_both (to, to_byte);
5496
5497   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5498                                to_byte - from_byte,
5499                                !NILP (highest));
5500 }
5501
5502 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5503        1, 2, 0,
5504   "Detect coding system of the text in STRING.\n\
5505 Return a list of possible coding systems ordered by priority.\n\
5506 \n\
5507 If only ASCII characters are found, it returns a list of single element\n\
5508 `undecided' or its subsidiary coding system according to a detected\n\
5509 end-of-line format.\n\
5510 \n\
5511 If optional argument HIGHEST is non-nil, return the coding system of\n\
5512 highest priority.")
5513   (string, highest)
5514      Lisp_Object string, highest;
5515 {
5516   CHECK_STRING (string, 0);
5517
5518   return detect_coding_system (XSTRING (string)->data,
5519                                STRING_BYTES (XSTRING (string)),
5520                                !NILP (highest));
5521 }
5522
5523 Lisp_Object
5524 code_convert_region1 (start, end, coding_system, encodep)
5525      Lisp_Object start, end, coding_system;
5526      int encodep;
5527 {
5528   struct coding_system coding;
5529   int from, to, len;
5530
5531   CHECK_NUMBER_COERCE_MARKER (start, 0);
5532   CHECK_NUMBER_COERCE_MARKER (end, 1);
5533   CHECK_SYMBOL (coding_system, 2);
5534
5535   validate_region (&start, &end);
5536   from = XFASTINT (start);
5537   to = XFASTINT (end);
5538
5539   if (NILP (coding_system))
5540     return make_number (to - from);
5541
5542   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5543     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5544
5545   coding.mode |= CODING_MODE_LAST_BLOCK;
5546   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5547                        &coding, encodep, 1);
5548   Vlast_coding_system_used = coding.symbol;
5549   return make_number (coding.produced_char);
5550 }
5551
5552 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5553        3, 3, "r\nzCoding system: ",
5554   "Decode the current region by specified coding system.\n\
5555 When called from a program, takes three arguments:\n\
5556 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5557 This function sets `last-coding-system-used' to the precise coding system\n\
5558 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5559 not fully specified.)\n\
5560 It returns the length of the decoded text.")
5561   (start, end, coding_system)
5562      Lisp_Object start, end, coding_system;
5563 {
5564   return code_convert_region1 (start, end, coding_system, 0);
5565 }
5566
5567 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5568        3, 3, "r\nzCoding system: ",
5569   "Encode the current region by specified coding system.\n\
5570 When called from a program, takes three arguments:\n\
5571 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5572 This function sets `last-coding-system-used' to the precise coding system\n\
5573 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5574 not fully specified.)\n\
5575 It returns the length of the encoded text.")
5576   (start, end, coding_system)
5577      Lisp_Object start, end, coding_system;
5578 {
5579   return code_convert_region1 (start, end, coding_system, 1);
5580 }
5581
5582 Lisp_Object
5583 code_convert_string1 (string, coding_system, nocopy, encodep)
5584      Lisp_Object string, coding_system, nocopy;
5585      int encodep;
5586 {
5587   struct coding_system coding;
5588
5589   CHECK_STRING (string, 0);
5590   CHECK_SYMBOL (coding_system, 1);
5591
5592   if (NILP (coding_system))
5593     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5594
5595   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5596     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5597
5598   coding.mode |= CODING_MODE_LAST_BLOCK;
5599   string = code_convert_string (string, &coding, encodep, !NILP (nocopy));
5600   Vlast_coding_system_used = coding.symbol;
5601
5602   return string;
5603 }
5604
5605 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5606        2, 3, 0,
5607   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5608 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5609 if the decoding operation is trivial.\n\
5610 This function sets `last-coding-system-used' to the precise coding system\n\
5611 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5612 not fully specified.)")
5613   (string, coding_system, nocopy)
5614      Lisp_Object string, coding_system, nocopy;
5615 {
5616   return code_convert_string1 (string, coding_system, nocopy, 0);
5617 }
5618
5619 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5620        2, 3, 0,
5621   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5622 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5623 if the encoding operation is trivial.\n\
5624 This function sets `last-coding-system-used' to the precise coding system\n\
5625 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5626 not fully specified.)")
5627   (string, coding_system, nocopy)
5628      Lisp_Object string, coding_system, nocopy;
5629 {
5630   return code_convert_string1 (string, coding_system, nocopy, 1);
5631 }
5632
5633 /* Encode or decode STRING according to CODING_SYSTEM.
5634    Do not set Vlast_coding_system_used.
5635
5636    This function is called only from macros DECODE_FILE and
5637    ENCODE_FILE, thus we ignore character composition.  */
5638
5639 Lisp_Object
5640 code_convert_string_norecord (string, coding_system, encodep)
5641      Lisp_Object string, coding_system;
5642      int encodep;
5643 {
5644   struct coding_system coding;
5645
5646   CHECK_STRING (string, 0);
5647   CHECK_SYMBOL (coding_system, 1);
5648
5649   if (NILP (coding_system))
5650     return string;
5651
5652   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5653     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5654
5655   coding.composing = COMPOSITION_DISABLED;
5656   coding.mode |= CODING_MODE_LAST_BLOCK;
5657   return code_convert_string (string, &coding, encodep, Qt);
5658 }
5659 \f
5660 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5661   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5662 Return the corresponding character.")
5663   (code)
5664      Lisp_Object code;
5665 {
5666   unsigned char c1, c2, s1, s2;
5667   Lisp_Object val;
5668
5669   CHECK_NUMBER (code, 0);
5670   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5671   if (s1 == 0)
5672     {
5673       if (s2 < 0x80)
5674         XSETFASTINT (val, s2);
5675       else if (s2 >= 0xA0 || s2 <= 0xDF)
5676         XSETFASTINT (val,
5677                      MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5678       else
5679         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5680     }
5681   else
5682     {
5683       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5684           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5685         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5686       DECODE_SJIS (s1, s2, c1, c2);
5687       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5688     }
5689   return val;
5690 }
5691
5692 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5693   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5694 Return the corresponding code in SJIS.")
5695   (ch)
5696      Lisp_Object ch;
5697 {
5698   int charset, c1, c2, s1, s2;
5699   Lisp_Object val;
5700
5701   CHECK_NUMBER (ch, 0);
5702   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5703   if (charset == CHARSET_ASCII)
5704     {
5705       val = ch;
5706     }
5707   else if (charset == charset_jisx0208
5708            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5709     {
5710       ENCODE_SJIS (c1, c2, s1, s2);
5711       XSETFASTINT (val, (s1 << 8) | s2);
5712     }
5713   else if (charset == charset_katakana_jisx0201
5714            && c1 > 0x20 && c2 < 0xE0)
5715     {
5716       XSETFASTINT (val, c1 | 0x80);
5717     }
5718   else
5719     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5720   return val;
5721 }
5722
5723 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5724   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5725 Return the corresponding character.")
5726   (code)
5727      Lisp_Object code;
5728 {
5729   int charset;
5730   unsigned char b1, b2, c1, c2;
5731   Lisp_Object val;
5732
5733   CHECK_NUMBER (code, 0);
5734   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5735   if (b1 == 0)
5736     {
5737       if (b2 >= 0x80)
5738         error ("Invalid BIG5 code: %x", XFASTINT (code));
5739       val = code;
5740     }
5741   else
5742     {
5743       if ((b1 < 0xA1 || b1 > 0xFE)
5744           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5745         error ("Invalid BIG5 code: %x", XFASTINT (code));
5746       DECODE_BIG5 (b1, b2, charset, c1, c2);
5747       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5748     }
5749   return val;
5750 }
5751
5752 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5753   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5754 Return the corresponding character code in Big5.")
5755   (ch)
5756      Lisp_Object ch;
5757 {
5758   int charset, c1, c2, b1, b2;
5759   Lisp_Object val;
5760
5761   CHECK_NUMBER (ch, 0);
5762   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5763   if (charset == CHARSET_ASCII)
5764     {
5765       val = ch;
5766     }
5767   else if ((charset == charset_big5_1
5768             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5769            || (charset == charset_big5_2
5770                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5771     {
5772       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5773       XSETFASTINT (val, (b1 << 8) | b2);
5774     }
5775   else
5776     error ("Can't encode to Big5: %d", XFASTINT (ch));
5777   return val;
5778 }
5779 \f
5780 DEFUN ("set-terminal-coding-system-internal",
5781        Fset_terminal_coding_system_internal,
5782        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5783   (coding_system)
5784      Lisp_Object coding_system;
5785 {
5786   CHECK_SYMBOL (coding_system, 0);
5787   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5788   /* We had better not send unsafe characters to terminal.  */
5789   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5790   /* Characer composition should be disabled.  */
5791   terminal_coding.composing = COMPOSITION_DISABLED;
5792   return Qnil;
5793 }
5794
5795 DEFUN ("set-safe-terminal-coding-system-internal",
5796        Fset_safe_terminal_coding_system_internal,
5797        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5798   (coding_system)
5799      Lisp_Object coding_system;
5800 {
5801   CHECK_SYMBOL (coding_system, 0);
5802   setup_coding_system (Fcheck_coding_system (coding_system),
5803                        &safe_terminal_coding);
5804   /* Characer composition should be disabled.  */
5805   safe_terminal_coding.composing = COMPOSITION_DISABLED;
5806   return Qnil;
5807 }
5808
5809 DEFUN ("terminal-coding-system",
5810        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5811   "Return coding system specified for terminal output.")
5812   ()
5813 {
5814   return terminal_coding.symbol;
5815 }
5816
5817 DEFUN ("set-keyboard-coding-system-internal",
5818        Fset_keyboard_coding_system_internal,
5819        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5820   (coding_system)
5821      Lisp_Object coding_system;
5822 {
5823   CHECK_SYMBOL (coding_system, 0);
5824   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5825   /* Characer composition should be disabled.  */
5826   keyboard_coding.composing = COMPOSITION_DISABLED;
5827   return Qnil;
5828 }
5829
5830 DEFUN ("keyboard-coding-system",
5831        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5832   "Return coding system specified for decoding keyboard input.")
5833   ()
5834 {
5835   return keyboard_coding.symbol;
5836 }
5837
5838 \f
5839 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5840        Sfind_operation_coding_system,  1, MANY, 0,
5841   "Choose a coding system for an operation based on the target name.\n\
5842 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5843 DECODING-SYSTEM is the coding system to use for decoding\n\
5844 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5845 for encoding (in case OPERATION does encoding).\n\
5846 \n\
5847 The first argument OPERATION specifies an I/O primitive:\n\
5848   For file I/O, `insert-file-contents' or `write-region'.\n\
5849   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5850   For network I/O, `open-network-stream'.\n\
5851 \n\
5852 The remaining arguments should be the same arguments that were passed\n\
5853 to the primitive.  Depending on which primitive, one of those arguments\n\
5854 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5855 whichever argument specifies the file name is TARGET.\n\
5856 \n\
5857 TARGET has a meaning which depends on OPERATION:\n\
5858   For file I/O, TARGET is a file name.\n\
5859   For process I/O, TARGET is a process name.\n\
5860   For network I/O, TARGET is a service name or a port number\n\
5861 \n\
5862 This function looks up what specified for TARGET in,\n\
5863 `file-coding-system-alist', `process-coding-system-alist',\n\
5864 or `network-coding-system-alist' depending on OPERATION.\n\
5865 They may specify a coding system, a cons of coding systems,\n\
5866 or a function symbol to call.\n\
5867 In the last case, we call the function with one argument,\n\
5868 which is a list of all the arguments given to this function.")
5869   (nargs, args)
5870      int nargs;
5871      Lisp_Object *args;
5872 {
5873   Lisp_Object operation, target_idx, target, val;
5874   register Lisp_Object chain;
5875
5876   if (nargs < 2)
5877     error ("Too few arguments");
5878   operation = args[0];
5879   if (!SYMBOLP (operation)
5880       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5881     error ("Invalid first arguement");
5882   if (nargs < 1 + XINT (target_idx))
5883     error ("Too few arguments for operation: %s",
5884            XSYMBOL (operation)->name->data);
5885   target = args[XINT (target_idx) + 1];
5886   if (!(STRINGP (target)
5887         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5888     error ("Invalid %dth argument", XINT (target_idx) + 1);
5889
5890   chain = ((EQ (operation, Qinsert_file_contents)
5891             || EQ (operation, Qwrite_region))
5892            ? Vfile_coding_system_alist
5893            : (EQ (operation, Qopen_network_stream)
5894               ? Vnetwork_coding_system_alist
5895               : Vprocess_coding_system_alist));
5896   if (NILP (chain))
5897     return Qnil;
5898
5899   for (; CONSP (chain); chain = XCDR (chain))
5900     {
5901       Lisp_Object elt;
5902       elt = XCAR (chain);
5903
5904       if (CONSP (elt)
5905           && ((STRINGP (target)
5906                && STRINGP (XCAR (elt))
5907                && fast_string_match (XCAR (elt), target) >= 0)
5908               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5909         {
5910           val = XCDR (elt);
5911           /* Here, if VAL is both a valid coding system and a valid
5912              function symbol, we return VAL as a coding system.  */
5913           if (CONSP (val))
5914             return val;
5915           if (! SYMBOLP (val))
5916             return Qnil;
5917           if (! NILP (Fcoding_system_p (val)))
5918             return Fcons (val, val);
5919           if (! NILP (Ffboundp (val)))
5920             {
5921               val = call1 (val, Flist (nargs, args));
5922               if (CONSP (val))
5923                 return val;
5924               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5925                 return Fcons (val, val);
5926             }
5927           return Qnil;
5928         }
5929     }
5930   return Qnil;
5931 }
5932
5933 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5934        Supdate_coding_systems_internal, 0, 0, 0,
5935   "Update internal database for ISO2022 and CCL based coding systems.\n\
5936 When values of any coding categories are changed, you must\n\
5937 call this function")
5938   ()
5939 {
5940   int i;
5941
5942   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5943     {
5944       Lisp_Object val;
5945
5946       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5947       if (!NILP (val))
5948         {
5949           if (! coding_system_table[i])
5950             coding_system_table[i] = ((struct coding_system *)
5951                                       xmalloc (sizeof (struct coding_system)));
5952           setup_coding_system (val, coding_system_table[i]);
5953         }
5954       else if (coding_system_table[i])
5955         {
5956           xfree (coding_system_table[i]);
5957           coding_system_table[i] = NULL;
5958         }
5959     }
5960
5961   return Qnil;
5962 }
5963
5964 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5965        Sset_coding_priority_internal, 0, 0, 0,
5966   "Update internal database for the current value of `coding-category-list'.\n\
5967 This function is internal use only.")
5968   ()
5969 {
5970   int i = 0, idx;
5971   Lisp_Object val;
5972
5973   val = Vcoding_category_list;
5974
5975   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5976     {
5977       if (! SYMBOLP (XCAR (val)))
5978         break;
5979       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5980       if (idx >= CODING_CATEGORY_IDX_MAX)
5981         break;
5982       coding_priorities[i++] = (1 << idx);
5983       val = XCDR (val);
5984     }
5985   /* If coding-category-list is valid and contains all coding
5986      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5987      the following code saves Emacs from crashing.  */
5988   while (i < CODING_CATEGORY_IDX_MAX)
5989     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5990
5991   return Qnil;
5992 }
5993
5994 #endif /* emacs */
5995
5996 \f
5997 /*** 9. Post-amble ***/
5998
5999 void
6000 init_coding ()
6001 {
6002   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6003 }
6004
6005 void
6006 init_coding_once ()
6007 {
6008   int i;
6009
6010   /* Emacs' internal format specific initialize routine.  */
6011   for (i = 0; i <= 0x20; i++)
6012     emacs_code_class[i] = EMACS_control_code;
6013   emacs_code_class[0x0A] = EMACS_linefeed_code;
6014   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6015   for (i = 0x21 ; i < 0x7F; i++)
6016     emacs_code_class[i] = EMACS_ascii_code;
6017   emacs_code_class[0x7F] = EMACS_control_code;
6018   for (i = 0x80; i < 0xFF; i++)
6019     emacs_code_class[i] = EMACS_invalid_code;
6020   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6021   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6022   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6023   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6024
6025   /* ISO2022 specific initialize routine.  */
6026   for (i = 0; i < 0x20; i++)
6027     iso_code_class[i] = ISO_control_code;
6028   for (i = 0x21; i < 0x7F; i++)
6029     iso_code_class[i] = ISO_graphic_plane_0;
6030   for (i = 0x80; i < 0xA0; i++)
6031     iso_code_class[i] = ISO_control_code;
6032   for (i = 0xA1; i < 0xFF; i++)
6033     iso_code_class[i] = ISO_graphic_plane_1;
6034   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6035   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6036   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6037   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6038   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6039   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6040   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6041   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6042   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6043   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6044
6045   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6046
6047   setup_coding_system (Qnil, &keyboard_coding);
6048   setup_coding_system (Qnil, &terminal_coding);
6049   setup_coding_system (Qnil, &safe_terminal_coding);
6050   setup_coding_system (Qnil, &default_buffer_file_coding);
6051
6052   bzero (coding_system_table, sizeof coding_system_table);
6053
6054   bzero (ascii_skip_code, sizeof ascii_skip_code);
6055   for (i = 0; i < 128; i++)
6056     ascii_skip_code[i] = 1;
6057
6058 #if defined (MSDOS) || defined (WINDOWSNT)
6059   system_eol_type = CODING_EOL_CRLF;
6060 #else
6061   system_eol_type = CODING_EOL_LF;
6062 #endif
6063
6064   inhibit_pre_post_conversion = 0;
6065 }
6066
6067 #ifdef emacs
6068
6069 void
6070 syms_of_coding ()
6071 {
6072   Qtarget_idx = intern ("target-idx");
6073   staticpro (&Qtarget_idx);
6074
6075   Qcoding_system_history = intern ("coding-system-history");
6076   staticpro (&Qcoding_system_history);
6077   Fset (Qcoding_system_history, Qnil);
6078
6079   /* Target FILENAME is the first argument.  */
6080   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6081   /* Target FILENAME is the third argument.  */
6082   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6083
6084   Qcall_process = intern ("call-process");
6085   staticpro (&Qcall_process);
6086   /* Target PROGRAM is the first argument.  */
6087   Fput (Qcall_process, Qtarget_idx, make_number (0));
6088
6089   Qcall_process_region = intern ("call-process-region");
6090   staticpro (&Qcall_process_region);
6091   /* Target PROGRAM is the third argument.  */
6092   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6093
6094   Qstart_process = intern ("start-process");
6095   staticpro (&Qstart_process);
6096   /* Target PROGRAM is the third argument.  */
6097   Fput (Qstart_process, Qtarget_idx, make_number (2));
6098
6099   Qopen_network_stream = intern ("open-network-stream");
6100   staticpro (&Qopen_network_stream);
6101   /* Target SERVICE is the fourth argument.  */
6102   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6103
6104   Qcoding_system = intern ("coding-system");
6105   staticpro (&Qcoding_system);
6106
6107   Qeol_type = intern ("eol-type");
6108   staticpro (&Qeol_type);
6109
6110   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6111   staticpro (&Qbuffer_file_coding_system);
6112
6113   Qpost_read_conversion = intern ("post-read-conversion");
6114   staticpro (&Qpost_read_conversion);
6115
6116   Qpre_write_conversion = intern ("pre-write-conversion");
6117   staticpro (&Qpre_write_conversion);
6118
6119   Qno_conversion = intern ("no-conversion");
6120   staticpro (&Qno_conversion);
6121
6122   Qundecided = intern ("undecided");
6123   staticpro (&Qundecided);
6124
6125   Qcoding_system_p = intern ("coding-system-p");
6126   staticpro (&Qcoding_system_p);
6127
6128   Qcoding_system_error = intern ("coding-system-error");
6129   staticpro (&Qcoding_system_error);
6130
6131   Fput (Qcoding_system_error, Qerror_conditions,
6132         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6133   Fput (Qcoding_system_error, Qerror_message,
6134         build_string ("Invalid coding system"));
6135
6136   Qcoding_category = intern ("coding-category");
6137   staticpro (&Qcoding_category);
6138   Qcoding_category_index = intern ("coding-category-index");
6139   staticpro (&Qcoding_category_index);
6140
6141   Vcoding_category_table
6142     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6143   staticpro (&Vcoding_category_table);
6144   {
6145     int i;
6146     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6147       {
6148         XVECTOR (Vcoding_category_table)->contents[i]
6149           = intern (coding_category_name[i]);
6150         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6151               Qcoding_category_index, make_number (i));
6152       }
6153   }
6154
6155   Qtranslation_table = intern ("translation-table");
6156   staticpro (&Qtranslation_table);
6157   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6158
6159   Qtranslation_table_id = intern ("translation-table-id");
6160   staticpro (&Qtranslation_table_id);
6161
6162   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6163   staticpro (&Qtranslation_table_for_decode);
6164
6165   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6166   staticpro (&Qtranslation_table_for_encode);
6167
6168   Qsafe_charsets = intern ("safe-charsets");
6169   staticpro (&Qsafe_charsets);
6170
6171   Qvalid_codes = intern ("valid-codes");
6172   staticpro (&Qvalid_codes);
6173
6174   Qemacs_mule = intern ("emacs-mule");
6175   staticpro (&Qemacs_mule);
6176
6177   Qraw_text = intern ("raw-text");
6178   staticpro (&Qraw_text);
6179
6180   defsubr (&Scoding_system_p);
6181   defsubr (&Sread_coding_system);
6182   defsubr (&Sread_non_nil_coding_system);
6183   defsubr (&Scheck_coding_system);
6184   defsubr (&Sdetect_coding_region);
6185   defsubr (&Sdetect_coding_string);
6186   defsubr (&Sdecode_coding_region);
6187   defsubr (&Sencode_coding_region);
6188   defsubr (&Sdecode_coding_string);
6189   defsubr (&Sencode_coding_string);
6190   defsubr (&Sdecode_sjis_char);
6191   defsubr (&Sencode_sjis_char);
6192   defsubr (&Sdecode_big5_char);
6193   defsubr (&Sencode_big5_char);
6194   defsubr (&Sset_terminal_coding_system_internal);
6195   defsubr (&Sset_safe_terminal_coding_system_internal);
6196   defsubr (&Sterminal_coding_system);
6197   defsubr (&Sset_keyboard_coding_system_internal);
6198   defsubr (&Skeyboard_coding_system);
6199   defsubr (&Sfind_operation_coding_system);
6200   defsubr (&Supdate_coding_systems_internal);
6201   defsubr (&Sset_coding_priority_internal);
6202
6203   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6204     "List of coding systems.\n\
6205 \n\
6206 Do not alter the value of this variable manually.  This variable should be\n\
6207 updated by the functions `make-coding-system' and\n\
6208 `define-coding-system-alias'.");
6209   Vcoding_system_list = Qnil;
6210
6211   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6212     "Alist of coding system names.\n\
6213 Each element is one element list of coding system name.\n\
6214 This variable is given to `completing-read' as TABLE argument.\n\
6215 \n\
6216 Do not alter the value of this variable manually.  This variable should be\n\
6217 updated by the functions `make-coding-system' and\n\
6218 `define-coding-system-alias'.");
6219   Vcoding_system_alist = Qnil;
6220
6221   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6222     "List of coding-categories (symbols) ordered by priority.");
6223   {
6224     int i;
6225
6226     Vcoding_category_list = Qnil;
6227     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6228       Vcoding_category_list
6229         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6230                  Vcoding_category_list);
6231   }
6232
6233   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6234     "Specify the coding system for read operations.\n\
6235 It is useful to bind this variable with `let', but do not set it globally.\n\
6236 If the value is a coding system, it is used for decoding on read operation.\n\
6237 If not, an appropriate element is used from one of the coding system alists:\n\
6238 There are three such tables, `file-coding-system-alist',\n\
6239 `process-coding-system-alist', and `network-coding-system-alist'.");
6240   Vcoding_system_for_read = Qnil;
6241
6242   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6243     "Specify the coding system for write operations.\n\
6244 Programs bind this variable with `let', but you should not set it globally.\n\
6245 If the value is a coding system, it is used for encoding of output,\n\
6246 when writing it to a file and when sending it to a file or subprocess.\n\
6247 \n\
6248 If this does not specify a coding system, an appropriate element\n\
6249 is used from one of the coding system alists:\n\
6250 There are three such tables, `file-coding-system-alist',\n\
6251 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6252 For output to files, if the above procedure does not specify a coding system,\n\
6253 the value of `buffer-file-coding-system' is used.");
6254   Vcoding_system_for_write = Qnil;
6255
6256   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6257     "Coding system used in the latest file or process I/O.");
6258   Vlast_coding_system_used = Qnil;
6259
6260   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6261     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6262 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6263 such conversion.");
6264   inhibit_eol_conversion = 0;
6265
6266   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6267     "Non-nil means process buffer inherits coding system of process output.\n\
6268 Bind it to t if the process output is to be treated as if it were a file\n\
6269 read from some filesystem.");
6270   inherit_process_coding_system = 0;
6271
6272   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6273     "Alist to decide a coding system to use for a file I/O operation.\n\
6274 The format is ((PATTERN . VAL) ...),\n\
6275 where PATTERN is a regular expression matching a file name,\n\
6276 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6277 If VAL is a coding system, it is used for both decoding and encoding\n\
6278 the file contents.\n\
6279 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6280 and the cdr part is used for encoding.\n\
6281 If VAL is a function symbol, the function must return a coding system\n\
6282 or a cons of coding systems which are used as above.\n\
6283 \n\
6284 See also the function `find-operation-coding-system'\n\
6285 and the variable `auto-coding-alist'.");
6286   Vfile_coding_system_alist = Qnil;
6287
6288   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6289     "Alist to decide a coding system to use for a process I/O operation.\n\
6290 The format is ((PATTERN . VAL) ...),\n\
6291 where PATTERN is a regular expression matching a program name,\n\
6292 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6293 If VAL is a coding system, it is used for both decoding what received\n\
6294 from the program and encoding what sent to the program.\n\
6295 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6296 and the cdr part is used for encoding.\n\
6297 If VAL is a function symbol, the function must return a coding system\n\
6298 or a cons of coding systems which are used as above.\n\
6299 \n\
6300 See also the function `find-operation-coding-system'.");
6301   Vprocess_coding_system_alist = Qnil;
6302
6303   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6304     "Alist to decide a coding system to use for a network I/O operation.\n\
6305 The format is ((PATTERN . VAL) ...),\n\
6306 where PATTERN is a regular expression matching a network service name\n\
6307 or is a port number to connect to,\n\
6308 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6309 If VAL is a coding system, it is used for both decoding what received\n\
6310 from the network stream and encoding what sent to the network stream.\n\
6311 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6312 and the cdr part is used for encoding.\n\
6313 If VAL is a function symbol, the function must return a coding system\n\
6314 or a cons of coding systems which are used as above.\n\
6315 \n\
6316 See also the function `find-operation-coding-system'.");
6317   Vnetwork_coding_system_alist = Qnil;
6318
6319   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6320     "Coding system to use with system messages.");
6321   Vlocale_coding_system = Qnil;
6322
6323   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6324     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6325   eol_mnemonic_unix = build_string (":");
6326
6327   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6328     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6329   eol_mnemonic_dos = build_string ("\\");
6330
6331   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6332     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6333   eol_mnemonic_mac = build_string ("/");
6334
6335   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6336     "*String displayed in mode line when end-of-line format is not yet determined.");
6337   eol_mnemonic_undecided = build_string (":");
6338
6339   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6340     "*Non-nil enables character translation while encoding and decoding.");
6341   Venable_character_translation = Qt;
6342
6343   DEFVAR_LISP ("standard-translation-table-for-decode",
6344     &Vstandard_translation_table_for_decode,
6345     "Table for translating characters while decoding.");
6346   Vstandard_translation_table_for_decode = Qnil;
6347
6348   DEFVAR_LISP ("standard-translation-table-for-encode",
6349     &Vstandard_translation_table_for_encode,
6350     "Table for translationg characters while encoding.");
6351   Vstandard_translation_table_for_encode = Qnil;
6352
6353   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6354     "Alist of charsets vs revision numbers.\n\
6355 While encoding, if a charset (car part of an element) is found,\n\
6356 designate it with the escape sequence identifing revision (cdr part of the element).");
6357   Vcharset_revision_alist = Qnil;
6358
6359   DEFVAR_LISP ("default-process-coding-system",
6360                &Vdefault_process_coding_system,
6361     "Cons of coding systems used for process I/O by default.\n\
6362 The car part is used for decoding a process output,\n\
6363 the cdr part is used for encoding a text to be sent to a process.");
6364   Vdefault_process_coding_system = Qnil;
6365
6366   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6367     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6368 This is a vector of length 256.\n\
6369 If Nth element is non-nil, the existence of code N in a file\n\
6370 \(or output of subprocess) doesn't prevent it to be detected as\n\
6371 a coding system of ISO 2022 variant which has a flag\n\
6372 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6373 or reading output of a subprocess.\n\
6374 Only 128th through 159th elements has a meaning.");
6375   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6376
6377   DEFVAR_LISP ("select-safe-coding-system-function",
6378                &Vselect_safe_coding_system_function,
6379     "Function to call to select safe coding system for encoding a text.\n\
6380 \n\
6381 If set, this function is called to force a user to select a proper\n\
6382 coding system which can encode the text in the case that a default\n\
6383 coding system used in each operation can't encode the text.\n\
6384 \n\
6385 The default value is `select-safe-coding-system' (which see).");
6386   Vselect_safe_coding_system_function = Qnil;
6387
6388 }
6389
6390 char *
6391 emacs_strerror (error_number)
6392      int error_number;
6393 {
6394   char *str;
6395
6396   synchronize_system_messages_locale ();
6397   str = strerror (error_number);
6398
6399   if (! NILP (Vlocale_coding_system))
6400     {
6401       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6402                                                       Vlocale_coding_system,
6403                                                       0);
6404       str = (char *) XSTRING (dec)->data;
6405     }
6406
6407   return str;
6408 }
6409
6410 #endif /* emacs */