code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts number of
1194    characters and check EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   ptrdiff_t nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_bytes = src_base - coding->source;
1304   coding->detected_utf8_chars = nchars;
1305   return 1;
1306 }
1307
1308
1309 static void
1310 decode_coding_utf_8 (struct coding_system *coding)
1311 {
1312   const unsigned char *src = coding->source + coding->consumed;
1313   const unsigned char *src_end = coding->source + coding->src_bytes;
1314   const unsigned char *src_base;
1315   int *charbuf = coding->charbuf + coding->charbuf_used;
1316   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1317   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1318   bool multibytep = coding->src_multibyte;
1319   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1320   bool eol_dos
1321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1322   int byte_after_cr = -1;
1323
1324   if (bom != utf_without_bom)
1325     {
1326       int c1, c2, c3;
1327
1328       src_base = src;
1329       ONE_MORE_BYTE (c1);
1330       if (! UTF_8_3_OCTET_LEADING_P (c1))
1331         src = src_base;
1332       else
1333         {
1334           ONE_MORE_BYTE (c2);
1335           if (! UTF_8_EXTRA_OCTET_P (c2))
1336             src = src_base;
1337           else
1338             {
1339               ONE_MORE_BYTE (c3);
1340               if (! UTF_8_EXTRA_OCTET_P (c3))
1341                 src = src_base;
1342               else
1343                 {
1344                   if ((c1 != UTF_8_BOM_1)
1345                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1346                     src = src_base;
1347                   else
1348                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1349                 }
1350             }
1351         }
1352     }
1353   CODING_UTF_8_BOM (coding) = utf_without_bom;
1354
1355   while (1)
1356     {
1357       int c, c1, c2, c3, c4, c5;
1358
1359       src_base = src;
1360       consumed_chars_base = consumed_chars;
1361
1362       if (charbuf >= charbuf_end)
1363         {
1364           if (byte_after_cr >= 0)
1365             src_base--;
1366           break;
1367         }
1368
1369       /* In the simple case, rapidly handle ordinary characters */
1370       if (multibytep && ! eol_dos
1371           && charbuf < charbuf_end - 6 && src < src_end - 6)
1372         {
1373           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1374             {
1375               c1 = *src;
1376               if (c1 & 0x80)
1377                 break;
1378               src++;
1379               consumed_chars++;
1380               *charbuf++ = c1;
1381
1382               c1 = *src;
1383               if (c1 & 0x80)
1384                 break;
1385               src++;
1386               consumed_chars++;
1387               *charbuf++ = c1;
1388
1389               c1 = *src;
1390               if (c1 & 0x80)
1391                 break;
1392               src++;
1393               consumed_chars++;
1394               *charbuf++ = c1;
1395
1396               c1 = *src;
1397               if (c1 & 0x80)
1398                 break;
1399               src++;
1400               consumed_chars++;
1401               *charbuf++ = c1;
1402             }
1403           /* If we handled at least one character, restart the main loop.  */
1404           if (src != src_base)
1405             continue;
1406         }
1407
1408       if (byte_after_cr >= 0)
1409         c1 = byte_after_cr, byte_after_cr = -1;
1410       else
1411         ONE_MORE_BYTE (c1);
1412       if (c1 < 0)
1413         {
1414           c = - c1;
1415         }
1416       else if (UTF_8_1_OCTET_P (c1))
1417         {
1418           if (eol_dos && c1 == '\r')
1419             ONE_MORE_BYTE (byte_after_cr);
1420           c = c1;
1421         }
1422       else
1423         {
1424           ONE_MORE_BYTE (c2);
1425           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1426             goto invalid_code;
1427           if (UTF_8_2_OCTET_LEADING_P (c1))
1428             {
1429               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1430               /* Reject overlong sequences here and below.  Encoders
1431                  producing them are incorrect, they can be misleading,
1432                  and they mess up read/write invariance.  */
1433               if (c < 128)
1434                 goto invalid_code;
1435             }
1436           else
1437             {
1438               ONE_MORE_BYTE (c3);
1439               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1440                 goto invalid_code;
1441               if (UTF_8_3_OCTET_LEADING_P (c1))
1442                 {
1443                   c = (((c1 & 0xF) << 12)
1444                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1445                   if (c < 0x800
1446                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1447                     goto invalid_code;
1448                 }
1449               else
1450                 {
1451                   ONE_MORE_BYTE (c4);
1452                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1453                     goto invalid_code;
1454                   if (UTF_8_4_OCTET_LEADING_P (c1))
1455                     {
1456                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1457                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1458                     if (c < 0x10000)
1459                       goto invalid_code;
1460                     }
1461                   else
1462                     {
1463                       ONE_MORE_BYTE (c5);
1464                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1465                         goto invalid_code;
1466                       if (UTF_8_5_OCTET_LEADING_P (c1))
1467                         {
1468                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1469                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1470                                | (c5 & 0x3F));
1471                           if ((c > MAX_CHAR) || (c < 0x200000))
1472                             goto invalid_code;
1473                         }
1474                       else
1475                         goto invalid_code;
1476                     }
1477                 }
1478             }
1479         }
1480
1481       *charbuf++ = c;
1482       continue;
1483
1484     invalid_code:
1485       src = src_base;
1486       consumed_chars = consumed_chars_base;
1487       ONE_MORE_BYTE (c);
1488       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1489       coding->errors++;
1490     }
1491
1492  no_more_source:
1493   coding->consumed_char += consumed_chars_base;
1494   coding->consumed = src_base - coding->source;
1495   coding->charbuf_used = charbuf - coding->charbuf;
1496 }
1497
1498
1499 static bool
1500 encode_coding_utf_8 (struct coding_system *coding)
1501 {
1502   bool multibytep = coding->dst_multibyte;
1503   int *charbuf = coding->charbuf;
1504   int *charbuf_end = charbuf + coding->charbuf_used;
1505   unsigned char *dst = coding->destination + coding->produced;
1506   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1507   ptrdiff_t produced_chars = 0;
1508   int c;
1509
1510   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1511     {
1512       ASSURE_DESTINATION (3);
1513       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1514       CODING_UTF_8_BOM (coding) = utf_without_bom;
1515     }
1516
1517   if (multibytep)
1518     {
1519       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1520
1521       while (charbuf < charbuf_end)
1522         {
1523           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1524
1525           ASSURE_DESTINATION (safe_room);
1526           c = *charbuf++;
1527           if (CHAR_BYTE8_P (c))
1528             {
1529               c = CHAR_TO_BYTE8 (c);
1530               EMIT_ONE_BYTE (c);
1531             }
1532           else
1533             {
1534               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1535               for (p = str; p < pend; p++)
1536                 EMIT_ONE_BYTE (*p);
1537             }
1538         }
1539     }
1540   else
1541     {
1542       int safe_room = MAX_MULTIBYTE_LENGTH;
1543
1544       while (charbuf < charbuf_end)
1545         {
1546           ASSURE_DESTINATION (safe_room);
1547           c = *charbuf++;
1548           if (CHAR_BYTE8_P (c))
1549             *dst++ = CHAR_TO_BYTE8 (c);
1550           else
1551             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1552         }
1553       produced_chars = dst - (coding->destination + coding->produced);
1554     }
1555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1556   coding->produced_char += produced_chars;
1557   coding->produced = dst - coding->destination;
1558   return 0;
1559 }
1560
1561
1562 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1563    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1564
1565 #define UTF_16_HIGH_SURROGATE_P(val) \
1566   (((val) & 0xFC00) == 0xD800)
1567
1568 #define UTF_16_LOW_SURROGATE_P(val) \
1569   (((val) & 0xFC00) == 0xDC00)
1570
1571
1572 static bool
1573 detect_coding_utf_16 (struct coding_system *coding,
1574                       struct coding_detection_info *detect_info)
1575 {
1576   const unsigned char *src = coding->source;
1577   const unsigned char *src_end = coding->source + coding->src_bytes;
1578   bool multibytep = coding->src_multibyte;
1579   int c1, c2;
1580
1581   detect_info->checked |= CATEGORY_MASK_UTF_16;
1582   if (coding->mode & CODING_MODE_LAST_BLOCK
1583       && (coding->src_chars & 1))
1584     {
1585       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1586       return 0;
1587     }
1588
1589   TWO_MORE_BYTES (c1, c2);
1590   if ((c1 == 0xFF) && (c2 == 0xFE))
1591     {
1592       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1593                              | CATEGORY_MASK_UTF_16_AUTO);
1594       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1595                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1596                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1597     }
1598   else if ((c1 == 0xFE) && (c2 == 0xFF))
1599     {
1600       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1601                              | CATEGORY_MASK_UTF_16_AUTO);
1602       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1603                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1604                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1605     }
1606   else if (c2 < 0)
1607     {
1608       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1609       return 0;
1610     }
1611   else
1612     {
1613       /* We check the dispersion of Eth and Oth bytes where E is even and
1614          O is odd.  If both are high, we assume binary data.*/
1615       unsigned char e[256], o[256];
1616       unsigned e_num = 1, o_num = 1;
1617
1618       memset (e, 0, 256);
1619       memset (o, 0, 256);
1620       e[c1] = 1;
1621       o[c2] = 1;
1622
1623       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1624                                 |CATEGORY_MASK_UTF_16_BE
1625                                 | CATEGORY_MASK_UTF_16_LE);
1626
1627       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1628              != CATEGORY_MASK_UTF_16)
1629         {
1630           TWO_MORE_BYTES (c1, c2);
1631           if (c2 < 0)
1632             break;
1633           if (! e[c1])
1634             {
1635               e[c1] = 1;
1636               e_num++;
1637               if (e_num >= 128)
1638                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1639             }
1640           if (! o[c2])
1641             {
1642               o[c2] = 1;
1643               o_num++;
1644               if (o_num >= 128)
1645                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1646             }
1647         }
1648       return 0;
1649     }
1650
1651  no_more_source:
1652   return 1;
1653 }
1654
1655 static void
1656 decode_coding_utf_16 (struct coding_system *coding)
1657 {
1658   const unsigned char *src = coding->source + coding->consumed;
1659   const unsigned char *src_end = coding->source + coding->src_bytes;
1660   const unsigned char *src_base;
1661   int *charbuf = coding->charbuf + coding->charbuf_used;
1662   /* We may produces at most 3 chars in one loop.  */
1663   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1664   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1665   bool multibytep = coding->src_multibyte;
1666   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1667   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1668   int surrogate = CODING_UTF_16_SURROGATE (coding);
1669   bool eol_dos
1670     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1671   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1672
1673   if (bom == utf_with_bom)
1674     {
1675       int c, c1, c2;
1676
1677       src_base = src;
1678       ONE_MORE_BYTE (c1);
1679       ONE_MORE_BYTE (c2);
1680       c = (c1 << 8) | c2;
1681
1682       if (endian == utf_16_big_endian
1683           ? c != 0xFEFF : c != 0xFFFE)
1684         {
1685           /* The first two bytes are not BOM.  Treat them as bytes
1686              for a normal character.  */
1687           src = src_base;
1688           coding->errors++;
1689         }
1690       CODING_UTF_16_BOM (coding) = utf_without_bom;
1691     }
1692   else if (bom == utf_detect_bom)
1693     {
1694       /* We have already tried to detect BOM and failed in
1695          detect_coding.  */
1696       CODING_UTF_16_BOM (coding) = utf_without_bom;
1697     }
1698
1699   while (1)
1700     {
1701       int c, c1, c2;
1702
1703       src_base = src;
1704       consumed_chars_base = consumed_chars;
1705
1706       if (charbuf >= charbuf_end)
1707         {
1708           if (byte_after_cr1 >= 0)
1709             src_base -= 2;
1710           break;
1711         }
1712
1713       if (byte_after_cr1 >= 0)
1714         c1 = byte_after_cr1, byte_after_cr1 = -1;
1715       else
1716         ONE_MORE_BYTE (c1);
1717       if (c1 < 0)
1718         {
1719           *charbuf++ = -c1;
1720           continue;
1721         }
1722       if (byte_after_cr2 >= 0)
1723         c2 = byte_after_cr2, byte_after_cr2 = -1;
1724       else
1725         ONE_MORE_BYTE (c2);
1726       if (c2 < 0)
1727         {
1728           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1729           *charbuf++ = -c2;
1730           continue;
1731         }
1732       c = (endian == utf_16_big_endian
1733            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1734
1735       if (surrogate)
1736         {
1737           if (! UTF_16_LOW_SURROGATE_P (c))
1738             {
1739               if (endian == utf_16_big_endian)
1740                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1741               else
1742                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1743               *charbuf++ = c1;
1744               *charbuf++ = c2;
1745               coding->errors++;
1746               if (UTF_16_HIGH_SURROGATE_P (c))
1747                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1748               else
1749                 *charbuf++ = c;
1750             }
1751           else
1752             {
1753               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1754               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1755               *charbuf++ = 0x10000 + c;
1756             }
1757         }
1758       else
1759         {
1760           if (UTF_16_HIGH_SURROGATE_P (c))
1761             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1762           else
1763             {
1764               if (eol_dos && c == '\r')
1765                 {
1766                   ONE_MORE_BYTE (byte_after_cr1);
1767                   ONE_MORE_BYTE (byte_after_cr2);
1768                 }
1769               *charbuf++ = c;
1770             }
1771         }
1772     }
1773
1774  no_more_source:
1775   coding->consumed_char += consumed_chars_base;
1776   coding->consumed = src_base - coding->source;
1777   coding->charbuf_used = charbuf - coding->charbuf;
1778 }
1779
1780 static bool
1781 encode_coding_utf_16 (struct coding_system *coding)
1782 {
1783   bool multibytep = coding->dst_multibyte;
1784   int *charbuf = coding->charbuf;
1785   int *charbuf_end = charbuf + coding->charbuf_used;
1786   unsigned char *dst = coding->destination + coding->produced;
1787   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1788   int safe_room = 8;
1789   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1790   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1791   ptrdiff_t produced_chars = 0;
1792   int c;
1793
1794   if (bom != utf_without_bom)
1795     {
1796       ASSURE_DESTINATION (safe_room);
1797       if (big_endian)
1798         EMIT_TWO_BYTES (0xFE, 0xFF);
1799       else
1800         EMIT_TWO_BYTES (0xFF, 0xFE);
1801       CODING_UTF_16_BOM (coding) = utf_without_bom;
1802     }
1803
1804   while (charbuf < charbuf_end)
1805     {
1806       ASSURE_DESTINATION (safe_room);
1807       c = *charbuf++;
1808       if (c > MAX_UNICODE_CHAR)
1809         c = coding->default_char;
1810
1811       if (c < 0x10000)
1812         {
1813           if (big_endian)
1814             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1815           else
1816             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1817         }
1818       else
1819         {
1820           int c1, c2;
1821
1822           c -= 0x10000;
1823           c1 = (c >> 10) + 0xD800;
1824           c2 = (c & 0x3FF) + 0xDC00;
1825           if (big_endian)
1826             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1827           else
1828             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1829         }
1830     }
1831   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1832   coding->produced = dst - coding->destination;
1833   coding->produced_char += produced_chars;
1834   return 0;
1835 }
1836
1837 \f
1838 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1839
1840 /* Emacs' internal format for representation of multiple character
1841    sets is a kind of multi-byte encoding, i.e. characters are
1842    represented by variable-length sequences of one-byte codes.
1843
1844    ASCII characters and control characters (e.g. `tab', `newline') are
1845    represented by one-byte sequences which are their ASCII codes, in
1846    the range 0x00 through 0x7F.
1847
1848    8-bit characters of the range 0x80..0x9F are represented by
1849    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1850    code + 0x20).
1851
1852    8-bit characters of the range 0xA0..0xFF are represented by
1853    one-byte sequences which are their 8-bit code.
1854
1855    The other characters are represented by a sequence of `base
1856    leading-code', optional `extended leading-code', and one or two
1857    `position-code's.  The length of the sequence is determined by the
1858    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1859    whereas extended leading-code and position-code take the range 0xA0
1860    through 0xFF.  See `charset.h' for more details about leading-code
1861    and position-code.
1862
1863    --- CODE RANGE of Emacs' internal format ---
1864    character set        range
1865    -------------        -----
1866    ascii                0x00..0x7F
1867    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1868    eight-bit-graphic    0xA0..0xBF
1869    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1870    ---------------------------------------------
1871
1872    As this is the internal character representation, the format is
1873    usually not used externally (i.e. in a file or in a data sent to a
1874    process).  But, it is possible to have a text externally in this
1875    format (i.e. by encoding by the coding system `emacs-mule').
1876
1877    In that case, a sequence of one-byte codes has a slightly different
1878    form.
1879
1880    At first, all characters in eight-bit-control are represented by
1881    one-byte sequences which are their 8-bit code.
1882
1883    Next, character composition data are represented by the byte
1884    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1885    where,
1886         METHOD is 0xF2 plus one of composition method (enum
1887         composition_method),
1888
1889         BYTES is 0xA0 plus a byte length of this composition data,
1890
1891         CHARS is 0xA0 plus a number of characters composed by this
1892         data,
1893
1894         COMPONENTs are characters of multibyte form or composition
1895         rules encoded by two-byte of ASCII codes.
1896
1897    In addition, for backward compatibility, the following formats are
1898    also recognized as composition data on decoding.
1899
1900    0x80 MSEQ ...
1901    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1902
1903    Here,
1904         MSEQ is a multibyte form but in these special format:
1905           ASCII: 0xA0 ASCII_CODE+0x80,
1906           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1907         RULE is a one byte code of the range 0xA0..0xF0 that
1908         represents a composition rule.
1909   */
1910
1911 char emacs_mule_bytes[256];
1912
1913
1914 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1915    Return true if a text is encoded in 'emacs-mule'.  */
1916
1917 static bool
1918 detect_coding_emacs_mule (struct coding_system *coding,
1919                           struct coding_detection_info *detect_info)
1920 {
1921   const unsigned char *src = coding->source, *src_base;
1922   const unsigned char *src_end = coding->source + coding->src_bytes;
1923   bool multibytep = coding->src_multibyte;
1924   ptrdiff_t consumed_chars = 0;
1925   int c;
1926   int found = 0;
1927
1928   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1929   /* A coding system of this category is always ASCII compatible.  */
1930   src += coding->head_ascii;
1931
1932   while (1)
1933     {
1934       src_base = src;
1935       ONE_MORE_BYTE (c);
1936       if (c < 0)
1937         continue;
1938       if (c == 0x80)
1939         {
1940           /* Perhaps the start of composite character.  We simply skip
1941              it because analyzing it is too heavy for detecting.  But,
1942              at least, we check that the composite character
1943              constitutes of more than 4 bytes.  */
1944           const unsigned char *src_start;
1945
1946         repeat:
1947           src_start = src;
1948           do
1949             {
1950               ONE_MORE_BYTE (c);
1951             }
1952           while (c >= 0xA0);
1953
1954           if (src - src_start <= 4)
1955             break;
1956           found = CATEGORY_MASK_EMACS_MULE;
1957           if (c == 0x80)
1958             goto repeat;
1959         }
1960
1961       if (c < 0x80)
1962         {
1963           if (c < 0x20
1964               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1965             break;
1966         }
1967       else
1968         {
1969           int more_bytes = emacs_mule_bytes[c] - 1;
1970
1971           while (more_bytes > 0)
1972             {
1973               ONE_MORE_BYTE (c);
1974               if (c < 0xA0)
1975                 {
1976                   src--;        /* Unread the last byte.  */
1977                   break;
1978                 }
1979               more_bytes--;
1980             }
1981           if (more_bytes != 0)
1982             break;
1983           found = CATEGORY_MASK_EMACS_MULE;
1984         }
1985     }
1986   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1987   return 0;
1988
1989  no_more_source:
1990   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1991     {
1992       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1993       return 0;
1994     }
1995   detect_info->found |= found;
1996   return 1;
1997 }
1998
1999
2000 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2001    character.  If CMP_STATUS indicates that we must expect MSEQ or
2002    RULE described above, decode it and return the negative value of
2003    the decoded character or rule.  If an invalid byte is found, return
2004    -1.  If SRC is too short, return -2.  */
2005
2006 static int
2007 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2008                  int *nbytes, int *nchars, int *id,
2009                  struct composition_status *cmp_status)
2010 {
2011   const unsigned char *src_end = coding->source + coding->src_bytes;
2012   const unsigned char *src_base = src;
2013   bool multibytep = coding->src_multibyte;
2014   int charset_ID;
2015   unsigned code;
2016   int c;
2017   ptrdiff_t consumed_chars = 0;
2018   bool mseq_found = 0;
2019
2020   ONE_MORE_BYTE (c);
2021   if (c < 0)
2022     {
2023       c = -c;
2024       charset_ID = emacs_mule_charset[0];
2025     }
2026   else
2027     {
2028       if (c >= 0xA0)
2029         {
2030           if (cmp_status->state != COMPOSING_NO
2031               && cmp_status->old_form)
2032             {
2033               if (cmp_status->state == COMPOSING_CHAR)
2034                 {
2035                   if (c == 0xA0)
2036                     {
2037                       ONE_MORE_BYTE (c);
2038                       c -= 0x80;
2039                       if (c < 0)
2040                         goto invalid_code;
2041                     }
2042                   else
2043                     c -= 0x20;
2044                   mseq_found = 1;
2045                 }
2046               else
2047                 {
2048                   *nbytes = src - src_base;
2049                   *nchars = consumed_chars;
2050                   return -c;
2051                 }
2052             }
2053           else
2054             goto invalid_code;
2055         }
2056
2057       switch (emacs_mule_bytes[c])
2058         {
2059         case 2:
2060           if ((charset_ID = emacs_mule_charset[c]) < 0)
2061             goto invalid_code;
2062           ONE_MORE_BYTE (c);
2063           if (c < 0xA0)
2064             goto invalid_code;
2065           code = c & 0x7F;
2066           break;
2067
2068         case 3:
2069           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2070               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2071             {
2072               ONE_MORE_BYTE (c);
2073               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2074                 goto invalid_code;
2075               ONE_MORE_BYTE (c);
2076               if (c < 0xA0)
2077                 goto invalid_code;
2078               code = c & 0x7F;
2079             }
2080           else
2081             {
2082               if ((charset_ID = emacs_mule_charset[c]) < 0)
2083                 goto invalid_code;
2084               ONE_MORE_BYTE (c);
2085               if (c < 0xA0)
2086                 goto invalid_code;
2087               code = (c & 0x7F) << 8;
2088               ONE_MORE_BYTE (c);
2089               if (c < 0xA0)
2090                 goto invalid_code;
2091               code |= c & 0x7F;
2092             }
2093           break;
2094
2095         case 4:
2096           ONE_MORE_BYTE (c);
2097           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2098             goto invalid_code;
2099           ONE_MORE_BYTE (c);
2100           if (c < 0xA0)
2101             goto invalid_code;
2102           code = (c & 0x7F) << 8;
2103           ONE_MORE_BYTE (c);
2104           if (c < 0xA0)
2105             goto invalid_code;
2106           code |= c & 0x7F;
2107           break;
2108
2109         case 1:
2110           code = c;
2111           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2112           break;
2113
2114         default:
2115           emacs_abort ();
2116         }
2117       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2118                           CHARSET_FROM_ID (charset_ID), code, c);
2119       if (c < 0)
2120         goto invalid_code;
2121     }
2122   *nbytes = src - src_base;
2123   *nchars = consumed_chars;
2124   if (id)
2125     *id = charset_ID;
2126   return (mseq_found ? -c : c);
2127
2128  no_more_source:
2129   return -2;
2130
2131  invalid_code:
2132   return -1;
2133 }
2134
2135
2136 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2137
2138 /* Handle these composition sequence ('|': the end of header elements,
2139    BYTES and CHARS >= 0xA0):
2140
2141    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2142    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2143    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2144
2145    and these old form:
2146
2147    (4) relative composition: 0x80 | MSEQ ... MSEQ
2148    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2149
2150    When the starter 0x80 and the following header elements are found,
2151    this annotation header is produced.
2152
2153         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2154
2155    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157
2158    Then, upon reading the following elements, these codes are produced
2159    until the composition end is found:
2160
2161    (1) CHAR ... CHAR
2162    (2) ALT ... ALT CHAR ... CHAR
2163    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2164    (4) CHAR ... CHAR
2165    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2166
2167    When the composition end is found, LENGTH and NCHARS in the
2168    annotation header is updated as below:
2169
2170    (1) LENGTH: unchanged, NCHARS: unchanged
2171    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2174    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2175
2176    If an error is found while composing, the annotation header is
2177    changed to the original composition header (plus filler -1s) as
2178    below:
2179
2180    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2181    (5)          [ 0x80 0xFF -1 -1- -1 ]
2182
2183    and the sequence [ -2 DECODED-RULE ] is changed to the original
2184    byte sequence as below:
2185         o the original byte sequence is B: [ B -1 ]
2186         o the original byte sequence is B1 B2: [ B1 B2 ]
2187
2188    Most of the routines are implemented by macros because many
2189    variables and labels in the caller decode_coding_emacs_mule must be
2190    accessible, and they are usually called just once (thus doesn't
2191    increase the size of compiled object).  */
2192
2193 /* Decode a composition rule represented by C as a component of
2194    composition sequence of Emacs 20 style.  Set RULE to the decoded
2195    rule. */
2196
2197 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2198   do {                                                  \
2199     int gref, nref;                                     \
2200                                                         \
2201     c -= 0xA0;                                          \
2202     if (c < 0 || c >= 81)                               \
2203       goto invalid_code;                                \
2204     gref = c / 9, nref = c % 9;                         \
2205     if (gref == 4) gref = 10;                           \
2206     if (nref == 4) nref = 10;                           \
2207     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2208   } while (0)
2209
2210
2211 /* Decode a composition rule represented by C and the following byte
2212    at SRC as a component of composition sequence of Emacs 21 style.
2213    Set RULE to the decoded rule.  */
2214
2215 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2216   do {                                                  \
2217     int gref, nref;                                     \
2218                                                         \
2219     gref = c - 0x20;                                    \
2220     if (gref < 0 || gref >= 81)                         \
2221       goto invalid_code;                                \
2222     ONE_MORE_BYTE (c);                                  \
2223     nref = c - 0x20;                                    \
2224     if (nref < 0 || nref >= 81)                         \
2225       goto invalid_code;                                \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2231    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2232    byte length of this composition information, CHARS is the number of
2233    characters composed by this composition.  */
2234
2235 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2236   do {                                                                  \
2237     enum composition_method method = c - 0xF2;                          \
2238     int nbytes, nchars;                                                 \
2239                                                                         \
2240     ONE_MORE_BYTE (c);                                                  \
2241     if (c < 0)                                                          \
2242       goto invalid_code;                                                \
2243     nbytes = c - 0xA0;                                                  \
2244     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2245       goto invalid_code;                                                \
2246     ONE_MORE_BYTE (c);                                                  \
2247     nchars = c - 0xA0;                                                  \
2248     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2249       goto invalid_code;                                                \
2250     cmp_status->old_form = 0;                                           \
2251     cmp_status->method = method;                                        \
2252     if (method == COMPOSITION_RELATIVE)                                 \
2253       cmp_status->state = COMPOSING_CHAR;                               \
2254     else                                                                \
2255       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2256     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2257     cmp_status->nchars = nchars;                                        \
2258     cmp_status->ncomps = nbytes - 4;                                    \
2259     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2260   } while (0)
2261
2262
2263 /* Start of Emacs 20 style format for relative composition.  */
2264
2265 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2266   do {                                                          \
2267     cmp_status->old_form = 1;                                   \
2268     cmp_status->method = COMPOSITION_RELATIVE;                  \
2269     cmp_status->state = COMPOSING_CHAR;                         \
2270     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2271     cmp_status->nchars = cmp_status->ncomps = 0;                \
2272     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2273   } while (0)
2274
2275
2276 /* Start of Emacs 20 style format for rule-base composition.  */
2277
2278 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2279   do {                                                          \
2280     cmp_status->old_form = 1;                                   \
2281     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2282     cmp_status->state = COMPOSING_CHAR;                         \
2283     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2284     cmp_status->nchars = cmp_status->ncomps = 0;                \
2285     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2286   } while (0)
2287
2288
2289 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2290   do {                                                  \
2291     const unsigned char *current_src = src;             \
2292                                                         \
2293     ONE_MORE_BYTE (c);                                  \
2294     if (c < 0)                                          \
2295       goto invalid_code;                                \
2296     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2297         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2298       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2299     else if (c < 0xA0)                                  \
2300       goto invalid_code;                                \
2301     else if (c < 0xC0)                                  \
2302       {                                                 \
2303         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2304         /* Re-read C as a composition component.  */    \
2305         src = current_src;                              \
2306       }                                                 \
2307     else if (c == 0xFF)                                 \
2308       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2309     else                                                \
2310       goto invalid_code;                                \
2311   } while (0)
2312
2313 #define EMACS_MULE_COMPOSITION_END()                            \
2314   do {                                                          \
2315     int idx = - cmp_status->length;                             \
2316                                                                 \
2317     if (cmp_status->old_form)                                   \
2318       charbuf[idx + 2] = cmp_status->nchars;                    \
2319     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2320       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2321     cmp_status->state = COMPOSING_NO;                           \
2322   } while (0)
2323
2324
2325 static int
2326 emacs_mule_finish_composition (int *charbuf,
2327                                struct composition_status *cmp_status)
2328 {
2329   int idx = - cmp_status->length;
2330   int new_chars;
2331
2332   if (cmp_status->old_form && cmp_status->nchars > 0)
2333     {
2334       charbuf[idx + 2] = cmp_status->nchars;
2335       new_chars = 0;
2336       if (cmp_status->method == COMPOSITION_WITH_RULE
2337           && cmp_status->state == COMPOSING_CHAR)
2338         {
2339           /* The last rule was invalid.  */
2340           int rule = charbuf[-1] + 0xA0;
2341
2342           charbuf[-2] = BYTE8_TO_CHAR (rule);
2343           charbuf[-1] = -1;
2344           new_chars = 1;
2345         }
2346     }
2347   else
2348     {
2349       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2350
2351       if (cmp_status->method == COMPOSITION_WITH_RULE)
2352         {
2353           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2354           charbuf[idx++] = -3;
2355           charbuf[idx++] = 0;
2356           new_chars = 1;
2357         }
2358       else
2359         {
2360           int nchars = charbuf[idx + 1] + 0xA0;
2361           int nbytes = charbuf[idx + 2] + 0xA0;
2362
2363           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2364           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2365           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2366           charbuf[idx++] = -1;
2367           new_chars = 4;
2368         }
2369     }
2370   cmp_status->state = COMPOSING_NO;
2371   return new_chars;
2372 }
2373
2374 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2375   do {                                                                    \
2376     if (cmp_status->state != COMPOSING_NO)                                \
2377       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2378   } while (0)
2379
2380
2381 static void
2382 decode_coding_emacs_mule (struct coding_system *coding)
2383 {
2384   const unsigned char *src = coding->source + coding->consumed;
2385   const unsigned char *src_end = coding->source + coding->src_bytes;
2386   const unsigned char *src_base;
2387   int *charbuf = coding->charbuf + coding->charbuf_used;
2388   /* We may produce two annotations (charset and composition) in one
2389      loop and one more charset annotation at the end.  */
2390   int *charbuf_end
2391     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2392       /* We can produce up to 2 characters in a loop.  */
2393       - 1;
2394   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2395   bool multibytep = coding->src_multibyte;
2396   ptrdiff_t char_offset = coding->produced_char;
2397   ptrdiff_t last_offset = char_offset;
2398   int last_id = charset_ascii;
2399   bool eol_dos
2400     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2401   int byte_after_cr = -1;
2402   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2403
2404   if (cmp_status->state != COMPOSING_NO)
2405     {
2406       int i;
2407
2408       if (charbuf_end - charbuf < cmp_status->length)
2409         emacs_abort ();
2410       for (i = 0; i < cmp_status->length; i++)
2411         *charbuf++ = cmp_status->carryover[i];
2412       coding->annotated = 1;
2413     }
2414
2415   while (1)
2416     {
2417       int c, id IF_LINT (= 0);
2418
2419       src_base = src;
2420       consumed_chars_base = consumed_chars;
2421
2422       if (charbuf >= charbuf_end)
2423         {
2424           if (byte_after_cr >= 0)
2425             src_base--;
2426           break;
2427         }
2428
2429       if (byte_after_cr >= 0)
2430         c = byte_after_cr, byte_after_cr = -1;
2431       else
2432         ONE_MORE_BYTE (c);
2433
2434       if (c < 0 || c == 0x80)
2435         {
2436           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2437           if (c < 0)
2438             {
2439               *charbuf++ = -c;
2440               char_offset++;
2441             }
2442           else
2443             DECODE_EMACS_MULE_COMPOSITION_START ();
2444           continue;
2445         }
2446
2447       if (c < 0x80)
2448         {
2449           if (eol_dos && c == '\r')
2450             ONE_MORE_BYTE (byte_after_cr);
2451           id = charset_ascii;
2452           if (cmp_status->state != COMPOSING_NO)
2453             {
2454               if (cmp_status->old_form)
2455                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2457                 cmp_status->ncomps--;
2458             }
2459         }
2460       else
2461         {
2462           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2463           /* emacs_mule_char can load a charset map from a file, which
2464              allocates a large structure and might cause buffer text
2465              to be relocated as result.  Thus, we need to remember the
2466              original pointer to buffer text, and fix up all related
2467              pointers after the call.  */
2468           const unsigned char *orig = coding->source;
2469           ptrdiff_t offset;
2470
2471           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2472                                cmp_status);
2473           offset = coding->source - orig;
2474           if (offset)
2475             {
2476               src += offset;
2477               src_base += offset;
2478               src_end += offset;
2479             }
2480           if (c < 0)
2481             {
2482               if (c == -1)
2483                 goto invalid_code;
2484               if (c == -2)
2485                 break;
2486             }
2487           src = src_base + nbytes;
2488           consumed_chars = consumed_chars_base + nchars;
2489           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2490             cmp_status->ncomps -= nchars;
2491         }
2492
2493       /* Now if C >= 0, we found a normally encoded character, if C <
2494          0, we found an old-style composition component character or
2495          rule.  */
2496
2497       if (cmp_status->state == COMPOSING_NO)
2498         {
2499           if (last_id != id)
2500             {
2501               if (last_id != charset_ascii)
2502                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2503                                   last_id);
2504               last_id = id;
2505               last_offset = char_offset;
2506             }
2507           *charbuf++ = c;
2508           char_offset++;
2509         }
2510       else if (cmp_status->state == COMPOSING_CHAR)
2511         {
2512           if (cmp_status->old_form)
2513             {
2514               if (c >= 0)
2515                 {
2516                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517                   *charbuf++ = c;
2518                   char_offset++;
2519                 }
2520               else
2521                 {
2522                   *charbuf++ = -c;
2523                   cmp_status->nchars++;
2524                   cmp_status->length++;
2525                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2526                     EMACS_MULE_COMPOSITION_END ();
2527                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2528                     cmp_status->state = COMPOSING_RULE;
2529                 }
2530             }
2531           else
2532             {
2533               *charbuf++ = c;
2534               cmp_status->length++;
2535               cmp_status->nchars--;
2536               if (cmp_status->nchars == 0)
2537                 EMACS_MULE_COMPOSITION_END ();
2538             }
2539         }
2540       else if (cmp_status->state == COMPOSING_RULE)
2541         {
2542           int rule;
2543
2544           if (c >= 0)
2545             {
2546               EMACS_MULE_COMPOSITION_END ();
2547               *charbuf++ = c;
2548               char_offset++;
2549             }
2550           else
2551             {
2552               c = -c;
2553               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2554               if (rule < 0)
2555                 goto invalid_code;
2556               *charbuf++ = -2;
2557               *charbuf++ = rule;
2558               cmp_status->length += 2;
2559               cmp_status->state = COMPOSING_CHAR;
2560             }
2561         }
2562       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2563         {
2564           *charbuf++ = c;
2565           cmp_status->length++;
2566           if (cmp_status->ncomps == 0)
2567             cmp_status->state = COMPOSING_CHAR;
2568           else if (cmp_status->ncomps > 0)
2569             {
2570               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2571                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2572             }
2573           else
2574             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2575         }
2576       else                      /* COMPOSING_COMPONENT_RULE */
2577         {
2578           int rule;
2579
2580           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2581           if (rule < 0)
2582             goto invalid_code;
2583           *charbuf++ = -2;
2584           *charbuf++ = rule;
2585           cmp_status->length += 2;
2586           cmp_status->ncomps--;
2587           if (cmp_status->ncomps > 0)
2588             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2589           else
2590             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2591         }
2592       continue;
2593
2594     invalid_code:
2595       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2596       src = src_base;
2597       consumed_chars = consumed_chars_base;
2598       ONE_MORE_BYTE (c);
2599       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2600       char_offset++;
2601       coding->errors++;
2602     }
2603
2604  no_more_source:
2605   if (cmp_status->state != COMPOSING_NO)
2606     {
2607       if (coding->mode & CODING_MODE_LAST_BLOCK)
2608         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2609       else
2610         {
2611           int i;
2612
2613           charbuf -= cmp_status->length;
2614           for (i = 0; i < cmp_status->length; i++)
2615             cmp_status->carryover[i] = charbuf[i];
2616         }
2617     }
2618   if (last_id != charset_ascii)
2619     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2620   coding->consumed_char += consumed_chars_base;
2621   coding->consumed = src_base - coding->source;
2622   coding->charbuf_used = charbuf - coding->charbuf;
2623 }
2624
2625
2626 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2627   do {                                          \
2628     if (id < 0xA0)                              \
2629       codes[0] = id, codes[1] = 0;              \
2630     else if (id < 0xE0)                         \
2631       codes[0] = 0x9A, codes[1] = id;           \
2632     else if (id < 0xF0)                         \
2633       codes[0] = 0x9B, codes[1] = id;           \
2634     else if (id < 0xF5)                         \
2635       codes[0] = 0x9C, codes[1] = id;           \
2636     else                                        \
2637       codes[0] = 0x9D, codes[1] = id;           \
2638   } while (0);
2639
2640
2641 static bool
2642 encode_coding_emacs_mule (struct coding_system *coding)
2643 {
2644   bool multibytep = coding->dst_multibyte;
2645   int *charbuf = coding->charbuf;
2646   int *charbuf_end = charbuf + coding->charbuf_used;
2647   unsigned char *dst = coding->destination + coding->produced;
2648   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2649   int safe_room = 8;
2650   ptrdiff_t produced_chars = 0;
2651   Lisp_Object attrs, charset_list;
2652   int c;
2653   int preferred_charset_id = -1;
2654
2655   CODING_GET_INFO (coding, attrs, charset_list);
2656   if (! EQ (charset_list, Vemacs_mule_charset_list))
2657     {
2658       charset_list = Vemacs_mule_charset_list;
2659       ASET (attrs, coding_attr_charset_list, charset_list);
2660     }
2661
2662   while (charbuf < charbuf_end)
2663     {
2664       ASSURE_DESTINATION (safe_room);
2665       c = *charbuf++;
2666
2667       if (c < 0)
2668         {
2669           /* Handle an annotation.  */
2670           switch (*charbuf)
2671             {
2672             case CODING_ANNOTATE_COMPOSITION_MASK:
2673               /* Not yet implemented.  */
2674               break;
2675             case CODING_ANNOTATE_CHARSET_MASK:
2676               preferred_charset_id = charbuf[3];
2677               if (preferred_charset_id >= 0
2678                   && NILP (Fmemq (make_number (preferred_charset_id),
2679                                   charset_list)))
2680                 preferred_charset_id = -1;
2681               break;
2682             default:
2683               emacs_abort ();
2684             }
2685           charbuf += -c - 1;
2686           continue;
2687         }
2688
2689       if (ASCII_CHAR_P (c))
2690         EMIT_ONE_ASCII_BYTE (c);
2691       else if (CHAR_BYTE8_P (c))
2692         {
2693           c = CHAR_TO_BYTE8 (c);
2694           EMIT_ONE_BYTE (c);
2695         }
2696       else
2697         {
2698           struct charset *charset;
2699           unsigned code;
2700           int dimension;
2701           int emacs_mule_id;
2702           unsigned char leading_codes[2];
2703
2704           if (preferred_charset_id >= 0)
2705             {
2706               bool result;
2707
2708               charset = CHARSET_FROM_ID (preferred_charset_id);
2709               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2710               if (result)
2711                 code = ENCODE_CHAR (charset, c);
2712               else
2713                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2714                                      &code, charset);
2715             }
2716           else
2717             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2718                                  &code, charset);
2719           if (! charset)
2720             {
2721               c = coding->default_char;
2722               if (ASCII_CHAR_P (c))
2723                 {
2724                   EMIT_ONE_ASCII_BYTE (c);
2725                   continue;
2726                 }
2727               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2728                                    &code, charset);
2729             }
2730           dimension = CHARSET_DIMENSION (charset);
2731           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2732           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2733           EMIT_ONE_BYTE (leading_codes[0]);
2734           if (leading_codes[1])
2735             EMIT_ONE_BYTE (leading_codes[1]);
2736           if (dimension == 1)
2737             EMIT_ONE_BYTE (code | 0x80);
2738           else
2739             {
2740               code |= 0x8080;
2741               EMIT_ONE_BYTE (code >> 8);
2742               EMIT_ONE_BYTE (code & 0xFF);
2743             }
2744         }
2745     }
2746   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2747   coding->produced_char += produced_chars;
2748   coding->produced = dst - coding->destination;
2749   return 0;
2750 }
2751
2752 \f
2753 /*** 7. ISO2022 handlers ***/
2754
2755 /* The following note describes the coding system ISO2022 briefly.
2756    Since the intention of this note is to help understand the
2757    functions in this file, some parts are NOT ACCURATE or are OVERLY
2758    SIMPLIFIED.  For thorough understanding, please refer to the
2759    original document of ISO2022.  This is equivalent to the standard
2760    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2761
2762    ISO2022 provides many mechanisms to encode several character sets
2763    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2764    is encoded using bytes less than 128.  This may make the encoded
2765    text a little bit longer, but the text passes more easily through
2766    several types of gateway, some of which strip off the MSB (Most
2767    Significant Bit).
2768
2769    There are two kinds of character sets: control character sets and
2770    graphic character sets.  The former contain control characters such
2771    as `newline' and `escape' to provide control functions (control
2772    functions are also provided by escape sequences).  The latter
2773    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2774    two control character sets and many graphic character sets.
2775
2776    Graphic character sets are classified into one of the following
2777    four classes, according to the number of bytes (DIMENSION) and
2778    number of characters in one dimension (CHARS) of the set:
2779    - DIMENSION1_CHARS94
2780    - DIMENSION1_CHARS96
2781    - DIMENSION2_CHARS94
2782    - DIMENSION2_CHARS96
2783
2784    In addition, each character set is assigned an identification tag,
2785    unique for each set, called the "final character" (denoted as <F>
2786    hereafter).  The <F> of each character set is decided by ECMA(*)
2787    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2788    (0x30..0x3F are for private use only).
2789
2790    Note (*): ECMA = European Computer Manufacturers Association
2791
2792    Here are examples of graphic character sets [NAME(<F>)]:
2793         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2794         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2795         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2796         o DIMENSION2_CHARS96 -- none for the moment
2797
2798    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2799         C0 [0x00..0x1F] -- control character plane 0
2800         GL [0x20..0x7F] -- graphic character plane 0
2801         C1 [0x80..0x9F] -- control character plane 1
2802         GR [0xA0..0xFF] -- graphic character plane 1
2803
2804    A control character set is directly designated and invoked to C0 or
2805    C1 by an escape sequence.  The most common case is that:
2806    - ISO646's  control character set is designated/invoked to C0, and
2807    - ISO6429's control character set is designated/invoked to C1,
2808    and usually these designations/invocations are omitted in encoded
2809    text.  In a 7-bit environment, only C0 can be used, and a control
2810    character for C1 is encoded by an appropriate escape sequence to
2811    fit into the environment.  All control characters for C1 are
2812    defined to have corresponding escape sequences.
2813
2814    A graphic character set is at first designated to one of four
2815    graphic registers (G0 through G3), then these graphic registers are
2816    invoked to GL or GR.  These designations and invocations can be
2817    done independently.  The most common case is that G0 is invoked to
2818    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2819    these invocations and designations are omitted in encoded text.
2820    In a 7-bit environment, only GL can be used.
2821
2822    When a graphic character set of CHARS94 is invoked to GL, codes
2823    0x20 and 0x7F of the GL area work as control characters SPACE and
2824    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2825    be used.
2826
2827    There are two ways of invocation: locking-shift and single-shift.
2828    With locking-shift, the invocation lasts until the next different
2829    invocation, whereas with single-shift, the invocation affects the
2830    following character only and doesn't affect the locking-shift
2831    state.  Invocations are done by the following control characters or
2832    escape sequences:
2833
2834    ----------------------------------------------------------------------
2835    abbrev  function                  cntrl escape seq   description
2836    ----------------------------------------------------------------------
2837    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2838    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2839    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2840    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2841    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2842    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2843    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2844    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2845    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2846    ----------------------------------------------------------------------
2847    (*) These are not used by any known coding system.
2848
2849    Control characters for these functions are defined by macros
2850    ISO_CODE_XXX in `coding.h'.
2851
2852    Designations are done by the following escape sequences:
2853    ----------------------------------------------------------------------
2854    escape sequence      description
2855    ----------------------------------------------------------------------
2856    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2857    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2858    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2859    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2860    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2861    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2862    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2863    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2864    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2865    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2866    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2867    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2868    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2869    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2870    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2871    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2872    ----------------------------------------------------------------------
2873
2874    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2875    of dimension 1, chars 94, and final character <F>, etc...
2876
2877    Note (*): Although these designations are not allowed in ISO2022,
2878    Emacs accepts them on decoding, and produces them on encoding
2879    CHARS96 character sets in a coding system which is characterized as
2880    7-bit environment, non-locking-shift, and non-single-shift.
2881
2882    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2883    '(' must be omitted.  We refer to this as "short-form" hereafter.
2884
2885    Now you may notice that there are a lot of ways of encoding the
2886    same multilingual text in ISO2022.  Actually, there exist many
2887    coding systems such as Compound Text (used in X11's inter client
2888    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2889    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2890    localized platforms), and all of these are variants of ISO2022.
2891
2892    In addition to the above, Emacs handles two more kinds of escape
2893    sequences: ISO6429's direction specification and Emacs' private
2894    sequence for specifying character composition.
2895
2896    ISO6429's direction specification takes the following form:
2897         o CSI ']'      -- end of the current direction
2898         o CSI '0' ']'  -- end of the current direction
2899         o CSI '1' ']'  -- start of left-to-right text
2900         o CSI '2' ']'  -- start of right-to-left text
2901    The control character CSI (0x9B: control sequence introducer) is
2902    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2903
2904    Character composition specification takes the following form:
2905         o ESC '0' -- start relative composition
2906         o ESC '1' -- end composition
2907         o ESC '2' -- start rule-base composition (*)
2908         o ESC '3' -- start relative composition with alternate chars  (**)
2909         o ESC '4' -- start rule-base composition with alternate chars  (**)
2910   Since these are not standard escape sequences of any ISO standard,
2911   the use of them with these meanings is restricted to Emacs only.
2912
2913   (*) This form is used only in Emacs 20.7 and older versions,
2914   but newer versions can safely decode it.
2915   (**) This form is used only in Emacs 21.1 and newer versions,
2916   and older versions can't decode it.
2917
2918   Here's a list of example usages of these composition escape
2919   sequences (categorized by `enum composition_method').
2920
2921   COMPOSITION_RELATIVE:
2922         ESC 0 CHAR [ CHAR ] ESC 1
2923   COMPOSITION_WITH_RULE:
2924         ESC 2 CHAR [ RULE CHAR ] ESC 1
2925   COMPOSITION_WITH_ALTCHARS:
2926         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2927   COMPOSITION_WITH_RULE_ALTCHARS:
2928         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2929
2930 static enum iso_code_class_type iso_code_class[256];
2931
2932 #define SAFE_CHARSET_P(coding, id)      \
2933   ((id) <= (coding)->max_charset_id     \
2934    && (coding)->safe_charsets[id] != 255)
2935
2936 static void
2937 setup_iso_safe_charsets (Lisp_Object attrs)
2938 {
2939   Lisp_Object charset_list, safe_charsets;
2940   Lisp_Object request;
2941   Lisp_Object reg_usage;
2942   Lisp_Object tail;
2943   EMACS_INT reg94, reg96;
2944   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2945   int max_charset_id;
2946
2947   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2948   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2949       && ! EQ (charset_list, Viso_2022_charset_list))
2950     {
2951       charset_list = Viso_2022_charset_list;
2952       ASET (attrs, coding_attr_charset_list, charset_list);
2953       ASET (attrs, coding_attr_safe_charsets, Qnil);
2954     }
2955
2956   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2957     return;
2958
2959   max_charset_id = 0;
2960   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2961     {
2962       int id = XINT (XCAR (tail));
2963       if (max_charset_id < id)
2964         max_charset_id = id;
2965     }
2966
2967   safe_charsets = make_uninit_string (max_charset_id + 1);
2968   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2969   request = AREF (attrs, coding_attr_iso_request);
2970   reg_usage = AREF (attrs, coding_attr_iso_usage);
2971   reg94 = XINT (XCAR (reg_usage));
2972   reg96 = XINT (XCDR (reg_usage));
2973
2974   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2975     {
2976       Lisp_Object id;
2977       Lisp_Object reg;
2978       struct charset *charset;
2979
2980       id = XCAR (tail);
2981       charset = CHARSET_FROM_ID (XINT (id));
2982       reg = Fcdr (Fassq (id, request));
2983       if (! NILP (reg))
2984         SSET (safe_charsets, XINT (id), XINT (reg));
2985       else if (charset->iso_chars_96)
2986         {
2987           if (reg96 < 4)
2988             SSET (safe_charsets, XINT (id), reg96);
2989         }
2990       else
2991         {
2992           if (reg94 < 4)
2993             SSET (safe_charsets, XINT (id), reg94);
2994         }
2995     }
2996   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2997 }
2998
2999
3000 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3001    Return true if a text is encoded in one of ISO-2022 based coding
3002    systems.  */
3003
3004 static bool
3005 detect_coding_iso_2022 (struct coding_system *coding,
3006                         struct coding_detection_info *detect_info)
3007 {
3008   const unsigned char *src = coding->source, *src_base = src;
3009   const unsigned char *src_end = coding->source + coding->src_bytes;
3010   bool multibytep = coding->src_multibyte;
3011   bool single_shifting = 0;
3012   int id;
3013   int c, c1;
3014   ptrdiff_t consumed_chars = 0;
3015   int i;
3016   int rejected = 0;
3017   int found = 0;
3018   int composition_count = -1;
3019
3020   detect_info->checked |= CATEGORY_MASK_ISO;
3021
3022   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3023     {
3024       struct coding_system *this = &(coding_categories[i]);
3025       Lisp_Object attrs, val;
3026
3027       if (this->id < 0)
3028         continue;
3029       attrs = CODING_ID_ATTRS (this->id);
3030       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3031           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3032         setup_iso_safe_charsets (attrs);
3033       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3034       this->max_charset_id = SCHARS (val) - 1;
3035       this->safe_charsets = SDATA (val);
3036     }
3037
3038   /* A coding system of this category is always ASCII compatible.  */
3039   src += coding->head_ascii;
3040
3041   while (rejected != CATEGORY_MASK_ISO)
3042     {
3043       src_base = src;
3044       ONE_MORE_BYTE (c);
3045       switch (c)
3046         {
3047         case ISO_CODE_ESC:
3048           if (inhibit_iso_escape_detection)
3049             break;
3050           single_shifting = 0;
3051           ONE_MORE_BYTE (c);
3052           if (c == 'N' || c == 'O')
3053             {
3054               /* ESC <Fe> for SS2 or SS3.  */
3055               single_shifting = 1;
3056               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3057             }
3058           else if (c == '1')
3059             {
3060               /* End of composition.  */
3061               if (composition_count < 0
3062                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3063                 /* Invalid */
3064                 break;
3065               composition_count = -1;
3066               found |= CATEGORY_MASK_ISO;
3067             }
3068           else if (c >= '0' && c <= '4')
3069             {
3070               /* ESC <Fp> for start/end composition.  */
3071               composition_count = 0;
3072             }
3073           else
3074             {
3075               if (c >= '(' && c <= '/')
3076                 {
3077                   /* Designation sequence for a charset of dimension 1.  */
3078                   ONE_MORE_BYTE (c1);
3079                   if (c1 < ' ' || c1 >= 0x80
3080                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3081                     /* Invalid designation sequence.  Just ignore.  */
3082                     break;
3083                 }
3084               else if (c == '$')
3085                 {
3086                   /* Designation sequence for a charset of dimension 2.  */
3087                   ONE_MORE_BYTE (c);
3088                   if (c >= '@' && c <= 'B')
3089                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3090                     id = iso_charset_table[1][0][c];
3091                   else if (c >= '(' && c <= '/')
3092                     {
3093                       ONE_MORE_BYTE (c1);
3094                       if (c1 < ' ' || c1 >= 0x80
3095                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3096                         /* Invalid designation sequence.  Just ignore.  */
3097                         break;
3098                     }
3099                   else
3100                     /* Invalid designation sequence.  Just ignore it.  */
3101                     break;
3102                 }
3103               else
3104                 {
3105                   /* Invalid escape sequence.  Just ignore it.  */
3106                   break;
3107                 }
3108
3109               /* We found a valid designation sequence for CHARSET.  */
3110               rejected |= CATEGORY_MASK_ISO_8BIT;
3111               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3112                                   id))
3113                 found |= CATEGORY_MASK_ISO_7;
3114               else
3115                 rejected |= CATEGORY_MASK_ISO_7;
3116               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3117                                   id))
3118                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3119               else
3120                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3121               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3122                                   id))
3123                 found |= CATEGORY_MASK_ISO_7_ELSE;
3124               else
3125                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3126               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3127                                   id))
3128                 found |= CATEGORY_MASK_ISO_8_ELSE;
3129               else
3130                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3131             }
3132           break;
3133
3134         case ISO_CODE_SO:
3135         case ISO_CODE_SI:
3136           /* Locking shift out/in.  */
3137           if (inhibit_iso_escape_detection)
3138             break;
3139           single_shifting = 0;
3140           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3141           break;
3142
3143         case ISO_CODE_CSI:
3144           /* Control sequence introducer.  */
3145           single_shifting = 0;
3146           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3147           found |= CATEGORY_MASK_ISO_8_ELSE;
3148           goto check_extra_latin;
3149
3150         case ISO_CODE_SS2:
3151         case ISO_CODE_SS3:
3152           /* Single shift.   */
3153           if (inhibit_iso_escape_detection)
3154             break;
3155           single_shifting = 0;
3156           rejected |= CATEGORY_MASK_ISO_7BIT;
3157           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3158               & CODING_ISO_FLAG_SINGLE_SHIFT)
3159             {
3160               found |= CATEGORY_MASK_ISO_8_1;
3161               single_shifting = 1;
3162             }
3163           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3164               & CODING_ISO_FLAG_SINGLE_SHIFT)
3165             {
3166               found |= CATEGORY_MASK_ISO_8_2;
3167               single_shifting = 1;
3168             }
3169           if (single_shifting)
3170             break;
3171           goto check_extra_latin;
3172
3173         default:
3174           if (c < 0)
3175             continue;
3176           if (c < 0x80)
3177             {
3178               if (composition_count >= 0)
3179                 composition_count++;
3180               single_shifting = 0;
3181               break;
3182             }
3183           if (c >= 0xA0)
3184             {
3185               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3186               found |= CATEGORY_MASK_ISO_8_1;
3187               /* Check the length of succeeding codes of the range
3188                  0xA0..0FF.  If the byte length is even, we include
3189                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3190                  only when we are not single shifting.  */
3191               if (! single_shifting
3192                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3193                 {
3194                   ptrdiff_t len = 1;
3195                   while (src < src_end)
3196                     {
3197                       src_base = src;
3198                       ONE_MORE_BYTE (c);
3199                       if (c < 0xA0)
3200                         {
3201                           src = src_base;
3202                           break;
3203                         }
3204                       len++;
3205                     }
3206
3207                   if (len & 1 && src < src_end)
3208                     {
3209                       rejected |= CATEGORY_MASK_ISO_8_2;
3210                       if (composition_count >= 0)
3211                         composition_count += len;
3212                     }
3213                   else
3214                     {
3215                       found |= CATEGORY_MASK_ISO_8_2;
3216                       if (composition_count >= 0)
3217                         composition_count += len / 2;
3218                     }
3219                 }
3220               break;
3221             }
3222         check_extra_latin:
3223           if (! VECTORP (Vlatin_extra_code_table)
3224               || NILP (AREF (Vlatin_extra_code_table, c)))
3225             {
3226               rejected = CATEGORY_MASK_ISO;
3227               break;
3228             }
3229           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3230               & CODING_ISO_FLAG_LATIN_EXTRA)
3231             found |= CATEGORY_MASK_ISO_8_1;
3232           else
3233             rejected |= CATEGORY_MASK_ISO_8_1;
3234           rejected |= CATEGORY_MASK_ISO_8_2;
3235           break;
3236         }
3237     }
3238   detect_info->rejected |= CATEGORY_MASK_ISO;
3239   return 0;
3240
3241  no_more_source:
3242   detect_info->rejected |= rejected;
3243   detect_info->found |= (found & ~rejected);
3244   return 1;
3245 }
3246
3247
3248 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3249    escape sequence should be kept.  */
3250 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3251   do {                                                                  \
3252     int id, prev;                                                       \
3253                                                                         \
3254     if (final < '0' || final >= 128                                     \
3255         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3256         || !SAFE_CHARSET_P (coding, id))                                \
3257       {                                                                 \
3258         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3259         chars_96 = -1;                                                  \
3260         break;                                                          \
3261       }                                                                 \
3262     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3263     if (id == charset_jisx0201_roman)                                   \
3264       {                                                                 \
3265         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3266           id = charset_ascii;                                           \
3267       }                                                                 \
3268     else if (id == charset_jisx0208_1978)                               \
3269       {                                                                 \
3270         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3271           id = charset_jisx0208;                                        \
3272       }                                                                 \
3273     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3274     /* If there was an invalid designation to REG previously, and this  \
3275        designation is ASCII to REG, we should keep this designation     \
3276        sequence.  */                                                    \
3277     if (prev == -2 && id == charset_ascii)                              \
3278       chars_96 = -1;                                                    \
3279   } while (0)
3280
3281
3282 /* Handle these composition sequence (ALT: alternate char):
3283
3284    (1) relative composition: ESC 0 CHAR ... ESC 1
3285    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3286    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3287    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3288
3289    When the start sequence (ESC 0/2/3/4) is found, this annotation
3290    header is produced.
3291
3292         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3293
3294    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3295    produced until the end sequence (ESC 1) is found:
3296
3297    (1) CHAR ... CHAR
3298    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3299    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3300    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3301
3302    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3303    annotation header is updated as below:
3304
3305    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3306    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3307    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3308    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3309
3310    If an error is found while composing, the annotation header is
3311    changed to:
3312
3313         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3314
3315    and the sequence [ -2 DECODED-RULE ] is changed to the original
3316    byte sequence as below:
3317         o the original byte sequence is B: [ B -1 ]
3318         o the original byte sequence is B1 B2: [ B1 B2 ]
3319    and the sequence [ -1 -1 ] is changed to the original byte
3320    sequence:
3321         [ ESC '0' ]
3322 */
3323
3324 /* Decode a composition rule C1 and maybe one more byte from the
3325    source, and set RULE to the encoded composition rule.  If the rule
3326    is invalid, goto invalid_code.  */
3327
3328 #define DECODE_COMPOSITION_RULE(rule)                                   \
3329   do {                                                                  \
3330     rule = c1 - 32;                                                     \
3331     if (rule < 0)                                                       \
3332       goto invalid_code;                                                \
3333     if (rule < 81)              /* old format (before ver.21) */        \
3334       {                                                                 \
3335         int gref = (rule) / 9;                                          \
3336         int nref = (rule) % 9;                                          \
3337         if (gref == 4) gref = 10;                                       \
3338         if (nref == 4) nref = 10;                                       \
3339         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3340       }                                                                 \
3341     else                        /* new format (after ver.21) */         \
3342       {                                                                 \
3343         int b;                                                          \
3344                                                                         \
3345         ONE_MORE_BYTE (b);                                              \
3346         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3347           goto invalid_code;                                            \
3348         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3349         rule += 0x100;   /* Distinguish it from the old format.  */     \
3350       }                                                                 \
3351   } while (0)
3352
3353 #define ENCODE_COMPOSITION_RULE(rule)                           \
3354   do {                                                          \
3355     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3356                                                                 \
3357     if (rule < 0x100)           /* old format */                \
3358       {                                                         \
3359         if (gref == 10) gref = 4;                               \
3360         if (nref == 10) nref = 4;                               \
3361         charbuf[idx] = 32 + gref * 9 + nref;                    \
3362         charbuf[idx + 1] = -1;                                  \
3363         new_chars++;                                            \
3364       }                                                         \
3365     else                                /* new format */        \
3366       {                                                         \
3367         charbuf[idx] = 32 + 81 + gref;                          \
3368         charbuf[idx + 1] = 32 + nref;                           \
3369         new_chars += 2;                                         \
3370       }                                                         \
3371   } while (0)
3372
3373 /* Finish the current composition as invalid.  */
3374
3375 static int
3376 finish_composition (int *charbuf, struct composition_status *cmp_status)
3377 {
3378   int idx = - cmp_status->length;
3379   int new_chars;
3380
3381   /* Recover the original ESC sequence */
3382   charbuf[idx++] = ISO_CODE_ESC;
3383   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3384                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3385                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3386                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3387                     : '4');
3388   charbuf[idx++] = -2;
3389   charbuf[idx++] = 0;
3390   charbuf[idx++] = -1;
3391   new_chars = cmp_status->nchars;
3392   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3393     for (; idx < 0; idx++)
3394       {
3395         int elt = charbuf[idx];
3396
3397         if (elt == -2)
3398           {
3399             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3400             idx++;
3401           }
3402         else if (elt == -1)
3403           {
3404             charbuf[idx++] = ISO_CODE_ESC;
3405             charbuf[idx] = '0';
3406             new_chars += 2;
3407           }
3408       }
3409   cmp_status->state = COMPOSING_NO;
3410   return new_chars;
3411 }
3412
3413 /* If characters are under composition, finish the composition.  */
3414 #define MAYBE_FINISH_COMPOSITION()                              \
3415   do {                                                          \
3416     if (cmp_status->state != COMPOSING_NO)                      \
3417       char_offset += finish_composition (charbuf, cmp_status);  \
3418   } while (0)
3419
3420 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3421
3422    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3423    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3424    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3425    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3426
3427    Produce this annotation sequence now:
3428
3429    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3430 */
3431
3432 #define DECODE_COMPOSITION_START(c1)                                       \
3433   do {                                                                     \
3434     if (c1 == '0'                                                          \
3435         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3436              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3437             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3438                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3439       {                                                                    \
3440         *charbuf++ = -1;                                                   \
3441         *charbuf++= -1;                                                    \
3442         cmp_status->state = COMPOSING_CHAR;                                \
3443         cmp_status->length += 2;                                           \
3444       }                                                                    \
3445     else                                                                   \
3446       {                                                                    \
3447         MAYBE_FINISH_COMPOSITION ();                                       \
3448         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3449                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3450                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3451                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3452         cmp_status->state                                                  \
3453           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3454         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3455         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3456         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3457         coding->annotated = 1;                                             \
3458       }                                                                    \
3459   } while (0)
3460
3461
3462 /* Handle composition end sequence ESC 1.  */
3463
3464 #define DECODE_COMPOSITION_END()                                        \
3465   do {                                                                  \
3466     if (cmp_status->nchars == 0                                         \
3467         || ((cmp_status->state == COMPOSING_CHAR)                       \
3468             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3469       {                                                                 \
3470         MAYBE_FINISH_COMPOSITION ();                                    \
3471         goto invalid_code;                                              \
3472       }                                                                 \
3473     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3474       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3475     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3476       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3477     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3478     char_offset += cmp_status->nchars;                                  \
3479     cmp_status->state = COMPOSING_NO;                                   \
3480   } while (0)
3481
3482 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3483
3484 #define STORE_COMPOSITION_RULE(rule)    \
3485   do {                                  \
3486     *charbuf++ = -2;                    \
3487     *charbuf++ = rule;                  \
3488     cmp_status->length += 2;            \
3489     cmp_status->state--;                \
3490   } while (0)
3491
3492 /* Store a composed char or a component char C in charbuf, and update
3493    cmp_status.  */
3494
3495 #define STORE_COMPOSITION_CHAR(c)                                       \
3496   do {                                                                  \
3497     *charbuf++ = (c);                                                   \
3498     cmp_status->length++;                                               \
3499     if (cmp_status->state == COMPOSING_CHAR)                            \
3500       cmp_status->nchars++;                                             \
3501     else                                                                \
3502       cmp_status->ncomps++;                                             \
3503     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3504         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3505             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3506       cmp_status->state++;                                              \
3507   } while (0)
3508
3509
3510 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3511
3512 static void
3513 decode_coding_iso_2022 (struct coding_system *coding)
3514 {
3515   const unsigned char *src = coding->source + coding->consumed;
3516   const unsigned char *src_end = coding->source + coding->src_bytes;
3517   const unsigned char *src_base;
3518   int *charbuf = coding->charbuf + coding->charbuf_used;
3519   /* We may produce two annotations (charset and composition) in one
3520      loop and one more charset annotation at the end.  */
3521   int *charbuf_end
3522     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3523   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3524   bool multibytep = coding->src_multibyte;
3525   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3526   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3527   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3528   int charset_id_2, charset_id_3;
3529   struct charset *charset;
3530   int c;
3531   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3532   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3533   ptrdiff_t char_offset = coding->produced_char;
3534   ptrdiff_t last_offset = char_offset;
3535   int last_id = charset_ascii;
3536   bool eol_dos
3537     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3538   int byte_after_cr = -1;
3539   int i;
3540
3541   setup_iso_safe_charsets (attrs);
3542   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3543
3544   if (cmp_status->state != COMPOSING_NO)
3545     {
3546       if (charbuf_end - charbuf < cmp_status->length)
3547         emacs_abort ();
3548       for (i = 0; i < cmp_status->length; i++)
3549         *charbuf++ = cmp_status->carryover[i];
3550       coding->annotated = 1;
3551     }
3552
3553   while (1)
3554     {
3555       int c1, c2, c3;
3556
3557       src_base = src;
3558       consumed_chars_base = consumed_chars;
3559
3560       if (charbuf >= charbuf_end)
3561         {
3562           if (byte_after_cr >= 0)
3563             src_base--;
3564           break;
3565         }
3566
3567       if (byte_after_cr >= 0)
3568         c1 = byte_after_cr, byte_after_cr = -1;
3569       else
3570         ONE_MORE_BYTE (c1);
3571       if (c1 < 0)
3572         goto invalid_code;
3573
3574       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3575         {
3576           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3577           char_offset++;
3578           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3579           continue;
3580         }
3581
3582       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3583         {
3584           if (c1 == ISO_CODE_ESC)
3585             {
3586               if (src + 1 >= src_end)
3587                 goto no_more_source;
3588               *charbuf++ = ISO_CODE_ESC;
3589               char_offset++;
3590               if (src[0] == '%' && src[1] == '@')
3591                 {
3592                   src += 2;
3593                   consumed_chars += 2;
3594                   char_offset += 2;
3595                   /* We are sure charbuf can contain two more chars. */
3596                   *charbuf++ = '%';
3597                   *charbuf++ = '@';
3598                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3599                 }
3600             }
3601           else
3602             {
3603               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3604               char_offset++;
3605             }
3606           continue;
3607         }
3608
3609       if ((cmp_status->state == COMPOSING_RULE
3610            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3611           && c1 != ISO_CODE_ESC)
3612         {
3613           int rule;
3614
3615           DECODE_COMPOSITION_RULE (rule);
3616           STORE_COMPOSITION_RULE (rule);
3617           continue;
3618         }
3619
3620       /* We produce at most one character.  */
3621       switch (iso_code_class [c1])
3622         {
3623         case ISO_0x20_or_0x7F:
3624           if (charset_id_0 < 0
3625               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3626             /* This is SPACE or DEL.  */
3627             charset = CHARSET_FROM_ID (charset_ascii);
3628           else
3629             charset = CHARSET_FROM_ID (charset_id_0);
3630           break;
3631
3632         case ISO_graphic_plane_0:
3633           if (charset_id_0 < 0)
3634             charset = CHARSET_FROM_ID (charset_ascii);
3635           else
3636             charset = CHARSET_FROM_ID (charset_id_0);
3637           break;
3638
3639         case ISO_0xA0_or_0xFF:
3640           if (charset_id_1 < 0
3641               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3642               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3643             goto invalid_code;
3644           /* This is a graphic character, we fall down ... */
3645
3646         case ISO_graphic_plane_1:
3647           if (charset_id_1 < 0)
3648             goto invalid_code;
3649           charset = CHARSET_FROM_ID (charset_id_1);
3650           break;
3651
3652         case ISO_control_0:
3653           if (eol_dos && c1 == '\r')
3654             ONE_MORE_BYTE (byte_after_cr);
3655           MAYBE_FINISH_COMPOSITION ();
3656           charset = CHARSET_FROM_ID (charset_ascii);
3657           break;
3658
3659         case ISO_control_1:
3660           goto invalid_code;
3661
3662         case ISO_shift_out:
3663           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3664               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3665             goto invalid_code;
3666           CODING_ISO_INVOCATION (coding, 0) = 1;
3667           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3668           continue;
3669
3670         case ISO_shift_in:
3671           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3672             goto invalid_code;
3673           CODING_ISO_INVOCATION (coding, 0) = 0;
3674           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3675           continue;
3676
3677         case ISO_single_shift_2_7:
3678           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3679             goto invalid_code;
3680         case ISO_single_shift_2:
3681           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3682             goto invalid_code;
3683           /* SS2 is handled as an escape sequence of ESC 'N' */
3684           c1 = 'N';
3685           goto label_escape_sequence;
3686
3687         case ISO_single_shift_3:
3688           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3689             goto invalid_code;
3690           /* SS2 is handled as an escape sequence of ESC 'O' */
3691           c1 = 'O';
3692           goto label_escape_sequence;
3693
3694         case ISO_control_sequence_introducer:
3695           /* CSI is handled as an escape sequence of ESC '[' ...  */
3696           c1 = '[';
3697           goto label_escape_sequence;
3698
3699         case ISO_escape:
3700           ONE_MORE_BYTE (c1);
3701         label_escape_sequence:
3702           /* Escape sequences handled here are invocation,
3703              designation, direction specification, and character
3704              composition specification.  */
3705           switch (c1)
3706             {
3707             case '&':           /* revision of following character set */
3708               ONE_MORE_BYTE (c1);
3709               if (!(c1 >= '@' && c1 <= '~'))
3710                 goto invalid_code;
3711               ONE_MORE_BYTE (c1);
3712               if (c1 != ISO_CODE_ESC)
3713                 goto invalid_code;
3714               ONE_MORE_BYTE (c1);
3715               goto label_escape_sequence;
3716
3717             case '$':           /* designation of 2-byte character set */
3718               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3719                 goto invalid_code;
3720               {
3721                 int reg, chars96;
3722
3723                 ONE_MORE_BYTE (c1);
3724                 if (c1 >= '@' && c1 <= 'B')
3725                   {     /* designation of JISX0208.1978, GB2312.1980,
3726                            or JISX0208.1980 */
3727                     reg = 0, chars96 = 0;
3728                   }
3729                 else if (c1 >= 0x28 && c1 <= 0x2B)
3730                   { /* designation of DIMENSION2_CHARS94 character set */
3731                     reg = c1 - 0x28, chars96 = 0;
3732                     ONE_MORE_BYTE (c1);
3733                   }
3734                 else if (c1 >= 0x2C && c1 <= 0x2F)
3735                   { /* designation of DIMENSION2_CHARS96 character set */
3736                     reg = c1 - 0x2C, chars96 = 1;
3737                     ONE_MORE_BYTE (c1);
3738                   }
3739                 else
3740                   goto invalid_code;
3741                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3742                 /* We must update these variables now.  */
3743                 if (reg == 0)
3744                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3745                 else if (reg == 1)
3746                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3747                 if (chars96 < 0)
3748                   goto invalid_code;
3749               }
3750               continue;
3751
3752             case 'n':           /* invocation of locking-shift-2 */
3753               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3754                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3755                 goto invalid_code;
3756               CODING_ISO_INVOCATION (coding, 0) = 2;
3757               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3758               continue;
3759
3760             case 'o':           /* invocation of locking-shift-3 */
3761               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3762                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3763                 goto invalid_code;
3764               CODING_ISO_INVOCATION (coding, 0) = 3;
3765               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3766               continue;
3767
3768             case 'N':           /* invocation of single-shift-2 */
3769               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3770                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3771                 goto invalid_code;
3772               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3773               if (charset_id_2 < 0)
3774                 charset = CHARSET_FROM_ID (charset_ascii);
3775               else
3776                 charset = CHARSET_FROM_ID (charset_id_2);
3777               ONE_MORE_BYTE (c1);
3778               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3779                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3780                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3781                           ? c1 >= 0x80 : c1 < 0x80)))
3782                 goto invalid_code;
3783               break;
3784
3785             case 'O':           /* invocation of single-shift-3 */
3786               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3787                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3788                 goto invalid_code;
3789               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3790               if (charset_id_3 < 0)
3791                 charset = CHARSET_FROM_ID (charset_ascii);
3792               else
3793                 charset = CHARSET_FROM_ID (charset_id_3);
3794               ONE_MORE_BYTE (c1);
3795               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3796                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3797                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3798                           ? c1 >= 0x80 : c1 < 0x80)))
3799                 goto invalid_code;
3800               break;
3801
3802             case '0': case '2': case '3': case '4': /* start composition */
3803               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3804                 goto invalid_code;
3805               if (last_id != charset_ascii)
3806                 {
3807                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3808                   last_id = charset_ascii;
3809                   last_offset = char_offset;
3810                 }
3811               DECODE_COMPOSITION_START (c1);
3812               continue;
3813
3814             case '1':           /* end composition */
3815               if (cmp_status->state == COMPOSING_NO)
3816                 goto invalid_code;
3817               DECODE_COMPOSITION_END ();
3818               continue;
3819
3820             case '[':           /* specification of direction */
3821               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3822                 goto invalid_code;
3823               /* For the moment, nested direction is not supported.
3824                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3825                  left-to-right, and nonzero means right-to-left.  */
3826               ONE_MORE_BYTE (c1);
3827               switch (c1)
3828                 {
3829                 case ']':       /* end of the current direction */
3830                   coding->mode &= ~CODING_MODE_DIRECTION;
3831
3832                 case '0':       /* end of the current direction */
3833                 case '1':       /* start of left-to-right direction */
3834                   ONE_MORE_BYTE (c1);
3835                   if (c1 == ']')
3836                     coding->mode &= ~CODING_MODE_DIRECTION;
3837                   else
3838                     goto invalid_code;
3839                   break;
3840
3841                 case '2':       /* start of right-to-left direction */
3842                   ONE_MORE_BYTE (c1);
3843                   if (c1 == ']')
3844                     coding->mode |= CODING_MODE_DIRECTION;
3845                   else
3846                     goto invalid_code;
3847                   break;
3848
3849                 default:
3850                   goto invalid_code;
3851                 }
3852               continue;
3853
3854             case '%':
3855               ONE_MORE_BYTE (c1);
3856               if (c1 == '/')
3857                 {
3858                   /* CTEXT extended segment:
3859                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3860                      We keep these bytes as is for the moment.
3861                      They may be decoded by post-read-conversion.  */
3862                   int dim, M, L;
3863                   int size;
3864
3865                   ONE_MORE_BYTE (dim);
3866                   if (dim < '0' || dim > '4')
3867                     goto invalid_code;
3868                   ONE_MORE_BYTE (M);
3869                   if (M < 128)
3870                     goto invalid_code;
3871                   ONE_MORE_BYTE (L);
3872                   if (L < 128)
3873                     goto invalid_code;
3874                   size = ((M - 128) * 128) + (L - 128);
3875                   if (charbuf + 6 > charbuf_end)
3876                     goto break_loop;
3877                   *charbuf++ = ISO_CODE_ESC;
3878                   *charbuf++ = '%';
3879                   *charbuf++ = '/';
3880                   *charbuf++ = dim;
3881                   *charbuf++ = BYTE8_TO_CHAR (M);
3882                   *charbuf++ = BYTE8_TO_CHAR (L);
3883                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3884                 }
3885               else if (c1 == 'G')
3886                 {
3887                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3888                      ESC % G --UTF-8-BYTES-- ESC % @
3889                      We keep these bytes as is for the moment.
3890                      They may be decoded by post-read-conversion.  */
3891                   if (charbuf + 3 > charbuf_end)
3892                     goto break_loop;
3893                   *charbuf++ = ISO_CODE_ESC;
3894                   *charbuf++ = '%';
3895                   *charbuf++ = 'G';
3896                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3897                 }
3898               else
3899                 goto invalid_code;
3900               continue;
3901               break;
3902
3903             default:
3904               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3905                 goto invalid_code;
3906               {
3907                 int reg, chars96;
3908
3909                 if (c1 >= 0x28 && c1 <= 0x2B)
3910                   { /* designation of DIMENSION1_CHARS94 character set */
3911                     reg = c1 - 0x28, chars96 = 0;
3912                     ONE_MORE_BYTE (c1);
3913                   }
3914                 else if (c1 >= 0x2C && c1 <= 0x2F)
3915                   { /* designation of DIMENSION1_CHARS96 character set */
3916                     reg = c1 - 0x2C, chars96 = 1;
3917                     ONE_MORE_BYTE (c1);
3918                   }
3919                 else
3920                   goto invalid_code;
3921                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3922                 /* We must update these variables now.  */
3923                 if (reg == 0)
3924                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3925                 else if (reg == 1)
3926                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3927                 if (chars96 < 0)
3928                   goto invalid_code;
3929               }
3930               continue;
3931             }
3932           break;
3933
3934         default:
3935           emacs_abort ();
3936         }
3937
3938       if (cmp_status->state == COMPOSING_NO
3939           && charset->id != charset_ascii
3940           && last_id != charset->id)
3941         {
3942           if (last_id != charset_ascii)
3943             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3944           last_id = charset->id;
3945           last_offset = char_offset;
3946         }
3947
3948       /* Now we know CHARSET and 1st position code C1 of a character.
3949          Produce a decoded character while getting 2nd and 3rd
3950          position codes C2, C3 if necessary.  */
3951       if (CHARSET_DIMENSION (charset) > 1)
3952         {
3953           ONE_MORE_BYTE (c2);
3954           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3955               || ((c1 & 0x80) != (c2 & 0x80)))
3956             /* C2 is not in a valid range.  */
3957             goto invalid_code;
3958           if (CHARSET_DIMENSION (charset) == 2)
3959             c1 = (c1 << 8) | c2;
3960           else
3961             {
3962               ONE_MORE_BYTE (c3);
3963               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3964                   || ((c1 & 0x80) != (c3 & 0x80)))
3965                 /* C3 is not in a valid range.  */
3966                 goto invalid_code;
3967               c1 = (c1 << 16) | (c2 << 8) | c2;
3968             }
3969         }
3970       c1 &= 0x7F7F7F;
3971       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3972       if (c < 0)
3973         {
3974           MAYBE_FINISH_COMPOSITION ();
3975           for (; src_base < src; src_base++, char_offset++)
3976             {
3977               if (ASCII_CHAR_P (*src_base))
3978                 *charbuf++ = *src_base;
3979               else
3980                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3981             }
3982         }
3983       else if (cmp_status->state == COMPOSING_NO)
3984         {
3985           *charbuf++ = c;
3986           char_offset++;
3987         }
3988       else if ((cmp_status->state == COMPOSING_CHAR
3989                 ? cmp_status->nchars
3990                 : cmp_status->ncomps)
3991                >= MAX_COMPOSITION_COMPONENTS)
3992         {
3993           /* Too long composition.  */
3994           MAYBE_FINISH_COMPOSITION ();
3995           *charbuf++ = c;
3996           char_offset++;
3997         }
3998       else
3999         STORE_COMPOSITION_CHAR (c);
4000       continue;
4001
4002     invalid_code:
4003       MAYBE_FINISH_COMPOSITION ();
4004       src = src_base;
4005       consumed_chars = consumed_chars_base;
4006       ONE_MORE_BYTE (c);
4007       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
4008       char_offset++;
4009       coding->errors++;
4010       /* Reset the invocation and designation status to the safest
4011          one; i.e. designate ASCII to the graphic register 0, and
4012          invoke that register to the graphic plane 0.  This typically
4013          helps the case that an designation sequence for ASCII "ESC (
4014          B" is somehow broken (e.g. broken by a newline).  */
4015       CODING_ISO_INVOCATION (coding, 0) = 0;
4016       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4017       charset_id_0 = charset_ascii;
4018       continue;
4019
4020     break_loop:
4021       break;
4022     }
4023
4024  no_more_source:
4025   if (cmp_status->state != COMPOSING_NO)
4026     {
4027       if (coding->mode & CODING_MODE_LAST_BLOCK)
4028         MAYBE_FINISH_COMPOSITION ();
4029       else
4030         {
4031           charbuf -= cmp_status->length;
4032           for (i = 0; i < cmp_status->length; i++)
4033             cmp_status->carryover[i] = charbuf[i];
4034         }
4035     }
4036   else if (last_id != charset_ascii)
4037     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4038   coding->consumed_char += consumed_chars_base;
4039   coding->consumed = src_base - coding->source;
4040   coding->charbuf_used = charbuf - coding->charbuf;
4041 }
4042
4043
4044 /* ISO2022 encoding stuff.  */
4045
4046 /*
4047    It is not enough to say just "ISO2022" on encoding, we have to
4048    specify more details.  In Emacs, each coding system of ISO2022
4049    variant has the following specifications:
4050         1. Initial designation to G0 thru G3.
4051         2. Allows short-form designation?
4052         3. ASCII should be designated to G0 before control characters?
4053         4. ASCII should be designated to G0 at end of line?
4054         5. 7-bit environment or 8-bit environment?
4055         6. Use locking-shift?
4056         7. Use Single-shift?
4057    And the following two are only for Japanese:
4058         8. Use ASCII in place of JIS0201-1976-Roman?
4059         9. Use JISX0208-1983 in place of JISX0208-1978?
4060    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4061    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4062    details.
4063 */
4064
4065 /* Produce codes (escape sequence) for designating CHARSET to graphic
4066    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4067    '@', 'A', or 'B' and the coding system CODING allows, produce
4068    designation sequence of short-form.  */
4069
4070 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4071   do {                                                                  \
4072     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4073     const char *intermediate_char_94 = "()*+";                          \
4074     const char *intermediate_char_96 = ",-./";                          \
4075     int revision = -1;                                                  \
4076                                                                         \
4077     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4078       revision = CHARSET_ISO_REVISION (charset);                        \
4079                                                                         \
4080     if (revision >= 0)                                                  \
4081       {                                                                 \
4082         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4083         EMIT_ONE_BYTE ('@' + revision);                                 \
4084       }                                                                 \
4085     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4086     if (CHARSET_DIMENSION (charset) == 1)                               \
4087       {                                                                 \
4088         int b;                                                          \
4089         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4090           b = intermediate_char_94[reg];                                \
4091         else                                                            \
4092           b = intermediate_char_96[reg];                                \
4093         EMIT_ONE_ASCII_BYTE (b);                                        \
4094       }                                                                 \
4095     else                                                                \
4096       {                                                                 \
4097         EMIT_ONE_ASCII_BYTE ('$');                                      \
4098         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4099           {                                                             \
4100             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4101                 || reg != 0                                             \
4102                 || final_char < '@' || final_char > 'B')                \
4103               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4104           }                                                             \
4105         else                                                            \
4106           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4107       }                                                                 \
4108     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4109                                                                         \
4110     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4111   } while (0)
4112
4113
4114 /* The following two macros produce codes (control character or escape
4115    sequence) for ISO2022 single-shift functions (single-shift-2 and
4116    single-shift-3).  */
4117
4118 #define ENCODE_SINGLE_SHIFT_2                                           \
4119   do {                                                                  \
4120     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4121       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4122     else                                                                \
4123       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4124     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4125   } while (0)
4126
4127
4128 #define ENCODE_SINGLE_SHIFT_3                                           \
4129   do {                                                                  \
4130     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4131       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4132     else                                                                \
4133       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4134     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4135   } while (0)
4136
4137
4138 /* The following four macros produce codes (control character or
4139    escape sequence) for ISO2022 locking-shift functions (shift-in,
4140    shift-out, locking-shift-2, and locking-shift-3).  */
4141
4142 #define ENCODE_SHIFT_IN                                 \
4143   do {                                                  \
4144     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4145     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4146   } while (0)
4147
4148
4149 #define ENCODE_SHIFT_OUT                                \
4150   do {                                                  \
4151     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4152     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4153   } while (0)
4154
4155
4156 #define ENCODE_LOCKING_SHIFT_2                          \
4157   do {                                                  \
4158     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4159     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4160   } while (0)
4161
4162
4163 #define ENCODE_LOCKING_SHIFT_3                          \
4164   do {                                                  \
4165     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4166     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4167   } while (0)
4168
4169
4170 /* Produce codes for a DIMENSION1 character whose character set is
4171    CHARSET and whose position-code is C1.  Designation and invocation
4172    sequences are also produced in advance if necessary.  */
4173
4174 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4175   do {                                                                  \
4176     int id = CHARSET_ID (charset);                                      \
4177                                                                         \
4178     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4179         && id == charset_ascii)                                         \
4180       {                                                                 \
4181         id = charset_jisx0201_roman;                                    \
4182         charset = CHARSET_FROM_ID (id);                                 \
4183       }                                                                 \
4184                                                                         \
4185     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4186       {                                                                 \
4187         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4188           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4189         else                                                            \
4190           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4191         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4192         break;                                                          \
4193       }                                                                 \
4194     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4195       {                                                                 \
4196         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4197         break;                                                          \
4198       }                                                                 \
4199     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4200       {                                                                 \
4201         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4202         break;                                                          \
4203       }                                                                 \
4204     else                                                                \
4205       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4206          must invoke it, or, at first, designate it to some graphic     \
4207          register.  Then repeat the loop to actually produce the        \
4208          character.  */                                                 \
4209       dst = encode_invocation_designation (charset, coding, dst,        \
4210                                            &produced_chars);            \
4211   } while (1)
4212
4213
4214 /* Produce codes for a DIMENSION2 character whose character set is
4215    CHARSET and whose position-codes are C1 and C2.  Designation and
4216    invocation codes are also produced in advance if necessary.  */
4217
4218 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4219   do {                                                                  \
4220     int id = CHARSET_ID (charset);                                      \
4221                                                                         \
4222     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4223         && id == charset_jisx0208)                                      \
4224       {                                                                 \
4225         id = charset_jisx0208_1978;                                     \
4226         charset = CHARSET_FROM_ID (id);                                 \
4227       }                                                                 \
4228                                                                         \
4229     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4230       {                                                                 \
4231         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4232           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4233         else                                                            \
4234           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4235         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4236         break;                                                          \
4237       }                                                                 \
4238     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4239       {                                                                 \
4240         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4241         break;                                                          \
4242       }                                                                 \
4243     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4244       {                                                                 \
4245         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4246         break;                                                          \
4247       }                                                                 \
4248     else                                                                \
4249       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4250          must invoke it, or, at first, designate it to some graphic     \
4251          register.  Then repeat the loop to actually produce the        \
4252          character.  */                                                 \
4253       dst = encode_invocation_designation (charset, coding, dst,        \
4254                                            &produced_chars);            \
4255   } while (1)
4256
4257
4258 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4259   do {                                                                     \
4260     unsigned code;                                                         \
4261     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4262                                                                            \
4263     if (CHARSET_DIMENSION (charset) == 1)                                  \
4264       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4265     else                                                                   \
4266       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4267   } while (0)
4268
4269
4270 /* Produce designation and invocation codes at a place pointed by DST
4271    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4272    Return new DST.  */
4273
4274 static unsigned char *
4275 encode_invocation_designation (struct charset *charset,
4276                                struct coding_system *coding,
4277                                unsigned char *dst, ptrdiff_t *p_nchars)
4278 {
4279   bool multibytep = coding->dst_multibyte;
4280   ptrdiff_t produced_chars = *p_nchars;
4281   int reg;                      /* graphic register number */
4282   int id = CHARSET_ID (charset);
4283
4284   /* At first, check designations.  */
4285   for (reg = 0; reg < 4; reg++)
4286     if (id == CODING_ISO_DESIGNATION (coding, reg))
4287       break;
4288
4289   if (reg >= 4)
4290     {
4291       /* CHARSET is not yet designated to any graphic registers.  */
4292       /* At first check the requested designation.  */
4293       reg = CODING_ISO_REQUEST (coding, id);
4294       if (reg < 0)
4295         /* Since CHARSET requests no special designation, designate it
4296            to graphic register 0.  */
4297         reg = 0;
4298
4299       ENCODE_DESIGNATION (charset, reg, coding);
4300     }
4301
4302   if (CODING_ISO_INVOCATION (coding, 0) != reg
4303       && CODING_ISO_INVOCATION (coding, 1) != reg)
4304     {
4305       /* Since the graphic register REG is not invoked to any graphic
4306          planes, invoke it to graphic plane 0.  */
4307       switch (reg)
4308         {
4309         case 0:                 /* graphic register 0 */
4310           ENCODE_SHIFT_IN;
4311           break;
4312
4313         case 1:                 /* graphic register 1 */
4314           ENCODE_SHIFT_OUT;
4315           break;
4316
4317         case 2:                 /* graphic register 2 */
4318           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4319             ENCODE_SINGLE_SHIFT_2;
4320           else
4321             ENCODE_LOCKING_SHIFT_2;
4322           break;
4323
4324         case 3:                 /* graphic register 3 */
4325           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4326             ENCODE_SINGLE_SHIFT_3;
4327           else
4328             ENCODE_LOCKING_SHIFT_3;
4329           break;
4330         }
4331     }
4332
4333   *p_nchars = produced_chars;
4334   return dst;
4335 }
4336
4337
4338 /* Produce codes for designation and invocation to reset the graphic
4339    planes and registers to initial state.  */
4340 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4341   do {                                                                  \
4342     int reg;                                                            \
4343     struct charset *charset;                                            \
4344                                                                         \
4345     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4346       ENCODE_SHIFT_IN;                                                  \
4347     for (reg = 0; reg < 4; reg++)                                       \
4348       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4349           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4350               != CODING_ISO_INITIAL (coding, reg)))                     \
4351         {                                                               \
4352           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4353           ENCODE_DESIGNATION (charset, reg, coding);                    \
4354         }                                                               \
4355   } while (0)
4356
4357
4358 /* Produce designation sequences of charsets in the line started from
4359    CHARBUF to a place pointed by DST, and return the number of
4360    produced bytes.  DST should not directly point a buffer text area
4361    which may be relocated by char_charset call.
4362
4363    If the current block ends before any end-of-line, we may fail to
4364    find all the necessary designations.  */
4365
4366 static ptrdiff_t
4367 encode_designation_at_bol (struct coding_system *coding,
4368                            int *charbuf, int *charbuf_end,
4369                            unsigned char *dst)
4370 {
4371   unsigned char *orig = dst;
4372   struct charset *charset;
4373   /* Table of charsets to be designated to each graphic register.  */
4374   int r[4];
4375   int c, found = 0, reg;
4376   ptrdiff_t produced_chars = 0;
4377   bool multibytep = coding->dst_multibyte;
4378   Lisp_Object attrs;
4379   Lisp_Object charset_list;
4380
4381   attrs = CODING_ID_ATTRS (coding->id);
4382   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4383   if (EQ (charset_list, Qiso_2022))
4384     charset_list = Viso_2022_charset_list;
4385
4386   for (reg = 0; reg < 4; reg++)
4387     r[reg] = -1;
4388
4389   while (charbuf < charbuf_end && found < 4)
4390     {
4391       int id;
4392
4393       c = *charbuf++;
4394       if (c == '\n')
4395         break;
4396       charset = char_charset (c, charset_list, NULL);
4397       id = CHARSET_ID (charset);
4398       reg = CODING_ISO_REQUEST (coding, id);
4399       if (reg >= 0 && r[reg] < 0)
4400         {
4401           found++;
4402           r[reg] = id;
4403         }
4404     }
4405
4406   if (found)
4407     {
4408       for (reg = 0; reg < 4; reg++)
4409         if (r[reg] >= 0
4410             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4411           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4412     }
4413
4414   return dst - orig;
4415 }
4416
4417 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4418
4419 static bool
4420 encode_coding_iso_2022 (struct coding_system *coding)
4421 {
4422   bool multibytep = coding->dst_multibyte;
4423   int *charbuf = coding->charbuf;
4424   int *charbuf_end = charbuf + coding->charbuf_used;
4425   unsigned char *dst = coding->destination + coding->produced;
4426   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4427   int safe_room = 16;
4428   bool bol_designation
4429     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4430        && CODING_ISO_BOL (coding));
4431   ptrdiff_t produced_chars = 0;
4432   Lisp_Object attrs, eol_type, charset_list;
4433   bool ascii_compatible;
4434   int c;
4435   int preferred_charset_id = -1;
4436
4437   CODING_GET_INFO (coding, attrs, charset_list);
4438   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4439   if (VECTORP (eol_type))
4440     eol_type = Qunix;
4441
4442   setup_iso_safe_charsets (attrs);
4443   /* Charset list may have been changed.  */
4444   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4445   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4446
4447   ascii_compatible
4448     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4449        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4450                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4451
4452   while (charbuf < charbuf_end)
4453     {
4454       ASSURE_DESTINATION (safe_room);
4455
4456       if (bol_designation)
4457         {
4458           /* We have to produce designation sequences if any now.  */
4459           unsigned char desig_buf[16];
4460           ptrdiff_t nbytes;
4461           ptrdiff_t offset;
4462
4463           charset_map_loaded = 0;
4464           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4465                                               desig_buf);
4466           if (charset_map_loaded
4467               && (offset = coding_change_destination (coding)))
4468             {
4469               dst += offset;
4470               dst_end += offset;
4471             }
4472           memcpy (dst, desig_buf, nbytes);
4473           dst += nbytes;
4474           /* We are sure that designation sequences are all ASCII bytes.  */
4475           produced_chars += nbytes;
4476           bol_designation = 0;
4477           ASSURE_DESTINATION (safe_room);
4478         }
4479
4480       c = *charbuf++;
4481
4482       if (c < 0)
4483         {
4484           /* Handle an annotation.  */
4485           switch (*charbuf)
4486             {
4487             case CODING_ANNOTATE_COMPOSITION_MASK:
4488               /* Not yet implemented.  */
4489               break;
4490             case CODING_ANNOTATE_CHARSET_MASK:
4491               preferred_charset_id = charbuf[2];
4492               if (preferred_charset_id >= 0
4493                   && NILP (Fmemq (make_number (preferred_charset_id),
4494                                   charset_list)))
4495                 preferred_charset_id = -1;
4496               break;
4497             default:
4498               emacs_abort ();
4499             }
4500           charbuf += -c - 1;
4501           continue;
4502         }
4503
4504       /* Now encode the character C.  */
4505       if (c < 0x20 || c == 0x7F)
4506         {
4507           if (c == '\n'
4508               || (c == '\r' && EQ (eol_type, Qmac)))
4509             {
4510               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4511                 ENCODE_RESET_PLANE_AND_REGISTER ();
4512               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4513                 {
4514                   int i;
4515
4516                   for (i = 0; i < 4; i++)
4517                     CODING_ISO_DESIGNATION (coding, i)
4518                       = CODING_ISO_INITIAL (coding, i);
4519                 }
4520               bol_designation = ((CODING_ISO_FLAGS (coding)
4521                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4522                                  != 0);
4523             }
4524           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4525             ENCODE_RESET_PLANE_AND_REGISTER ();
4526           EMIT_ONE_ASCII_BYTE (c);
4527         }
4528       else if (ASCII_CHAR_P (c))
4529         {
4530           if (ascii_compatible)
4531             EMIT_ONE_ASCII_BYTE (c);
4532           else
4533             {
4534               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4535               ENCODE_ISO_CHARACTER (charset, c);
4536             }
4537         }
4538       else if (CHAR_BYTE8_P (c))
4539         {
4540           c = CHAR_TO_BYTE8 (c);
4541           EMIT_ONE_BYTE (c);
4542         }
4543       else
4544         {
4545           struct charset *charset;
4546
4547           if (preferred_charset_id >= 0)
4548             {
4549               bool result;
4550
4551               charset = CHARSET_FROM_ID (preferred_charset_id);
4552               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4553               if (! result)
4554                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4555                                      NULL, charset);
4556             }
4557           else
4558             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4559                                  NULL, charset);
4560           if (!charset)
4561             {
4562               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4563                 {
4564                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4565                   charset = CHARSET_FROM_ID (charset_ascii);
4566                 }
4567               else
4568                 {
4569                   c = coding->default_char;
4570                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4571                                        charset_list, NULL, charset);
4572                 }
4573             }
4574           ENCODE_ISO_CHARACTER (charset, c);
4575         }
4576     }
4577
4578   if (coding->mode & CODING_MODE_LAST_BLOCK
4579       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4580     {
4581       ASSURE_DESTINATION (safe_room);
4582       ENCODE_RESET_PLANE_AND_REGISTER ();
4583     }
4584   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4585   CODING_ISO_BOL (coding) = bol_designation;
4586   coding->produced_char += produced_chars;
4587   coding->produced = dst - coding->destination;
4588   return 0;
4589 }
4590
4591 \f
4592 /*** 8,9. SJIS and BIG5 handlers ***/
4593
4594 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4595    quite widely.  So, for the moment, Emacs supports them in the bare
4596    C code.  But, in the future, they may be supported only by CCL.  */
4597
4598 /* SJIS is a coding system encoding three character sets: ASCII, right
4599    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4600    as is.  A character of charset katakana-jisx0201 is encoded by
4601    "position-code + 0x80".  A character of charset japanese-jisx0208
4602    is encoded in 2-byte but two position-codes are divided and shifted
4603    so that it fit in the range below.
4604
4605    --- CODE RANGE of SJIS ---
4606    (character set)      (range)
4607    ASCII                0x00 .. 0x7F
4608    KATAKANA-JISX0201    0xA0 .. 0xDF
4609    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4610             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4611    -------------------------------
4612
4613 */
4614
4615 /* BIG5 is a coding system encoding two character sets: ASCII and
4616    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4617    character set and is encoded in two-byte.
4618
4619    --- CODE RANGE of BIG5 ---
4620    (character set)      (range)
4621    ASCII                0x00 .. 0x7F
4622    Big5 (1st byte)      0xA1 .. 0xFE
4623         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4624    --------------------------
4625
4626   */
4627
4628 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4629    Return true if a text is encoded in SJIS.  */
4630
4631 static bool
4632 detect_coding_sjis (struct coding_system *coding,
4633                     struct coding_detection_info *detect_info)
4634 {
4635   const unsigned char *src = coding->source, *src_base;
4636   const unsigned char *src_end = coding->source + coding->src_bytes;
4637   bool multibytep = coding->src_multibyte;
4638   ptrdiff_t consumed_chars = 0;
4639   int found = 0;
4640   int c;
4641   Lisp_Object attrs, charset_list;
4642   int max_first_byte_of_2_byte_code;
4643
4644   CODING_GET_INFO (coding, attrs, charset_list);
4645   max_first_byte_of_2_byte_code
4646     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4647
4648   detect_info->checked |= CATEGORY_MASK_SJIS;
4649   /* A coding system of this category is always ASCII compatible.  */
4650   src += coding->head_ascii;
4651
4652   while (1)
4653     {
4654       src_base = src;
4655       ONE_MORE_BYTE (c);
4656       if (c < 0x80)
4657         continue;
4658       if ((c >= 0x81 && c <= 0x9F)
4659           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4660         {
4661           ONE_MORE_BYTE (c);
4662           if (c < 0x40 || c == 0x7F || c > 0xFC)
4663             break;
4664           found = CATEGORY_MASK_SJIS;
4665         }
4666       else if (c >= 0xA0 && c < 0xE0)
4667         found = CATEGORY_MASK_SJIS;
4668       else
4669         break;
4670     }
4671   detect_info->rejected |= CATEGORY_MASK_SJIS;
4672   return 0;
4673
4674  no_more_source:
4675   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4676     {
4677       detect_info->rejected |= CATEGORY_MASK_SJIS;
4678       return 0;
4679     }
4680   detect_info->found |= found;
4681   return 1;
4682 }
4683
4684 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4685    Return true if a text is encoded in BIG5.  */
4686
4687 static bool
4688 detect_coding_big5 (struct coding_system *coding,
4689                     struct coding_detection_info *detect_info)
4690 {
4691   const unsigned char *src = coding->source, *src_base;
4692   const unsigned char *src_end = coding->source + coding->src_bytes;
4693   bool multibytep = coding->src_multibyte;
4694   ptrdiff_t consumed_chars = 0;
4695   int found = 0;
4696   int c;
4697
4698   detect_info->checked |= CATEGORY_MASK_BIG5;
4699   /* A coding system of this category is always ASCII compatible.  */
4700   src += coding->head_ascii;
4701
4702   while (1)
4703     {
4704       src_base = src;
4705       ONE_MORE_BYTE (c);
4706       if (c < 0x80)
4707         continue;
4708       if (c >= 0xA1)
4709         {
4710           ONE_MORE_BYTE (c);
4711           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4712             return 0;
4713           found = CATEGORY_MASK_BIG5;
4714         }
4715       else
4716         break;
4717     }
4718   detect_info->rejected |= CATEGORY_MASK_BIG5;
4719   return 0;
4720
4721  no_more_source:
4722   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4723     {
4724       detect_info->rejected |= CATEGORY_MASK_BIG5;
4725       return 0;
4726     }
4727   detect_info->found |= found;
4728   return 1;
4729 }
4730
4731 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4732
4733 static void
4734 decode_coding_sjis (struct coding_system *coding)
4735 {
4736   const unsigned char *src = coding->source + coding->consumed;
4737   const unsigned char *src_end = coding->source + coding->src_bytes;
4738   const unsigned char *src_base;
4739   int *charbuf = coding->charbuf + coding->charbuf_used;
4740   /* We may produce one charset annotation in one loop and one more at
4741      the end.  */
4742   int *charbuf_end
4743     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4744   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4745   bool multibytep = coding->src_multibyte;
4746   struct charset *charset_roman, *charset_kanji, *charset_kana;
4747   struct charset *charset_kanji2;
4748   Lisp_Object attrs, charset_list, val;
4749   ptrdiff_t char_offset = coding->produced_char;
4750   ptrdiff_t last_offset = char_offset;
4751   int last_id = charset_ascii;
4752   bool eol_dos
4753     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4754   int byte_after_cr = -1;
4755
4756   CODING_GET_INFO (coding, attrs, charset_list);
4757
4758   val = charset_list;
4759   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4760   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4761   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4762   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4763
4764   while (1)
4765     {
4766       int c, c1;
4767       struct charset *charset;
4768
4769       src_base = src;
4770       consumed_chars_base = consumed_chars;
4771
4772       if (charbuf >= charbuf_end)
4773         {
4774           if (byte_after_cr >= 0)
4775             src_base--;
4776           break;
4777         }
4778
4779       if (byte_after_cr >= 0)
4780         c = byte_after_cr, byte_after_cr = -1;
4781       else
4782         ONE_MORE_BYTE (c);
4783       if (c < 0)
4784         goto invalid_code;
4785       if (c < 0x80)
4786         {
4787           if (eol_dos && c == '\r')
4788             ONE_MORE_BYTE (byte_after_cr);
4789           charset = charset_roman;
4790         }
4791       else if (c == 0x80 || c == 0xA0)
4792         goto invalid_code;
4793       else if (c >= 0xA1 && c <= 0xDF)
4794         {
4795           /* SJIS -> JISX0201-Kana */
4796           c &= 0x7F;
4797           charset = charset_kana;
4798         }
4799       else if (c <= 0xEF)
4800         {
4801           /* SJIS -> JISX0208 */
4802           ONE_MORE_BYTE (c1);
4803           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4804             goto invalid_code;
4805           c = (c << 8) | c1;
4806           SJIS_TO_JIS (c);
4807           charset = charset_kanji;
4808         }
4809       else if (c <= 0xFC && charset_kanji2)
4810         {
4811           /* SJIS -> JISX0213-2 */
4812           ONE_MORE_BYTE (c1);
4813           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4814             goto invalid_code;
4815           c = (c << 8) | c1;
4816           SJIS_TO_JIS2 (c);
4817           charset = charset_kanji2;
4818         }
4819       else
4820         goto invalid_code;
4821       if (charset->id != charset_ascii
4822           && last_id != charset->id)
4823         {
4824           if (last_id != charset_ascii)
4825             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4826           last_id = charset->id;
4827           last_offset = char_offset;
4828         }
4829       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4830       *charbuf++ = c;
4831       char_offset++;
4832       continue;
4833
4834     invalid_code:
4835       src = src_base;
4836       consumed_chars = consumed_chars_base;
4837       ONE_MORE_BYTE (c);
4838       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4839       char_offset++;
4840       coding->errors++;
4841     }
4842
4843  no_more_source:
4844   if (last_id != charset_ascii)
4845     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4846   coding->consumed_char += consumed_chars_base;
4847   coding->consumed = src_base - coding->source;
4848   coding->charbuf_used = charbuf - coding->charbuf;
4849 }
4850
4851 static void
4852 decode_coding_big5 (struct coding_system *coding)
4853 {
4854   const unsigned char *src = coding->source + coding->consumed;
4855   const unsigned char *src_end = coding->source + coding->src_bytes;
4856   const unsigned char *src_base;
4857   int *charbuf = coding->charbuf + coding->charbuf_used;
4858   /* We may produce one charset annotation in one loop and one more at
4859      the end.  */
4860   int *charbuf_end
4861     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4862   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4863   bool multibytep = coding->src_multibyte;
4864   struct charset *charset_roman, *charset_big5;
4865   Lisp_Object attrs, charset_list, val;
4866   ptrdiff_t char_offset = coding->produced_char;
4867   ptrdiff_t last_offset = char_offset;
4868   int last_id = charset_ascii;
4869   bool eol_dos
4870     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4871   int byte_after_cr = -1;
4872
4873   CODING_GET_INFO (coding, attrs, charset_list);
4874   val = charset_list;
4875   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4876   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4877
4878   while (1)
4879     {
4880       int c, c1;
4881       struct charset *charset;
4882
4883       src_base = src;
4884       consumed_chars_base = consumed_chars;
4885
4886       if (charbuf >= charbuf_end)
4887         {
4888           if (byte_after_cr >= 0)
4889             src_base--;
4890           break;
4891         }
4892
4893       if (byte_after_cr >= 0)
4894         c = byte_after_cr, byte_after_cr = -1;
4895       else
4896         ONE_MORE_BYTE (c);
4897
4898       if (c < 0)
4899         goto invalid_code;
4900       if (c < 0x80)
4901         {
4902           if (eol_dos && c == '\r')
4903             ONE_MORE_BYTE (byte_after_cr);
4904           charset = charset_roman;
4905         }
4906       else
4907         {
4908           /* BIG5 -> Big5 */
4909           if (c < 0xA1 || c > 0xFE)
4910             goto invalid_code;
4911           ONE_MORE_BYTE (c1);
4912           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4913             goto invalid_code;
4914           c = c << 8 | c1;
4915           charset = charset_big5;
4916         }
4917       if (charset->id != charset_ascii
4918           && last_id != charset->id)
4919         {
4920           if (last_id != charset_ascii)
4921             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4922           last_id = charset->id;
4923           last_offset = char_offset;
4924         }
4925       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4926       *charbuf++ = c;
4927       char_offset++;
4928       continue;
4929
4930     invalid_code:
4931       src = src_base;
4932       consumed_chars = consumed_chars_base;
4933       ONE_MORE_BYTE (c);
4934       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4935       char_offset++;
4936       coding->errors++;
4937     }
4938
4939  no_more_source:
4940   if (last_id != charset_ascii)
4941     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4942   coding->consumed_char += consumed_chars_base;
4943   coding->consumed = src_base - coding->source;
4944   coding->charbuf_used = charbuf - coding->charbuf;
4945 }
4946
4947 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4948    This function can encode charsets `ascii', `katakana-jisx0201',
4949    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4950    are sure that all these charsets are registered as official charset
4951    (i.e. do not have extended leading-codes).  Characters of other
4952    charsets are produced without any encoding.  */
4953
4954 static bool
4955 encode_coding_sjis (struct coding_system *coding)
4956 {
4957   bool multibytep = coding->dst_multibyte;
4958   int *charbuf = coding->charbuf;
4959   int *charbuf_end = charbuf + coding->charbuf_used;
4960   unsigned char *dst = coding->destination + coding->produced;
4961   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4962   int safe_room = 4;
4963   ptrdiff_t produced_chars = 0;
4964   Lisp_Object attrs, charset_list, val;
4965   bool ascii_compatible;
4966   struct charset *charset_kanji, *charset_kana;
4967   struct charset *charset_kanji2;
4968   int c;
4969
4970   CODING_GET_INFO (coding, attrs, charset_list);
4971   val = XCDR (charset_list);
4972   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4973   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4974   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4975
4976   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4977
4978   while (charbuf < charbuf_end)
4979     {
4980       ASSURE_DESTINATION (safe_room);
4981       c = *charbuf++;
4982       /* Now encode the character C.  */
4983       if (ASCII_CHAR_P (c) && ascii_compatible)
4984         EMIT_ONE_ASCII_BYTE (c);
4985       else if (CHAR_BYTE8_P (c))
4986         {
4987           c = CHAR_TO_BYTE8 (c);
4988           EMIT_ONE_BYTE (c);
4989         }
4990       else
4991         {
4992           unsigned code;
4993           struct charset *charset;
4994           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4995                                &code, charset);
4996
4997           if (!charset)
4998             {
4999               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5000                 {
5001                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5002                   charset = CHARSET_FROM_ID (charset_ascii);
5003                 }
5004               else
5005                 {
5006                   c = coding->default_char;
5007                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5008                                        charset_list, &code, charset);
5009                 }
5010             }
5011           if (code == CHARSET_INVALID_CODE (charset))
5012             emacs_abort ();
5013           if (charset == charset_kanji)
5014             {
5015               int c1, c2;
5016               JIS_TO_SJIS (code);
5017               c1 = code >> 8, c2 = code & 0xFF;
5018               EMIT_TWO_BYTES (c1, c2);
5019             }
5020           else if (charset == charset_kana)
5021             EMIT_ONE_BYTE (code | 0x80);
5022           else if (charset_kanji2 && charset == charset_kanji2)
5023             {
5024               int c1, c2;
5025
5026               c1 = code >> 8;
5027               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5028                   || c1 == 0x28
5029                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5030                 {
5031                   JIS_TO_SJIS2 (code);
5032                   c1 = code >> 8, c2 = code & 0xFF;
5033                   EMIT_TWO_BYTES (c1, c2);
5034                 }
5035               else
5036                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5037             }
5038           else
5039             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5040         }
5041     }
5042   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5043   coding->produced_char += produced_chars;
5044   coding->produced = dst - coding->destination;
5045   return 0;
5046 }
5047
5048 static bool
5049 encode_coding_big5 (struct coding_system *coding)
5050 {
5051   bool multibytep = coding->dst_multibyte;
5052   int *charbuf = coding->charbuf;
5053   int *charbuf_end = charbuf + coding->charbuf_used;
5054   unsigned char *dst = coding->destination + coding->produced;
5055   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5056   int safe_room = 4;
5057   ptrdiff_t produced_chars = 0;
5058   Lisp_Object attrs, charset_list, val;
5059   bool ascii_compatible;
5060   struct charset *charset_big5;
5061   int c;
5062
5063   CODING_GET_INFO (coding, attrs, charset_list);
5064   val = XCDR (charset_list);
5065   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5066   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5067
5068   while (charbuf < charbuf_end)
5069     {
5070       ASSURE_DESTINATION (safe_room);
5071       c = *charbuf++;
5072       /* Now encode the character C.  */
5073       if (ASCII_CHAR_P (c) && ascii_compatible)
5074         EMIT_ONE_ASCII_BYTE (c);
5075       else if (CHAR_BYTE8_P (c))
5076         {
5077           c = CHAR_TO_BYTE8 (c);
5078           EMIT_ONE_BYTE (c);
5079         }
5080       else
5081         {
5082           unsigned code;
5083           struct charset *charset;
5084           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5085                                &code, charset);
5086
5087           if (! charset)
5088             {
5089               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5090                 {
5091                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5092                   charset = CHARSET_FROM_ID (charset_ascii);
5093                 }
5094               else
5095                 {
5096                   c = coding->default_char;
5097                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5098                                        charset_list, &code, charset);
5099                 }
5100             }
5101           if (code == CHARSET_INVALID_CODE (charset))
5102             emacs_abort ();
5103           if (charset == charset_big5)
5104             {
5105               int c1, c2;
5106
5107               c1 = code >> 8, c2 = code & 0xFF;
5108               EMIT_TWO_BYTES (c1, c2);
5109             }
5110           else
5111             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5112         }
5113     }
5114   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5115   coding->produced_char += produced_chars;
5116   coding->produced = dst - coding->destination;
5117   return 0;
5118 }
5119
5120 \f
5121 /*** 10. CCL handlers ***/
5122
5123 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5124    Return true if a text is encoded in a coding system of which
5125    encoder/decoder are written in CCL program.  */
5126
5127 static bool
5128 detect_coding_ccl (struct coding_system *coding,
5129                    struct coding_detection_info *detect_info)
5130 {
5131   const unsigned char *src = coding->source, *src_base;
5132   const unsigned char *src_end = coding->source + coding->src_bytes;
5133   bool multibytep = coding->src_multibyte;
5134   ptrdiff_t consumed_chars = 0;
5135   int found = 0;
5136   unsigned char *valids;
5137   ptrdiff_t head_ascii = coding->head_ascii;
5138   Lisp_Object attrs;
5139
5140   detect_info->checked |= CATEGORY_MASK_CCL;
5141
5142   coding = &coding_categories[coding_category_ccl];
5143   valids = CODING_CCL_VALIDS (coding);
5144   attrs = CODING_ID_ATTRS (coding->id);
5145   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5146     src += head_ascii;
5147
5148   while (1)
5149     {
5150       int c;
5151
5152       src_base = src;
5153       ONE_MORE_BYTE (c);
5154       if (c < 0 || ! valids[c])
5155         break;
5156       if ((valids[c] > 1))
5157         found = CATEGORY_MASK_CCL;
5158     }
5159   detect_info->rejected |= CATEGORY_MASK_CCL;
5160   return 0;
5161
5162  no_more_source:
5163   detect_info->found |= found;
5164   return 1;
5165 }
5166
5167 static void
5168 decode_coding_ccl (struct coding_system *coding)
5169 {
5170   const unsigned char *src = coding->source + coding->consumed;
5171   const unsigned char *src_end = coding->source + coding->src_bytes;
5172   int *charbuf = coding->charbuf + coding->charbuf_used;
5173   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5174   ptrdiff_t consumed_chars = 0;
5175   bool multibytep = coding->src_multibyte;
5176   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5177   int source_charbuf[1024];
5178   int source_byteidx[1025];
5179   Lisp_Object attrs, charset_list;
5180
5181   CODING_GET_INFO (coding, attrs, charset_list);
5182
5183   while (1)
5184     {
5185       const unsigned char *p = src;
5186       ptrdiff_t offset;
5187       int i = 0;
5188
5189       if (multibytep)
5190         {
5191           while (i < 1024 && p < src_end)
5192             {
5193               source_byteidx[i] = p - src;
5194               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5195             }
5196           source_byteidx[i] = p - src;
5197         }
5198       else
5199         while (i < 1024 && p < src_end)
5200           source_charbuf[i++] = *p++;
5201
5202       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5203         ccl->last_block = true;
5204       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5205       charset_map_loaded = 0;
5206       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5207                   charset_list);
5208       if (charset_map_loaded
5209           && (offset = coding_change_source (coding)))
5210         {
5211           p += offset;
5212           src += offset;
5213           src_end += offset;
5214         }
5215       charbuf += ccl->produced;
5216       if (multibytep)
5217         src += source_byteidx[ccl->consumed];
5218       else
5219         src += ccl->consumed;
5220       consumed_chars += ccl->consumed;
5221       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5222         break;
5223     }
5224
5225   switch (ccl->status)
5226     {
5227     case CCL_STAT_SUSPEND_BY_SRC:
5228       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5229       break;
5230     case CCL_STAT_SUSPEND_BY_DST:
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5232       break;
5233     case CCL_STAT_QUIT:
5234     case CCL_STAT_INVALID_CMD:
5235       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5236       break;
5237     default:
5238       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5239       break;
5240     }
5241   coding->consumed_char += consumed_chars;
5242   coding->consumed = src - coding->source;
5243   coding->charbuf_used = charbuf - coding->charbuf;
5244 }
5245
5246 static bool
5247 encode_coding_ccl (struct coding_system *coding)
5248 {
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   bool multibytep = coding->dst_multibyte;
5251   int *charbuf = coding->charbuf;
5252   int *charbuf_end = charbuf + coding->charbuf_used;
5253   unsigned char *dst = coding->destination + coding->produced;
5254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5255   int destination_charbuf[1024];
5256   ptrdiff_t produced_chars = 0;
5257   int i;
5258   Lisp_Object attrs, charset_list;
5259
5260   CODING_GET_INFO (coding, attrs, charset_list);
5261   if (coding->consumed_char == coding->src_chars
5262       && coding->mode & CODING_MODE_LAST_BLOCK)
5263     ccl->last_block = true;
5264
5265   do
5266     {
5267       ptrdiff_t offset;
5268
5269       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5270       charset_map_loaded = 0;
5271       ccl_driver (ccl, charbuf, destination_charbuf,
5272                   charbuf_end - charbuf, 1024, charset_list);
5273       if (charset_map_loaded
5274           && (offset = coding_change_destination (coding)))
5275         dst += offset;
5276       if (multibytep)
5277         {
5278           ASSURE_DESTINATION (ccl->produced * 2);
5279           for (i = 0; i < ccl->produced; i++)
5280             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5281         }
5282       else
5283         {
5284           ASSURE_DESTINATION (ccl->produced);
5285           for (i = 0; i < ccl->produced; i++)
5286             *dst++ = destination_charbuf[i] & 0xFF;
5287           produced_chars += ccl->produced;
5288         }
5289       charbuf += ccl->consumed;
5290       if (ccl->status == CCL_STAT_QUIT
5291           || ccl->status == CCL_STAT_INVALID_CMD)
5292         break;
5293     }
5294   while (charbuf < charbuf_end);
5295
5296   switch (ccl->status)
5297     {
5298     case CCL_STAT_SUSPEND_BY_SRC:
5299       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5300       break;
5301     case CCL_STAT_SUSPEND_BY_DST:
5302       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5303       break;
5304     case CCL_STAT_QUIT:
5305     case CCL_STAT_INVALID_CMD:
5306       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5307       break;
5308     default:
5309       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5310       break;
5311     }
5312
5313   coding->produced_char += produced_chars;
5314   coding->produced = dst - coding->destination;
5315   return 0;
5316 }
5317
5318 \f
5319 /*** 10, 11. no-conversion handlers ***/
5320
5321 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5322
5323 static void
5324 decode_coding_raw_text (struct coding_system *coding)
5325 {
5326   bool eol_dos
5327     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5328
5329   coding->chars_at_source = 1;
5330   coding->consumed_char = coding->src_chars;
5331   coding->consumed = coding->src_bytes;
5332   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5333     {
5334       coding->consumed_char--;
5335       coding->consumed--;
5336       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5337     }
5338   else
5339     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5340 }
5341
5342 static bool
5343 encode_coding_raw_text (struct coding_system *coding)
5344 {
5345   bool multibytep = coding->dst_multibyte;
5346   int *charbuf = coding->charbuf;
5347   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5348   unsigned char *dst = coding->destination + coding->produced;
5349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5350   ptrdiff_t produced_chars = 0;
5351   int c;
5352
5353   if (multibytep)
5354     {
5355       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5356
5357       if (coding->src_multibyte)
5358         while (charbuf < charbuf_end)
5359           {
5360             ASSURE_DESTINATION (safe_room);
5361             c = *charbuf++;
5362             if (ASCII_CHAR_P (c))
5363               EMIT_ONE_ASCII_BYTE (c);
5364             else if (CHAR_BYTE8_P (c))
5365               {
5366                 c = CHAR_TO_BYTE8 (c);
5367                 EMIT_ONE_BYTE (c);
5368               }
5369             else
5370               {
5371                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5372
5373                 CHAR_STRING_ADVANCE (c, p1);
5374                 do
5375                   {
5376                     EMIT_ONE_BYTE (*p0);
5377                     p0++;
5378                   }
5379                 while (p0 < p1);
5380               }
5381           }
5382       else
5383         while (charbuf < charbuf_end)
5384           {
5385             ASSURE_DESTINATION (safe_room);
5386             c = *charbuf++;
5387             EMIT_ONE_BYTE (c);
5388           }
5389     }
5390   else
5391     {
5392       if (coding->src_multibyte)
5393         {
5394           int safe_room = MAX_MULTIBYTE_LENGTH;
5395
5396           while (charbuf < charbuf_end)
5397             {
5398               ASSURE_DESTINATION (safe_room);
5399               c = *charbuf++;
5400               if (ASCII_CHAR_P (c))
5401                 *dst++ = c;
5402               else if (CHAR_BYTE8_P (c))
5403                 *dst++ = CHAR_TO_BYTE8 (c);
5404               else
5405                 CHAR_STRING_ADVANCE (c, dst);
5406             }
5407         }
5408       else
5409         {
5410           ASSURE_DESTINATION (charbuf_end - charbuf);
5411           while (charbuf < charbuf_end && dst < dst_end)
5412             *dst++ = *charbuf++;
5413         }
5414       produced_chars = dst - (coding->destination + coding->produced);
5415     }
5416   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5417   coding->produced_char += produced_chars;
5418   coding->produced = dst - coding->destination;
5419   return 0;
5420 }
5421
5422 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5423    Return true if a text is encoded in a charset-based coding system.  */
5424
5425 static bool
5426 detect_coding_charset (struct coding_system *coding,
5427                        struct coding_detection_info *detect_info)
5428 {
5429   const unsigned char *src = coding->source, *src_base;
5430   const unsigned char *src_end = coding->source + coding->src_bytes;
5431   bool multibytep = coding->src_multibyte;
5432   ptrdiff_t consumed_chars = 0;
5433   Lisp_Object attrs, valids, name;
5434   int found = 0;
5435   ptrdiff_t head_ascii = coding->head_ascii;
5436   bool check_latin_extra = 0;
5437
5438   detect_info->checked |= CATEGORY_MASK_CHARSET;
5439
5440   coding = &coding_categories[coding_category_charset];
5441   attrs = CODING_ID_ATTRS (coding->id);
5442   valids = AREF (attrs, coding_attr_charset_valids);
5443   name = CODING_ID_NAME (coding->id);
5444   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5445                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5446       || strncmp (SSDATA (SYMBOL_NAME (name)),
5447                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5448     check_latin_extra = 1;
5449
5450   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5451     src += head_ascii;
5452
5453   while (1)
5454     {
5455       int c;
5456       Lisp_Object val;
5457       struct charset *charset;
5458       int dim, idx;
5459
5460       src_base = src;
5461       ONE_MORE_BYTE (c);
5462       if (c < 0)
5463         continue;
5464       val = AREF (valids, c);
5465       if (NILP (val))
5466         break;
5467       if (c >= 0x80)
5468         {
5469           if (c < 0xA0
5470               && check_latin_extra
5471               && (!VECTORP (Vlatin_extra_code_table)
5472                   || NILP (AREF (Vlatin_extra_code_table, c))))
5473             break;
5474           found = CATEGORY_MASK_CHARSET;
5475         }
5476       if (INTEGERP (val))
5477         {
5478           charset = CHARSET_FROM_ID (XFASTINT (val));
5479           dim = CHARSET_DIMENSION (charset);
5480           for (idx = 1; idx < dim; idx++)
5481             {
5482               if (src == src_end)
5483                 goto too_short;
5484               ONE_MORE_BYTE (c);
5485               if (c < charset->code_space[(dim - 1 - idx) * 4]
5486                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5487                 break;
5488             }
5489           if (idx < dim)
5490             break;
5491         }
5492       else
5493         {
5494           idx = 1;
5495           for (; CONSP (val); val = XCDR (val))
5496             {
5497               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5498               dim = CHARSET_DIMENSION (charset);
5499               while (idx < dim)
5500                 {
5501                   if (src == src_end)
5502                     goto too_short;
5503                   ONE_MORE_BYTE (c);
5504                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5505                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5506                     break;
5507                   idx++;
5508                 }
5509               if (idx == dim)
5510                 {
5511                   val = Qnil;
5512                   break;
5513                 }
5514             }
5515           if (CONSP (val))
5516             break;
5517         }
5518     }
5519  too_short:
5520   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5521   return 0;
5522
5523  no_more_source:
5524   detect_info->found |= found;
5525   return 1;
5526 }
5527
5528 static void
5529 decode_coding_charset (struct coding_system *coding)
5530 {
5531   const unsigned char *src = coding->source + coding->consumed;
5532   const unsigned char *src_end = coding->source + coding->src_bytes;
5533   const unsigned char *src_base;
5534   int *charbuf = coding->charbuf + coding->charbuf_used;
5535   /* We may produce one charset annotation in one loop and one more at
5536      the end.  */
5537   int *charbuf_end
5538     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5539   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5540   bool multibytep = coding->src_multibyte;
5541   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5542   Lisp_Object valids;
5543   ptrdiff_t char_offset = coding->produced_char;
5544   ptrdiff_t last_offset = char_offset;
5545   int last_id = charset_ascii;
5546   bool eol_dos
5547     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5548   int byte_after_cr = -1;
5549
5550   valids = AREF (attrs, coding_attr_charset_valids);
5551
5552   while (1)
5553     {
5554       int c;
5555       Lisp_Object val;
5556       struct charset *charset;
5557       int dim;
5558       int len = 1;
5559       unsigned code;
5560
5561       src_base = src;
5562       consumed_chars_base = consumed_chars;
5563
5564       if (charbuf >= charbuf_end)
5565         {
5566           if (byte_after_cr >= 0)
5567             src_base--;
5568           break;
5569         }
5570
5571       if (byte_after_cr >= 0)
5572         {
5573           c = byte_after_cr;
5574           byte_after_cr = -1;
5575         }
5576       else
5577         {
5578           ONE_MORE_BYTE (c);
5579           if (eol_dos && c == '\r')
5580             ONE_MORE_BYTE (byte_after_cr);
5581         }
5582       if (c < 0)
5583         goto invalid_code;
5584       code = c;
5585
5586       val = AREF (valids, c);
5587       if (! INTEGERP (val) && ! CONSP (val))
5588         goto invalid_code;
5589       if (INTEGERP (val))
5590         {
5591           charset = CHARSET_FROM_ID (XFASTINT (val));
5592           dim = CHARSET_DIMENSION (charset);
5593           while (len < dim)
5594             {
5595               ONE_MORE_BYTE (c);
5596               code = (code << 8) | c;
5597               len++;
5598             }
5599           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5600                               charset, code, c);
5601         }
5602       else
5603         {
5604           /* VAL is a list of charset IDs.  It is assured that the
5605              list is sorted by charset dimensions (smaller one
5606              comes first).  */
5607           while (CONSP (val))
5608             {
5609               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5610               dim = CHARSET_DIMENSION (charset);
5611               while (len < dim)
5612                 {
5613                   ONE_MORE_BYTE (c);
5614                   code = (code << 8) | c;
5615                   len++;
5616                 }
5617               CODING_DECODE_CHAR (coding, src, src_base,
5618                                   src_end, charset, code, c);
5619               if (c >= 0)
5620                 break;
5621               val = XCDR (val);
5622             }
5623         }
5624       if (c < 0)
5625         goto invalid_code;
5626       if (charset->id != charset_ascii
5627           && last_id != charset->id)
5628         {
5629           if (last_id != charset_ascii)
5630             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5631           last_id = charset->id;
5632           last_offset = char_offset;
5633         }
5634
5635       *charbuf++ = c;
5636       char_offset++;
5637       continue;
5638
5639     invalid_code:
5640       src = src_base;
5641       consumed_chars = consumed_chars_base;
5642       ONE_MORE_BYTE (c);
5643       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5644       char_offset++;
5645       coding->errors++;
5646     }
5647
5648  no_more_source:
5649   if (last_id != charset_ascii)
5650     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5651   coding->consumed_char += consumed_chars_base;
5652   coding->consumed = src_base - coding->source;
5653   coding->charbuf_used = charbuf - coding->charbuf;
5654 }
5655
5656 static bool
5657 encode_coding_charset (struct coding_system *coding)
5658 {
5659   bool multibytep = coding->dst_multibyte;
5660   int *charbuf = coding->charbuf;
5661   int *charbuf_end = charbuf + coding->charbuf_used;
5662   unsigned char *dst = coding->destination + coding->produced;
5663   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5664   int safe_room = MAX_MULTIBYTE_LENGTH;
5665   ptrdiff_t produced_chars = 0;
5666   Lisp_Object attrs, charset_list;
5667   bool ascii_compatible;
5668   int c;
5669
5670   CODING_GET_INFO (coding, attrs, charset_list);
5671   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5672
5673   while (charbuf < charbuf_end)
5674     {
5675       struct charset *charset;
5676       unsigned code;
5677
5678       ASSURE_DESTINATION (safe_room);
5679       c = *charbuf++;
5680       if (ascii_compatible && ASCII_CHAR_P (c))
5681         EMIT_ONE_ASCII_BYTE (c);
5682       else if (CHAR_BYTE8_P (c))
5683         {
5684           c = CHAR_TO_BYTE8 (c);
5685           EMIT_ONE_BYTE (c);
5686         }
5687       else
5688         {
5689           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5690                                &code, charset);
5691
5692           if (charset)
5693             {
5694               if (CHARSET_DIMENSION (charset) == 1)
5695                 EMIT_ONE_BYTE (code);
5696               else if (CHARSET_DIMENSION (charset) == 2)
5697                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5698               else if (CHARSET_DIMENSION (charset) == 3)
5699                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5700               else
5701                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5702                                  (code >> 8) & 0xFF, code & 0xFF);
5703             }
5704           else
5705             {
5706               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5707                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5708               else
5709                 c = coding->default_char;
5710               EMIT_ONE_BYTE (c);
5711             }
5712         }
5713     }
5714
5715   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5716   coding->produced_char += produced_chars;
5717   coding->produced = dst - coding->destination;
5718   return 0;
5719 }
5720
5721 \f
5722 /*** 7. C library functions ***/
5723
5724 /* Setup coding context CODING from information about CODING_SYSTEM.
5725    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5726    CODING_SYSTEM is invalid, signal an error.  */
5727
5728 void
5729 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5730 {
5731   Lisp_Object attrs;
5732   Lisp_Object eol_type;
5733   Lisp_Object coding_type;
5734   Lisp_Object val;
5735
5736   if (NILP (coding_system))
5737     coding_system = Qundecided;
5738
5739   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5740
5741   attrs = CODING_ID_ATTRS (coding->id);
5742   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5743
5744   coding->mode = 0;
5745   if (VECTORP (eol_type))
5746     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5747                             | CODING_REQUIRE_DETECTION_MASK);
5748   else if (! EQ (eol_type, Qunix))
5749     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5750                             | CODING_REQUIRE_ENCODING_MASK);
5751   else
5752     coding->common_flags = 0;
5753   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5754     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5755   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5756     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5757   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5758     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5759
5760   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5761   coding->max_charset_id = SCHARS (val) - 1;
5762   coding->safe_charsets = SDATA (val);
5763   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5764   coding->carryover_bytes = 0;
5765   coding->raw_destination = 0;
5766
5767   coding_type = CODING_ATTR_TYPE (attrs);
5768   if (EQ (coding_type, Qundecided))
5769     {
5770       coding->detector = NULL;
5771       coding->decoder = decode_coding_raw_text;
5772       coding->encoder = encode_coding_raw_text;
5773       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5774       coding->spec.undecided.inhibit_nbd
5775         = (encode_inhibit_flag
5776            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5777       coding->spec.undecided.inhibit_ied
5778         = (encode_inhibit_flag
5779            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5780       coding->spec.undecided.prefer_utf_8
5781         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5782     }
5783   else if (EQ (coding_type, Qiso_2022))
5784     {
5785       int i;
5786       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5787
5788       /* Invoke graphic register 0 to plane 0.  */
5789       CODING_ISO_INVOCATION (coding, 0) = 0;
5790       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5791       CODING_ISO_INVOCATION (coding, 1)
5792         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5793       /* Setup the initial status of designation.  */
5794       for (i = 0; i < 4; i++)
5795         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5796       /* Not single shifting initially.  */
5797       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5798       /* Beginning of buffer should also be regarded as bol. */
5799       CODING_ISO_BOL (coding) = 1;
5800       coding->detector = detect_coding_iso_2022;
5801       coding->decoder = decode_coding_iso_2022;
5802       coding->encoder = encode_coding_iso_2022;
5803       if (flags & CODING_ISO_FLAG_SAFE)
5804         coding->mode |= CODING_MODE_SAFE_ENCODING;
5805       coding->common_flags
5806         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5807             | CODING_REQUIRE_FLUSHING_MASK);
5808       if (flags & CODING_ISO_FLAG_COMPOSITION)
5809         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5810       if (flags & CODING_ISO_FLAG_DESIGNATION)
5811         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5812       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5813         {
5814           setup_iso_safe_charsets (attrs);
5815           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5816           coding->max_charset_id = SCHARS (val) - 1;
5817           coding->safe_charsets = SDATA (val);
5818         }
5819       CODING_ISO_FLAGS (coding) = flags;
5820       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5821       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5822       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5823       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5824     }
5825   else if (EQ (coding_type, Qcharset))
5826     {
5827       coding->detector = detect_coding_charset;
5828       coding->decoder = decode_coding_charset;
5829       coding->encoder = encode_coding_charset;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832     }
5833   else if (EQ (coding_type, Qutf_8))
5834     {
5835       val = AREF (attrs, coding_attr_utf_bom);
5836       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5837                                    : EQ (val, Qt) ? utf_with_bom
5838                                    : utf_without_bom);
5839       coding->detector = detect_coding_utf_8;
5840       coding->decoder = decode_coding_utf_8;
5841       coding->encoder = encode_coding_utf_8;
5842       coding->common_flags
5843         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5844       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5845         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5846     }
5847   else if (EQ (coding_type, Qutf_16))
5848     {
5849       val = AREF (attrs, coding_attr_utf_bom);
5850       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5851                                     : EQ (val, Qt) ? utf_with_bom
5852                                     : utf_without_bom);
5853       val = AREF (attrs, coding_attr_utf_16_endian);
5854       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5855                                        : utf_16_little_endian);
5856       CODING_UTF_16_SURROGATE (coding) = 0;
5857       coding->detector = detect_coding_utf_16;
5858       coding->decoder = decode_coding_utf_16;
5859       coding->encoder = encode_coding_utf_16;
5860       coding->common_flags
5861         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5862       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5863         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5864     }
5865   else if (EQ (coding_type, Qccl))
5866     {
5867       coding->detector = detect_coding_ccl;
5868       coding->decoder = decode_coding_ccl;
5869       coding->encoder = encode_coding_ccl;
5870       coding->common_flags
5871         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5872             | CODING_REQUIRE_FLUSHING_MASK);
5873     }
5874   else if (EQ (coding_type, Qemacs_mule))
5875     {
5876       coding->detector = detect_coding_emacs_mule;
5877       coding->decoder = decode_coding_emacs_mule;
5878       coding->encoder = encode_coding_emacs_mule;
5879       coding->common_flags
5880         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5881       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5882           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5883         {
5884           Lisp_Object tail, safe_charsets;
5885           int max_charset_id = 0;
5886
5887           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5888                tail = XCDR (tail))
5889             if (max_charset_id < XFASTINT (XCAR (tail)))
5890               max_charset_id = XFASTINT (XCAR (tail));
5891           safe_charsets = make_uninit_string (max_charset_id + 1);
5892           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5893           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5894                tail = XCDR (tail))
5895             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5896           coding->max_charset_id = max_charset_id;
5897           coding->safe_charsets = SDATA (safe_charsets);
5898         }
5899       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5900       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5901     }
5902   else if (EQ (coding_type, Qshift_jis))
5903     {
5904       coding->detector = detect_coding_sjis;
5905       coding->decoder = decode_coding_sjis;
5906       coding->encoder = encode_coding_sjis;
5907       coding->common_flags
5908         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5909     }
5910   else if (EQ (coding_type, Qbig5))
5911     {
5912       coding->detector = detect_coding_big5;
5913       coding->decoder = decode_coding_big5;
5914       coding->encoder = encode_coding_big5;
5915       coding->common_flags
5916         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5917     }
5918   else                          /* EQ (coding_type, Qraw_text) */
5919     {
5920       coding->detector = NULL;
5921       coding->decoder = decode_coding_raw_text;
5922       coding->encoder = encode_coding_raw_text;
5923       if (! EQ (eol_type, Qunix))
5924         {
5925           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5926           if (! VECTORP (eol_type))
5927             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5928         }
5929
5930     }
5931
5932   return;
5933 }
5934
5935 /* Return a list of charsets supported by CODING.  */
5936
5937 Lisp_Object
5938 coding_charset_list (struct coding_system *coding)
5939 {
5940   Lisp_Object attrs, charset_list;
5941
5942   CODING_GET_INFO (coding, attrs, charset_list);
5943   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5944     {
5945       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5946
5947       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5948         charset_list = Viso_2022_charset_list;
5949     }
5950   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5951     {
5952       charset_list = Vemacs_mule_charset_list;
5953     }
5954   return charset_list;
5955 }
5956
5957
5958 /* Return a list of charsets supported by CODING-SYSTEM.  */
5959
5960 Lisp_Object
5961 coding_system_charset_list (Lisp_Object coding_system)
5962 {
5963   ptrdiff_t id;
5964   Lisp_Object attrs, charset_list;
5965
5966   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5967   attrs = CODING_ID_ATTRS (id);
5968
5969   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5970     {
5971       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5972
5973       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5974         charset_list = Viso_2022_charset_list;
5975       else
5976         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5977     }
5978   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5979     {
5980       charset_list = Vemacs_mule_charset_list;
5981     }
5982   else
5983     {
5984       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5985     }
5986   return charset_list;
5987 }
5988
5989
5990 /* Return raw-text or one of its subsidiaries that has the same
5991    eol_type as CODING-SYSTEM.  */
5992
5993 Lisp_Object
5994 raw_text_coding_system (Lisp_Object coding_system)
5995 {
5996   Lisp_Object spec, attrs;
5997   Lisp_Object eol_type, raw_text_eol_type;
5998
5999   if (NILP (coding_system))
6000     return Qraw_text;
6001   spec = CODING_SYSTEM_SPEC (coding_system);
6002   attrs = AREF (spec, 0);
6003
6004   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6005     return coding_system;
6006
6007   eol_type = AREF (spec, 2);
6008   if (VECTORP (eol_type))
6009     return Qraw_text;
6010   spec = CODING_SYSTEM_SPEC (Qraw_text);
6011   raw_text_eol_type = AREF (spec, 2);
6012   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6013           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6014           : AREF (raw_text_eol_type, 2));
6015 }
6016
6017
6018 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6019    the subsidiary that has the same eol-spec as PARENT (if it is not
6020    nil and specifies end-of-line format) or the system's setting
6021    (system_eol_type).  */
6022
6023 Lisp_Object
6024 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6025 {
6026   Lisp_Object spec, eol_type;
6027
6028   if (NILP (coding_system))
6029     coding_system = Qraw_text;
6030   spec = CODING_SYSTEM_SPEC (coding_system);
6031   eol_type = AREF (spec, 2);
6032   if (VECTORP (eol_type))
6033     {
6034       Lisp_Object parent_eol_type;
6035
6036       if (! NILP (parent))
6037         {
6038           Lisp_Object parent_spec;
6039
6040           parent_spec = CODING_SYSTEM_SPEC (parent);
6041           parent_eol_type = AREF (parent_spec, 2);
6042           if (VECTORP (parent_eol_type))
6043             parent_eol_type = system_eol_type;
6044         }
6045       else
6046         parent_eol_type = system_eol_type;
6047       if (EQ (parent_eol_type, Qunix))
6048         coding_system = AREF (eol_type, 0);
6049       else if (EQ (parent_eol_type, Qdos))
6050         coding_system = AREF (eol_type, 1);
6051       else if (EQ (parent_eol_type, Qmac))
6052         coding_system = AREF (eol_type, 2);
6053     }
6054   return coding_system;
6055 }
6056
6057
6058 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6059    decided for writing to a process.  If not, complement them, and
6060    return a new coding system.  */
6061
6062 Lisp_Object
6063 complement_process_encoding_system (Lisp_Object coding_system)
6064 {
6065   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6066   Lisp_Object spec, attrs;
6067   int i;
6068
6069   for (i = 0; i < 3; i++)
6070     {
6071       if (i == 1)
6072         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6073       else if (i == 2)
6074         coding_system = preferred_coding_system ();
6075       spec = CODING_SYSTEM_SPEC (coding_system);
6076       if (NILP (spec))
6077         continue;
6078       attrs = AREF (spec, 0);
6079       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6080         coding_base = CODING_ATTR_BASE_NAME (attrs);
6081       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6082         eol_base = coding_system;
6083       if (! NILP (coding_base) && ! NILP (eol_base))
6084         break;
6085     }
6086
6087   if (i > 0)
6088     /* The original CODING_SYSTEM didn't specify text-conversion or
6089        eol-conversion.  Be sure that we return a fully complemented
6090        coding system.  */
6091     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6092   return coding_system;
6093 }
6094
6095
6096 /* Emacs has a mechanism to automatically detect a coding system if it
6097    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6098    it's impossible to distinguish some coding systems accurately
6099    because they use the same range of codes.  So, at first, coding
6100    systems are categorized into 7, those are:
6101
6102    o coding-category-emacs-mule
6103
6104         The category for a coding system which has the same code range
6105         as Emacs' internal format.  Assigned the coding-system (Lisp
6106         symbol) `emacs-mule' by default.
6107
6108    o coding-category-sjis
6109
6110         The category for a coding system which has the same code range
6111         as SJIS.  Assigned the coding-system (Lisp
6112         symbol) `japanese-shift-jis' by default.
6113
6114    o coding-category-iso-7
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 7-bit environment.  This doesn't use any locking
6118         shift and single shift functions.  This can encode/decode all
6119         charsets.  Assigned the coding-system (Lisp symbol)
6120         `iso-2022-7bit' by default.
6121
6122    o coding-category-iso-7-tight
6123
6124         Same as coding-category-iso-7 except that this can
6125         encode/decode only the specified charsets.
6126
6127    o coding-category-iso-8-1
6128
6129         The category for a coding system which has the same code range
6130         as ISO2022 of 8-bit environment and graphic plane 1 used only
6131         for DIMENSION1 charset.  This doesn't use any locking shift
6132         and single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-latin-1' by default.
6134
6135    o coding-category-iso-8-2
6136
6137         The category for a coding system which has the same code range
6138         as ISO2022 of 8-bit environment and graphic plane 1 used only
6139         for DIMENSION2 charset.  This doesn't use any locking shift
6140         and single shift functions.  Assigned the coding-system (Lisp
6141         symbol) `japanese-iso-8bit' by default.
6142
6143    o coding-category-iso-7-else
6144
6145         The category for a coding system which has the same code range
6146         as ISO2022 of 7-bit environment but uses locking shift or
6147         single shift functions.  Assigned the coding-system (Lisp
6148         symbol) `iso-2022-7bit-lock' by default.
6149
6150    o coding-category-iso-8-else
6151
6152         The category for a coding system which has the same code range
6153         as ISO2022 of 8-bit environment but uses locking shift or
6154         single shift functions.  Assigned the coding-system (Lisp
6155         symbol) `iso-2022-8bit-ss2' by default.
6156
6157    o coding-category-big5
6158
6159         The category for a coding system which has the same code range
6160         as BIG5.  Assigned the coding-system (Lisp symbol)
6161         `cn-big5' by default.
6162
6163    o coding-category-utf-8
6164
6165         The category for a coding system which has the same code range
6166         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6167         symbol) `utf-8' by default.
6168
6169    o coding-category-utf-16-be
6170
6171         The category for a coding system in which a text has an
6172         Unicode signature (cf. Unicode Standard) in the order of BIG
6173         endian at the head.  Assigned the coding-system (Lisp symbol)
6174         `utf-16-be' by default.
6175
6176    o coding-category-utf-16-le
6177
6178         The category for a coding system in which a text has an
6179         Unicode signature (cf. Unicode Standard) in the order of
6180         LITTLE endian at the head.  Assigned the coding-system (Lisp
6181         symbol) `utf-16-le' by default.
6182
6183    o coding-category-ccl
6184
6185         The category for a coding system of which encoder/decoder is
6186         written in CCL programs.  The default value is nil, i.e., no
6187         coding system is assigned.
6188
6189    o coding-category-binary
6190
6191         The category for a coding system not categorized in any of the
6192         above.  Assigned the coding-system (Lisp symbol)
6193         `no-conversion' by default.
6194
6195    Each of them is a Lisp symbol and the value is an actual
6196    `coding-system's (this is also a Lisp symbol) assigned by a user.
6197    What Emacs does actually is to detect a category of coding system.
6198    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6199    decide only one possible category, it selects a category of the
6200    highest priority.  Priorities of categories are also specified by a
6201    user in a Lisp variable `coding-category-list'.
6202
6203 */
6204
6205 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6206                                            int eol_seen);
6207
6208
6209 /* Return the number of ASCII characters at the head of the source.
6210    By side effects, set coding->head_ascii and update
6211    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6212    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6213    reliable only when all the source bytes are ASCII.  */
6214
6215 static ptrdiff_t
6216 check_ascii (struct coding_system *coding)
6217 {
6218   const unsigned char *src, *end;
6219   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6220   int eol_seen = coding->eol_seen;
6221
6222   coding_set_source (coding);
6223   src = coding->source;
6224   end = src + coding->src_bytes;
6225
6226   if (inhibit_eol_conversion
6227       || SYMBOLP (eol_type))
6228     {
6229       /* We don't have to check EOL format.  */
6230       while (src < end && !( *src & 0x80))
6231         {
6232           if (*src++ == '\n')
6233             eol_seen |= EOL_SEEN_LF;
6234         }
6235     }
6236   else
6237     {
6238       end--;                /* We look ahead one byte for "CR LF".  */
6239       while (src < end)
6240         {
6241           int c = *src;
6242
6243           if (c & 0x80)
6244             break;
6245           src++;
6246           if (c == '\r')
6247             {
6248               if (*src == '\n')
6249                 {
6250                   eol_seen |= EOL_SEEN_CRLF;
6251                   src++;
6252                 }
6253               else
6254                 eol_seen |= EOL_SEEN_CR;
6255             }
6256           else if (c == '\n')
6257             eol_seen |= EOL_SEEN_LF;
6258         }
6259       if (src == end)
6260         {
6261           int c = *src;
6262
6263           /* All bytes but the last one C are ASCII.  */
6264           if (! (c & 0x80))
6265             {
6266               if (c == '\r')
6267                 eol_seen |= EOL_SEEN_CR;
6268               else if (c  == '\n')
6269                 eol_seen |= EOL_SEEN_LF;
6270               src++;
6271             }
6272         }
6273     }
6274   coding->head_ascii = src - coding->source;
6275   coding->eol_seen = eol_seen;
6276   return (coding->head_ascii);
6277 }
6278
6279
6280 /* Return the number of characters at the source if all the bytes are
6281    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6282    effects, update coding->eol_seen.  The value of coding->eol_seen is
6283    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6284    the value is reliable only when all the source bytes are valid
6285    UTF-8.  */
6286
6287 static ptrdiff_t
6288 check_utf_8 (struct coding_system *coding)
6289 {
6290   const unsigned char *src, *end;
6291   int eol_seen;
6292   ptrdiff_t nchars = coding->head_ascii;
6293
6294   if (coding->head_ascii < 0)
6295     check_ascii (coding);
6296   else
6297     coding_set_source (coding);
6298   src = coding->source + coding->head_ascii;
6299   /* We look ahead one byte for CR LF.  */
6300   end = coding->source + coding->src_bytes - 1;
6301   eol_seen = coding->eol_seen;
6302   while (src < end)
6303     {
6304       int c = *src;
6305
6306       if (UTF_8_1_OCTET_P (*src))
6307         {
6308           src++;
6309           if (c < 0x20)
6310             {
6311               if (c == '\r')
6312                 {
6313                   if (*src == '\n')
6314                     {
6315                       eol_seen |= EOL_SEEN_CRLF;
6316                       src++;
6317                       nchars++;
6318                     }
6319                   else
6320                     eol_seen |= EOL_SEEN_CR;
6321                 }
6322               else if (c == '\n')
6323                 eol_seen |= EOL_SEEN_LF;
6324             }
6325         }
6326       else if (UTF_8_2_OCTET_LEADING_P (c))
6327         {
6328           if (c < 0xC2          /* overlong sequence */
6329               || src + 1 >= end
6330               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6331             return -1;
6332           src += 2;
6333         }
6334       else if (UTF_8_3_OCTET_LEADING_P (c))
6335         {
6336           if (src + 2 >= end
6337               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6338                     && UTF_8_EXTRA_OCTET_P (src[2])))
6339             return -1;
6340           c = (((c & 0xF) << 12)
6341                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6342           if (c < 0x800                       /* overlong sequence */
6343               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6344             return -1;
6345           src += 3;
6346         }
6347       else if (UTF_8_4_OCTET_LEADING_P (c))
6348         {
6349           if (src + 3 >= end
6350               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6351                     && UTF_8_EXTRA_OCTET_P (src[2])
6352                     && UTF_8_EXTRA_OCTET_P (src[3])))
6353             return -1;
6354           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6355                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6356           if (c < 0x10000       /* overlong sequence */
6357               || c >= 0x110000) /* non-Unicode character  */
6358             return -1;
6359           src += 4;
6360         }
6361       else
6362         return -1;
6363       nchars++;
6364     }
6365
6366   if (src == end)
6367     {
6368       if (! UTF_8_1_OCTET_P (*src))
6369         return -1;
6370       nchars++;
6371       if (*src == '\r')
6372         eol_seen |= EOL_SEEN_CR;
6373       else if (*src  == '\n')
6374         eol_seen |= EOL_SEEN_LF;
6375     }
6376   coding->eol_seen = eol_seen;
6377   return nchars;
6378 }
6379
6380
6381 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6382    SOURCE is encoded.  If CATEGORY is one of
6383    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6384    two-byte, else they are encoded by one-byte.
6385
6386    Return one of EOL_SEEN_XXX.  */
6387
6388 #define MAX_EOL_CHECK_COUNT 3
6389
6390 static int
6391 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6392             enum coding_category category)
6393 {
6394   const unsigned char *src = source, *src_end = src + src_bytes;
6395   unsigned char c;
6396   int total  = 0;
6397   int eol_seen = EOL_SEEN_NONE;
6398
6399   if ((1 << category) & CATEGORY_MASK_UTF_16)
6400     {
6401       bool msb = category == (coding_category_utf_16_le
6402                               | coding_category_utf_16_le_nosig);
6403       bool lsb = !msb;
6404
6405       while (src + 1 < src_end)
6406         {
6407           c = src[lsb];
6408           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6409             {
6410               int this_eol;
6411
6412               if (c == '\n')
6413                 this_eol = EOL_SEEN_LF;
6414               else if (src + 3 >= src_end
6415                        || src[msb + 2] != 0
6416                        || src[lsb + 2] != '\n')
6417                 this_eol = EOL_SEEN_CR;
6418               else
6419                 {
6420                   this_eol = EOL_SEEN_CRLF;
6421                   src += 2;
6422                 }
6423
6424               if (eol_seen == EOL_SEEN_NONE)
6425                 /* This is the first end-of-line.  */
6426                 eol_seen = this_eol;
6427               else if (eol_seen != this_eol)
6428                 {
6429                   /* The found type is different from what found before.
6430                      Allow for stray ^M characters in DOS EOL files.  */
6431                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6432                       || (eol_seen == EOL_SEEN_CRLF
6433                           && this_eol == EOL_SEEN_CR))
6434                     eol_seen = EOL_SEEN_CRLF;
6435                   else
6436                     {
6437                       eol_seen = EOL_SEEN_LF;
6438                       break;
6439                     }
6440                 }
6441               if (++total == MAX_EOL_CHECK_COUNT)
6442                 break;
6443             }
6444           src += 2;
6445         }
6446     }
6447   else
6448     while (src < src_end)
6449       {
6450         c = *src++;
6451         if (c == '\n' || c == '\r')
6452           {
6453             int this_eol;
6454
6455             if (c == '\n')
6456               this_eol = EOL_SEEN_LF;
6457             else if (src >= src_end || *src != '\n')
6458               this_eol = EOL_SEEN_CR;
6459             else
6460               this_eol = EOL_SEEN_CRLF, src++;
6461
6462             if (eol_seen == EOL_SEEN_NONE)
6463               /* This is the first end-of-line.  */
6464               eol_seen = this_eol;
6465             else if (eol_seen != this_eol)
6466               {
6467                 /* The found type is different from what found before.
6468                    Allow for stray ^M characters in DOS EOL files.  */
6469                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6470                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6471                   eol_seen = EOL_SEEN_CRLF;
6472                 else
6473                   {
6474                     eol_seen = EOL_SEEN_LF;
6475                     break;
6476                   }
6477               }
6478             if (++total == MAX_EOL_CHECK_COUNT)
6479               break;
6480           }
6481       }
6482   return eol_seen;
6483 }
6484
6485
6486 static Lisp_Object
6487 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6488 {
6489   Lisp_Object eol_type;
6490
6491   eol_type = CODING_ID_EOL_TYPE (coding->id);
6492   if (! VECTORP (eol_type))
6493     /* Already adjusted.  */
6494     return eol_type;
6495   if (eol_seen & EOL_SEEN_LF)
6496     {
6497       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6498       eol_type = Qunix;
6499     }
6500   else if (eol_seen & EOL_SEEN_CRLF)
6501     {
6502       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6503       eol_type = Qdos;
6504     }
6505   else if (eol_seen & EOL_SEEN_CR)
6506     {
6507       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6508       eol_type = Qmac;
6509     }
6510   return eol_type;
6511 }
6512
6513 /* Detect how a text specified in CODING is encoded.  If a coding
6514    system is detected, update fields of CODING by the detected coding
6515    system.  */
6516
6517 static void
6518 detect_coding (struct coding_system *coding)
6519 {
6520   const unsigned char *src, *src_end;
6521   unsigned int saved_mode = coding->mode;
6522   Lisp_Object found = Qnil;
6523   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6524
6525   coding->consumed = coding->consumed_char = 0;
6526   coding->produced = coding->produced_char = 0;
6527   coding_set_source (coding);
6528
6529   src_end = coding->source + coding->src_bytes;
6530
6531   coding->eol_seen = EOL_SEEN_NONE;
6532   /* If we have not yet decided the text encoding type, detect it
6533      now.  */
6534   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6535     {
6536       int c, i;
6537       struct coding_detection_info detect_info;
6538       bool null_byte_found = 0, eight_bit_found = 0;
6539       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6540                                        inhibit_null_byte_detection);
6541       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6542                                        inhibit_iso_escape_detection);
6543       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6544
6545       coding->head_ascii = 0;
6546       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6547       for (src = coding->source; src < src_end; src++)
6548         {
6549           c = *src;
6550           if (c & 0x80)
6551             {
6552               eight_bit_found = 1;
6553               if (null_byte_found)
6554                 break;
6555             }
6556           else if (c < 0x20)
6557             {
6558               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6559                   && ! inhibit_ied
6560                   && ! detect_info.checked)
6561                 {
6562                   if (detect_coding_iso_2022 (coding, &detect_info))
6563                     {
6564                       /* We have scanned the whole data.  */
6565                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6566                         {
6567                           /* We didn't find an 8-bit code.  We may
6568                              have found a null-byte, but it's very
6569                              rare that a binary file conforms to
6570                              ISO-2022.  */
6571                           src = src_end;
6572                           coding->head_ascii = src - coding->source;
6573                         }
6574                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6575                       break;
6576                     }
6577                 }
6578               else if (! c && !inhibit_nbd)
6579                 {
6580                   null_byte_found = 1;
6581                   if (eight_bit_found)
6582                     break;
6583                 }
6584               else if (! disable_ascii_optimization
6585                        && ! inhibit_eol_conversion)
6586                 {
6587                   if (c == '\r')
6588                     {
6589                       if (src < src_end && src[1] == '\n')
6590                         {
6591                           coding->eol_seen |= EOL_SEEN_CRLF;
6592                           src++;
6593                           if (! eight_bit_found)
6594                             coding->head_ascii++;
6595                         }
6596                       else
6597                         coding->eol_seen |= EOL_SEEN_CR;
6598                     }
6599                   else if (c == '\n')
6600                     {
6601                       coding->eol_seen |= EOL_SEEN_LF;
6602                     }
6603                 }
6604
6605               if (! eight_bit_found)
6606                 coding->head_ascii++;
6607             }
6608           else if (! eight_bit_found)
6609             coding->head_ascii++;
6610         }
6611
6612       if (null_byte_found || eight_bit_found
6613           || coding->head_ascii < coding->src_bytes
6614           || detect_info.found)
6615         {
6616           enum coding_category category;
6617           struct coding_system *this;
6618
6619           if (coding->head_ascii == coding->src_bytes)
6620             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6621             for (i = 0; i < coding_category_raw_text; i++)
6622               {
6623                 category = coding_priorities[i];
6624                 this = coding_categories + category;
6625                 if (detect_info.found & (1 << category))
6626                   break;
6627               }
6628           else
6629             {
6630               if (null_byte_found)
6631                 {
6632                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6633                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6634                 }
6635               else if (prefer_utf_8
6636                        && detect_coding_utf_8 (coding, &detect_info))
6637                 {
6638                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6639                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6640                 }
6641               for (i = 0; i < coding_category_raw_text; i++)
6642                 {
6643                   category = coding_priorities[i];
6644                   this = coding_categories + category;
6645                   /* Some of this->detector (e.g. detect_coding_sjis)
6646                      require this information.  */
6647                   coding->id = this->id;
6648                   if (this->id < 0)
6649                     {
6650                       /* No coding system of this category is defined.  */
6651                       detect_info.rejected |= (1 << category);
6652                     }
6653                   else if (category >= coding_category_raw_text)
6654                     continue;
6655                   else if (detect_info.checked & (1 << category))
6656                     {
6657                       if (detect_info.found & (1 << category))
6658                         break;
6659                     }
6660                   else if ((*(this->detector)) (coding, &detect_info)
6661                            && detect_info.found & (1 << category))
6662                     break;
6663                 }
6664             }
6665
6666           if (i < coding_category_raw_text)
6667             {
6668               if (category == coding_category_utf_8_auto)
6669                 {
6670                   Lisp_Object coding_systems;
6671
6672                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6673                                          coding_attr_utf_bom);
6674                   if (CONSP (coding_systems))
6675                     {
6676                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6677                         found = XCAR (coding_systems);
6678                       else
6679                         found = XCDR (coding_systems);
6680                     }
6681                   else
6682                     found = CODING_ID_NAME (this->id);
6683                 }
6684               else if (category == coding_category_utf_16_auto)
6685                 {
6686                   Lisp_Object coding_systems;
6687
6688                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6689                                          coding_attr_utf_bom);
6690                   if (CONSP (coding_systems))
6691                     {
6692                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6693                         found = XCAR (coding_systems);
6694                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6695                         found = XCDR (coding_systems);
6696                     }
6697                   else
6698                     found = CODING_ID_NAME (this->id);
6699                 }
6700               else
6701                 found = CODING_ID_NAME (this->id);
6702             }
6703           else if (null_byte_found)
6704             found = Qno_conversion;
6705           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6706                    == CATEGORY_MASK_ANY)
6707             found = Qraw_text;
6708           else if (detect_info.rejected)
6709             for (i = 0; i < coding_category_raw_text; i++)
6710               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6711                 {
6712                   this = coding_categories + coding_priorities[i];
6713                   found = CODING_ID_NAME (this->id);
6714                   break;
6715                 }
6716         }
6717     }
6718   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6719            == coding_category_utf_8_auto)
6720     {
6721       Lisp_Object coding_systems;
6722       struct coding_detection_info detect_info;
6723
6724       coding_systems
6725         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6726       detect_info.found = detect_info.rejected = 0;
6727       if (check_ascii (coding) == coding->src_bytes)
6728         {
6729           if (CONSP (coding_systems))
6730             found = XCDR (coding_systems);
6731         }
6732       else
6733         {
6734           if (CONSP (coding_systems)
6735               && detect_coding_utf_8 (coding, &detect_info))
6736             {
6737               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6738                 found = XCAR (coding_systems);
6739               else
6740                 found = XCDR (coding_systems);
6741             }
6742         }
6743     }
6744   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6745            == coding_category_utf_16_auto)
6746     {
6747       Lisp_Object coding_systems;
6748       struct coding_detection_info detect_info;
6749
6750       coding_systems
6751         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6752       detect_info.found = detect_info.rejected = 0;
6753       coding->head_ascii = 0;
6754       if (CONSP (coding_systems)
6755           && detect_coding_utf_16 (coding, &detect_info))
6756         {
6757           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6758             found = XCAR (coding_systems);
6759           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6760             found = XCDR (coding_systems);
6761         }
6762     }
6763
6764   if (! NILP (found))
6765     {
6766       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6767                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6768                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6769                            : EOL_SEEN_LF);
6770
6771       setup_coding_system (found, coding);
6772       if (specified_eol != EOL_SEEN_NONE)
6773         adjust_coding_eol_type (coding, specified_eol);
6774     }
6775
6776   coding->mode = saved_mode;
6777 }
6778
6779
6780 static void
6781 decode_eol (struct coding_system *coding)
6782 {
6783   Lisp_Object eol_type;
6784   unsigned char *p, *pbeg, *pend;
6785
6786   eol_type = CODING_ID_EOL_TYPE (coding->id);
6787   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6788     return;
6789
6790   if (NILP (coding->dst_object))
6791     pbeg = coding->destination;
6792   else
6793     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6794   pend = pbeg + coding->produced;
6795
6796   if (VECTORP (eol_type))
6797     {
6798       int eol_seen = EOL_SEEN_NONE;
6799
6800       for (p = pbeg; p < pend; p++)
6801         {
6802           if (*p == '\n')
6803             eol_seen |= EOL_SEEN_LF;
6804           else if (*p == '\r')
6805             {
6806               if (p + 1 < pend && *(p + 1) == '\n')
6807                 {
6808                   eol_seen |= EOL_SEEN_CRLF;
6809                   p++;
6810                 }
6811               else
6812                 eol_seen |= EOL_SEEN_CR;
6813             }
6814         }
6815       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6816       if ((eol_seen & EOL_SEEN_CRLF) != 0
6817           && (eol_seen & EOL_SEEN_CR) != 0
6818           && (eol_seen & EOL_SEEN_LF) == 0)
6819         eol_seen = EOL_SEEN_CRLF;
6820       else if (eol_seen != EOL_SEEN_NONE
6821           && eol_seen != EOL_SEEN_LF
6822           && eol_seen != EOL_SEEN_CRLF
6823           && eol_seen != EOL_SEEN_CR)
6824         eol_seen = EOL_SEEN_LF;
6825       if (eol_seen != EOL_SEEN_NONE)
6826         eol_type = adjust_coding_eol_type (coding, eol_seen);
6827     }
6828
6829   if (EQ (eol_type, Qmac))
6830     {
6831       for (p = pbeg; p < pend; p++)
6832         if (*p == '\r')
6833           *p = '\n';
6834     }
6835   else if (EQ (eol_type, Qdos))
6836     {
6837       ptrdiff_t n = 0;
6838
6839       if (NILP (coding->dst_object))
6840         {
6841           /* Start deleting '\r' from the tail to minimize the memory
6842              movement.  */
6843           for (p = pend - 2; p >= pbeg; p--)
6844             if (*p == '\r')
6845               {
6846                 memmove (p, p + 1, pend-- - p - 1);
6847                 n++;
6848               }
6849         }
6850       else
6851         {
6852           ptrdiff_t pos_byte = coding->dst_pos_byte;
6853           ptrdiff_t pos = coding->dst_pos;
6854           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6855
6856           while (pos < pos_end)
6857             {
6858               p = BYTE_POS_ADDR (pos_byte);
6859               if (*p == '\r' && p[1] == '\n')
6860                 {
6861                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6862                   n++;
6863                   pos_end--;
6864                 }
6865               pos++;
6866               if (coding->dst_multibyte)
6867                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6868               else
6869                 pos_byte++;
6870             }
6871         }
6872       coding->produced -= n;
6873       coding->produced_char -= n;
6874     }
6875 }
6876
6877
6878 /* Return a translation table (or list of them) from coding system
6879    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6880    not ENCODEP). */
6881
6882 static Lisp_Object
6883 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6884 {
6885   Lisp_Object standard, translation_table;
6886   Lisp_Object val;
6887
6888   if (NILP (Venable_character_translation))
6889     {
6890       if (max_lookup)
6891         *max_lookup = 0;
6892       return Qnil;
6893     }
6894   if (encodep)
6895     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6896       standard = Vstandard_translation_table_for_encode;
6897   else
6898     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6899       standard = Vstandard_translation_table_for_decode;
6900   if (NILP (translation_table))
6901     translation_table = standard;
6902   else
6903     {
6904       if (SYMBOLP (translation_table))
6905         translation_table = Fget (translation_table, Qtranslation_table);
6906       else if (CONSP (translation_table))
6907         {
6908           translation_table = Fcopy_sequence (translation_table);
6909           for (val = translation_table; CONSP (val); val = XCDR (val))
6910             if (SYMBOLP (XCAR (val)))
6911               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6912         }
6913       if (CHAR_TABLE_P (standard))
6914         {
6915           if (CONSP (translation_table))
6916             translation_table = nconc2 (translation_table, list1 (standard));
6917           else
6918             translation_table = list2 (translation_table, standard);
6919         }
6920     }
6921
6922   if (max_lookup)
6923     {
6924       *max_lookup = 1;
6925       if (CHAR_TABLE_P (translation_table)
6926           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6927         {
6928           val = XCHAR_TABLE (translation_table)->extras[1];
6929           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6930             *max_lookup = XFASTINT (val);
6931         }
6932       else if (CONSP (translation_table))
6933         {
6934           Lisp_Object tail;
6935
6936           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6937             if (CHAR_TABLE_P (XCAR (tail))
6938                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6939               {
6940                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6941                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6942                   *max_lookup = XFASTINT (tailval);
6943               }
6944         }
6945     }
6946   return translation_table;
6947 }
6948
6949 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6950   do {                                                          \
6951     trans = Qnil;                                               \
6952     if (CHAR_TABLE_P (table))                                   \
6953       {                                                         \
6954         trans = CHAR_TABLE_REF (table, c);                      \
6955         if (CHARACTERP (trans))                                 \
6956           c = XFASTINT (trans), trans = Qnil;                   \
6957       }                                                         \
6958     else if (CONSP (table))                                     \
6959       {                                                         \
6960         Lisp_Object tail;                                       \
6961                                                                 \
6962         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6963           if (CHAR_TABLE_P (XCAR (tail)))                       \
6964             {                                                   \
6965               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6966               if (CHARACTERP (trans))                           \
6967                 c = XFASTINT (trans), trans = Qnil;             \
6968               else if (! NILP (trans))                          \
6969                 break;                                          \
6970             }                                                   \
6971       }                                                         \
6972   } while (0)
6973
6974
6975 /* Return a translation of character(s) at BUF according to TRANS.
6976    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6977    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6978    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6979    translation is found, and Qnil if not found..
6980    If BUF is too short to lookup characters in FROM, return Qt.  */
6981
6982 static Lisp_Object
6983 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6984 {
6985
6986   if (INTEGERP (trans))
6987     return trans;
6988   for (; CONSP (trans); trans = XCDR (trans))
6989     {
6990       Lisp_Object val = XCAR (trans);
6991       Lisp_Object from = XCAR (val);
6992       ptrdiff_t len = ASIZE (from);
6993       ptrdiff_t i;
6994
6995       for (i = 0; i < len; i++)
6996         {
6997           if (buf + i == buf_end)
6998             return Qt;
6999           if (XINT (AREF (from, i)) != buf[i])
7000             break;
7001         }
7002       if (i == len)
7003         return val;
7004     }
7005   return Qnil;
7006 }
7007
7008
7009 static int
7010 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7011                bool last_block)
7012 {
7013   unsigned char *dst = coding->destination + coding->produced;
7014   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7015   ptrdiff_t produced;
7016   ptrdiff_t produced_chars = 0;
7017   int carryover = 0;
7018
7019   if (! coding->chars_at_source)
7020     {
7021       /* Source characters are in coding->charbuf.  */
7022       int *buf = coding->charbuf;
7023       int *buf_end = buf + coding->charbuf_used;
7024
7025       if (EQ (coding->src_object, coding->dst_object))
7026         {
7027           coding_set_source (coding);
7028           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7029         }
7030
7031       while (buf < buf_end)
7032         {
7033           int c = *buf;
7034           ptrdiff_t i;
7035
7036           if (c >= 0)
7037             {
7038               ptrdiff_t from_nchars = 1, to_nchars = 1;
7039               Lisp_Object trans = Qnil;
7040
7041               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7042               if (! NILP (trans))
7043                 {
7044                   trans = get_translation (trans, buf, buf_end);
7045                   if (INTEGERP (trans))
7046                     c = XINT (trans);
7047                   else if (CONSP (trans))
7048                     {
7049                       from_nchars = ASIZE (XCAR (trans));
7050                       trans = XCDR (trans);
7051                       if (INTEGERP (trans))
7052                         c = XINT (trans);
7053                       else
7054                         {
7055                           to_nchars = ASIZE (trans);
7056                           c = XINT (AREF (trans, 0));
7057                         }
7058                     }
7059                   else if (EQ (trans, Qt) && ! last_block)
7060                     break;
7061                 }
7062
7063               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7064                 {
7065                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7066                        / MAX_MULTIBYTE_LENGTH)
7067                       < to_nchars)
7068                     memory_full (SIZE_MAX);
7069                   dst = alloc_destination (coding,
7070                                            buf_end - buf
7071                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7072                                            dst);
7073                   if (EQ (coding->src_object, coding->dst_object))
7074                     {
7075                       coding_set_source (coding);
7076                       dst_end = (((unsigned char *) coding->source)
7077                                  + coding->consumed);
7078                     }
7079                   else
7080                     dst_end = coding->destination + coding->dst_bytes;
7081                 }
7082
7083               for (i = 0; i < to_nchars; i++)
7084                 {
7085                   if (i > 0)
7086                     c = XINT (AREF (trans, i));
7087                   if (coding->dst_multibyte
7088                       || ! CHAR_BYTE8_P (c))
7089                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7090                   else
7091                     *dst++ = CHAR_TO_BYTE8 (c);
7092                 }
7093               produced_chars += to_nchars;
7094               buf += from_nchars;
7095             }
7096           else
7097             /* This is an annotation datum.  (-C) is the length.  */
7098             buf += -c;
7099         }
7100       carryover = buf_end - buf;
7101     }
7102   else
7103     {
7104       /* Source characters are at coding->source.  */
7105       const unsigned char *src = coding->source;
7106       const unsigned char *src_end = src + coding->consumed;
7107
7108       if (EQ (coding->dst_object, coding->src_object))
7109         dst_end = (unsigned char *) src;
7110       if (coding->src_multibyte != coding->dst_multibyte)
7111         {
7112           if (coding->src_multibyte)
7113             {
7114               bool multibytep = 1;
7115               ptrdiff_t consumed_chars = 0;
7116
7117               while (1)
7118                 {
7119                   const unsigned char *src_base = src;
7120                   int c;
7121
7122                   ONE_MORE_BYTE (c);
7123                   if (dst == dst_end)
7124                     {
7125                       if (EQ (coding->src_object, coding->dst_object))
7126                         dst_end = (unsigned char *) src;
7127                       if (dst == dst_end)
7128                         {
7129                           ptrdiff_t offset = src - coding->source;
7130
7131                           dst = alloc_destination (coding, src_end - src + 1,
7132                                                    dst);
7133                           dst_end = coding->destination + coding->dst_bytes;
7134                           coding_set_source (coding);
7135                           src = coding->source + offset;
7136                           src_end = coding->source + coding->consumed;
7137                           if (EQ (coding->src_object, coding->dst_object))
7138                             dst_end = (unsigned char *) src;
7139                         }
7140                     }
7141                   *dst++ = c;
7142                   produced_chars++;
7143                 }
7144             no_more_source:
7145               ;
7146             }
7147           else
7148             while (src < src_end)
7149               {
7150                 bool multibytep = 1;
7151                 int c = *src++;
7152
7153                 if (dst >= dst_end - 1)
7154                   {
7155                     if (EQ (coding->src_object, coding->dst_object))
7156                       dst_end = (unsigned char *) src;
7157                     if (dst >= dst_end - 1)
7158                       {
7159                         ptrdiff_t offset = src - coding->source;
7160                         ptrdiff_t more_bytes;
7161
7162                         if (EQ (coding->src_object, coding->dst_object))
7163                           more_bytes = ((src_end - src) / 2) + 2;
7164                         else
7165                           more_bytes = src_end - src + 2;
7166                         dst = alloc_destination (coding, more_bytes, dst);
7167                         dst_end = coding->destination + coding->dst_bytes;
7168                         coding_set_source (coding);
7169                         src = coding->source + offset;
7170                         src_end = coding->source + coding->consumed;
7171                         if (EQ (coding->src_object, coding->dst_object))
7172                           dst_end = (unsigned char *) src;
7173                       }
7174                   }
7175                 EMIT_ONE_BYTE (c);
7176               }
7177         }
7178       else
7179         {
7180           if (!EQ (coding->src_object, coding->dst_object))
7181             {
7182               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7183
7184               if (require > 0)
7185                 {
7186                   ptrdiff_t offset = src - coding->source;
7187
7188                   dst = alloc_destination (coding, require, dst);
7189                   coding_set_source (coding);
7190                   src = coding->source + offset;
7191                   src_end = coding->source + coding->consumed;
7192                 }
7193             }
7194           produced_chars = coding->consumed_char;
7195           while (src < src_end)
7196             *dst++ = *src++;
7197         }
7198     }
7199
7200   produced = dst - (coding->destination + coding->produced);
7201   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7202     insert_from_gap (produced_chars, produced, 0);
7203   coding->produced += produced;
7204   coding->produced_char += produced_chars;
7205   return carryover;
7206 }
7207
7208 /* Compose text in CODING->object according to the annotation data at
7209    CHARBUF.  CHARBUF is an array:
7210      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7211  */
7212
7213 static void
7214 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7215 {
7216   int len;
7217   ptrdiff_t to;
7218   enum composition_method method;
7219   Lisp_Object components;
7220
7221   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7222   to = pos + charbuf[2];
7223   method = (enum composition_method) (charbuf[4]);
7224
7225   if (method == COMPOSITION_RELATIVE)
7226     components = Qnil;
7227   else
7228     {
7229       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7230       int i, j;
7231
7232       if (method == COMPOSITION_WITH_RULE)
7233         len = charbuf[2] * 3 - 2;
7234       charbuf += MAX_ANNOTATION_LENGTH;
7235       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7236       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7237         {
7238           if (charbuf[i] >= 0)
7239             args[j] = make_number (charbuf[i]);
7240           else
7241             {
7242               i++;
7243               args[j] = make_number (charbuf[i] % 0x100);
7244             }
7245         }
7246       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7247     }
7248   compose_text (pos, to, components, Qnil, coding->dst_object);
7249 }
7250
7251
7252 /* Put `charset' property on text in CODING->object according to
7253    the annotation data at CHARBUF.  CHARBUF is an array:
7254      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7255  */
7256
7257 static void
7258 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7259 {
7260   ptrdiff_t from = pos - charbuf[2];
7261   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7262
7263   Fput_text_property (make_number (from), make_number (pos),
7264                       Qcharset, CHARSET_NAME (charset),
7265                       coding->dst_object);
7266 }
7267
7268 #define MAX_CHARBUF_SIZE 0x4000
7269 /* How many units decoding functions expect in coding->charbuf at
7270    most.  Currently, decode_coding_emacs_mule expects the following
7271    size, and that is the largest value.  */
7272 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7273
7274 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7275   do {                                                          \
7276     int units = (size) + MAX_CHARBUF_EXTRA_SIZE;                \
7277                                                                 \
7278     if (units > MAX_CHARBUF_SIZE)                               \
7279       units = MAX_CHARBUF_SIZE;                                 \
7280     coding->charbuf = SAFE_ALLOCA ((units) * sizeof (int));     \
7281     coding->charbuf_size = (units);                             \
7282   } while (0)
7283
7284
7285 static void
7286 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7287 {
7288   int *charbuf = coding->charbuf;
7289   int *charbuf_end = charbuf + coding->charbuf_used;
7290
7291   if (NILP (coding->dst_object))
7292     return;
7293
7294   while (charbuf < charbuf_end)
7295     {
7296       if (*charbuf >= 0)
7297         pos++, charbuf++;
7298       else
7299         {
7300           int len = -*charbuf;
7301
7302           if (len > 2)
7303             switch (charbuf[1])
7304               {
7305               case CODING_ANNOTATE_COMPOSITION_MASK:
7306                 produce_composition (coding, charbuf, pos);
7307                 break;
7308               case CODING_ANNOTATE_CHARSET_MASK:
7309                 produce_charset (coding, charbuf, pos);
7310                 break;
7311               }
7312           charbuf += len;
7313         }
7314     }
7315 }
7316
7317 /* Decode the data at CODING->src_object into CODING->dst_object.
7318    CODING->src_object is a buffer, a string, or nil.
7319    CODING->dst_object is a buffer.
7320
7321    If CODING->src_object is a buffer, it must be the current buffer.
7322    In this case, if CODING->src_pos is positive, it is a position of
7323    the source text in the buffer, otherwise, the source text is in the
7324    gap area of the buffer, and CODING->src_pos specifies the offset of
7325    the text from GPT (which must be the same as PT).  If this is the
7326    same buffer as CODING->dst_object, CODING->src_pos must be
7327    negative.
7328
7329    If CODING->src_object is a string, CODING->src_pos is an index to
7330    that string.
7331
7332    If CODING->src_object is nil, CODING->source must already point to
7333    the non-relocatable memory area.  In this case, CODING->src_pos is
7334    an offset from CODING->source.
7335
7336    The decoded data is inserted at the current point of the buffer
7337    CODING->dst_object.
7338 */
7339
7340 static void
7341 decode_coding (struct coding_system *coding)
7342 {
7343   Lisp_Object attrs;
7344   Lisp_Object undo_list;
7345   Lisp_Object translation_table;
7346   struct ccl_spec cclspec;
7347   int carryover;
7348   int i;
7349
7350   USE_SAFE_ALLOCA;
7351
7352   if (BUFFERP (coding->src_object)
7353       && coding->src_pos > 0
7354       && coding->src_pos < GPT
7355       && coding->src_pos + coding->src_chars > GPT)
7356     move_gap_both (coding->src_pos, coding->src_pos_byte);
7357
7358   undo_list = Qt;
7359   if (BUFFERP (coding->dst_object))
7360     {
7361       set_buffer_internal (XBUFFER (coding->dst_object));
7362       if (GPT != PT)
7363         move_gap_both (PT, PT_BYTE);
7364
7365       /* We must disable undo_list in order to record the whole insert
7366          transaction via record_insert at the end.  But doing so also
7367          disables the recording of the first change to the undo_list.
7368          Therefore we check for first change here and record it via
7369          record_first_change if needed.  */
7370       if (MODIFF <= SAVE_MODIFF)
7371         record_first_change ();
7372
7373       undo_list = BVAR (current_buffer, undo_list);
7374       bset_undo_list (current_buffer, Qt);
7375     }
7376
7377   coding->consumed = coding->consumed_char = 0;
7378   coding->produced = coding->produced_char = 0;
7379   coding->chars_at_source = 0;
7380   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7381   coding->errors = 0;
7382
7383   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7384
7385   attrs = CODING_ID_ATTRS (coding->id);
7386   translation_table = get_translation_table (attrs, 0, NULL);
7387
7388   carryover = 0;
7389   if (coding->decoder == decode_coding_ccl)
7390     {
7391       coding->spec.ccl = &cclspec;
7392       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7393     }
7394   do
7395     {
7396       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7397
7398       coding_set_source (coding);
7399       coding->annotated = 0;
7400       coding->charbuf_used = carryover;
7401       (*(coding->decoder)) (coding);
7402       coding_set_destination (coding);
7403       carryover = produce_chars (coding, translation_table, 0);
7404       if (coding->annotated)
7405         produce_annotation (coding, pos);
7406       for (i = 0; i < carryover; i++)
7407         coding->charbuf[i]
7408           = coding->charbuf[coding->charbuf_used - carryover + i];
7409     }
7410   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7411          || (coding->consumed < coding->src_bytes
7412              && (coding->result == CODING_RESULT_SUCCESS
7413                  || coding->result == CODING_RESULT_INVALID_SRC)));
7414
7415   if (carryover > 0)
7416     {
7417       coding_set_destination (coding);
7418       coding->charbuf_used = carryover;
7419       produce_chars (coding, translation_table, 1);
7420     }
7421
7422   coding->carryover_bytes = 0;
7423   if (coding->consumed < coding->src_bytes)
7424     {
7425       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7426       const unsigned char *src;
7427
7428       coding_set_source (coding);
7429       coding_set_destination (coding);
7430       src = coding->source + coding->consumed;
7431
7432       if (coding->mode & CODING_MODE_LAST_BLOCK)
7433         {
7434           /* Flush out unprocessed data as binary chars.  We are sure
7435              that the number of data is less than the size of
7436              coding->charbuf.  */
7437           coding->charbuf_used = 0;
7438           coding->chars_at_source = 0;
7439
7440           while (nbytes-- > 0)
7441             {
7442               int c = *src++;
7443
7444               if (c & 0x80)
7445                 c = BYTE8_TO_CHAR (c);
7446               coding->charbuf[coding->charbuf_used++] = c;
7447             }
7448           produce_chars (coding, Qnil, 1);
7449         }
7450       else
7451         {
7452           /* Record unprocessed bytes in coding->carryover.  We are
7453              sure that the number of data is less than the size of
7454              coding->carryover.  */
7455           unsigned char *p = coding->carryover;
7456
7457           if (nbytes > sizeof coding->carryover)
7458             nbytes = sizeof coding->carryover;
7459           coding->carryover_bytes = nbytes;
7460           while (nbytes-- > 0)
7461             *p++ = *src++;
7462         }
7463       coding->consumed = coding->src_bytes;
7464     }
7465
7466   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7467       && !inhibit_eol_conversion)
7468     decode_eol (coding);
7469   if (BUFFERP (coding->dst_object))
7470     {
7471       bset_undo_list (current_buffer, undo_list);
7472       record_insert (coding->dst_pos, coding->produced_char);
7473     }
7474
7475   SAFE_FREE ();
7476 }
7477
7478
7479 /* Extract an annotation datum from a composition starting at POS and
7480    ending before LIMIT of CODING->src_object (buffer or string), store
7481    the data in BUF, set *STOP to a starting position of the next
7482    composition (if any) or to LIMIT, and return the address of the
7483    next element of BUF.
7484
7485    If such an annotation is not found, set *STOP to a starting
7486    position of a composition after POS (if any) or to LIMIT, and
7487    return BUF.  */
7488
7489 static int *
7490 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7491                                struct coding_system *coding, int *buf,
7492                                ptrdiff_t *stop)
7493 {
7494   ptrdiff_t start, end;
7495   Lisp_Object prop;
7496
7497   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7498       || end > limit)
7499     *stop = limit;
7500   else if (start > pos)
7501     *stop = start;
7502   else
7503     {
7504       if (start == pos)
7505         {
7506           /* We found a composition.  Store the corresponding
7507              annotation data in BUF.  */
7508           int *head = buf;
7509           enum composition_method method = composition_method (prop);
7510           int nchars = COMPOSITION_LENGTH (prop);
7511
7512           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7513           if (method != COMPOSITION_RELATIVE)
7514             {
7515               Lisp_Object components;
7516               ptrdiff_t i, len, i_byte;
7517
7518               components = COMPOSITION_COMPONENTS (prop);
7519               if (VECTORP (components))
7520                 {
7521                   len = ASIZE (components);
7522                   for (i = 0; i < len; i++)
7523                     *buf++ = XINT (AREF (components, i));
7524                 }
7525               else if (STRINGP (components))
7526                 {
7527                   len = SCHARS (components);
7528                   i = i_byte = 0;
7529                   while (i < len)
7530                     {
7531                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7532                       buf++;
7533                     }
7534                 }
7535               else if (INTEGERP (components))
7536                 {
7537                   len = 1;
7538                   *buf++ = XINT (components);
7539                 }
7540               else if (CONSP (components))
7541                 {
7542                   for (len = 0; CONSP (components);
7543                        len++, components = XCDR (components))
7544                     *buf++ = XINT (XCAR (components));
7545                 }
7546               else
7547                 emacs_abort ();
7548               *head -= len;
7549             }
7550         }
7551
7552       if (find_composition (end, limit, &start, &end, &prop,
7553                             coding->src_object)
7554           && end <= limit)
7555         *stop = start;
7556       else
7557         *stop = limit;
7558     }
7559   return buf;
7560 }
7561
7562
7563 /* Extract an annotation datum from a text property `charset' at POS of
7564    CODING->src_object (buffer of string), store the data in BUF, set
7565    *STOP to the position where the value of `charset' property changes
7566    (limiting by LIMIT), and return the address of the next element of
7567    BUF.
7568
7569    If the property value is nil, set *STOP to the position where the
7570    property value is non-nil (limiting by LIMIT), and return BUF.  */
7571
7572 static int *
7573 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7574                            struct coding_system *coding, int *buf,
7575                            ptrdiff_t *stop)
7576 {
7577   Lisp_Object val, next;
7578   int id;
7579
7580   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7581   if (! NILP (val) && CHARSETP (val))
7582     id = XINT (CHARSET_SYMBOL_ID (val));
7583   else
7584     id = -1;
7585   ADD_CHARSET_DATA (buf, 0, id);
7586   next = Fnext_single_property_change (make_number (pos), Qcharset,
7587                                        coding->src_object,
7588                                        make_number (limit));
7589   *stop = XINT (next);
7590   return buf;
7591 }
7592
7593
7594 static void
7595 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7596                int max_lookup)
7597 {
7598   int *buf = coding->charbuf;
7599   int *buf_end = coding->charbuf + coding->charbuf_size;
7600   const unsigned char *src = coding->source + coding->consumed;
7601   const unsigned char *src_end = coding->source + coding->src_bytes;
7602   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7603   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7604   bool multibytep = coding->src_multibyte;
7605   Lisp_Object eol_type;
7606   int c;
7607   ptrdiff_t stop, stop_composition, stop_charset;
7608   int *lookup_buf = NULL;
7609
7610   if (! NILP (translation_table))
7611     lookup_buf = alloca (sizeof (int) * max_lookup);
7612
7613   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7614   if (VECTORP (eol_type))
7615     eol_type = Qunix;
7616
7617   /* Note: composition handling is not yet implemented.  */
7618   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7619
7620   if (NILP (coding->src_object))
7621     stop = stop_composition = stop_charset = end_pos;
7622   else
7623     {
7624       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7625         stop = stop_composition = pos;
7626       else
7627         stop = stop_composition = end_pos;
7628       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7629         stop = stop_charset = pos;
7630       else
7631         stop_charset = end_pos;
7632     }
7633
7634   /* Compensate for CRLF and conversion.  */
7635   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7636   while (buf < buf_end)
7637     {
7638       Lisp_Object trans;
7639
7640       if (pos == stop)
7641         {
7642           if (pos == end_pos)
7643             break;
7644           if (pos == stop_composition)
7645             buf = handle_composition_annotation (pos, end_pos, coding,
7646                                                  buf, &stop_composition);
7647           if (pos == stop_charset)
7648             buf = handle_charset_annotation (pos, end_pos, coding,
7649                                              buf, &stop_charset);
7650           stop = (stop_composition < stop_charset
7651                   ? stop_composition : stop_charset);
7652         }
7653
7654       if (! multibytep)
7655         {
7656           int bytes;
7657
7658           if (coding->encoder == encode_coding_raw_text
7659               || coding->encoder == encode_coding_ccl)
7660             c = *src++, pos++;
7661           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7662             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7663           else
7664             c = BYTE8_TO_CHAR (*src), src++, pos++;
7665         }
7666       else
7667         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7668       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7669         c = '\n';
7670       if (! EQ (eol_type, Qunix))
7671         {
7672           if (c == '\n')
7673             {
7674               if (EQ (eol_type, Qdos))
7675                 *buf++ = '\r';
7676               else
7677                 c = '\r';
7678             }
7679         }
7680
7681       trans = Qnil;
7682       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7683       if (NILP (trans))
7684         *buf++ = c;
7685       else
7686         {
7687           ptrdiff_t from_nchars = 1, to_nchars = 1;
7688           int *lookup_buf_end;
7689           const unsigned char *p = src;
7690           int i;
7691
7692           lookup_buf[0] = c;
7693           for (i = 1; i < max_lookup && p < src_end; i++)
7694             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7695           lookup_buf_end = lookup_buf + i;
7696           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7697           if (INTEGERP (trans))
7698             c = XINT (trans);
7699           else if (CONSP (trans))
7700             {
7701               from_nchars = ASIZE (XCAR (trans));
7702               trans = XCDR (trans);
7703               if (INTEGERP (trans))
7704                 c = XINT (trans);
7705               else
7706                 {
7707                   to_nchars = ASIZE (trans);
7708                   if (buf_end - buf < to_nchars)
7709                     break;
7710                   c = XINT (AREF (trans, 0));
7711                 }
7712             }
7713           else
7714             break;
7715           *buf++ = c;
7716           for (i = 1; i < to_nchars; i++)
7717             *buf++ = XINT (AREF (trans, i));
7718           for (i = 1; i < from_nchars; i++, pos++)
7719             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7720         }
7721     }
7722
7723   coding->consumed = src - coding->source;
7724   coding->consumed_char = pos - coding->src_pos;
7725   coding->charbuf_used = buf - coding->charbuf;
7726   coding->chars_at_source = 0;
7727 }
7728
7729
7730 /* Encode the text at CODING->src_object into CODING->dst_object.
7731    CODING->src_object is a buffer or a string.
7732    CODING->dst_object is a buffer or nil.
7733
7734    If CODING->src_object is a buffer, it must be the current buffer.
7735    In this case, if CODING->src_pos is positive, it is a position of
7736    the source text in the buffer, otherwise. the source text is in the
7737    gap area of the buffer, and coding->src_pos specifies the offset of
7738    the text from GPT (which must be the same as PT).  If this is the
7739    same buffer as CODING->dst_object, CODING->src_pos must be
7740    negative and CODING should not have `pre-write-conversion'.
7741
7742    If CODING->src_object is a string, CODING should not have
7743    `pre-write-conversion'.
7744
7745    If CODING->dst_object is a buffer, the encoded data is inserted at
7746    the current point of that buffer.
7747
7748    If CODING->dst_object is nil, the encoded data is placed at the
7749    memory area specified by CODING->destination.  */
7750
7751 static void
7752 encode_coding (struct coding_system *coding)
7753 {
7754   Lisp_Object attrs;
7755   Lisp_Object translation_table;
7756   int max_lookup;
7757   struct ccl_spec cclspec;
7758
7759   USE_SAFE_ALLOCA;
7760
7761   attrs = CODING_ID_ATTRS (coding->id);
7762   if (coding->encoder == encode_coding_raw_text)
7763     translation_table = Qnil, max_lookup = 0;
7764   else
7765     translation_table = get_translation_table (attrs, 1, &max_lookup);
7766
7767   if (BUFFERP (coding->dst_object))
7768     {
7769       set_buffer_internal (XBUFFER (coding->dst_object));
7770       coding->dst_multibyte
7771         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7772     }
7773
7774   coding->consumed = coding->consumed_char = 0;
7775   coding->produced = coding->produced_char = 0;
7776   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7777   coding->errors = 0;
7778
7779   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7780
7781   if (coding->encoder == encode_coding_ccl)
7782     {
7783       coding->spec.ccl = &cclspec;
7784       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7785     }
7786   do {
7787     coding_set_source (coding);
7788     consume_chars (coding, translation_table, max_lookup);
7789     coding_set_destination (coding);
7790     (*(coding->encoder)) (coding);
7791   } while (coding->consumed_char < coding->src_chars);
7792
7793   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7794     insert_from_gap (coding->produced_char, coding->produced, 0);
7795
7796   SAFE_FREE ();
7797 }
7798
7799
7800 /* Name (or base name) of work buffer for code conversion.  */
7801 static Lisp_Object Vcode_conversion_workbuf_name;
7802
7803 /* A working buffer used by the top level conversion.  Once it is
7804    created, it is never destroyed.  It has the name
7805    Vcode_conversion_workbuf_name.  The other working buffers are
7806    destroyed after the use is finished, and their names are modified
7807    versions of Vcode_conversion_workbuf_name.  */
7808 static Lisp_Object Vcode_conversion_reused_workbuf;
7809
7810 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7811 static bool reused_workbuf_in_use;
7812
7813
7814 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7815    multibyteness of returning buffer.  */
7816
7817 static Lisp_Object
7818 make_conversion_work_buffer (bool multibyte)
7819 {
7820   Lisp_Object name, workbuf;
7821   struct buffer *current;
7822
7823   if (reused_workbuf_in_use)
7824     {
7825       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7826       workbuf = Fget_buffer_create (name);
7827     }
7828   else
7829     {
7830       reused_workbuf_in_use = 1;
7831       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7832         Vcode_conversion_reused_workbuf
7833           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7834       workbuf = Vcode_conversion_reused_workbuf;
7835     }
7836   current = current_buffer;
7837   set_buffer_internal (XBUFFER (workbuf));
7838   /* We can't allow modification hooks to run in the work buffer.  For
7839      instance, directory_files_internal assumes that file decoding
7840      doesn't compile new regexps.  */
7841   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7842   Ferase_buffer ();
7843   bset_undo_list (current_buffer, Qt);
7844   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7845   set_buffer_internal (current);
7846   return workbuf;
7847 }
7848
7849
7850 static void
7851 code_conversion_restore (Lisp_Object arg)
7852 {
7853   Lisp_Object current, workbuf;
7854   struct gcpro gcpro1;
7855
7856   GCPRO1 (arg);
7857   current = XCAR (arg);
7858   workbuf = XCDR (arg);
7859   if (! NILP (workbuf))
7860     {
7861       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7862         reused_workbuf_in_use = 0;
7863       else
7864         Fkill_buffer (workbuf);
7865     }
7866   set_buffer_internal (XBUFFER (current));
7867   UNGCPRO;
7868 }
7869
7870 Lisp_Object
7871 code_conversion_save (bool with_work_buf, bool multibyte)
7872 {
7873   Lisp_Object workbuf = Qnil;
7874
7875   if (with_work_buf)
7876     workbuf = make_conversion_work_buffer (multibyte);
7877   record_unwind_protect (code_conversion_restore,
7878                          Fcons (Fcurrent_buffer (), workbuf));
7879   return workbuf;
7880 }
7881
7882 void
7883 decode_coding_gap (struct coding_system *coding,
7884                    ptrdiff_t chars, ptrdiff_t bytes)
7885 {
7886   ptrdiff_t count = SPECPDL_INDEX ();
7887   Lisp_Object attrs;
7888
7889   coding->src_object = Fcurrent_buffer ();
7890   coding->src_chars = chars;
7891   coding->src_bytes = bytes;
7892   coding->src_pos = -chars;
7893   coding->src_pos_byte = -bytes;
7894   coding->src_multibyte = chars < bytes;
7895   coding->dst_object = coding->src_object;
7896   coding->dst_pos = PT;
7897   coding->dst_pos_byte = PT_BYTE;
7898   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7899
7900   coding->head_ascii = -1;
7901   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7902   coding->eol_seen = EOL_SEEN_NONE;
7903   if (CODING_REQUIRE_DETECTION (coding))
7904     detect_coding (coding);
7905   attrs = CODING_ID_ATTRS (coding->id);
7906   if (! disable_ascii_optimization
7907       && ! coding->src_multibyte
7908       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7909       && NILP (CODING_ATTR_POST_READ (attrs))
7910       && NILP (get_translation_table (attrs, 0, NULL)))
7911     {
7912       chars = coding->head_ascii;
7913       if (chars < 0)
7914         chars = check_ascii (coding);
7915       if (chars != bytes)
7916         {
7917           /* There exists a non-ASCII byte.  */
7918           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7919               && coding->detected_utf8_bytes == coding->src_bytes)
7920             {
7921               if (coding->detected_utf8_chars >= 0)
7922                 chars = coding->detected_utf8_chars;
7923               else
7924                 chars = check_utf_8 (coding);
7925               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7926                   && coding->head_ascii == 0
7927                   && coding->source[0] == UTF_8_BOM_1
7928                   && coding->source[1] == UTF_8_BOM_2
7929                   && coding->source[2] == UTF_8_BOM_3)
7930                 {
7931                   chars--;
7932                   bytes -= 3;
7933                   coding->src_bytes -= 3;
7934                 }
7935             }
7936           else
7937             chars = -1;
7938         }
7939       if (chars >= 0)
7940         {
7941           Lisp_Object eol_type;
7942
7943           eol_type = CODING_ID_EOL_TYPE (coding->id);
7944           if (VECTORP (eol_type))
7945             {
7946               if (coding->eol_seen != EOL_SEEN_NONE)
7947                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7948             }
7949           if (EQ (eol_type, Qmac))
7950             {
7951               unsigned char *src_end = GAP_END_ADDR;
7952               unsigned char *src = src_end - coding->src_bytes;
7953
7954               while (src < src_end)
7955                 {
7956                   if (*src++ == '\r')
7957                     src[-1] = '\n';
7958                 }
7959             }
7960           else if (EQ (eol_type, Qdos))
7961             {
7962               unsigned char *src = GAP_END_ADDR;
7963               unsigned char *src_beg = src - coding->src_bytes;
7964               unsigned char *dst = src;
7965               ptrdiff_t diff;
7966
7967               while (src_beg < src)
7968                 {
7969                   *--dst = *--src;
7970                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7971                     src--;
7972                 }
7973               diff = dst - src;
7974               bytes -= diff;
7975               chars -= diff;
7976             }
7977           coding->produced = bytes;
7978           coding->produced_char = chars;
7979           insert_from_gap (chars, bytes, 1);
7980           return;
7981         }
7982     }
7983   code_conversion_save (0, 0);
7984
7985   coding->mode |= CODING_MODE_LAST_BLOCK;
7986   current_buffer->text->inhibit_shrinking = 1;
7987   decode_coding (coding);
7988   current_buffer->text->inhibit_shrinking = 0;
7989
7990   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7991     {
7992       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7993       Lisp_Object val;
7994
7995       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7996       val = call1 (CODING_ATTR_POST_READ (attrs),
7997                    make_number (coding->produced_char));
7998       CHECK_NATNUM (val);
7999       coding->produced_char += Z - prev_Z;
8000       coding->produced += Z_BYTE - prev_Z_BYTE;
8001     }
8002
8003   unbind_to (count, Qnil);
8004 }
8005
8006
8007 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8008    SRC_OBJECT into DST_OBJECT by coding context CODING.
8009
8010    SRC_OBJECT is a buffer, a string, or Qnil.
8011
8012    If it is a buffer, the text is at point of the buffer.  FROM and TO
8013    are positions in the buffer.
8014
8015    If it is a string, the text is at the beginning of the string.
8016    FROM and TO are indices to the string.
8017
8018    If it is nil, the text is at coding->source.  FROM and TO are
8019    indices to coding->source.
8020
8021    DST_OBJECT is a buffer, Qt, or Qnil.
8022
8023    If it is a buffer, the decoded text is inserted at point of the
8024    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8025    is deleted.
8026
8027    If it is Qt, a string is made from the decoded text, and
8028    set in CODING->dst_object.
8029
8030    If it is Qnil, the decoded text is stored at CODING->destination.
8031    The caller must allocate CODING->dst_bytes bytes at
8032    CODING->destination by xmalloc.  If the decoded text is longer than
8033    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8034  */
8035
8036 void
8037 decode_coding_object (struct coding_system *coding,
8038                       Lisp_Object src_object,
8039                       ptrdiff_t from, ptrdiff_t from_byte,
8040                       ptrdiff_t to, ptrdiff_t to_byte,
8041                       Lisp_Object dst_object)
8042 {
8043   ptrdiff_t count = SPECPDL_INDEX ();
8044   unsigned char *destination IF_LINT (= NULL);
8045   ptrdiff_t dst_bytes IF_LINT (= 0);
8046   ptrdiff_t chars = to - from;
8047   ptrdiff_t bytes = to_byte - from_byte;
8048   Lisp_Object attrs;
8049   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8050   bool need_marker_adjustment = 0;
8051   Lisp_Object old_deactivate_mark;
8052
8053   old_deactivate_mark = Vdeactivate_mark;
8054
8055   if (NILP (dst_object))
8056     {
8057       destination = coding->destination;
8058       dst_bytes = coding->dst_bytes;
8059     }
8060
8061   coding->src_object = src_object;
8062   coding->src_chars = chars;
8063   coding->src_bytes = bytes;
8064   coding->src_multibyte = chars < bytes;
8065
8066   if (STRINGP (src_object))
8067     {
8068       coding->src_pos = from;
8069       coding->src_pos_byte = from_byte;
8070     }
8071   else if (BUFFERP (src_object))
8072     {
8073       set_buffer_internal (XBUFFER (src_object));
8074       if (from != GPT)
8075         move_gap_both (from, from_byte);
8076       if (EQ (src_object, dst_object))
8077         {
8078           struct Lisp_Marker *tail;
8079
8080           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8081             {
8082               tail->need_adjustment
8083                 = tail->charpos == (tail->insertion_type ? from : to);
8084               need_marker_adjustment |= tail->need_adjustment;
8085             }
8086           saved_pt = PT, saved_pt_byte = PT_BYTE;
8087           TEMP_SET_PT_BOTH (from, from_byte);
8088           current_buffer->text->inhibit_shrinking = 1;
8089           del_range_both (from, from_byte, to, to_byte, 1);
8090           coding->src_pos = -chars;
8091           coding->src_pos_byte = -bytes;
8092         }
8093       else
8094         {
8095           coding->src_pos = from;
8096           coding->src_pos_byte = from_byte;
8097         }
8098     }
8099
8100   if (CODING_REQUIRE_DETECTION (coding))
8101     detect_coding (coding);
8102   attrs = CODING_ID_ATTRS (coding->id);
8103
8104   if (EQ (dst_object, Qt)
8105       || (! NILP (CODING_ATTR_POST_READ (attrs))
8106           && NILP (dst_object)))
8107     {
8108       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8109       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8110       coding->dst_pos = BEG;
8111       coding->dst_pos_byte = BEG_BYTE;
8112     }
8113   else if (BUFFERP (dst_object))
8114     {
8115       code_conversion_save (0, 0);
8116       coding->dst_object = dst_object;
8117       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8118       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8119       coding->dst_multibyte
8120         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8121     }
8122   else
8123     {
8124       code_conversion_save (0, 0);
8125       coding->dst_object = Qnil;
8126       /* Most callers presume this will return a multibyte result, and they
8127          won't use `binary' or `raw-text' anyway, so let's not worry about
8128          CODING_FOR_UNIBYTE.  */
8129       coding->dst_multibyte = 1;
8130     }
8131
8132   decode_coding (coding);
8133
8134   if (BUFFERP (coding->dst_object))
8135     set_buffer_internal (XBUFFER (coding->dst_object));
8136
8137   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8138     {
8139       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8140       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8141       Lisp_Object val;
8142
8143       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8144       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8145               old_deactivate_mark);
8146       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8147                         make_number (coding->produced_char));
8148       UNGCPRO;
8149       CHECK_NATNUM (val);
8150       coding->produced_char += Z - prev_Z;
8151       coding->produced += Z_BYTE - prev_Z_BYTE;
8152     }
8153
8154   if (EQ (dst_object, Qt))
8155     {
8156       coding->dst_object = Fbuffer_string ();
8157     }
8158   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8159     {
8160       set_buffer_internal (XBUFFER (coding->dst_object));
8161       if (dst_bytes < coding->produced)
8162         {
8163           eassert (coding->produced > 0);
8164           destination = xrealloc (destination, coding->produced);
8165           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8166             move_gap_both (BEGV, BEGV_BYTE);
8167           memcpy (destination, BEGV_ADDR, coding->produced);
8168           coding->destination = destination;
8169         }
8170     }
8171
8172   if (saved_pt >= 0)
8173     {
8174       /* This is the case of:
8175          (BUFFERP (src_object) && EQ (src_object, dst_object))
8176          As we have moved PT while replacing the original buffer
8177          contents, we must recover it now.  */
8178       set_buffer_internal (XBUFFER (src_object));
8179       current_buffer->text->inhibit_shrinking = 0;
8180       if (saved_pt < from)
8181         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8182       else if (saved_pt < from + chars)
8183         TEMP_SET_PT_BOTH (from, from_byte);
8184       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8185         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8186                           saved_pt_byte + (coding->produced - bytes));
8187       else
8188         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8189                           saved_pt_byte + (coding->produced - bytes));
8190
8191       if (need_marker_adjustment)
8192         {
8193           struct Lisp_Marker *tail;
8194
8195           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8196             if (tail->need_adjustment)
8197               {
8198                 tail->need_adjustment = 0;
8199                 if (tail->insertion_type)
8200                   {
8201                     tail->bytepos = from_byte;
8202                     tail->charpos = from;
8203                   }
8204                 else
8205                   {
8206                     tail->bytepos = from_byte + coding->produced;
8207                     tail->charpos
8208                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8209                          ? tail->bytepos : from + coding->produced_char);
8210                   }
8211               }
8212         }
8213     }
8214
8215   Vdeactivate_mark = old_deactivate_mark;
8216   unbind_to (count, coding->dst_object);
8217 }
8218
8219
8220 void
8221 encode_coding_object (struct coding_system *coding,
8222                       Lisp_Object src_object,
8223                       ptrdiff_t from, ptrdiff_t from_byte,
8224                       ptrdiff_t to, ptrdiff_t to_byte,
8225                       Lisp_Object dst_object)
8226 {
8227   ptrdiff_t count = SPECPDL_INDEX ();
8228   ptrdiff_t chars = to - from;
8229   ptrdiff_t bytes = to_byte - from_byte;
8230   Lisp_Object attrs;
8231   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8232   bool need_marker_adjustment = 0;
8233   bool kill_src_buffer = 0;
8234   Lisp_Object old_deactivate_mark;
8235
8236   old_deactivate_mark = Vdeactivate_mark;
8237
8238   coding->src_object = src_object;
8239   coding->src_chars = chars;
8240   coding->src_bytes = bytes;
8241   coding->src_multibyte = chars < bytes;
8242
8243   attrs = CODING_ID_ATTRS (coding->id);
8244
8245   if (EQ (src_object, dst_object))
8246     {
8247       struct Lisp_Marker *tail;
8248
8249       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8250         {
8251           tail->need_adjustment
8252             = tail->charpos == (tail->insertion_type ? from : to);
8253           need_marker_adjustment |= tail->need_adjustment;
8254         }
8255     }
8256
8257   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8258     {
8259       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8260       set_buffer_internal (XBUFFER (coding->src_object));
8261       if (STRINGP (src_object))
8262         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8263       else if (BUFFERP (src_object))
8264         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8265       else
8266         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8267
8268       if (EQ (src_object, dst_object))
8269         {
8270           set_buffer_internal (XBUFFER (src_object));
8271           saved_pt = PT, saved_pt_byte = PT_BYTE;
8272           del_range_both (from, from_byte, to, to_byte, 1);
8273           set_buffer_internal (XBUFFER (coding->src_object));
8274         }
8275
8276       {
8277         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8278
8279         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8280                 old_deactivate_mark);
8281         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8282                     make_number (BEG), make_number (Z));
8283         UNGCPRO;
8284       }
8285       if (XBUFFER (coding->src_object) != current_buffer)
8286         kill_src_buffer = 1;
8287       coding->src_object = Fcurrent_buffer ();
8288       if (BEG != GPT)
8289         move_gap_both (BEG, BEG_BYTE);
8290       coding->src_chars = Z - BEG;
8291       coding->src_bytes = Z_BYTE - BEG_BYTE;
8292       coding->src_pos = BEG;
8293       coding->src_pos_byte = BEG_BYTE;
8294       coding->src_multibyte = Z < Z_BYTE;
8295     }
8296   else if (STRINGP (src_object))
8297     {
8298       code_conversion_save (0, 0);
8299       coding->src_pos = from;
8300       coding->src_pos_byte = from_byte;
8301     }
8302   else if (BUFFERP (src_object))
8303     {
8304       code_conversion_save (0, 0);
8305       set_buffer_internal (XBUFFER (src_object));
8306       if (EQ (src_object, dst_object))
8307         {
8308           saved_pt = PT, saved_pt_byte = PT_BYTE;
8309           coding->src_object = del_range_1 (from, to, 1, 1);
8310           coding->src_pos = 0;
8311           coding->src_pos_byte = 0;
8312         }
8313       else
8314         {
8315           if (from < GPT && to >= GPT)
8316             move_gap_both (from, from_byte);
8317           coding->src_pos = from;
8318           coding->src_pos_byte = from_byte;
8319         }
8320     }
8321   else
8322     code_conversion_save (0, 0);
8323
8324   if (BUFFERP (dst_object))
8325     {
8326       coding->dst_object = dst_object;
8327       if (EQ (src_object, dst_object))
8328         {
8329           coding->dst_pos = from;
8330           coding->dst_pos_byte = from_byte;
8331         }
8332       else
8333         {
8334           struct buffer *current = current_buffer;
8335
8336           set_buffer_temp (XBUFFER (dst_object));
8337           coding->dst_pos = PT;
8338           coding->dst_pos_byte = PT_BYTE;
8339           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8340           set_buffer_temp (current);
8341         }
8342       coding->dst_multibyte
8343         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8344     }
8345   else if (EQ (dst_object, Qt))
8346     {
8347       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8348       coding->dst_object = Qnil;
8349       coding->destination = xmalloc (dst_bytes);
8350       coding->dst_bytes = dst_bytes;
8351       coding->dst_multibyte = 0;
8352     }
8353   else
8354     {
8355       coding->dst_object = Qnil;
8356       coding->dst_multibyte = 0;
8357     }
8358
8359   encode_coding (coding);
8360
8361   if (EQ (dst_object, Qt))
8362     {
8363       if (BUFFERP (coding->dst_object))
8364         coding->dst_object = Fbuffer_string ();
8365       else if (coding->raw_destination)
8366         /* This is used to avoid creating huge Lisp string.
8367            NOTE: caller who sets `raw_destination' is also
8368            responsible for freeing `destination' buffer.  */
8369         coding->dst_object = Qnil;
8370       else
8371         {
8372           coding->dst_object
8373             = make_unibyte_string ((char *) coding->destination,
8374                                    coding->produced);
8375           xfree (coding->destination);
8376         }
8377     }
8378
8379   if (saved_pt >= 0)
8380     {
8381       /* This is the case of:
8382          (BUFFERP (src_object) && EQ (src_object, dst_object))
8383          As we have moved PT while replacing the original buffer
8384          contents, we must recover it now.  */
8385       set_buffer_internal (XBUFFER (src_object));
8386       if (saved_pt < from)
8387         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8388       else if (saved_pt < from + chars)
8389         TEMP_SET_PT_BOTH (from, from_byte);
8390       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8391         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8392                           saved_pt_byte + (coding->produced - bytes));
8393       else
8394         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8395                           saved_pt_byte + (coding->produced - bytes));
8396
8397       if (need_marker_adjustment)
8398         {
8399           struct Lisp_Marker *tail;
8400
8401           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8402             if (tail->need_adjustment)
8403               {
8404                 tail->need_adjustment = 0;
8405                 if (tail->insertion_type)
8406                   {
8407                     tail->bytepos = from_byte;
8408                     tail->charpos = from;
8409                   }
8410                 else
8411                   {
8412                     tail->bytepos = from_byte + coding->produced;
8413                     tail->charpos
8414                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8415                          ? tail->bytepos : from + coding->produced_char);
8416                   }
8417               }
8418         }
8419     }
8420
8421   if (kill_src_buffer)
8422     Fkill_buffer (coding->src_object);
8423
8424   Vdeactivate_mark = old_deactivate_mark;
8425   unbind_to (count, Qnil);
8426 }
8427
8428
8429 Lisp_Object
8430 preferred_coding_system (void)
8431 {
8432   int id = coding_categories[coding_priorities[0]].id;
8433
8434   return CODING_ID_NAME (id);
8435 }
8436
8437 #if defined (WINDOWSNT) || defined (CYGWIN)
8438
8439 Lisp_Object
8440 from_unicode (Lisp_Object str)
8441 {
8442   CHECK_STRING (str);
8443   if (!STRING_MULTIBYTE (str) &&
8444       SBYTES (str) & 1)
8445     {
8446       str = Fsubstring (str, make_number (0), make_number (-1));
8447     }
8448
8449   return code_convert_string_norecord (str, Qutf_16le, 0);
8450 }
8451
8452 Lisp_Object
8453 from_unicode_buffer (const wchar_t *wstr)
8454 {
8455     return from_unicode (
8456         make_unibyte_string (
8457             (char *) wstr,
8458             /* we get one of the two final 0 bytes for free. */
8459             1 + sizeof (wchar_t) * wcslen (wstr)));
8460 }
8461
8462 wchar_t *
8463 to_unicode (Lisp_Object str, Lisp_Object *buf)
8464 {
8465   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8466   /* We need to make another copy (in addition to the one made by
8467      code_convert_string_norecord) to ensure that the final string is
8468      _doubly_ zero terminated --- that is, that the string is
8469      terminated by two zero bytes and one utf-16le null character.
8470      Because strings are already terminated with a single zero byte,
8471      we just add one additional zero. */
8472   str = make_uninit_string (SBYTES (*buf) + 1);
8473   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8474   SDATA (str) [SBYTES (*buf)] = '\0';
8475   *buf = str;
8476   return WCSDATA (*buf);
8477 }
8478
8479 #endif /* WINDOWSNT || CYGWIN */
8480
8481 \f
8482 #ifdef emacs
8483 /*** 8. Emacs Lisp library functions ***/
8484
8485 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8486        doc: /* Return t if OBJECT is nil or a coding-system.
8487 See the documentation of `define-coding-system' for information
8488 about coding-system objects.  */)
8489   (Lisp_Object object)
8490 {
8491   if (NILP (object)
8492       || CODING_SYSTEM_ID (object) >= 0)
8493     return Qt;
8494   if (! SYMBOLP (object)
8495       || NILP (Fget (object, Qcoding_system_define_form)))
8496     return Qnil;
8497   return Qt;
8498 }
8499
8500 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8501        Sread_non_nil_coding_system, 1, 1, 0,
8502        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8503   (Lisp_Object prompt)
8504 {
8505   Lisp_Object val;
8506   do
8507     {
8508       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8509                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8510     }
8511   while (SCHARS (val) == 0);
8512   return (Fintern (val, Qnil));
8513 }
8514
8515 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8516        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8517 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8518 Ignores case when completing coding systems (all Emacs coding systems
8519 are lower-case).  */)
8520   (Lisp_Object prompt, Lisp_Object default_coding_system)
8521 {
8522   Lisp_Object val;
8523   ptrdiff_t count = SPECPDL_INDEX ();
8524
8525   if (SYMBOLP (default_coding_system))
8526     default_coding_system = SYMBOL_NAME (default_coding_system);
8527   specbind (Qcompletion_ignore_case, Qt);
8528   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8529                           Qt, Qnil, Qcoding_system_history,
8530                           default_coding_system, Qnil);
8531   unbind_to (count, Qnil);
8532   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8533 }
8534
8535 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8536        1, 1, 0,
8537        doc: /* Check validity of CODING-SYSTEM.
8538 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8539 It is valid if it is nil or a symbol defined as a coding system by the
8540 function `define-coding-system'.  */)
8541   (Lisp_Object coding_system)
8542 {
8543   Lisp_Object define_form;
8544
8545   define_form = Fget (coding_system, Qcoding_system_define_form);
8546   if (! NILP (define_form))
8547     {
8548       Fput (coding_system, Qcoding_system_define_form, Qnil);
8549       safe_eval (define_form);
8550     }
8551   if (!NILP (Fcoding_system_p (coding_system)))
8552     return coding_system;
8553   xsignal1 (Qcoding_system_error, coding_system);
8554 }
8555
8556 \f
8557 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8558    HIGHEST, return the coding system of the highest
8559    priority among the detected coding systems.  Otherwise return a
8560    list of detected coding systems sorted by their priorities.  If
8561    MULTIBYTEP, it is assumed that the bytes are in correct
8562    multibyte form but contains only ASCII and eight-bit chars.
8563    Otherwise, the bytes are raw bytes.
8564
8565    CODING-SYSTEM controls the detection as below:
8566
8567    If it is nil, detect both text-format and eol-format.  If the
8568    text-format part of CODING-SYSTEM is already specified
8569    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8570    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8571    detect only text-format.  */
8572
8573 Lisp_Object
8574 detect_coding_system (const unsigned char *src,
8575                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8576                       bool highest, bool multibytep,
8577                       Lisp_Object coding_system)
8578 {
8579   const unsigned char *src_end = src + src_bytes;
8580   Lisp_Object attrs, eol_type;
8581   Lisp_Object val = Qnil;
8582   struct coding_system coding;
8583   ptrdiff_t id;
8584   struct coding_detection_info detect_info;
8585   enum coding_category base_category;
8586   bool null_byte_found = 0, eight_bit_found = 0;
8587
8588   if (NILP (coding_system))
8589     coding_system = Qundecided;
8590   setup_coding_system (coding_system, &coding);
8591   attrs = CODING_ID_ATTRS (coding.id);
8592   eol_type = CODING_ID_EOL_TYPE (coding.id);
8593   coding_system = CODING_ATTR_BASE_NAME (attrs);
8594
8595   coding.source = src;
8596   coding.src_chars = src_chars;
8597   coding.src_bytes = src_bytes;
8598   coding.src_multibyte = multibytep;
8599   coding.consumed = 0;
8600   coding.mode |= CODING_MODE_LAST_BLOCK;
8601   coding.head_ascii = 0;
8602
8603   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8604
8605   /* At first, detect text-format if necessary.  */
8606   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8607   if (base_category == coding_category_undecided)
8608     {
8609       enum coding_category category IF_LINT (= 0);
8610       struct coding_system *this IF_LINT (= NULL);
8611       int c, i;
8612       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8613                                        inhibit_null_byte_detection);
8614       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8615                                        inhibit_iso_escape_detection);
8616       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8617
8618       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8619       for (; src < src_end; src++)
8620         {
8621           c = *src;
8622           if (c & 0x80)
8623             {
8624               eight_bit_found = 1;
8625               if (null_byte_found)
8626                 break;
8627             }
8628           else if (c < 0x20)
8629             {
8630               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8631                   && ! inhibit_ied
8632                   && ! detect_info.checked)
8633                 {
8634                   if (detect_coding_iso_2022 (&coding, &detect_info))
8635                     {
8636                       /* We have scanned the whole data.  */
8637                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8638                         {
8639                           /* We didn't find an 8-bit code.  We may
8640                              have found a null-byte, but it's very
8641                              rare that a binary file confirm to
8642                              ISO-2022.  */
8643                           src = src_end;
8644                           coding.head_ascii = src - coding.source;
8645                         }
8646                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8647                       break;
8648                     }
8649                 }
8650               else if (! c && !inhibit_nbd)
8651                 {
8652                   null_byte_found = 1;
8653                   if (eight_bit_found)
8654                     break;
8655                 }
8656               if (! eight_bit_found)
8657                 coding.head_ascii++;
8658             }
8659           else if (! eight_bit_found)
8660             coding.head_ascii++;
8661         }
8662
8663       if (null_byte_found || eight_bit_found
8664           || coding.head_ascii < coding.src_bytes
8665           || detect_info.found)
8666         {
8667           if (coding.head_ascii == coding.src_bytes)
8668             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8669             for (i = 0; i < coding_category_raw_text; i++)
8670               {
8671                 category = coding_priorities[i];
8672                 this = coding_categories + category;
8673                 if (detect_info.found & (1 << category))
8674                   break;
8675               }
8676           else
8677             {
8678               if (null_byte_found)
8679                 {
8680                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8681                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8682                 }
8683               else if (prefer_utf_8
8684                        && detect_coding_utf_8 (&coding, &detect_info))
8685                 {
8686                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8687                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8688                 }
8689               for (i = 0; i < coding_category_raw_text; i++)
8690                 {
8691                   category = coding_priorities[i];
8692                   this = coding_categories + category;
8693
8694                   if (this->id < 0)
8695                     {
8696                       /* No coding system of this category is defined.  */
8697                       detect_info.rejected |= (1 << category);
8698                     }
8699                   else if (category >= coding_category_raw_text)
8700                     continue;
8701                   else if (detect_info.checked & (1 << category))
8702                     {
8703                       if (highest
8704                           && (detect_info.found & (1 << category)))
8705                         break;
8706                     }
8707                   else if ((*(this->detector)) (&coding, &detect_info)
8708                            && highest
8709                            && (detect_info.found & (1 << category)))
8710                     {
8711                       if (category == coding_category_utf_16_auto)
8712                         {
8713                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8714                             category = coding_category_utf_16_le;
8715                           else
8716                             category = coding_category_utf_16_be;
8717                         }
8718                       break;
8719                     }
8720                 }
8721             }
8722         }
8723
8724       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8725           || null_byte_found)
8726         {
8727           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8728           id = CODING_SYSTEM_ID (Qno_conversion);
8729           val = list1 (make_number (id));
8730         }
8731       else if (! detect_info.rejected && ! detect_info.found)
8732         {
8733           detect_info.found = CATEGORY_MASK_ANY;
8734           id = coding_categories[coding_category_undecided].id;
8735           val = list1 (make_number (id));
8736         }
8737       else if (highest)
8738         {
8739           if (detect_info.found)
8740             {
8741               detect_info.found = 1 << category;
8742               val = list1 (make_number (this->id));
8743             }
8744           else
8745             for (i = 0; i < coding_category_raw_text; i++)
8746               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8747                 {
8748                   detect_info.found = 1 << coding_priorities[i];
8749                   id = coding_categories[coding_priorities[i]].id;
8750                   val = list1 (make_number (id));
8751                   break;
8752                 }
8753         }
8754       else
8755         {
8756           int mask = detect_info.rejected | detect_info.found;
8757           int found = 0;
8758
8759           for (i = coding_category_raw_text - 1; i >= 0; i--)
8760             {
8761               category = coding_priorities[i];
8762               if (! (mask & (1 << category)))
8763                 {
8764                   found |= 1 << category;
8765                   id = coding_categories[category].id;
8766                   if (id >= 0)
8767                     val = list1 (make_number (id));
8768                 }
8769             }
8770           for (i = coding_category_raw_text - 1; i >= 0; i--)
8771             {
8772               category = coding_priorities[i];
8773               if (detect_info.found & (1 << category))
8774                 {
8775                   id = coding_categories[category].id;
8776                   val = Fcons (make_number (id), val);
8777                 }
8778             }
8779           detect_info.found |= found;
8780         }
8781     }
8782   else if (base_category == coding_category_utf_8_auto)
8783     {
8784       if (detect_coding_utf_8 (&coding, &detect_info))
8785         {
8786           struct coding_system *this;
8787
8788           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8789             this = coding_categories + coding_category_utf_8_sig;
8790           else
8791             this = coding_categories + coding_category_utf_8_nosig;
8792           val = list1 (make_number (this->id));
8793         }
8794     }
8795   else if (base_category == coding_category_utf_16_auto)
8796     {
8797       if (detect_coding_utf_16 (&coding, &detect_info))
8798         {
8799           struct coding_system *this;
8800
8801           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8802             this = coding_categories + coding_category_utf_16_le;
8803           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8804             this = coding_categories + coding_category_utf_16_be;
8805           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8806             this = coding_categories + coding_category_utf_16_be_nosig;
8807           else
8808             this = coding_categories + coding_category_utf_16_le_nosig;
8809           val = list1 (make_number (this->id));
8810         }
8811     }
8812   else
8813     {
8814       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8815       val = list1 (make_number (coding.id));
8816     }
8817
8818   /* Then, detect eol-format if necessary.  */
8819   {
8820     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8821     Lisp_Object tail;
8822
8823     if (VECTORP (eol_type))
8824       {
8825         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8826           {
8827             if (null_byte_found)
8828               normal_eol = EOL_SEEN_LF;
8829             else
8830               normal_eol = detect_eol (coding.source, src_bytes,
8831                                        coding_category_raw_text);
8832           }
8833         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8834                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8835           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8836                                       coding_category_utf_16_be);
8837         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8838                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8839           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8840                                       coding_category_utf_16_le);
8841       }
8842     else
8843       {
8844         if (EQ (eol_type, Qunix))
8845           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8846         else if (EQ (eol_type, Qdos))
8847           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8848         else
8849           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8850       }
8851
8852     for (tail = val; CONSP (tail); tail = XCDR (tail))
8853       {
8854         enum coding_category category;
8855         int this_eol;
8856
8857         id = XINT (XCAR (tail));
8858         attrs = CODING_ID_ATTRS (id);
8859         category = XINT (CODING_ATTR_CATEGORY (attrs));
8860         eol_type = CODING_ID_EOL_TYPE (id);
8861         if (VECTORP (eol_type))
8862           {
8863             if (category == coding_category_utf_16_be
8864                 || category == coding_category_utf_16_be_nosig)
8865               this_eol = utf_16_be_eol;
8866             else if (category == coding_category_utf_16_le
8867                      || category == coding_category_utf_16_le_nosig)
8868               this_eol = utf_16_le_eol;
8869             else
8870               this_eol = normal_eol;
8871
8872             if (this_eol == EOL_SEEN_LF)
8873               XSETCAR (tail, AREF (eol_type, 0));
8874             else if (this_eol == EOL_SEEN_CRLF)
8875               XSETCAR (tail, AREF (eol_type, 1));
8876             else if (this_eol == EOL_SEEN_CR)
8877               XSETCAR (tail, AREF (eol_type, 2));
8878             else
8879               XSETCAR (tail, CODING_ID_NAME (id));
8880           }
8881         else
8882           XSETCAR (tail, CODING_ID_NAME (id));
8883       }
8884   }
8885
8886   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8887 }
8888
8889
8890 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8891        2, 3, 0,
8892        doc: /* Detect coding system of the text in the region between START and END.
8893 Return a list of possible coding systems ordered by priority.
8894 The coding systems to try and their priorities follows what
8895 the function `coding-system-priority-list' (which see) returns.
8896
8897 If only ASCII characters are found (except for such ISO-2022 control
8898 characters as ESC), it returns a list of single element `undecided'
8899 or its subsidiary coding system according to a detected end-of-line
8900 format.
8901
8902 If optional argument HIGHEST is non-nil, return the coding system of
8903 highest priority.  */)
8904   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8905 {
8906   ptrdiff_t from, to;
8907   ptrdiff_t from_byte, to_byte;
8908
8909   validate_region (&start, &end);
8910   from = XINT (start), to = XINT (end);
8911   from_byte = CHAR_TO_BYTE (from);
8912   to_byte = CHAR_TO_BYTE (to);
8913
8914   if (from < GPT && to >= GPT)
8915     move_gap_both (to, to_byte);
8916
8917   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8918                                to - from, to_byte - from_byte,
8919                                !NILP (highest),
8920                                !NILP (BVAR (current_buffer
8921                                       , enable_multibyte_characters)),
8922                                Qnil);
8923 }
8924
8925 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8926        1, 2, 0,
8927        doc: /* Detect coding system of the text in STRING.
8928 Return a list of possible coding systems ordered by priority.
8929 The coding systems to try and their priorities follows what
8930 the function `coding-system-priority-list' (which see) returns.
8931
8932 If only ASCII characters are found (except for such ISO-2022 control
8933 characters as ESC), it returns a list of single element `undecided'
8934 or its subsidiary coding system according to a detected end-of-line
8935 format.
8936
8937 If optional argument HIGHEST is non-nil, return the coding system of
8938 highest priority.  */)
8939   (Lisp_Object string, Lisp_Object highest)
8940 {
8941   CHECK_STRING (string);
8942
8943   return detect_coding_system (SDATA (string),
8944                                SCHARS (string), SBYTES (string),
8945                                !NILP (highest), STRING_MULTIBYTE (string),
8946                                Qnil);
8947 }
8948
8949
8950 static bool
8951 char_encodable_p (int c, Lisp_Object attrs)
8952 {
8953   Lisp_Object tail;
8954   struct charset *charset;
8955   Lisp_Object translation_table;
8956
8957   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8958   if (! NILP (translation_table))
8959     c = translate_char (translation_table, c);
8960   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8961        CONSP (tail); tail = XCDR (tail))
8962     {
8963       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8964       if (CHAR_CHARSET_P (c, charset))
8965         break;
8966     }
8967   return (! NILP (tail));
8968 }
8969
8970
8971 /* Return a list of coding systems that safely encode the text between
8972    START and END.  If EXCLUDE is non-nil, it is a list of coding
8973    systems not to check.  The returned list doesn't contain any such
8974    coding systems.  In any case, if the text contains only ASCII or is
8975    unibyte, return t.  */
8976
8977 DEFUN ("find-coding-systems-region-internal",
8978        Ffind_coding_systems_region_internal,
8979        Sfind_coding_systems_region_internal, 2, 3, 0,
8980        doc: /* Internal use only.  */)
8981   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8982 {
8983   Lisp_Object coding_attrs_list, safe_codings;
8984   ptrdiff_t start_byte, end_byte;
8985   const unsigned char *p, *pbeg, *pend;
8986   int c;
8987   Lisp_Object tail, elt, work_table;
8988
8989   if (STRINGP (start))
8990     {
8991       if (!STRING_MULTIBYTE (start)
8992           || SCHARS (start) == SBYTES (start))
8993         return Qt;
8994       start_byte = 0;
8995       end_byte = SBYTES (start);
8996     }
8997   else
8998     {
8999       CHECK_NUMBER_COERCE_MARKER (start);
9000       CHECK_NUMBER_COERCE_MARKER (end);
9001       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9002         args_out_of_range (start, end);
9003       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9004         return Qt;
9005       start_byte = CHAR_TO_BYTE (XINT (start));
9006       end_byte = CHAR_TO_BYTE (XINT (end));
9007       if (XINT (end) - XINT (start) == end_byte - start_byte)
9008         return Qt;
9009
9010       if (XINT (start) < GPT && XINT (end) > GPT)
9011         {
9012           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9013             move_gap_both (XINT (start), start_byte);
9014           else
9015             move_gap_both (XINT (end), end_byte);
9016         }
9017     }
9018
9019   coding_attrs_list = Qnil;
9020   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9021     if (NILP (exclude)
9022         || NILP (Fmemq (XCAR (tail), exclude)))
9023       {
9024         Lisp_Object attrs;
9025
9026         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9027         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9028           {
9029             ASET (attrs, coding_attr_trans_tbl,
9030                   get_translation_table (attrs, 1, NULL));
9031             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9032           }
9033       }
9034
9035   if (STRINGP (start))
9036     p = pbeg = SDATA (start);
9037   else
9038     p = pbeg = BYTE_POS_ADDR (start_byte);
9039   pend = p + (end_byte - start_byte);
9040
9041   while (p < pend && ASCII_CHAR_P (*p)) p++;
9042   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9043
9044   work_table = Fmake_char_table (Qnil, Qnil);
9045   while (p < pend)
9046     {
9047       if (ASCII_CHAR_P (*p))
9048         p++;
9049       else
9050         {
9051           c = STRING_CHAR_ADVANCE (p);
9052           if (!NILP (char_table_ref (work_table, c)))
9053             /* This character was already checked.  Ignore it.  */
9054             continue;
9055
9056           charset_map_loaded = 0;
9057           for (tail = coding_attrs_list; CONSP (tail);)
9058             {
9059               elt = XCAR (tail);
9060               if (NILP (elt))
9061                 tail = XCDR (tail);
9062               else if (char_encodable_p (c, elt))
9063                 tail = XCDR (tail);
9064               else if (CONSP (XCDR (tail)))
9065                 {
9066                   XSETCAR (tail, XCAR (XCDR (tail)));
9067                   XSETCDR (tail, XCDR (XCDR (tail)));
9068                 }
9069               else
9070                 {
9071                   XSETCAR (tail, Qnil);
9072                   tail = XCDR (tail);
9073                 }
9074             }
9075           if (charset_map_loaded)
9076             {
9077               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9078
9079               if (STRINGP (start))
9080                 pbeg = SDATA (start);
9081               else
9082                 pbeg = BYTE_POS_ADDR (start_byte);
9083               p = pbeg + p_offset;
9084               pend = pbeg + pend_offset;
9085             }
9086           char_table_set (work_table, c, Qt);
9087         }
9088     }
9089
9090   safe_codings = list2 (Qraw_text, Qno_conversion);
9091   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9092     if (! NILP (XCAR (tail)))
9093       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9094
9095   return safe_codings;
9096 }
9097
9098
9099 DEFUN ("unencodable-char-position", Funencodable_char_position,
9100        Sunencodable_char_position, 3, 5, 0,
9101        doc: /* Return position of first un-encodable character in a region.
9102 START and END specify the region and CODING-SYSTEM specifies the
9103 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9104
9105 If optional 4th argument COUNT is non-nil, it specifies at most how
9106 many un-encodable characters to search.  In this case, the value is a
9107 list of positions.
9108
9109 If optional 5th argument STRING is non-nil, it is a string to search
9110 for un-encodable characters.  In that case, START and END are indexes
9111 to the string and treated as in `substring'.  */)
9112   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9113    Lisp_Object count, Lisp_Object string)
9114 {
9115   EMACS_INT n;
9116   struct coding_system coding;
9117   Lisp_Object attrs, charset_list, translation_table;
9118   Lisp_Object positions;
9119   ptrdiff_t from, to;
9120   const unsigned char *p, *stop, *pend;
9121   bool ascii_compatible;
9122
9123   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9124   attrs = CODING_ID_ATTRS (coding.id);
9125   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9126     return Qnil;
9127   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9128   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9129   translation_table = get_translation_table (attrs, 1, NULL);
9130
9131   if (NILP (string))
9132     {
9133       validate_region (&start, &end);
9134       from = XINT (start);
9135       to = XINT (end);
9136       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9137           || (ascii_compatible
9138               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9139         return Qnil;
9140       p = CHAR_POS_ADDR (from);
9141       pend = CHAR_POS_ADDR (to);
9142       if (from < GPT && to >= GPT)
9143         stop = GPT_ADDR;
9144       else
9145         stop = pend;
9146     }
9147   else
9148     {
9149       CHECK_STRING (string);
9150       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9151       if (! STRING_MULTIBYTE (string))
9152         return Qnil;
9153       p = SDATA (string) + string_char_to_byte (string, from);
9154       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9155       if (ascii_compatible && (to - from) == (pend - p))
9156         return Qnil;
9157     }
9158
9159   if (NILP (count))
9160     n = 1;
9161   else
9162     {
9163       CHECK_NATNUM (count);
9164       n = XINT (count);
9165     }
9166
9167   positions = Qnil;
9168   charset_map_loaded = 0;
9169   while (1)
9170     {
9171       int c;
9172
9173       if (ascii_compatible)
9174         while (p < stop && ASCII_CHAR_P (*p))
9175           p++, from++;
9176       if (p >= stop)
9177         {
9178           if (p >= pend)
9179             break;
9180           stop = pend;
9181           p = GAP_END_ADDR;
9182         }
9183
9184       c = STRING_CHAR_ADVANCE (p);
9185       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9186           && ! char_charset (translate_char (translation_table, c),
9187                              charset_list, NULL))
9188         {
9189           positions = Fcons (make_number (from), positions);
9190           n--;
9191           if (n == 0)
9192             break;
9193         }
9194
9195       from++;
9196       if (charset_map_loaded && NILP (string))
9197         {
9198           p = CHAR_POS_ADDR (from);
9199           pend = CHAR_POS_ADDR (to);
9200           if (from < GPT && to >= GPT)
9201             stop = GPT_ADDR;
9202           else
9203             stop = pend;
9204           charset_map_loaded = 0;
9205         }
9206     }
9207
9208   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9209 }
9210
9211
9212 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9213        Scheck_coding_systems_region, 3, 3, 0,
9214        doc: /* Check if the region is encodable by coding systems.
9215
9216 START and END are buffer positions specifying the region.
9217 CODING-SYSTEM-LIST is a list of coding systems to check.
9218
9219 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9220 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9221 whole region, POS0, POS1, ... are buffer positions where non-encodable
9222 characters are found.
9223
9224 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9225 value is nil.
9226
9227 START may be a string.  In that case, check if the string is
9228 encodable, and the value contains indices to the string instead of
9229 buffer positions.  END is ignored.
9230
9231 If the current buffer (or START if it is a string) is unibyte, the value
9232 is nil.  */)
9233   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9234 {
9235   Lisp_Object list;
9236   ptrdiff_t start_byte, end_byte;
9237   ptrdiff_t pos;
9238   const unsigned char *p, *pbeg, *pend;
9239   int c;
9240   Lisp_Object tail, elt, attrs;
9241
9242   if (STRINGP (start))
9243     {
9244       if (!STRING_MULTIBYTE (start)
9245           || SCHARS (start) == SBYTES (start))
9246         return Qnil;
9247       start_byte = 0;
9248       end_byte = SBYTES (start);
9249       pos = 0;
9250     }
9251   else
9252     {
9253       CHECK_NUMBER_COERCE_MARKER (start);
9254       CHECK_NUMBER_COERCE_MARKER (end);
9255       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9256         args_out_of_range (start, end);
9257       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9258         return Qnil;
9259       start_byte = CHAR_TO_BYTE (XINT (start));
9260       end_byte = CHAR_TO_BYTE (XINT (end));
9261       if (XINT (end) - XINT (start) == end_byte - start_byte)
9262         return Qnil;
9263
9264       if (XINT (start) < GPT && XINT (end) > GPT)
9265         {
9266           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9267             move_gap_both (XINT (start), start_byte);
9268           else
9269             move_gap_both (XINT (end), end_byte);
9270         }
9271       pos = XINT (start);
9272     }
9273
9274   list = Qnil;
9275   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9276     {
9277       elt = XCAR (tail);
9278       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9279       ASET (attrs, coding_attr_trans_tbl,
9280             get_translation_table (attrs, 1, NULL));
9281       list = Fcons (list2 (elt, attrs), list);
9282     }
9283
9284   if (STRINGP (start))
9285     p = pbeg = SDATA (start);
9286   else
9287     p = pbeg = BYTE_POS_ADDR (start_byte);
9288   pend = p + (end_byte - start_byte);
9289
9290   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9291   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9292
9293   while (p < pend)
9294     {
9295       if (ASCII_CHAR_P (*p))
9296         p++;
9297       else
9298         {
9299           c = STRING_CHAR_ADVANCE (p);
9300
9301           charset_map_loaded = 0;
9302           for (tail = list; CONSP (tail); tail = XCDR (tail))
9303             {
9304               elt = XCDR (XCAR (tail));
9305               if (! char_encodable_p (c, XCAR (elt)))
9306                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9307             }
9308           if (charset_map_loaded)
9309             {
9310               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9311
9312               if (STRINGP (start))
9313                 pbeg = SDATA (start);
9314               else
9315                 pbeg = BYTE_POS_ADDR (start_byte);
9316               p = pbeg + p_offset;
9317               pend = pbeg + pend_offset;
9318             }
9319         }
9320       pos++;
9321     }
9322
9323   tail = list;
9324   list = Qnil;
9325   for (; CONSP (tail); tail = XCDR (tail))
9326     {
9327       elt = XCAR (tail);
9328       if (CONSP (XCDR (XCDR (elt))))
9329         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9330                       list);
9331     }
9332
9333   return list;
9334 }
9335
9336
9337 static Lisp_Object
9338 code_convert_region (Lisp_Object start, Lisp_Object end,
9339                      Lisp_Object coding_system, Lisp_Object dst_object,
9340                      bool encodep, bool norecord)
9341 {
9342   struct coding_system coding;
9343   ptrdiff_t from, from_byte, to, to_byte;
9344   Lisp_Object src_object;
9345
9346   if (NILP (coding_system))
9347     coding_system = Qno_conversion;
9348   else
9349     CHECK_CODING_SYSTEM (coding_system);
9350   src_object = Fcurrent_buffer ();
9351   if (NILP (dst_object))
9352     dst_object = src_object;
9353   else if (! EQ (dst_object, Qt))
9354     CHECK_BUFFER (dst_object);
9355
9356   validate_region (&start, &end);
9357   from = XFASTINT (start);
9358   from_byte = CHAR_TO_BYTE (from);
9359   to = XFASTINT (end);
9360   to_byte = CHAR_TO_BYTE (to);
9361
9362   setup_coding_system (coding_system, &coding);
9363   coding.mode |= CODING_MODE_LAST_BLOCK;
9364
9365   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9366     {
9367       struct buffer *buf = XBUFFER (dst_object);
9368       ptrdiff_t buf_pt = BUF_PT (buf);
9369
9370       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9371     }
9372
9373   if (encodep)
9374     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9375                           dst_object);
9376   else
9377     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9378                           dst_object);
9379   if (! norecord)
9380     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9381
9382   return (BUFFERP (dst_object)
9383           ? make_number (coding.produced_char)
9384           : coding.dst_object);
9385 }
9386
9387
9388 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9389        3, 4, "r\nzCoding system: ",
9390        doc: /* Decode the current region from the specified coding system.
9391 When called from a program, takes four arguments:
9392         START, END, CODING-SYSTEM, and DESTINATION.
9393 START and END are buffer positions.
9394
9395 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9396 If nil, the region between START and END is replaced by the decoded text.
9397 If buffer, the decoded text is inserted in that buffer after point (point
9398 does not move).
9399 In those cases, the length of the decoded text is returned.
9400 If DESTINATION is t, the decoded text is returned.
9401
9402 This function sets `last-coding-system-used' to the precise coding system
9403 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9404 not fully specified.)  */)
9405   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9406 {
9407   return code_convert_region (start, end, coding_system, destination, 0, 0);
9408 }
9409
9410 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9411        3, 4, "r\nzCoding system: ",
9412        doc: /* Encode the current region by specified coding system.
9413 When called from a program, takes four arguments:
9414         START, END, CODING-SYSTEM and DESTINATION.
9415 START and END are buffer positions.
9416
9417 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9418 If nil, the region between START and END is replace by the encoded text.
9419 If buffer, the encoded text is inserted in that buffer after point (point
9420 does not move).
9421 In those cases, the length of the encoded text is returned.
9422 If DESTINATION is t, the encoded text is returned.
9423
9424 This function sets `last-coding-system-used' to the precise coding system
9425 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9426 not fully specified.)  */)
9427   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9428 {
9429   return code_convert_region (start, end, coding_system, destination, 1, 0);
9430 }
9431
9432 Lisp_Object
9433 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9434                      Lisp_Object dst_object, bool encodep, bool nocopy,
9435                      bool norecord)
9436 {
9437   struct coding_system coding;
9438   ptrdiff_t chars, bytes;
9439
9440   CHECK_STRING (string);
9441   if (NILP (coding_system))
9442     {
9443       if (! norecord)
9444         Vlast_coding_system_used = Qno_conversion;
9445       if (NILP (dst_object))
9446         return (nocopy ? Fcopy_sequence (string) : string);
9447     }
9448
9449   if (NILP (coding_system))
9450     coding_system = Qno_conversion;
9451   else
9452     CHECK_CODING_SYSTEM (coding_system);
9453   if (NILP (dst_object))
9454     dst_object = Qt;
9455   else if (! EQ (dst_object, Qt))
9456     CHECK_BUFFER (dst_object);
9457
9458   setup_coding_system (coding_system, &coding);
9459   coding.mode |= CODING_MODE_LAST_BLOCK;
9460   chars = SCHARS (string);
9461   bytes = SBYTES (string);
9462
9463   if (BUFFERP (dst_object))
9464     {
9465       struct buffer *buf = XBUFFER (dst_object);
9466       ptrdiff_t buf_pt = BUF_PT (buf);
9467
9468       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9469     }
9470
9471   if (encodep)
9472     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9473   else
9474     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9475   if (! norecord)
9476     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9477
9478   return (BUFFERP (dst_object)
9479           ? make_number (coding.produced_char)
9480           : coding.dst_object);
9481 }
9482
9483
9484 /* Encode or decode STRING according to CODING_SYSTEM.
9485    Do not set Vlast_coding_system_used.
9486
9487    This function is called only from macros DECODE_FILE and
9488    ENCODE_FILE, thus we ignore character composition.  */
9489
9490 Lisp_Object
9491 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9492                               bool encodep)
9493 {
9494   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9495 }
9496
9497 /* Encode or decode a file name, to or from a unibyte string suitable
9498    for passing to C library functions.  */
9499 Lisp_Object
9500 decode_file_name (Lisp_Object fname)
9501 {
9502 #ifdef WINDOWSNT
9503   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9504      converts the file names either to UTF-16LE or to the system ANSI
9505      codepage internally, depending on the underlying OS; see w32.c.  */
9506   if (! NILP (Fcoding_system_p (Qutf_8)))
9507     return code_convert_string_norecord (fname, Qutf_8, 0);
9508   return fname;
9509 #else  /* !WINDOWSNT */
9510   if (! NILP (Vfile_name_coding_system))
9511     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9512   else if (! NILP (Vdefault_file_name_coding_system))
9513     return code_convert_string_norecord (fname,
9514                                          Vdefault_file_name_coding_system, 0);
9515   else
9516     return fname;
9517 #endif
9518 }
9519
9520 Lisp_Object
9521 encode_file_name (Lisp_Object fname)
9522 {
9523   /* This is especially important during bootstrap and dumping, when
9524      file-name encoding is not yet known, and therefore any non-ASCII
9525      file names are unibyte strings, and could only be thrashed if we
9526      try to encode them.  */
9527   if (!STRING_MULTIBYTE (fname))
9528     return fname;
9529 #ifdef WINDOWSNT
9530   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9531      converts the file names either to UTF-16LE or to the system ANSI
9532      codepage internally, depending on the underlying OS; see w32.c.  */
9533   if (! NILP (Fcoding_system_p (Qutf_8)))
9534     return code_convert_string_norecord (fname, Qutf_8, 1);
9535   return fname;
9536 #else  /* !WINDOWSNT */
9537   if (! NILP (Vfile_name_coding_system))
9538     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9539   else if (! NILP (Vdefault_file_name_coding_system))
9540     return code_convert_string_norecord (fname,
9541                                          Vdefault_file_name_coding_system, 1);
9542   else
9543     return fname;
9544 #endif
9545 }
9546
9547 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9548        2, 4, 0,
9549        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9550
9551 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9552 if the decoding operation is trivial.
9553
9554 Optional fourth arg BUFFER non-nil means that the decoded text is
9555 inserted in that buffer after point (point does not move).  In this
9556 case, the return value is the length of the decoded text.
9557
9558 This function sets `last-coding-system-used' to the precise coding system
9559 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9560 not fully specified.)  */)
9561   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9562 {
9563   return code_convert_string (string, coding_system, buffer,
9564                               0, ! NILP (nocopy), 0);
9565 }
9566
9567 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9568        2, 4, 0,
9569        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9570
9571 Optional third arg NOCOPY non-nil means it is OK to return STRING
9572 itself if the encoding operation is trivial.
9573
9574 Optional fourth arg BUFFER non-nil means that the encoded text is
9575 inserted in that buffer after point (point does not move).  In this
9576 case, the return value is the length of the encoded text.
9577
9578 This function sets `last-coding-system-used' to the precise coding system
9579 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9580 not fully specified.)  */)
9581   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9582 {
9583   return code_convert_string (string, coding_system, buffer,
9584                               1, ! NILP (nocopy), 0);
9585 }
9586
9587 \f
9588 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9589        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9590 Return the corresponding character.  */)
9591   (Lisp_Object code)
9592 {
9593   Lisp_Object spec, attrs, val;
9594   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9595   EMACS_INT ch;
9596   int c;
9597
9598   CHECK_NATNUM (code);
9599   ch = XFASTINT (code);
9600   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9601   attrs = AREF (spec, 0);
9602
9603   if (ASCII_CHAR_P (ch)
9604       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9605     return code;
9606
9607   val = CODING_ATTR_CHARSET_LIST (attrs);
9608   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9609   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9610   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9611
9612   if (ch <= 0x7F)
9613     {
9614       c = ch;
9615       charset = charset_roman;
9616     }
9617   else if (ch >= 0xA0 && ch < 0xDF)
9618     {
9619       c = ch - 0x80;
9620       charset = charset_kana;
9621     }
9622   else
9623     {
9624       EMACS_INT c1 = ch >> 8;
9625       int c2 = ch & 0xFF;
9626
9627       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9628           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9629         error ("Invalid code: %"pI"d", ch);
9630       c = ch;
9631       SJIS_TO_JIS (c);
9632       charset = charset_kanji;
9633     }
9634   c = DECODE_CHAR (charset, c);
9635   if (c < 0)
9636     error ("Invalid code: %"pI"d", ch);
9637   return make_number (c);
9638 }
9639
9640
9641 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9642        doc: /* Encode a Japanese character CH to shift_jis encoding.
9643 Return the corresponding code in SJIS.  */)
9644   (Lisp_Object ch)
9645 {
9646   Lisp_Object spec, attrs, charset_list;
9647   int c;
9648   struct charset *charset;
9649   unsigned code;
9650
9651   CHECK_CHARACTER (ch);
9652   c = XFASTINT (ch);
9653   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9654   attrs = AREF (spec, 0);
9655
9656   if (ASCII_CHAR_P (c)
9657       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9658     return ch;
9659
9660   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9661   charset = char_charset (c, charset_list, &code);
9662   if (code == CHARSET_INVALID_CODE (charset))
9663     error ("Can't encode by shift_jis encoding: %c", c);
9664   JIS_TO_SJIS (code);
9665
9666   return make_number (code);
9667 }
9668
9669 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9670        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9671 Return the corresponding character.  */)
9672   (Lisp_Object code)
9673 {
9674   Lisp_Object spec, attrs, val;
9675   struct charset *charset_roman, *charset_big5, *charset;
9676   EMACS_INT ch;
9677   int c;
9678
9679   CHECK_NATNUM (code);
9680   ch = XFASTINT (code);
9681   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9682   attrs = AREF (spec, 0);
9683
9684   if (ASCII_CHAR_P (ch)
9685       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9686     return code;
9687
9688   val = CODING_ATTR_CHARSET_LIST (attrs);
9689   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9690   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9691
9692   if (ch <= 0x7F)
9693     {
9694       c = ch;
9695       charset = charset_roman;
9696     }
9697   else
9698     {
9699       EMACS_INT b1 = ch >> 8;
9700       int b2 = ch & 0x7F;
9701       if (b1 < 0xA1 || b1 > 0xFE
9702           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9703         error ("Invalid code: %"pI"d", ch);
9704       c = ch;
9705       charset = charset_big5;
9706     }
9707   c = DECODE_CHAR (charset, c);
9708   if (c < 0)
9709     error ("Invalid code: %"pI"d", ch);
9710   return make_number (c);
9711 }
9712
9713 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9714        doc: /* Encode the Big5 character CH to BIG5 coding system.
9715 Return the corresponding character code in Big5.  */)
9716   (Lisp_Object ch)
9717 {
9718   Lisp_Object spec, attrs, charset_list;
9719   struct charset *charset;
9720   int c;
9721   unsigned code;
9722
9723   CHECK_CHARACTER (ch);
9724   c = XFASTINT (ch);
9725   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9726   attrs = AREF (spec, 0);
9727   if (ASCII_CHAR_P (c)
9728       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9729     return ch;
9730
9731   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9732   charset = char_charset (c, charset_list, &code);
9733   if (code == CHARSET_INVALID_CODE (charset))
9734     error ("Can't encode by Big5 encoding: %c", c);
9735
9736   return make_number (code);
9737 }
9738
9739 \f
9740 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9741        Sset_terminal_coding_system_internal, 1, 2, 0,
9742        doc: /* Internal use only.  */)
9743   (Lisp_Object coding_system, Lisp_Object terminal)
9744 {
9745   struct terminal *term = get_terminal (terminal, 1);
9746   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9747   CHECK_SYMBOL (coding_system);
9748   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9749   /* We had better not send unsafe characters to terminal.  */
9750   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9751   /* Character composition should be disabled.  */
9752   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9753   terminal_coding->src_multibyte = 1;
9754   terminal_coding->dst_multibyte = 0;
9755   tset_charset_list
9756     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9757             ? coding_charset_list (terminal_coding)
9758             : list1 (make_number (charset_ascii))));
9759   return Qnil;
9760 }
9761
9762 DEFUN ("set-safe-terminal-coding-system-internal",
9763        Fset_safe_terminal_coding_system_internal,
9764        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9765        doc: /* Internal use only.  */)
9766   (Lisp_Object coding_system)
9767 {
9768   CHECK_SYMBOL (coding_system);
9769   setup_coding_system (Fcheck_coding_system (coding_system),
9770                        &safe_terminal_coding);
9771   /* Character composition should be disabled.  */
9772   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9773   safe_terminal_coding.src_multibyte = 1;
9774   safe_terminal_coding.dst_multibyte = 0;
9775   return Qnil;
9776 }
9777
9778 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9779        Sterminal_coding_system, 0, 1, 0,
9780        doc: /* Return coding system specified for terminal output on the given terminal.
9781 TERMINAL may be a terminal object, a frame, or nil for the selected
9782 frame's terminal device.  */)
9783   (Lisp_Object terminal)
9784 {
9785   struct coding_system *terminal_coding
9786     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9787   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9788
9789   /* For backward compatibility, return nil if it is `undecided'.  */
9790   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9791 }
9792
9793 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9794        Sset_keyboard_coding_system_internal, 1, 2, 0,
9795        doc: /* Internal use only.  */)
9796   (Lisp_Object coding_system, Lisp_Object terminal)
9797 {
9798   struct terminal *t = get_terminal (terminal, 1);
9799   CHECK_SYMBOL (coding_system);
9800   if (NILP (coding_system))
9801     coding_system = Qno_conversion;
9802   else
9803     Fcheck_coding_system (coding_system);
9804   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9805   /* Character composition should be disabled.  */
9806   TERMINAL_KEYBOARD_CODING (t)->common_flags
9807     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9808   return Qnil;
9809 }
9810
9811 DEFUN ("keyboard-coding-system",
9812        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9813        doc: /* Return coding system specified for decoding keyboard input.  */)
9814   (Lisp_Object terminal)
9815 {
9816   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9817                          (get_terminal (terminal, 1))->id);
9818 }
9819
9820 \f
9821 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9822        Sfind_operation_coding_system,  1, MANY, 0,
9823        doc: /* Choose a coding system for an operation based on the target name.
9824 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9825 DECODING-SYSTEM is the coding system to use for decoding
9826 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9827 for encoding (in case OPERATION does encoding).
9828
9829 The first argument OPERATION specifies an I/O primitive:
9830   For file I/O, `insert-file-contents' or `write-region'.
9831   For process I/O, `call-process', `call-process-region', or `start-process'.
9832   For network I/O, `open-network-stream'.
9833
9834 The remaining arguments should be the same arguments that were passed
9835 to the primitive.  Depending on which primitive, one of those arguments
9836 is selected as the TARGET.  For example, if OPERATION does file I/O,
9837 whichever argument specifies the file name is TARGET.
9838
9839 TARGET has a meaning which depends on OPERATION:
9840   For file I/O, TARGET is a file name (except for the special case below).
9841   For process I/O, TARGET is a process name.
9842   For network I/O, TARGET is a service name or a port number.
9843
9844 This function looks up what is specified for TARGET in
9845 `file-coding-system-alist', `process-coding-system-alist',
9846 or `network-coding-system-alist' depending on OPERATION.
9847 They may specify a coding system, a cons of coding systems,
9848 or a function symbol to call.
9849 In the last case, we call the function with one argument,
9850 which is a list of all the arguments given to this function.
9851 If the function can't decide a coding system, it can return
9852 `undecided' so that the normal code-detection is performed.
9853
9854 If OPERATION is `insert-file-contents', the argument corresponding to
9855 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9856 file name to look up, and BUFFER is a buffer that contains the file's
9857 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9858 function to call for FILENAME, that function should examine the
9859 contents of BUFFER instead of reading the file.
9860
9861 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9862   (ptrdiff_t nargs, Lisp_Object *args)
9863 {
9864   Lisp_Object operation, target_idx, target, val;
9865   register Lisp_Object chain;
9866
9867   if (nargs < 2)
9868     error ("Too few arguments");
9869   operation = args[0];
9870   if (!SYMBOLP (operation)
9871       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9872     error ("Invalid first argument");
9873   if (nargs <= 1 + XFASTINT (target_idx))
9874     error ("Too few arguments for operation `%s'",
9875            SDATA (SYMBOL_NAME (operation)));
9876   target = args[XFASTINT (target_idx) + 1];
9877   if (!(STRINGP (target)
9878         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9879             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9880         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9881     error ("Invalid argument %"pI"d of operation `%s'",
9882            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9883   if (CONSP (target))
9884     target = XCAR (target);
9885
9886   chain = ((EQ (operation, Qinsert_file_contents)
9887             || EQ (operation, Qwrite_region))
9888            ? Vfile_coding_system_alist
9889            : (EQ (operation, Qopen_network_stream)
9890               ? Vnetwork_coding_system_alist
9891               : Vprocess_coding_system_alist));
9892   if (NILP (chain))
9893     return Qnil;
9894
9895   for (; CONSP (chain); chain = XCDR (chain))
9896     {
9897       Lisp_Object elt;
9898
9899       elt = XCAR (chain);
9900       if (CONSP (elt)
9901           && ((STRINGP (target)
9902                && STRINGP (XCAR (elt))
9903                && fast_string_match (XCAR (elt), target) >= 0)
9904               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9905         {
9906           val = XCDR (elt);
9907           /* Here, if VAL is both a valid coding system and a valid
9908              function symbol, we return VAL as a coding system.  */
9909           if (CONSP (val))
9910             return val;
9911           if (! SYMBOLP (val))
9912             return Qnil;
9913           if (! NILP (Fcoding_system_p (val)))
9914             return Fcons (val, val);
9915           if (! NILP (Ffboundp (val)))
9916             {
9917               /* We use call1 rather than safe_call1
9918                  so as to get bug reports about functions called here
9919                  which don't handle the current interface.  */
9920               val = call1 (val, Flist (nargs, args));
9921               if (CONSP (val))
9922                 return val;
9923               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9924                 return Fcons (val, val);
9925             }
9926           return Qnil;
9927         }
9928     }
9929   return Qnil;
9930 }
9931
9932 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9933        Sset_coding_system_priority, 0, MANY, 0,
9934        doc: /* Assign higher priority to the coding systems given as arguments.
9935 If multiple coding systems belong to the same category,
9936 all but the first one are ignored.
9937
9938 usage: (set-coding-system-priority &rest coding-systems)  */)
9939   (ptrdiff_t nargs, Lisp_Object *args)
9940 {
9941   ptrdiff_t i, j;
9942   bool changed[coding_category_max];
9943   enum coding_category priorities[coding_category_max];
9944
9945   memset (changed, 0, sizeof changed);
9946
9947   for (i = j = 0; i < nargs; i++)
9948     {
9949       enum coding_category category;
9950       Lisp_Object spec, attrs;
9951
9952       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9953       attrs = AREF (spec, 0);
9954       category = XINT (CODING_ATTR_CATEGORY (attrs));
9955       if (changed[category])
9956         /* Ignore this coding system because a coding system of the
9957            same category already had a higher priority.  */
9958         continue;
9959       changed[category] = 1;
9960       priorities[j++] = category;
9961       if (coding_categories[category].id >= 0
9962           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9963         setup_coding_system (args[i], &coding_categories[category]);
9964       Fset (AREF (Vcoding_category_table, category), args[i]);
9965     }
9966
9967   /* Now we have decided top J priorities.  Reflect the order of the
9968      original priorities to the remaining priorities.  */
9969
9970   for (i = j, j = 0; i < coding_category_max; i++, j++)
9971     {
9972       while (j < coding_category_max
9973              && changed[coding_priorities[j]])
9974         j++;
9975       if (j == coding_category_max)
9976         emacs_abort ();
9977       priorities[i] = coding_priorities[j];
9978     }
9979
9980   memcpy (coding_priorities, priorities, sizeof priorities);
9981
9982   /* Update `coding-category-list'.  */
9983   Vcoding_category_list = Qnil;
9984   for (i = coding_category_max; i-- > 0; )
9985     Vcoding_category_list
9986       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9987                Vcoding_category_list);
9988
9989   return Qnil;
9990 }
9991
9992 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9993        Scoding_system_priority_list, 0, 1, 0,
9994        doc: /* Return a list of coding systems ordered by their priorities.
9995 The list contains a subset of coding systems; i.e. coding systems
9996 assigned to each coding category (see `coding-category-list').
9997
9998 HIGHESTP non-nil means just return the highest priority one.  */)
9999   (Lisp_Object highestp)
10000 {
10001   int i;
10002   Lisp_Object val;
10003
10004   for (i = 0, val = Qnil; i < coding_category_max; i++)
10005     {
10006       enum coding_category category = coding_priorities[i];
10007       int id = coding_categories[category].id;
10008       Lisp_Object attrs;
10009
10010       if (id < 0)
10011         continue;
10012       attrs = CODING_ID_ATTRS (id);
10013       if (! NILP (highestp))
10014         return CODING_ATTR_BASE_NAME (attrs);
10015       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10016     }
10017   return Fnreverse (val);
10018 }
10019
10020 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10021
10022 static Lisp_Object
10023 make_subsidiaries (Lisp_Object base)
10024 {
10025   Lisp_Object subsidiaries;
10026   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10027   char *buf = alloca (base_name_len + 6);
10028   int i;
10029
10030   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10031   subsidiaries = make_uninit_vector (3);
10032   for (i = 0; i < 3; i++)
10033     {
10034       strcpy (buf + base_name_len, suffixes[i]);
10035       ASET (subsidiaries, i, intern (buf));
10036     }
10037   return subsidiaries;
10038 }
10039
10040
10041 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10042        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10043        doc: /* For internal use only.
10044 usage: (define-coding-system-internal ...)  */)
10045   (ptrdiff_t nargs, Lisp_Object *args)
10046 {
10047   Lisp_Object name;
10048   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10049   Lisp_Object attrs;            /* Vector of attributes.  */
10050   Lisp_Object eol_type;
10051   Lisp_Object aliases;
10052   Lisp_Object coding_type, charset_list, safe_charsets;
10053   enum coding_category category;
10054   Lisp_Object tail, val;
10055   int max_charset_id = 0;
10056   int i;
10057
10058   if (nargs < coding_arg_max)
10059     goto short_args;
10060
10061   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10062
10063   name = args[coding_arg_name];
10064   CHECK_SYMBOL (name);
10065   ASET (attrs, coding_attr_base_name, name);
10066
10067   val = args[coding_arg_mnemonic];
10068   if (! STRINGP (val))
10069     CHECK_CHARACTER (val);
10070   ASET (attrs, coding_attr_mnemonic, val);
10071
10072   coding_type = args[coding_arg_coding_type];
10073   CHECK_SYMBOL (coding_type);
10074   ASET (attrs, coding_attr_type, coding_type);
10075
10076   charset_list = args[coding_arg_charset_list];
10077   if (SYMBOLP (charset_list))
10078     {
10079       if (EQ (charset_list, Qiso_2022))
10080         {
10081           if (! EQ (coding_type, Qiso_2022))
10082             error ("Invalid charset-list");
10083           charset_list = Viso_2022_charset_list;
10084         }
10085       else if (EQ (charset_list, Qemacs_mule))
10086         {
10087           if (! EQ (coding_type, Qemacs_mule))
10088             error ("Invalid charset-list");
10089           charset_list = Vemacs_mule_charset_list;
10090         }
10091       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10092         {
10093           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10094             error ("Invalid charset-list");
10095           if (max_charset_id < XFASTINT (XCAR (tail)))
10096             max_charset_id = XFASTINT (XCAR (tail));
10097         }
10098     }
10099   else
10100     {
10101       charset_list = Fcopy_sequence (charset_list);
10102       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10103         {
10104           struct charset *charset;
10105
10106           val = XCAR (tail);
10107           CHECK_CHARSET_GET_CHARSET (val, charset);
10108           if (EQ (coding_type, Qiso_2022)
10109               ? CHARSET_ISO_FINAL (charset) < 0
10110               : EQ (coding_type, Qemacs_mule)
10111               ? CHARSET_EMACS_MULE_ID (charset) < 0
10112               : 0)
10113             error ("Can't handle charset `%s'",
10114                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10115
10116           XSETCAR (tail, make_number (charset->id));
10117           if (max_charset_id < charset->id)
10118             max_charset_id = charset->id;
10119         }
10120     }
10121   ASET (attrs, coding_attr_charset_list, charset_list);
10122
10123   safe_charsets = make_uninit_string (max_charset_id + 1);
10124   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10125   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10126     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10127   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10128
10129   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10130
10131   val = args[coding_arg_decode_translation_table];
10132   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10133     CHECK_SYMBOL (val);
10134   ASET (attrs, coding_attr_decode_tbl, val);
10135
10136   val = args[coding_arg_encode_translation_table];
10137   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10138     CHECK_SYMBOL (val);
10139   ASET (attrs, coding_attr_encode_tbl, val);
10140
10141   val = args[coding_arg_post_read_conversion];
10142   CHECK_SYMBOL (val);
10143   ASET (attrs, coding_attr_post_read, val);
10144
10145   val = args[coding_arg_pre_write_conversion];
10146   CHECK_SYMBOL (val);
10147   ASET (attrs, coding_attr_pre_write, val);
10148
10149   val = args[coding_arg_default_char];
10150   if (NILP (val))
10151     ASET (attrs, coding_attr_default_char, make_number (' '));
10152   else
10153     {
10154       CHECK_CHARACTER (val);
10155       ASET (attrs, coding_attr_default_char, val);
10156     }
10157
10158   val = args[coding_arg_for_unibyte];
10159   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10160
10161   val = args[coding_arg_plist];
10162   CHECK_LIST (val);
10163   ASET (attrs, coding_attr_plist, val);
10164
10165   if (EQ (coding_type, Qcharset))
10166     {
10167       /* Generate a lisp vector of 256 elements.  Each element is nil,
10168          integer, or a list of charset IDs.
10169
10170          If Nth element is nil, the byte code N is invalid in this
10171          coding system.
10172
10173          If Nth element is a number NUM, N is the first byte of a
10174          charset whose ID is NUM.
10175
10176          If Nth element is a list of charset IDs, N is the first byte
10177          of one of them.  The list is sorted by dimensions of the
10178          charsets.  A charset of smaller dimension comes first. */
10179       val = Fmake_vector (make_number (256), Qnil);
10180
10181       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10182         {
10183           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10184           int dim = CHARSET_DIMENSION (charset);
10185           int idx = (dim - 1) * 4;
10186
10187           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10188             ASET (attrs, coding_attr_ascii_compat, Qt);
10189
10190           for (i = charset->code_space[idx];
10191                i <= charset->code_space[idx + 1]; i++)
10192             {
10193               Lisp_Object tmp, tmp2;
10194               int dim2;
10195
10196               tmp = AREF (val, i);
10197               if (NILP (tmp))
10198                 tmp = XCAR (tail);
10199               else if (NUMBERP (tmp))
10200                 {
10201                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10202                   if (dim < dim2)
10203                     tmp = list2 (XCAR (tail), tmp);
10204                   else
10205                     tmp = list2 (tmp, XCAR (tail));
10206                 }
10207               else
10208                 {
10209                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10210                     {
10211                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10212                       if (dim < dim2)
10213                         break;
10214                     }
10215                   if (NILP (tmp2))
10216                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10217                   else
10218                     {
10219                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10220                       XSETCAR (tmp2, XCAR (tail));
10221                     }
10222                 }
10223               ASET (val, i, tmp);
10224             }
10225         }
10226       ASET (attrs, coding_attr_charset_valids, val);
10227       category = coding_category_charset;
10228     }
10229   else if (EQ (coding_type, Qccl))
10230     {
10231       Lisp_Object valids;
10232
10233       if (nargs < coding_arg_ccl_max)
10234         goto short_args;
10235
10236       val = args[coding_arg_ccl_decoder];
10237       CHECK_CCL_PROGRAM (val);
10238       if (VECTORP (val))
10239         val = Fcopy_sequence (val);
10240       ASET (attrs, coding_attr_ccl_decoder, val);
10241
10242       val = args[coding_arg_ccl_encoder];
10243       CHECK_CCL_PROGRAM (val);
10244       if (VECTORP (val))
10245         val = Fcopy_sequence (val);
10246       ASET (attrs, coding_attr_ccl_encoder, val);
10247
10248       val = args[coding_arg_ccl_valids];
10249       valids = Fmake_string (make_number (256), make_number (0));
10250       for (tail = val; CONSP (tail); tail = XCDR (tail))
10251         {
10252           int from, to;
10253
10254           val = XCAR (tail);
10255           if (INTEGERP (val))
10256             {
10257               if (! (0 <= XINT (val) && XINT (val) <= 255))
10258                 args_out_of_range_3 (val, make_number (0), make_number (255));
10259               from = to = XINT (val);
10260             }
10261           else
10262             {
10263               CHECK_CONS (val);
10264               CHECK_NATNUM_CAR (val);
10265               CHECK_NUMBER_CDR (val);
10266               if (XINT (XCAR (val)) > 255)
10267                 args_out_of_range_3 (XCAR (val),
10268                                      make_number (0), make_number (255));
10269               from = XINT (XCAR (val));
10270               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10271                 args_out_of_range_3 (XCDR (val),
10272                                      XCAR (val), make_number (255));
10273               to = XINT (XCDR (val));
10274             }
10275           for (i = from; i <= to; i++)
10276             SSET (valids, i, 1);
10277         }
10278       ASET (attrs, coding_attr_ccl_valids, valids);
10279
10280       category = coding_category_ccl;
10281     }
10282   else if (EQ (coding_type, Qutf_16))
10283     {
10284       Lisp_Object bom, endian;
10285
10286       ASET (attrs, coding_attr_ascii_compat, Qnil);
10287
10288       if (nargs < coding_arg_utf16_max)
10289         goto short_args;
10290
10291       bom = args[coding_arg_utf16_bom];
10292       if (! NILP (bom) && ! EQ (bom, Qt))
10293         {
10294           CHECK_CONS (bom);
10295           val = XCAR (bom);
10296           CHECK_CODING_SYSTEM (val);
10297           val = XCDR (bom);
10298           CHECK_CODING_SYSTEM (val);
10299         }
10300       ASET (attrs, coding_attr_utf_bom, bom);
10301
10302       endian = args[coding_arg_utf16_endian];
10303       CHECK_SYMBOL (endian);
10304       if (NILP (endian))
10305         endian = Qbig;
10306       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10307         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10308       ASET (attrs, coding_attr_utf_16_endian, endian);
10309
10310       category = (CONSP (bom)
10311                   ? coding_category_utf_16_auto
10312                   : NILP (bom)
10313                   ? (EQ (endian, Qbig)
10314                      ? coding_category_utf_16_be_nosig
10315                      : coding_category_utf_16_le_nosig)
10316                   : (EQ (endian, Qbig)
10317                      ? coding_category_utf_16_be
10318                      : coding_category_utf_16_le));
10319     }
10320   else if (EQ (coding_type, Qiso_2022))
10321     {
10322       Lisp_Object initial, reg_usage, request, flags;
10323
10324       if (nargs < coding_arg_iso2022_max)
10325         goto short_args;
10326
10327       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10328       CHECK_VECTOR (initial);
10329       for (i = 0; i < 4; i++)
10330         {
10331           val = AREF (initial, i);
10332           if (! NILP (val))
10333             {
10334               struct charset *charset;
10335
10336               CHECK_CHARSET_GET_CHARSET (val, charset);
10337               ASET (initial, i, make_number (CHARSET_ID (charset)));
10338               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10339                 ASET (attrs, coding_attr_ascii_compat, Qt);
10340             }
10341           else
10342             ASET (initial, i, make_number (-1));
10343         }
10344
10345       reg_usage = args[coding_arg_iso2022_reg_usage];
10346       CHECK_CONS (reg_usage);
10347       CHECK_NUMBER_CAR (reg_usage);
10348       CHECK_NUMBER_CDR (reg_usage);
10349
10350       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10351       for (tail = request; CONSP (tail); tail = XCDR (tail))
10352         {
10353           int id;
10354           Lisp_Object tmp1;
10355
10356           val = XCAR (tail);
10357           CHECK_CONS (val);
10358           tmp1 = XCAR (val);
10359           CHECK_CHARSET_GET_ID (tmp1, id);
10360           CHECK_NATNUM_CDR (val);
10361           if (XINT (XCDR (val)) >= 4)
10362             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10363           XSETCAR (val, make_number (id));
10364         }
10365
10366       flags = args[coding_arg_iso2022_flags];
10367       CHECK_NATNUM (flags);
10368       i = XINT (flags) & INT_MAX;
10369       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10370         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10371       flags = make_number (i);
10372
10373       ASET (attrs, coding_attr_iso_initial, initial);
10374       ASET (attrs, coding_attr_iso_usage, reg_usage);
10375       ASET (attrs, coding_attr_iso_request, request);
10376       ASET (attrs, coding_attr_iso_flags, flags);
10377       setup_iso_safe_charsets (attrs);
10378
10379       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10380         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10381                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10382                     ? coding_category_iso_7_else
10383                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10384                     ? coding_category_iso_7
10385                     : coding_category_iso_7_tight);
10386       else
10387         {
10388           int id = XINT (AREF (initial, 1));
10389
10390           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10391                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10392                        || id < 0)
10393                       ? coding_category_iso_8_else
10394                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10395                       ? coding_category_iso_8_1
10396                       : coding_category_iso_8_2);
10397         }
10398       if (category != coding_category_iso_8_1
10399           && category != coding_category_iso_8_2)
10400         ASET (attrs, coding_attr_ascii_compat, Qnil);
10401     }
10402   else if (EQ (coding_type, Qemacs_mule))
10403     {
10404       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10405         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10406       ASET (attrs, coding_attr_ascii_compat, Qt);
10407       category = coding_category_emacs_mule;
10408     }
10409   else if (EQ (coding_type, Qshift_jis))
10410     {
10411
10412       struct charset *charset;
10413
10414       if (XINT (Flength (charset_list)) != 3
10415           && XINT (Flength (charset_list)) != 4)
10416         error ("There should be three or four charsets");
10417
10418       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10419       if (CHARSET_DIMENSION (charset) != 1)
10420         error ("Dimension of charset %s is not one",
10421                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10422       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10423         ASET (attrs, coding_attr_ascii_compat, Qt);
10424
10425       charset_list = XCDR (charset_list);
10426       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10427       if (CHARSET_DIMENSION (charset) != 1)
10428         error ("Dimension of charset %s is not one",
10429                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10430
10431       charset_list = XCDR (charset_list);
10432       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10433       if (CHARSET_DIMENSION (charset) != 2)
10434         error ("Dimension of charset %s is not two",
10435                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10436
10437       charset_list = XCDR (charset_list);
10438       if (! NILP (charset_list))
10439         {
10440           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10441           if (CHARSET_DIMENSION (charset) != 2)
10442             error ("Dimension of charset %s is not two",
10443                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10444         }
10445
10446       category = coding_category_sjis;
10447       Vsjis_coding_system = name;
10448     }
10449   else if (EQ (coding_type, Qbig5))
10450     {
10451       struct charset *charset;
10452
10453       if (XINT (Flength (charset_list)) != 2)
10454         error ("There should be just two charsets");
10455
10456       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10457       if (CHARSET_DIMENSION (charset) != 1)
10458         error ("Dimension of charset %s is not one",
10459                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10460       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10461         ASET (attrs, coding_attr_ascii_compat, Qt);
10462
10463       charset_list = XCDR (charset_list);
10464       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10465       if (CHARSET_DIMENSION (charset) != 2)
10466         error ("Dimension of charset %s is not two",
10467                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10468
10469       category = coding_category_big5;
10470       Vbig5_coding_system = name;
10471     }
10472   else if (EQ (coding_type, Qraw_text))
10473     {
10474       category = coding_category_raw_text;
10475       ASET (attrs, coding_attr_ascii_compat, Qt);
10476     }
10477   else if (EQ (coding_type, Qutf_8))
10478     {
10479       Lisp_Object bom;
10480
10481       if (nargs < coding_arg_utf8_max)
10482         goto short_args;
10483
10484       bom = args[coding_arg_utf8_bom];
10485       if (! NILP (bom) && ! EQ (bom, Qt))
10486         {
10487           CHECK_CONS (bom);
10488           val = XCAR (bom);
10489           CHECK_CODING_SYSTEM (val);
10490           val = XCDR (bom);
10491           CHECK_CODING_SYSTEM (val);
10492         }
10493       ASET (attrs, coding_attr_utf_bom, bom);
10494       if (NILP (bom))
10495         ASET (attrs, coding_attr_ascii_compat, Qt);
10496
10497       category = (CONSP (bom) ? coding_category_utf_8_auto
10498                   : NILP (bom) ? coding_category_utf_8_nosig
10499                   : coding_category_utf_8_sig);
10500     }
10501   else if (EQ (coding_type, Qundecided))
10502     {
10503       if (nargs < coding_arg_undecided_max)
10504         goto short_args;
10505       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10506             args[coding_arg_undecided_inhibit_null_byte_detection]);
10507       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10508             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10509       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10510             args[coding_arg_undecided_prefer_utf_8]);
10511       category = coding_category_undecided;
10512     }
10513   else
10514     error ("Invalid coding system type: %s",
10515            SDATA (SYMBOL_NAME (coding_type)));
10516
10517   ASET (attrs, coding_attr_category, make_number (category));
10518   ASET (attrs, coding_attr_plist,
10519         Fcons (QCcategory,
10520                Fcons (AREF (Vcoding_category_table, category),
10521                       CODING_ATTR_PLIST (attrs))));
10522   ASET (attrs, coding_attr_plist,
10523         Fcons (QCascii_compatible_p,
10524                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10525                       CODING_ATTR_PLIST (attrs))));
10526
10527   eol_type = args[coding_arg_eol_type];
10528   if (! NILP (eol_type)
10529       && ! EQ (eol_type, Qunix)
10530       && ! EQ (eol_type, Qdos)
10531       && ! EQ (eol_type, Qmac))
10532     error ("Invalid eol-type");
10533
10534   aliases = list1 (name);
10535
10536   if (NILP (eol_type))
10537     {
10538       eol_type = make_subsidiaries (name);
10539       for (i = 0; i < 3; i++)
10540         {
10541           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10542
10543           this_name = AREF (eol_type, i);
10544           this_aliases = list1 (this_name);
10545           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10546           this_spec = make_uninit_vector (3);
10547           ASET (this_spec, 0, attrs);
10548           ASET (this_spec, 1, this_aliases);
10549           ASET (this_spec, 2, this_eol_type);
10550           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10551           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10552           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10553           if (NILP (val))
10554             Vcoding_system_alist
10555               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10556                        Vcoding_system_alist);
10557         }
10558     }
10559
10560   spec_vec = make_uninit_vector (3);
10561   ASET (spec_vec, 0, attrs);
10562   ASET (spec_vec, 1, aliases);
10563   ASET (spec_vec, 2, eol_type);
10564
10565   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10566   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10567   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10568   if (NILP (val))
10569     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10570                                   Vcoding_system_alist);
10571
10572   {
10573     int id = coding_categories[category].id;
10574
10575     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10576       setup_coding_system (name, &coding_categories[category]);
10577   }
10578
10579   return Qnil;
10580
10581  short_args:
10582   return Fsignal (Qwrong_number_of_arguments,
10583                   Fcons (intern ("define-coding-system-internal"),
10584                          make_number (nargs)));
10585 }
10586
10587
10588 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10589        3, 3, 0,
10590        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10591   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10592 {
10593   Lisp_Object spec, attrs;
10594
10595   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10596   attrs = AREF (spec, 0);
10597   if (EQ (prop, QCmnemonic))
10598     {
10599       if (! STRINGP (val))
10600         CHECK_CHARACTER (val);
10601       ASET (attrs, coding_attr_mnemonic, val);
10602     }
10603   else if (EQ (prop, QCdefault_char))
10604     {
10605       if (NILP (val))
10606         val = make_number (' ');
10607       else
10608         CHECK_CHARACTER (val);
10609       ASET (attrs, coding_attr_default_char, val);
10610     }
10611   else if (EQ (prop, QCdecode_translation_table))
10612     {
10613       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10614         CHECK_SYMBOL (val);
10615       ASET (attrs, coding_attr_decode_tbl, val);
10616     }
10617   else if (EQ (prop, QCencode_translation_table))
10618     {
10619       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10620         CHECK_SYMBOL (val);
10621       ASET (attrs, coding_attr_encode_tbl, val);
10622     }
10623   else if (EQ (prop, QCpost_read_conversion))
10624     {
10625       CHECK_SYMBOL (val);
10626       ASET (attrs, coding_attr_post_read, val);
10627     }
10628   else if (EQ (prop, QCpre_write_conversion))
10629     {
10630       CHECK_SYMBOL (val);
10631       ASET (attrs, coding_attr_pre_write, val);
10632     }
10633   else if (EQ (prop, QCascii_compatible_p))
10634     {
10635       ASET (attrs, coding_attr_ascii_compat, val);
10636     }
10637
10638   ASET (attrs, coding_attr_plist,
10639         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10640   return val;
10641 }
10642
10643
10644 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10645        Sdefine_coding_system_alias, 2, 2, 0,
10646        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10647   (Lisp_Object alias, Lisp_Object coding_system)
10648 {
10649   Lisp_Object spec, aliases, eol_type, val;
10650
10651   CHECK_SYMBOL (alias);
10652   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10653   aliases = AREF (spec, 1);
10654   /* ALIASES should be a list of length more than zero, and the first
10655      element is a base coding system.  Append ALIAS at the tail of the
10656      list.  */
10657   while (!NILP (XCDR (aliases)))
10658     aliases = XCDR (aliases);
10659   XSETCDR (aliases, list1 (alias));
10660
10661   eol_type = AREF (spec, 2);
10662   if (VECTORP (eol_type))
10663     {
10664       Lisp_Object subsidiaries;
10665       int i;
10666
10667       subsidiaries = make_subsidiaries (alias);
10668       for (i = 0; i < 3; i++)
10669         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10670                                      AREF (eol_type, i));
10671     }
10672
10673   Fputhash (alias, spec, Vcoding_system_hash_table);
10674   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10675   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10676   if (NILP (val))
10677     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10678                                   Vcoding_system_alist);
10679
10680   return Qnil;
10681 }
10682
10683 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10684        1, 1, 0,
10685        doc: /* Return the base of CODING-SYSTEM.
10686 Any alias or subsidiary coding system is not a base coding system.  */)
10687   (Lisp_Object coding_system)
10688 {
10689   Lisp_Object spec, attrs;
10690
10691   if (NILP (coding_system))
10692     return (Qno_conversion);
10693   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10694   attrs = AREF (spec, 0);
10695   return CODING_ATTR_BASE_NAME (attrs);
10696 }
10697
10698 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10699        1, 1, 0,
10700        doc: "Return the property list of CODING-SYSTEM.")
10701   (Lisp_Object coding_system)
10702 {
10703   Lisp_Object spec, attrs;
10704
10705   if (NILP (coding_system))
10706     coding_system = Qno_conversion;
10707   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10708   attrs = AREF (spec, 0);
10709   return CODING_ATTR_PLIST (attrs);
10710 }
10711
10712
10713 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10714        1, 1, 0,
10715        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10716   (Lisp_Object coding_system)
10717 {
10718   Lisp_Object spec;
10719
10720   if (NILP (coding_system))
10721     coding_system = Qno_conversion;
10722   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10723   return AREF (spec, 1);
10724 }
10725
10726 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10727        Scoding_system_eol_type, 1, 1, 0,
10728        doc: /* Return eol-type of CODING-SYSTEM.
10729 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10730
10731 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10732 and CR respectively.
10733
10734 A vector value indicates that a format of end-of-line should be
10735 detected automatically.  Nth element of the vector is the subsidiary
10736 coding system whose eol-type is N.  */)
10737   (Lisp_Object coding_system)
10738 {
10739   Lisp_Object spec, eol_type;
10740   int n;
10741
10742   if (NILP (coding_system))
10743     coding_system = Qno_conversion;
10744   if (! CODING_SYSTEM_P (coding_system))
10745     return Qnil;
10746   spec = CODING_SYSTEM_SPEC (coding_system);
10747   eol_type = AREF (spec, 2);
10748   if (VECTORP (eol_type))
10749     return Fcopy_sequence (eol_type);
10750   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10751   return make_number (n);
10752 }
10753
10754 #endif /* emacs */
10755
10756 \f
10757 /*** 9. Post-amble ***/
10758
10759 void
10760 init_coding_once (void)
10761 {
10762   int i;
10763
10764   for (i = 0; i < coding_category_max; i++)
10765     {
10766       coding_categories[i].id = -1;
10767       coding_priorities[i] = i;
10768     }
10769
10770   /* ISO2022 specific initialize routine.  */
10771   for (i = 0; i < 0x20; i++)
10772     iso_code_class[i] = ISO_control_0;
10773   for (i = 0x21; i < 0x7F; i++)
10774     iso_code_class[i] = ISO_graphic_plane_0;
10775   for (i = 0x80; i < 0xA0; i++)
10776     iso_code_class[i] = ISO_control_1;
10777   for (i = 0xA1; i < 0xFF; i++)
10778     iso_code_class[i] = ISO_graphic_plane_1;
10779   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10780   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10781   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10782   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10783   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10784   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10785   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10786   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10787   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10788
10789   for (i = 0; i < 256; i++)
10790     {
10791       emacs_mule_bytes[i] = 1;
10792     }
10793   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10794   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10795   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10796   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10797 }
10798
10799 #ifdef emacs
10800
10801 void
10802 syms_of_coding (void)
10803 {
10804   staticpro (&Vcoding_system_hash_table);
10805   {
10806     Lisp_Object args[2];
10807     args[0] = QCtest;
10808     args[1] = Qeq;
10809     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10810   }
10811
10812   staticpro (&Vsjis_coding_system);
10813   Vsjis_coding_system = Qnil;
10814
10815   staticpro (&Vbig5_coding_system);
10816   Vbig5_coding_system = Qnil;
10817
10818   staticpro (&Vcode_conversion_reused_workbuf);
10819   Vcode_conversion_reused_workbuf = Qnil;
10820
10821   staticpro (&Vcode_conversion_workbuf_name);
10822   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10823
10824   reused_workbuf_in_use = 0;
10825
10826   DEFSYM (Qcharset, "charset");
10827   DEFSYM (Qtarget_idx, "target-idx");
10828   DEFSYM (Qcoding_system_history, "coding-system-history");
10829   Fset (Qcoding_system_history, Qnil);
10830
10831   /* Target FILENAME is the first argument.  */
10832   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10833   /* Target FILENAME is the third argument.  */
10834   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10835
10836   DEFSYM (Qcall_process, "call-process");
10837   /* Target PROGRAM is the first argument.  */
10838   Fput (Qcall_process, Qtarget_idx, make_number (0));
10839
10840   DEFSYM (Qcall_process_region, "call-process-region");
10841   /* Target PROGRAM is the third argument.  */
10842   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10843
10844   DEFSYM (Qstart_process, "start-process");
10845   /* Target PROGRAM is the third argument.  */
10846   Fput (Qstart_process, Qtarget_idx, make_number (2));
10847
10848   DEFSYM (Qopen_network_stream, "open-network-stream");
10849   /* Target SERVICE is the fourth argument.  */
10850   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10851
10852   DEFSYM (Qcoding_system, "coding-system");
10853   DEFSYM (Qcoding_aliases, "coding-aliases");
10854
10855   DEFSYM (Qeol_type, "eol-type");
10856   DEFSYM (Qunix, "unix");
10857   DEFSYM (Qdos, "dos");
10858   DEFSYM (Qmac, "mac");
10859
10860   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10861   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10862   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10863   DEFSYM (Qdefault_char, "default-char");
10864   DEFSYM (Qundecided, "undecided");
10865   DEFSYM (Qno_conversion, "no-conversion");
10866   DEFSYM (Qraw_text, "raw-text");
10867
10868   DEFSYM (Qiso_2022, "iso-2022");
10869
10870   DEFSYM (Qutf_8, "utf-8");
10871   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10872
10873 #if defined (WINDOWSNT) || defined (CYGWIN)
10874   /* No, not utf-16-le: that one has a BOM.  */
10875   DEFSYM (Qutf_16le, "utf-16le");
10876 #endif
10877
10878   DEFSYM (Qutf_16, "utf-16");
10879   DEFSYM (Qbig, "big");
10880   DEFSYM (Qlittle, "little");
10881
10882   DEFSYM (Qshift_jis, "shift-jis");
10883   DEFSYM (Qbig5, "big5");
10884
10885   DEFSYM (Qcoding_system_p, "coding-system-p");
10886
10887   DEFSYM (Qcoding_system_error, "coding-system-error");
10888   Fput (Qcoding_system_error, Qerror_conditions,
10889         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10890   Fput (Qcoding_system_error, Qerror_message,
10891         build_pure_c_string ("Invalid coding system"));
10892
10893   DEFSYM (Qtranslation_table, "translation-table");
10894   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10895   DEFSYM (Qtranslation_table_id, "translation-table-id");
10896   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10897   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10898
10899   DEFSYM (Qvalid_codes, "valid-codes");
10900
10901   DEFSYM (Qemacs_mule, "emacs-mule");
10902
10903   DEFSYM (QCcategory, ":category");
10904   DEFSYM (QCmnemonic, ":mnemonic");
10905   DEFSYM (QCdefault_char, ":default-char");
10906   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10907   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10908   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10909   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10910   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10911
10912   Vcoding_category_table
10913     = Fmake_vector (make_number (coding_category_max), Qnil);
10914   staticpro (&Vcoding_category_table);
10915   /* Followings are target of code detection.  */
10916   ASET (Vcoding_category_table, coding_category_iso_7,
10917         intern_c_string ("coding-category-iso-7"));
10918   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10919         intern_c_string ("coding-category-iso-7-tight"));
10920   ASET (Vcoding_category_table, coding_category_iso_8_1,
10921         intern_c_string ("coding-category-iso-8-1"));
10922   ASET (Vcoding_category_table, coding_category_iso_8_2,
10923         intern_c_string ("coding-category-iso-8-2"));
10924   ASET (Vcoding_category_table, coding_category_iso_7_else,
10925         intern_c_string ("coding-category-iso-7-else"));
10926   ASET (Vcoding_category_table, coding_category_iso_8_else,
10927         intern_c_string ("coding-category-iso-8-else"));
10928   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10929         intern_c_string ("coding-category-utf-8-auto"));
10930   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10931         intern_c_string ("coding-category-utf-8"));
10932   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10933         intern_c_string ("coding-category-utf-8-sig"));
10934   ASET (Vcoding_category_table, coding_category_utf_16_be,
10935         intern_c_string ("coding-category-utf-16-be"));
10936   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10937         intern_c_string ("coding-category-utf-16-auto"));
10938   ASET (Vcoding_category_table, coding_category_utf_16_le,
10939         intern_c_string ("coding-category-utf-16-le"));
10940   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10941         intern_c_string ("coding-category-utf-16-be-nosig"));
10942   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10943         intern_c_string ("coding-category-utf-16-le-nosig"));
10944   ASET (Vcoding_category_table, coding_category_charset,
10945         intern_c_string ("coding-category-charset"));
10946   ASET (Vcoding_category_table, coding_category_sjis,
10947         intern_c_string ("coding-category-sjis"));
10948   ASET (Vcoding_category_table, coding_category_big5,
10949         intern_c_string ("coding-category-big5"));
10950   ASET (Vcoding_category_table, coding_category_ccl,
10951         intern_c_string ("coding-category-ccl"));
10952   ASET (Vcoding_category_table, coding_category_emacs_mule,
10953         intern_c_string ("coding-category-emacs-mule"));
10954   /* Followings are NOT target of code detection.  */
10955   ASET (Vcoding_category_table, coding_category_raw_text,
10956         intern_c_string ("coding-category-raw-text"));
10957   ASET (Vcoding_category_table, coding_category_undecided,
10958         intern_c_string ("coding-category-undecided"));
10959
10960   DEFSYM (Qinsufficient_source, "insufficient-source");
10961   DEFSYM (Qinvalid_source, "invalid-source");
10962   DEFSYM (Qinterrupted, "interrupted");
10963   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10964
10965   defsubr (&Scoding_system_p);
10966   defsubr (&Sread_coding_system);
10967   defsubr (&Sread_non_nil_coding_system);
10968   defsubr (&Scheck_coding_system);
10969   defsubr (&Sdetect_coding_region);
10970   defsubr (&Sdetect_coding_string);
10971   defsubr (&Sfind_coding_systems_region_internal);
10972   defsubr (&Sunencodable_char_position);
10973   defsubr (&Scheck_coding_systems_region);
10974   defsubr (&Sdecode_coding_region);
10975   defsubr (&Sencode_coding_region);
10976   defsubr (&Sdecode_coding_string);
10977   defsubr (&Sencode_coding_string);
10978   defsubr (&Sdecode_sjis_char);
10979   defsubr (&Sencode_sjis_char);
10980   defsubr (&Sdecode_big5_char);
10981   defsubr (&Sencode_big5_char);
10982   defsubr (&Sset_terminal_coding_system_internal);
10983   defsubr (&Sset_safe_terminal_coding_system_internal);
10984   defsubr (&Sterminal_coding_system);
10985   defsubr (&Sset_keyboard_coding_system_internal);
10986   defsubr (&Skeyboard_coding_system);
10987   defsubr (&Sfind_operation_coding_system);
10988   defsubr (&Sset_coding_system_priority);
10989   defsubr (&Sdefine_coding_system_internal);
10990   defsubr (&Sdefine_coding_system_alias);
10991   defsubr (&Scoding_system_put);
10992   defsubr (&Scoding_system_base);
10993   defsubr (&Scoding_system_plist);
10994   defsubr (&Scoding_system_aliases);
10995   defsubr (&Scoding_system_eol_type);
10996   defsubr (&Scoding_system_priority_list);
10997
10998   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10999                doc: /* List of coding systems.
11000
11001 Do not alter the value of this variable manually.  This variable should be
11002 updated by the functions `define-coding-system' and
11003 `define-coding-system-alias'.  */);
11004   Vcoding_system_list = Qnil;
11005
11006   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11007                doc: /* Alist of coding system names.
11008 Each element is one element list of coding system name.
11009 This variable is given to `completing-read' as COLLECTION argument.
11010
11011 Do not alter the value of this variable manually.  This variable should be
11012 updated by the functions `make-coding-system' and
11013 `define-coding-system-alias'.  */);
11014   Vcoding_system_alist = Qnil;
11015
11016   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11017                doc: /* List of coding-categories (symbols) ordered by priority.
11018
11019 On detecting a coding system, Emacs tries code detection algorithms
11020 associated with each coding-category one by one in this order.  When
11021 one algorithm agrees with a byte sequence of source text, the coding
11022 system bound to the corresponding coding-category is selected.
11023
11024 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11025   {
11026     int i;
11027
11028     Vcoding_category_list = Qnil;
11029     for (i = coding_category_max - 1; i >= 0; i--)
11030       Vcoding_category_list
11031         = Fcons (AREF (Vcoding_category_table, i),
11032                  Vcoding_category_list);
11033   }
11034
11035   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11036                doc: /* Specify the coding system for read operations.
11037 It is useful to bind this variable with `let', but do not set it globally.
11038 If the value is a coding system, it is used for decoding on read operation.
11039 If not, an appropriate element is used from one of the coding system alists.
11040 There are three such tables: `file-coding-system-alist',
11041 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11042   Vcoding_system_for_read = Qnil;
11043
11044   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11045                doc: /* Specify the coding system for write operations.
11046 Programs bind this variable with `let', but you should not set it globally.
11047 If the value is a coding system, it is used for encoding of output,
11048 when writing it to a file and when sending it to a file or subprocess.
11049
11050 If this does not specify a coding system, an appropriate element
11051 is used from one of the coding system alists.
11052 There are three such tables: `file-coding-system-alist',
11053 `process-coding-system-alist', and `network-coding-system-alist'.
11054 For output to files, if the above procedure does not specify a coding system,
11055 the value of `buffer-file-coding-system' is used.  */);
11056   Vcoding_system_for_write = Qnil;
11057
11058   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11059                doc: /*
11060 Coding system used in the latest file or process I/O.  */);
11061   Vlast_coding_system_used = Qnil;
11062
11063   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11064                doc: /*
11065 Error status of the last code conversion.
11066
11067 When an error was detected in the last code conversion, this variable
11068 is set to one of the following symbols.
11069   `insufficient-source'
11070   `inconsistent-eol'
11071   `invalid-source'
11072   `interrupted'
11073   `insufficient-memory'
11074 When no error was detected, the value doesn't change.  So, to check
11075 the error status of a code conversion by this variable, you must
11076 explicitly set this variable to nil before performing code
11077 conversion.  */);
11078   Vlast_code_conversion_error = Qnil;
11079
11080   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11081                doc: /*
11082 *Non-nil means always inhibit code conversion of end-of-line format.
11083 See info node `Coding Systems' and info node `Text and Binary' concerning
11084 such conversion.  */);
11085   inhibit_eol_conversion = 0;
11086
11087   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11088                doc: /*
11089 Non-nil means process buffer inherits coding system of process output.
11090 Bind it to t if the process output is to be treated as if it were a file
11091 read from some filesystem.  */);
11092   inherit_process_coding_system = 0;
11093
11094   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11095                doc: /*
11096 Alist to decide a coding system to use for a file I/O operation.
11097 The format is ((PATTERN . VAL) ...),
11098 where PATTERN is a regular expression matching a file name,
11099 VAL is a coding system, a cons of coding systems, or a function symbol.
11100 If VAL is a coding system, it is used for both decoding and encoding
11101 the file contents.
11102 If VAL is a cons of coding systems, the car part is used for decoding,
11103 and the cdr part is used for encoding.
11104 If VAL is a function symbol, the function must return a coding system
11105 or a cons of coding systems which are used as above.  The function is
11106 called with an argument that is a list of the arguments with which
11107 `find-operation-coding-system' was called.  If the function can't decide
11108 a coding system, it can return `undecided' so that the normal
11109 code-detection is performed.
11110
11111 See also the function `find-operation-coding-system'
11112 and the variable `auto-coding-alist'.  */);
11113   Vfile_coding_system_alist = Qnil;
11114
11115   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11116                doc: /*
11117 Alist to decide a coding system to use for a process I/O operation.
11118 The format is ((PATTERN . VAL) ...),
11119 where PATTERN is a regular expression matching a program name,
11120 VAL is a coding system, a cons of coding systems, or a function symbol.
11121 If VAL is a coding system, it is used for both decoding what received
11122 from the program and encoding what sent to the program.
11123 If VAL is a cons of coding systems, the car part is used for decoding,
11124 and the cdr part is used for encoding.
11125 If VAL is a function symbol, the function must return a coding system
11126 or a cons of coding systems which are used as above.
11127
11128 See also the function `find-operation-coding-system'.  */);
11129   Vprocess_coding_system_alist = Qnil;
11130
11131   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11132                doc: /*
11133 Alist to decide a coding system to use for a network I/O operation.
11134 The format is ((PATTERN . VAL) ...),
11135 where PATTERN is a regular expression matching a network service name
11136 or is a port number to connect to,
11137 VAL is a coding system, a cons of coding systems, or a function symbol.
11138 If VAL is a coding system, it is used for both decoding what received
11139 from the network stream and encoding what sent to the network stream.
11140 If VAL is a cons of coding systems, the car part is used for decoding,
11141 and the cdr part is used for encoding.
11142 If VAL is a function symbol, the function must return a coding system
11143 or a cons of coding systems which are used as above.
11144
11145 See also the function `find-operation-coding-system'.  */);
11146   Vnetwork_coding_system_alist = Qnil;
11147
11148   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11149                doc: /* Coding system to use with system messages.
11150 Also used for decoding keyboard input on X Window system.  */);
11151   Vlocale_coding_system = Qnil;
11152
11153   /* The eol mnemonics are reset in startup.el system-dependently.  */
11154   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11155                doc: /*
11156 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11157   eol_mnemonic_unix = build_pure_c_string (":");
11158
11159   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11160                doc: /*
11161 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11162   eol_mnemonic_dos = build_pure_c_string ("\\");
11163
11164   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11165                doc: /*
11166 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11167   eol_mnemonic_mac = build_pure_c_string ("/");
11168
11169   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11170                doc: /*
11171 *String displayed in mode line when end-of-line format is not yet determined.  */);
11172   eol_mnemonic_undecided = build_pure_c_string (":");
11173
11174   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11175                doc: /*
11176 *Non-nil enables character translation while encoding and decoding.  */);
11177   Venable_character_translation = Qt;
11178
11179   DEFVAR_LISP ("standard-translation-table-for-decode",
11180                Vstandard_translation_table_for_decode,
11181                doc: /* Table for translating characters while decoding.  */);
11182   Vstandard_translation_table_for_decode = Qnil;
11183
11184   DEFVAR_LISP ("standard-translation-table-for-encode",
11185                Vstandard_translation_table_for_encode,
11186                doc: /* Table for translating characters while encoding.  */);
11187   Vstandard_translation_table_for_encode = Qnil;
11188
11189   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11190                doc: /* Alist of charsets vs revision numbers.
11191 While encoding, if a charset (car part of an element) is found,
11192 designate it with the escape sequence identifying revision (cdr part
11193 of the element).  */);
11194   Vcharset_revision_table = Qnil;
11195
11196   DEFVAR_LISP ("default-process-coding-system",
11197                Vdefault_process_coding_system,
11198                doc: /* Cons of coding systems used for process I/O by default.
11199 The car part is used for decoding a process output,
11200 the cdr part is used for encoding a text to be sent to a process.  */);
11201   Vdefault_process_coding_system = Qnil;
11202
11203   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11204                doc: /*
11205 Table of extra Latin codes in the range 128..159 (inclusive).
11206 This is a vector of length 256.
11207 If Nth element is non-nil, the existence of code N in a file
11208 \(or output of subprocess) doesn't prevent it to be detected as
11209 a coding system of ISO 2022 variant which has a flag
11210 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11211 or reading output of a subprocess.
11212 Only 128th through 159th elements have a meaning.  */);
11213   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11214
11215   DEFVAR_LISP ("select-safe-coding-system-function",
11216                Vselect_safe_coding_system_function,
11217                doc: /*
11218 Function to call to select safe coding system for encoding a text.
11219
11220 If set, this function is called to force a user to select a proper
11221 coding system which can encode the text in the case that a default
11222 coding system used in each operation can't encode the text.  The
11223 function should take care that the buffer is not modified while
11224 the coding system is being selected.
11225
11226 The default value is `select-safe-coding-system' (which see).  */);
11227   Vselect_safe_coding_system_function = Qnil;
11228
11229   DEFVAR_BOOL ("coding-system-require-warning",
11230                coding_system_require_warning,
11231                doc: /* Internal use only.
11232 If non-nil, on writing a file, `select-safe-coding-system-function' is
11233 called even if `coding-system-for-write' is non-nil.  The command
11234 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11235   coding_system_require_warning = 0;
11236
11237
11238   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11239                inhibit_iso_escape_detection,
11240                doc: /*
11241 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11242
11243 When Emacs reads text, it tries to detect how the text is encoded.
11244 This code detection is sensitive to escape sequences.  If Emacs sees
11245 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11246 of the ISO2022 encodings, and decodes text by the corresponding coding
11247 system (e.g. `iso-2022-7bit').
11248
11249 However, there may be a case that you want to read escape sequences in
11250 a file as is.  In such a case, you can set this variable to non-nil.
11251 Then the code detection will ignore any escape sequences, and no text is
11252 detected as encoded in some ISO-2022 encoding.  The result is that all
11253 escape sequences become visible in a buffer.
11254
11255 The default value is nil, and it is strongly recommended not to change
11256 it.  That is because many Emacs Lisp source files that contain
11257 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11258 in Emacs's distribution, and they won't be decoded correctly on
11259 reading if you suppress escape sequence detection.
11260
11261 The other way to read escape sequences in a file without decoding is
11262 to explicitly specify some coding system that doesn't use ISO-2022
11263 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11264   inhibit_iso_escape_detection = 0;
11265
11266   DEFVAR_BOOL ("inhibit-null-byte-detection",
11267                inhibit_null_byte_detection,
11268                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11269 By default, Emacs treats it as binary data, and does not attempt to
11270 decode it.  The effect is as if you specified `no-conversion' for
11271 reading that text.
11272
11273 Set this to non-nil when a regular text happens to include null bytes.
11274 Examples are Index nodes of Info files and null-byte delimited output
11275 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11276 decode text as usual.  */);
11277   inhibit_null_byte_detection = 0;
11278
11279   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11280                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11281 Internal use only.  Removed after the experimental optimizer gets stable. */);
11282   disable_ascii_optimization = 0;
11283
11284   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11285                doc: /* Char table for translating self-inserting characters.
11286 This is applied to the result of input methods, not their input.
11287 See also `keyboard-translate-table'.
11288
11289 Use of this variable for character code unification was rendered
11290 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11291 internal character representation.  */);
11292     Vtranslation_table_for_input = Qnil;
11293
11294   {
11295     Lisp_Object args[coding_arg_undecided_max];
11296     Lisp_Object plist[16];
11297     int i;
11298
11299     for (i = 0; i < coding_arg_undecided_max; i++)
11300       args[i] = Qnil;
11301
11302     plist[0] = intern_c_string (":name");
11303     plist[1] = args[coding_arg_name] = Qno_conversion;
11304     plist[2] = intern_c_string (":mnemonic");
11305     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11306     plist[4] = intern_c_string (":coding-type");
11307     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11308     plist[6] = intern_c_string (":ascii-compatible-p");
11309     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11310     plist[8] = intern_c_string (":default-char");
11311     plist[9] = args[coding_arg_default_char] = make_number (0);
11312     plist[10] = intern_c_string (":for-unibyte");
11313     plist[11] = args[coding_arg_for_unibyte] = Qt;
11314     plist[12] = intern_c_string (":docstring");
11315     plist[13] = build_pure_c_string ("Do no conversion.\n\
11316 \n\
11317 When you visit a file with this coding, the file is read into a\n\
11318 unibyte buffer as is, thus each byte of a file is treated as a\n\
11319 character.");
11320     plist[14] = intern_c_string (":eol-type");
11321     plist[15] = args[coding_arg_eol_type] = Qunix;
11322     args[coding_arg_plist] = Flist (16, plist);
11323     Fdefine_coding_system_internal (coding_arg_max, args);
11324
11325     plist[1] = args[coding_arg_name] = Qundecided;
11326     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11327     plist[5] = args[coding_arg_coding_type] = Qundecided;
11328     /* This is already set.
11329        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11330     plist[8] = intern_c_string (":charset-list");
11331     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11332     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11333     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11334     plist[15] = args[coding_arg_eol_type] = Qnil;
11335     args[coding_arg_plist] = Flist (16, plist);
11336     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11337     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11338     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11339   }
11340
11341   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11342
11343   {
11344     int i;
11345
11346     for (i = 0; i < coding_category_max; i++)
11347       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11348   }
11349 #if defined (DOS_NT)
11350   system_eol_type = Qdos;
11351 #else
11352   system_eol_type = Qunix;
11353 #endif
11354   staticpro (&system_eol_type);
11355 }
11356
11357 char *
11358 emacs_strerror (int error_number)
11359 {
11360   char *str;
11361
11362   synchronize_system_messages_locale ();
11363   str = strerror (error_number);
11364
11365   if (! NILP (Vlocale_coding_system))
11366     {
11367       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11368                                                       Vlocale_coding_system,
11369                                                       0);
11370       str = SSDATA (dec);
11371     }
11372
11373   return str;
11374 }
11375
11376 #endif /* emacs */