code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 653   do {                                                  \
 654     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 655     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 656   } while (0)
 657
 658 static void
 659 CHECK_NATNUM_CAR (Lisp_Object x)
 660 {
 661   Lisp_Object tmp = XCAR (x);
 662   CHECK_NATNUM (tmp);
 663   XSETCAR (x, tmp);
 664 }
 665
 666 static void
 667 CHECK_NATNUM_CDR (Lisp_Object x)
 668 {
 669   Lisp_Object tmp = XCDR (x);
 670   CHECK_NATNUM (tmp);
 671   XSETCDR (x, tmp);
 672 }
 673
 674
 675 /* Safely get one byte from the source text pointed by SRC which ends
 676    at SRC_END, and set C to that byte.  If there are not enough bytes
 677    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 678    and a multibyte character is found at SRC, set C to the
 679    negative value of the character code.  The caller should declare
 680    and set these variables appropriately in advance:
 681         src, src_end, multibytep */
 682
 683 #define ONE_MORE_BYTE(c)                                \
 684   do {                                                  \
 685     if (src == src_end)                                 \
 686       {                                                 \
 687         if (src_base < src)                             \
 688           record_conversion_result                      \
 689             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 690         goto no_more_source;                            \
 691       }                                                 \
 692     c = *src++;                                         \
 693     if (multibytep && (c & 0x80))                       \
 694       {                                                 \
 695         if ((c & 0xFE) == 0xC0)                         \
 696           c = ((c & 1) << 6) | *src++;                  \
 697         else                                            \
 698           {                                             \
 699             src--;                                      \
 700             c = - string_char (src, &src, NULL);        \
 701             record_conversion_result                    \
 702               (coding, CODING_RESULT_INVALID_SRC);      \
 703           }                                             \
 704       }                                                 \
 705     consumed_chars++;                                   \
 706   } while (0)
 707
 708 /* Safely get two bytes from the source text pointed by SRC which ends
 709    at SRC_END, and set C1 and C2 to those bytes while skipping the
 710    heading multibyte characters.  If there are not enough bytes in the
 711    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 712    a multibyte character is found for C2, set C2 to the negative value
 713    of the character code.  The caller should declare and set these
 714    variables appropriately in advance:
 715         src, src_end, multibytep
 716    It is intended that this macro is used in detect_coding_utf_16.  */
 717
 718 #define TWO_MORE_BYTES(c1, c2)                          \
 719   do {                                                  \
 720     do {                                                \
 721       if (src == src_end)                               \
 722         goto no_more_source;                            \
 723       c1 = *src++;                                      \
 724       if (multibytep && (c1 & 0x80))                    \
 725         {                                               \
 726           if ((c1 & 0xFE) == 0xC0)                      \
 727             c1 = ((c1 & 1) << 6) | *src++;              \
 728           else                                          \
 729             {                                           \
 730               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 731               c1 = -1;                                  \
 732             }                                           \
 733         }                                               \
 734     } while (c1 < 0);                                   \
 735     if (src == src_end)                                 \
 736       goto no_more_source;                              \
 737     c2 = *src++;                                        \
 738     if (multibytep && (c2 & 0x80))                      \
 739       {                                                 \
 740         if ((c2 & 0xFE) == 0xC0)                        \
 741           c2 = ((c2 & 1) << 6) | *src++;                \
 742         else                                            \
 743           c2 = -1;                                      \
 744       }                                                 \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  The caller should
 750    assure that C is 0..127, and declare and set the variable `dst'
 751    appropriately in advance.
 752 */
 753
 754
 755 #define EMIT_ONE_ASCII_BYTE(c)  \
 756   do {                          \
 757     produced_chars++;           \
 758     *dst++ = (c);               \
 759   } while (0)
 760
 761
 762 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 763
 764 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 765   do {                                  \
 766     produced_chars += 2;                \
 767     *dst++ = (c1), *dst++ = (c2);       \
 768   } while (0)
 769
 770
 771 /* Store a byte C in the place pointed by DST and increment DST to the
 772    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 773    store in an appropriate multibyte form.  The caller should
 774    declare and set the variables `dst' and `multibytep' appropriately
 775    in advance.  */
 776
 777 #define EMIT_ONE_BYTE(c)                \
 778   do {                                  \
 779     produced_chars++;                   \
 780     if (multibytep)                     \
 781       {                                 \
 782         unsigned ch = (c);              \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       *dst++ = (c);                     \
 789   } while (0)
 790
 791
 792 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 793
 794 #define EMIT_TWO_BYTES(c1, c2)          \
 795   do {                                  \
 796     produced_chars += 2;                \
 797     if (multibytep)                     \
 798       {                                 \
 799         unsigned ch;                    \
 800                                         \
 801         ch = (c1);                      \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805         ch = (c2);                      \
 806         if (ch >= 0x80)                 \
 807           ch = BYTE8_TO_CHAR (ch);      \
 808         CHAR_STRING_ADVANCE (ch, dst);  \
 809       }                                 \
 810     else                                \
 811       {                                 \
 812         *dst++ = (c1);                  \
 813         *dst++ = (c2);                  \
 814       }                                 \
 815   } while (0)
 816
 817
 818 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 819   do {                                  \
 820     EMIT_ONE_BYTE (c1);                 \
 821     EMIT_TWO_BYTES (c2, c3);            \
 822   } while (0)
 823
 824
 825 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 826   do {                                          \
 827     EMIT_TWO_BYTES (c1, c2);                    \
 828     EMIT_TWO_BYTES (c3, c4);                    \
 829   } while (0)
 830
 831
 832 static void
 833 record_conversion_result (struct coding_system *coding,
 834                           enum coding_result_code result)
 835 {
 836   coding->result = result;
 837   switch (result)
 838     {
 839     case CODING_RESULT_INSUFFICIENT_SRC:
 840       Vlast_code_conversion_error = Qinsufficient_source;
 841       break;
 842     case CODING_RESULT_INVALID_SRC:
 843       Vlast_code_conversion_error = Qinvalid_source;
 844       break;
 845     case CODING_RESULT_INTERRUPT:
 846       Vlast_code_conversion_error = Qinterrupted;
 847       break;
 848     case CODING_RESULT_INSUFFICIENT_DST:
 849       /* Don't record this error in Vlast_code_conversion_error
 850          because it happens just temporarily and is resolved when the
 851          whole conversion is finished.  */
 852       break;
 853     case CODING_RESULT_SUCCESS:
 854       break;
 855     default:
 856       Vlast_code_conversion_error = intern ("Unknown error");
 857     }
 858 }
 859
 860 /* These wrapper macros are used to preserve validity of pointers into
 861    buffer text across calls to decode_char, encode_char, etc, which
 862    could cause relocation of buffers if it loads a charset map,
 863    because loading a charset map allocates large structures.  */
 864
 865 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 866   do {                                                                       \
 867     ptrdiff_t offset;                                                        \
 868                                                                              \
 869     charset_map_loaded = 0;                                                  \
 870     c = DECODE_CHAR (charset, code);                                         \
 871     if (charset_map_loaded                                                   \
 872         && (offset = coding_change_source (coding)))                         \
 873       {                                                                      \
 874         src += offset;                                                       \
 875         src_base += offset;                                                  \
 876         src_end += offset;                                                   \
 877       }                                                                      \
 878   } while (0)
 879
 880 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     code = ENCODE_CHAR (charset, c);                                    \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 895   do {                                                                  \
 896     ptrdiff_t offset;                                                   \
 897                                                                         \
 898     charset_map_loaded = 0;                                             \
 899     charset = char_charset (c, charset_list, code_return);              \
 900     if (charset_map_loaded                                              \
 901         && (offset = coding_change_destination (coding)))               \
 902       {                                                                 \
 903         dst += offset;                                                  \
 904         dst_end += offset;                                              \
 905       }                                                                 \
 906   } while (0)
 907
 908 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 909   do {                                                                  \
 910     ptrdiff_t offset;                                                   \
 911                                                                         \
 912     charset_map_loaded = 0;                                             \
 913     result = CHAR_CHARSET_P (c, charset);                               \
 914     if (charset_map_loaded                                              \
 915         && (offset = coding_change_destination (coding)))               \
 916       {                                                                 \
 917         dst += offset;                                                  \
 918         dst_end += offset;                                              \
 919       }                                                                 \
 920   } while (0)
 921
 922
 923 /* If there are at least BYTES length of room at dst, allocate memory
 924    for coding->destination and update dst and dst_end.  We don't have
 925    to take care of coding->source which will be relocated.  It is
 926    handled by calling coding_set_source in encode_coding.  */
 927
 928 #define ASSURE_DESTINATION(bytes)                               \
 929   do {                                                          \
 930     if (dst + (bytes) >= dst_end)                               \
 931       {                                                         \
 932         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 933                                                                 \
 934         dst = alloc_destination (coding, more_bytes, dst);      \
 935         dst_end = coding->destination + coding->dst_bytes;      \
 936       }                                                         \
 937   } while (0)
 938
 939
 940 /* Store multibyte form of the character C in P, and advance P to the
 941    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 942    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 943    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 944
 945 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 946
 947 /* Return the character code of character whose multibyte form is at
 948    P, and advance P to the end of the multibyte form.  This used to be
 949    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 950    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 951
 952 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 953
 954 /* Set coding->source from coding->src_object.  */
 955
 956 static void
 957 coding_set_source (struct coding_system *coding)
 958 {
 959   if (BUFFERP (coding->src_object))
 960     {
 961       struct buffer *buf = XBUFFER (coding->src_object);
 962
 963       if (coding->src_pos < 0)
 964         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 965       else
 966         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 967     }
 968   else if (STRINGP (coding->src_object))
 969     {
 970       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 971     }
 972   else
 973     {
 974       /* Otherwise, the source is C string and is never relocated
 975          automatically.  Thus we don't have to update anything.  */
 976     }
 977 }
 978
 979
 980 /* Set coding->source from coding->src_object, and return how many
 981    bytes coding->source was changed.  */
 982
 983 static ptrdiff_t
 984 coding_change_source (struct coding_system *coding)
 985 {
 986   const unsigned char *orig = coding->source;
 987   coding_set_source (coding);
 988   return coding->source - orig;
 989 }
 990
 991
 992 /* Set coding->destination from coding->dst_object.  */
 993
 994 static void
 995 coding_set_destination (struct coding_system *coding)
 996 {
 997   if (BUFFERP (coding->dst_object))
 998     {
 999       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1000         {
1001           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1002           coding->dst_bytes = (GAP_END_ADDR
1003                                - (coding->src_bytes - coding->consumed)
1004                                - coding->destination);
1005         }
1006       else
1007         {
1008           /* We are sure that coding->dst_pos_byte is before the gap
1009              of the buffer. */
1010           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1011                                  + coding->dst_pos_byte - BEG_BYTE);
1012           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1013                                - coding->destination);
1014         }
1015     }
1016   else
1017     {
1018       /* Otherwise, the destination is C string and is never relocated
1019          automatically.  Thus we don't have to update anything.  */
1020     }
1021 }
1022
1023
1024 /* Set coding->destination from coding->dst_object, and return how
1025    many bytes coding->destination was changed.  */
1026
1027 static ptrdiff_t
1028 coding_change_destination (struct coding_system *coding)
1029 {
1030   const unsigned char *orig = coding->destination;
1031   coding_set_destination (coding);
1032   return coding->destination - orig;
1033 }
1034
1035
1036 static void
1037 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1038 {
1039   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1040     string_overflow ();
1041   coding->destination = xrealloc (coding->destination,
1042                                   coding->dst_bytes + bytes);
1043   coding->dst_bytes += bytes;
1044 }
1045
1046 static void
1047 coding_alloc_by_making_gap (struct coding_system *coding,
1048                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1049 {
1050   if (EQ (coding->src_object, coding->dst_object))
1051     {
1052       /* The gap may contain the produced data at the head and not-yet
1053          consumed data at the tail.  To preserve those data, we at
1054          first make the gap size to zero, then increase the gap
1055          size.  */
1056       ptrdiff_t add = GAP_SIZE;
1057
1058       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1059       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1060       make_gap (bytes);
1061       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1062       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1063     }
1064   else
1065     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1066 }
1067
1068
1069 static unsigned char *
1070 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1071                    unsigned char *dst)
1072 {
1073   ptrdiff_t offset = dst - coding->destination;
1074
1075   if (BUFFERP (coding->dst_object))
1076     {
1077       struct buffer *buf = XBUFFER (coding->dst_object);
1078
1079       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1080     }
1081   else
1082     coding_alloc_by_realloc (coding, nbytes);
1083   coding_set_destination (coding);
1084   dst = coding->destination + offset;
1085   return dst;
1086 }
1087
1088 /** Macros for annotations.  */
1089
1090 /* An annotation data is stored in the array coding->charbuf in this
1091    format:
1092      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1093    LENGTH is the number of elements in the annotation.
1094    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1095    NCHARS is the number of characters in the text annotated.
1096
1097    The format of the following elements depend on ANNOTATION_MASK.
1098
1099    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1100    follows:
1101      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1102
1103    NBYTES is the number of bytes specified in the header part of
1104    old-style emacs-mule encoding, or 0 for the other kind of
1105    composition.
1106
1107    METHOD is one of enum composition_method.
1108
1109    Optional COMPOSITION-COMPONENTS are characters and composition
1110    rules.
1111
1112    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1113    follows.
1114
1115    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1116    recover from an invalid annotation, and should be skipped by
1117    produce_annotation.  */
1118
1119 /* Maximum length of the header of annotation data.  */
1120 #define MAX_ANNOTATION_LENGTH 5
1121
1122 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1123   do {                                                  \
1124     *(buf)++ = -(len);                                  \
1125     *(buf)++ = (mask);                                  \
1126     *(buf)++ = (nchars);                                \
1127     coding->annotated = 1;                              \
1128   } while (0);
1129
1130 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1131   do {                                                                      \
1132     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1133     *buf++ = nbytes;                                                        \
1134     *buf++ = method;                                                        \
1135   } while (0)
1136
1137
1138 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1139   do {                                                                  \
1140     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1141     *buf++ = id;                                                        \
1142   } while (0)
1143
1144
1145 /* Bitmasks for coding->eol_seen.  */
1146
1147 #define EOL_SEEN_NONE   0
1148 #define EOL_SEEN_LF     1
1149 #define EOL_SEEN_CR     2
1150 #define EOL_SEEN_CRLF   4
1151
1152 \f
1153 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1154
1155
1156
1157 \f
1158 /*** 3. UTF-8 ***/
1159
1160 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1161    Return true if a text is encoded in UTF-8.  */
1162
1163 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1164 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1165 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1166 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1167 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1168 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1169
1170 #define UTF_8_BOM_1 0xEF
1171 #define UTF_8_BOM_2 0xBB
1172 #define UTF_8_BOM_3 0xBF
1173
1174 /* Unlike the other detect_coding_XXX, this function counts number of
1175    characters and check EOL format.  */
1176
1177 static bool
1178 detect_coding_utf_8 (struct coding_system *coding,
1179                      struct coding_detection_info *detect_info)
1180 {
1181   const unsigned char *src = coding->source, *src_base;
1182   const unsigned char *src_end = coding->source + coding->src_bytes;
1183   bool multibytep = coding->src_multibyte;
1184   ptrdiff_t consumed_chars = 0;
1185   bool bom_found = 0;
1186   int nchars = coding->head_ascii;
1187   int eol_seen = coding->eol_seen;
1188
1189   detect_info->checked |= CATEGORY_MASK_UTF_8;
1190   /* A coding system of this category is always ASCII compatible.  */
1191   src += nchars;
1192
1193   if (src == coding->source     /* BOM should be at the head.  */
1194       && src + 3 < src_end      /* BOM is 3-byte long.  */
1195       && src[0] == UTF_8_BOM_1
1196       && src[1] == UTF_8_BOM_2
1197       && src[2] == UTF_8_BOM_3)
1198     {
1199       bom_found = 1;
1200       src += 3;
1201       nchars++;
1202     }
1203
1204   while (1)
1205     {
1206       int c, c1, c2, c3, c4;
1207
1208       src_base = src;
1209       ONE_MORE_BYTE (c);
1210       if (c < 0 || UTF_8_1_OCTET_P (c))
1211         {
1212           nchars++;
1213           if (c == '\r')
1214             {
1215               if (src < src_end && *src == '\n')
1216                 {
1217                   eol_seen |= EOL_SEEN_CRLF;
1218                   src++;
1219                   nchars++;
1220                 }
1221               else
1222                 eol_seen |= EOL_SEEN_CR;
1223             }
1224           else if (c == '\n')
1225             eol_seen |= EOL_SEEN_LF;
1226           continue;
1227         }
1228       ONE_MORE_BYTE (c1);
1229       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1230         break;
1231       if (UTF_8_2_OCTET_LEADING_P (c))
1232         {
1233           nchars++;
1234           continue;
1235         }
1236       ONE_MORE_BYTE (c2);
1237       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1238         break;
1239       if (UTF_8_3_OCTET_LEADING_P (c))
1240         {
1241           nchars++;
1242           continue;
1243         }
1244       ONE_MORE_BYTE (c3);
1245       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1246         break;
1247       if (UTF_8_4_OCTET_LEADING_P (c))
1248         {
1249           nchars++;
1250           continue;
1251         }
1252       ONE_MORE_BYTE (c4);
1253       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1254         break;
1255       if (UTF_8_5_OCTET_LEADING_P (c))
1256         {
1257           nchars++;
1258           continue;
1259         }
1260       break;
1261     }
1262   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1263   return 0;
1264
1265  no_more_source:
1266   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1267     {
1268       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1269       return 0;
1270     }
1271   if (bom_found)
1272     {
1273       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1274       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1275     }
1276   else
1277     {
1278       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1279       if (nchars < src_end - coding->source)
1280         /* The found characters are less than source bytes, which
1281            means that we found a valid non-ASCII characters.  */
1282         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1283     }
1284   coding->detected_utf8_chars = nchars;
1285   return 1;
1286 }
1287
1288
1289 static void
1290 decode_coding_utf_8 (struct coding_system *coding)
1291 {
1292   const unsigned char *src = coding->source + coding->consumed;
1293   const unsigned char *src_end = coding->source + coding->src_bytes;
1294   const unsigned char *src_base;
1295   int *charbuf = coding->charbuf + coding->charbuf_used;
1296   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1298   bool multibytep = coding->src_multibyte;
1299   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1300   bool eol_dos
1301     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1302   int byte_after_cr = -1;
1303
1304   if (bom != utf_without_bom)
1305     {
1306       int c1, c2, c3;
1307
1308       src_base = src;
1309       ONE_MORE_BYTE (c1);
1310       if (! UTF_8_3_OCTET_LEADING_P (c1))
1311         src = src_base;
1312       else
1313         {
1314           ONE_MORE_BYTE (c2);
1315           if (! UTF_8_EXTRA_OCTET_P (c2))
1316             src = src_base;
1317           else
1318             {
1319               ONE_MORE_BYTE (c3);
1320               if (! UTF_8_EXTRA_OCTET_P (c3))
1321                 src = src_base;
1322               else
1323                 {
1324                   if ((c1 != UTF_8_BOM_1)
1325                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1326                     src = src_base;
1327                   else
1328                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1329                 }
1330             }
1331         }
1332     }
1333   CODING_UTF_8_BOM (coding) = utf_without_bom;
1334
1335   while (1)
1336     {
1337       int c, c1, c2, c3, c4, c5;
1338
1339       src_base = src;
1340       consumed_chars_base = consumed_chars;
1341
1342       if (charbuf >= charbuf_end)
1343         {
1344           if (byte_after_cr >= 0)
1345             src_base--;
1346           break;
1347         }
1348
1349       if (byte_after_cr >= 0)
1350         c1 = byte_after_cr, byte_after_cr = -1;
1351       else
1352         ONE_MORE_BYTE (c1);
1353       if (c1 < 0)
1354         {
1355           c = - c1;
1356         }
1357       else if (UTF_8_1_OCTET_P (c1))
1358         {
1359           if (eol_dos && c1 == '\r')
1360             ONE_MORE_BYTE (byte_after_cr);
1361           c = c1;
1362         }
1363       else
1364         {
1365           ONE_MORE_BYTE (c2);
1366           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1367             goto invalid_code;
1368           if (UTF_8_2_OCTET_LEADING_P (c1))
1369             {
1370               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1371               /* Reject overlong sequences here and below.  Encoders
1372                  producing them are incorrect, they can be misleading,
1373                  and they mess up read/write invariance.  */
1374               if (c < 128)
1375                 goto invalid_code;
1376             }
1377           else
1378             {
1379               ONE_MORE_BYTE (c3);
1380               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1381                 goto invalid_code;
1382               if (UTF_8_3_OCTET_LEADING_P (c1))
1383                 {
1384                   c = (((c1 & 0xF) << 12)
1385                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1386                   if (c < 0x800
1387                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1388                     goto invalid_code;
1389                 }
1390               else
1391                 {
1392                   ONE_MORE_BYTE (c4);
1393                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1394                     goto invalid_code;
1395                   if (UTF_8_4_OCTET_LEADING_P (c1))
1396                     {
1397                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1398                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1399                     if (c < 0x10000)
1400                       goto invalid_code;
1401                     }
1402                   else
1403                     {
1404                       ONE_MORE_BYTE (c5);
1405                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1406                         goto invalid_code;
1407                       if (UTF_8_5_OCTET_LEADING_P (c1))
1408                         {
1409                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1410                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1411                                | (c5 & 0x3F));
1412                           if ((c > MAX_CHAR) || (c < 0x200000))
1413                             goto invalid_code;
1414                         }
1415                       else
1416                         goto invalid_code;
1417                     }
1418                 }
1419             }
1420         }
1421
1422       *charbuf++ = c;
1423       continue;
1424
1425     invalid_code:
1426       src = src_base;
1427       consumed_chars = consumed_chars_base;
1428       ONE_MORE_BYTE (c);
1429       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1430       coding->errors++;
1431     }
1432
1433  no_more_source:
1434   coding->consumed_char += consumed_chars_base;
1435   coding->consumed = src_base - coding->source;
1436   coding->charbuf_used = charbuf - coding->charbuf;
1437 }
1438
1439
1440 static bool
1441 encode_coding_utf_8 (struct coding_system *coding)
1442 {
1443   bool multibytep = coding->dst_multibyte;
1444   int *charbuf = coding->charbuf;
1445   int *charbuf_end = charbuf + coding->charbuf_used;
1446   unsigned char *dst = coding->destination + coding->produced;
1447   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1448   ptrdiff_t produced_chars = 0;
1449   int c;
1450
1451   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1452     {
1453       ASSURE_DESTINATION (3);
1454       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1455       CODING_UTF_8_BOM (coding) = utf_without_bom;
1456     }
1457
1458   if (multibytep)
1459     {
1460       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1461
1462       while (charbuf < charbuf_end)
1463         {
1464           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1465
1466           ASSURE_DESTINATION (safe_room);
1467           c = *charbuf++;
1468           if (CHAR_BYTE8_P (c))
1469             {
1470               c = CHAR_TO_BYTE8 (c);
1471               EMIT_ONE_BYTE (c);
1472             }
1473           else
1474             {
1475               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1476               for (p = str; p < pend; p++)
1477                 EMIT_ONE_BYTE (*p);
1478             }
1479         }
1480     }
1481   else
1482     {
1483       int safe_room = MAX_MULTIBYTE_LENGTH;
1484
1485       while (charbuf < charbuf_end)
1486         {
1487           ASSURE_DESTINATION (safe_room);
1488           c = *charbuf++;
1489           if (CHAR_BYTE8_P (c))
1490             *dst++ = CHAR_TO_BYTE8 (c);
1491           else
1492             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1493           produced_chars++;
1494         }
1495     }
1496   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1497   coding->produced_char += produced_chars;
1498   coding->produced = dst - coding->destination;
1499   return 0;
1500 }
1501
1502
1503 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1504    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1505
1506 #define UTF_16_HIGH_SURROGATE_P(val) \
1507   (((val) & 0xFC00) == 0xD800)
1508
1509 #define UTF_16_LOW_SURROGATE_P(val) \
1510   (((val) & 0xFC00) == 0xDC00)
1511
1512
1513 static bool
1514 detect_coding_utf_16 (struct coding_system *coding,
1515                       struct coding_detection_info *detect_info)
1516 {
1517   const unsigned char *src = coding->source;
1518   const unsigned char *src_end = coding->source + coding->src_bytes;
1519   bool multibytep = coding->src_multibyte;
1520   int c1, c2;
1521
1522   detect_info->checked |= CATEGORY_MASK_UTF_16;
1523   if (coding->mode & CODING_MODE_LAST_BLOCK
1524       && (coding->src_chars & 1))
1525     {
1526       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1527       return 0;
1528     }
1529
1530   TWO_MORE_BYTES (c1, c2);
1531   if ((c1 == 0xFF) && (c2 == 0xFE))
1532     {
1533       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1534                              | CATEGORY_MASK_UTF_16_AUTO);
1535       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1536                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1537                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1538     }
1539   else if ((c1 == 0xFE) && (c2 == 0xFF))
1540     {
1541       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1542                              | CATEGORY_MASK_UTF_16_AUTO);
1543       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1544                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1545                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1546     }
1547   else if (c2 < 0)
1548     {
1549       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1550       return 0;
1551     }
1552   else
1553     {
1554       /* We check the dispersion of Eth and Oth bytes where E is even and
1555          O is odd.  If both are high, we assume binary data.*/
1556       unsigned char e[256], o[256];
1557       unsigned e_num = 1, o_num = 1;
1558
1559       memset (e, 0, 256);
1560       memset (o, 0, 256);
1561       e[c1] = 1;
1562       o[c2] = 1;
1563
1564       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1565                                 |CATEGORY_MASK_UTF_16_BE
1566                                 | CATEGORY_MASK_UTF_16_LE);
1567
1568       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1569              != CATEGORY_MASK_UTF_16)
1570         {
1571           TWO_MORE_BYTES (c1, c2);
1572           if (c2 < 0)
1573             break;
1574           if (! e[c1])
1575             {
1576               e[c1] = 1;
1577               e_num++;
1578               if (e_num >= 128)
1579                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1580             }
1581           if (! o[c2])
1582             {
1583               o[c2] = 1;
1584               o_num++;
1585               if (o_num >= 128)
1586                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1587             }
1588         }
1589       return 0;
1590     }
1591
1592  no_more_source:
1593   return 1;
1594 }
1595
1596 static void
1597 decode_coding_utf_16 (struct coding_system *coding)
1598 {
1599   const unsigned char *src = coding->source + coding->consumed;
1600   const unsigned char *src_end = coding->source + coding->src_bytes;
1601   const unsigned char *src_base;
1602   int *charbuf = coding->charbuf + coding->charbuf_used;
1603   /* We may produces at most 3 chars in one loop.  */
1604   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1605   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1606   bool multibytep = coding->src_multibyte;
1607   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1608   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1609   int surrogate = CODING_UTF_16_SURROGATE (coding);
1610   bool eol_dos
1611     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1612   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1613
1614   if (bom == utf_with_bom)
1615     {
1616       int c, c1, c2;
1617
1618       src_base = src;
1619       ONE_MORE_BYTE (c1);
1620       ONE_MORE_BYTE (c2);
1621       c = (c1 << 8) | c2;
1622
1623       if (endian == utf_16_big_endian
1624           ? c != 0xFEFF : c != 0xFFFE)
1625         {
1626           /* The first two bytes are not BOM.  Treat them as bytes
1627              for a normal character.  */
1628           src = src_base;
1629           coding->errors++;
1630         }
1631       CODING_UTF_16_BOM (coding) = utf_without_bom;
1632     }
1633   else if (bom == utf_detect_bom)
1634     {
1635       /* We have already tried to detect BOM and failed in
1636          detect_coding.  */
1637       CODING_UTF_16_BOM (coding) = utf_without_bom;
1638     }
1639
1640   while (1)
1641     {
1642       int c, c1, c2;
1643
1644       src_base = src;
1645       consumed_chars_base = consumed_chars;
1646
1647       if (charbuf >= charbuf_end)
1648         {
1649           if (byte_after_cr1 >= 0)
1650             src_base -= 2;
1651           break;
1652         }
1653
1654       if (byte_after_cr1 >= 0)
1655         c1 = byte_after_cr1, byte_after_cr1 = -1;
1656       else
1657         ONE_MORE_BYTE (c1);
1658       if (c1 < 0)
1659         {
1660           *charbuf++ = -c1;
1661           continue;
1662         }
1663       if (byte_after_cr2 >= 0)
1664         c2 = byte_after_cr2, byte_after_cr2 = -1;
1665       else
1666         ONE_MORE_BYTE (c2);
1667       if (c2 < 0)
1668         {
1669           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1670           *charbuf++ = -c2;
1671           continue;
1672         }
1673       c = (endian == utf_16_big_endian
1674            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1675
1676       if (surrogate)
1677         {
1678           if (! UTF_16_LOW_SURROGATE_P (c))
1679             {
1680               if (endian == utf_16_big_endian)
1681                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1682               else
1683                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1684               *charbuf++ = c1;
1685               *charbuf++ = c2;
1686               coding->errors++;
1687               if (UTF_16_HIGH_SURROGATE_P (c))
1688                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1689               else
1690                 *charbuf++ = c;
1691             }
1692           else
1693             {
1694               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1695               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1696               *charbuf++ = 0x10000 + c;
1697             }
1698         }
1699       else
1700         {
1701           if (UTF_16_HIGH_SURROGATE_P (c))
1702             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1703           else
1704             {
1705               if (eol_dos && c == '\r')
1706                 {
1707                   ONE_MORE_BYTE (byte_after_cr1);
1708                   ONE_MORE_BYTE (byte_after_cr2);
1709                 }
1710               *charbuf++ = c;
1711             }
1712         }
1713     }
1714
1715  no_more_source:
1716   coding->consumed_char += consumed_chars_base;
1717   coding->consumed = src_base - coding->source;
1718   coding->charbuf_used = charbuf - coding->charbuf;
1719 }
1720
1721 static bool
1722 encode_coding_utf_16 (struct coding_system *coding)
1723 {
1724   bool multibytep = coding->dst_multibyte;
1725   int *charbuf = coding->charbuf;
1726   int *charbuf_end = charbuf + coding->charbuf_used;
1727   unsigned char *dst = coding->destination + coding->produced;
1728   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1729   int safe_room = 8;
1730   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1731   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1732   ptrdiff_t produced_chars = 0;
1733   int c;
1734
1735   if (bom != utf_without_bom)
1736     {
1737       ASSURE_DESTINATION (safe_room);
1738       if (big_endian)
1739         EMIT_TWO_BYTES (0xFE, 0xFF);
1740       else
1741         EMIT_TWO_BYTES (0xFF, 0xFE);
1742       CODING_UTF_16_BOM (coding) = utf_without_bom;
1743     }
1744
1745   while (charbuf < charbuf_end)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       c = *charbuf++;
1749       if (c > MAX_UNICODE_CHAR)
1750         c = coding->default_char;
1751
1752       if (c < 0x10000)
1753         {
1754           if (big_endian)
1755             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1756           else
1757             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1758         }
1759       else
1760         {
1761           int c1, c2;
1762
1763           c -= 0x10000;
1764           c1 = (c >> 10) + 0xD800;
1765           c2 = (c & 0x3FF) + 0xDC00;
1766           if (big_endian)
1767             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1768           else
1769             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1770         }
1771     }
1772   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1773   coding->produced = dst - coding->destination;
1774   coding->produced_char += produced_chars;
1775   return 0;
1776 }
1777
1778 \f
1779 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1780
1781 /* Emacs' internal format for representation of multiple character
1782    sets is a kind of multi-byte encoding, i.e. characters are
1783    represented by variable-length sequences of one-byte codes.
1784
1785    ASCII characters and control characters (e.g. `tab', `newline') are
1786    represented by one-byte sequences which are their ASCII codes, in
1787    the range 0x00 through 0x7F.
1788
1789    8-bit characters of the range 0x80..0x9F are represented by
1790    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1791    code + 0x20).
1792
1793    8-bit characters of the range 0xA0..0xFF are represented by
1794    one-byte sequences which are their 8-bit code.
1795
1796    The other characters are represented by a sequence of `base
1797    leading-code', optional `extended leading-code', and one or two
1798    `position-code's.  The length of the sequence is determined by the
1799    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1800    whereas extended leading-code and position-code take the range 0xA0
1801    through 0xFF.  See `charset.h' for more details about leading-code
1802    and position-code.
1803
1804    --- CODE RANGE of Emacs' internal format ---
1805    character set        range
1806    -------------        -----
1807    ascii                0x00..0x7F
1808    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1809    eight-bit-graphic    0xA0..0xBF
1810    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1811    ---------------------------------------------
1812
1813    As this is the internal character representation, the format is
1814    usually not used externally (i.e. in a file or in a data sent to a
1815    process).  But, it is possible to have a text externally in this
1816    format (i.e. by encoding by the coding system `emacs-mule').
1817
1818    In that case, a sequence of one-byte codes has a slightly different
1819    form.
1820
1821    At first, all characters in eight-bit-control are represented by
1822    one-byte sequences which are their 8-bit code.
1823
1824    Next, character composition data are represented by the byte
1825    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1826    where,
1827         METHOD is 0xF2 plus one of composition method (enum
1828         composition_method),
1829
1830         BYTES is 0xA0 plus a byte length of this composition data,
1831
1832         CHARS is 0xA0 plus a number of characters composed by this
1833         data,
1834
1835         COMPONENTs are characters of multibyte form or composition
1836         rules encoded by two-byte of ASCII codes.
1837
1838    In addition, for backward compatibility, the following formats are
1839    also recognized as composition data on decoding.
1840
1841    0x80 MSEQ ...
1842    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1843
1844    Here,
1845         MSEQ is a multibyte form but in these special format:
1846           ASCII: 0xA0 ASCII_CODE+0x80,
1847           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1848         RULE is a one byte code of the range 0xA0..0xF0 that
1849         represents a composition rule.
1850   */
1851
1852 char emacs_mule_bytes[256];
1853
1854
1855 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1856    Return true if a text is encoded in 'emacs-mule'.  */
1857
1858 static bool
1859 detect_coding_emacs_mule (struct coding_system *coding,
1860                           struct coding_detection_info *detect_info)
1861 {
1862   const unsigned char *src = coding->source, *src_base;
1863   const unsigned char *src_end = coding->source + coding->src_bytes;
1864   bool multibytep = coding->src_multibyte;
1865   ptrdiff_t consumed_chars = 0;
1866   int c;
1867   int found = 0;
1868
1869   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1870   /* A coding system of this category is always ASCII compatible.  */
1871   src += coding->head_ascii;
1872
1873   while (1)
1874     {
1875       src_base = src;
1876       ONE_MORE_BYTE (c);
1877       if (c < 0)
1878         continue;
1879       if (c == 0x80)
1880         {
1881           /* Perhaps the start of composite character.  We simply skip
1882              it because analyzing it is too heavy for detecting.  But,
1883              at least, we check that the composite character
1884              constitutes of more than 4 bytes.  */
1885           const unsigned char *src_start;
1886
1887         repeat:
1888           src_start = src;
1889           do
1890             {
1891               ONE_MORE_BYTE (c);
1892             }
1893           while (c >= 0xA0);
1894
1895           if (src - src_start <= 4)
1896             break;
1897           found = CATEGORY_MASK_EMACS_MULE;
1898           if (c == 0x80)
1899             goto repeat;
1900         }
1901
1902       if (c < 0x80)
1903         {
1904           if (c < 0x20
1905               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1906             break;
1907         }
1908       else
1909         {
1910           int more_bytes = emacs_mule_bytes[c] - 1;
1911
1912           while (more_bytes > 0)
1913             {
1914               ONE_MORE_BYTE (c);
1915               if (c < 0xA0)
1916                 {
1917                   src--;        /* Unread the last byte.  */
1918                   break;
1919                 }
1920               more_bytes--;
1921             }
1922           if (more_bytes != 0)
1923             break;
1924           found = CATEGORY_MASK_EMACS_MULE;
1925         }
1926     }
1927   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1928   return 0;
1929
1930  no_more_source:
1931   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1932     {
1933       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934       return 0;
1935     }
1936   detect_info->found |= found;
1937   return 1;
1938 }
1939
1940
1941 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1942    character.  If CMP_STATUS indicates that we must expect MSEQ or
1943    RULE described above, decode it and return the negative value of
1944    the decoded character or rule.  If an invalid byte is found, return
1945    -1.  If SRC is too short, return -2.  */
1946
1947 static int
1948 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1949                  int *nbytes, int *nchars, int *id,
1950                  struct composition_status *cmp_status)
1951 {
1952   const unsigned char *src_end = coding->source + coding->src_bytes;
1953   const unsigned char *src_base = src;
1954   bool multibytep = coding->src_multibyte;
1955   int charset_ID;
1956   unsigned code;
1957   int c;
1958   int consumed_chars = 0;
1959   bool mseq_found = 0;
1960
1961   ONE_MORE_BYTE (c);
1962   if (c < 0)
1963     {
1964       c = -c;
1965       charset_ID = emacs_mule_charset[0];
1966     }
1967   else
1968     {
1969       if (c >= 0xA0)
1970         {
1971           if (cmp_status->state != COMPOSING_NO
1972               && cmp_status->old_form)
1973             {
1974               if (cmp_status->state == COMPOSING_CHAR)
1975                 {
1976                   if (c == 0xA0)
1977                     {
1978                       ONE_MORE_BYTE (c);
1979                       c -= 0x80;
1980                       if (c < 0)
1981                         goto invalid_code;
1982                     }
1983                   else
1984                     c -= 0x20;
1985                   mseq_found = 1;
1986                 }
1987               else
1988                 {
1989                   *nbytes = src - src_base;
1990                   *nchars = consumed_chars;
1991                   return -c;
1992                 }
1993             }
1994           else
1995             goto invalid_code;
1996         }
1997
1998       switch (emacs_mule_bytes[c])
1999         {
2000         case 2:
2001           if ((charset_ID = emacs_mule_charset[c]) < 0)
2002             goto invalid_code;
2003           ONE_MORE_BYTE (c);
2004           if (c < 0xA0)
2005             goto invalid_code;
2006           code = c & 0x7F;
2007           break;
2008
2009         case 3:
2010           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2011               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2012             {
2013               ONE_MORE_BYTE (c);
2014               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2015                 goto invalid_code;
2016               ONE_MORE_BYTE (c);
2017               if (c < 0xA0)
2018                 goto invalid_code;
2019               code = c & 0x7F;
2020             }
2021           else
2022             {
2023               if ((charset_ID = emacs_mule_charset[c]) < 0)
2024                 goto invalid_code;
2025               ONE_MORE_BYTE (c);
2026               if (c < 0xA0)
2027                 goto invalid_code;
2028               code = (c & 0x7F) << 8;
2029               ONE_MORE_BYTE (c);
2030               if (c < 0xA0)
2031                 goto invalid_code;
2032               code |= c & 0x7F;
2033             }
2034           break;
2035
2036         case 4:
2037           ONE_MORE_BYTE (c);
2038           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2039             goto invalid_code;
2040           ONE_MORE_BYTE (c);
2041           if (c < 0xA0)
2042             goto invalid_code;
2043           code = (c & 0x7F) << 8;
2044           ONE_MORE_BYTE (c);
2045           if (c < 0xA0)
2046             goto invalid_code;
2047           code |= c & 0x7F;
2048           break;
2049
2050         case 1:
2051           code = c;
2052           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2053           break;
2054
2055         default:
2056           emacs_abort ();
2057         }
2058       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2059                           CHARSET_FROM_ID (charset_ID), code, c);
2060       if (c < 0)
2061         goto invalid_code;
2062     }
2063   *nbytes = src - src_base;
2064   *nchars = consumed_chars;
2065   if (id)
2066     *id = charset_ID;
2067   return (mseq_found ? -c : c);
2068
2069  no_more_source:
2070   return -2;
2071
2072  invalid_code:
2073   return -1;
2074 }
2075
2076
2077 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2078
2079 /* Handle these composition sequence ('|': the end of header elements,
2080    BYTES and CHARS >= 0xA0):
2081
2082    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2083    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2084    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2085
2086    and these old form:
2087
2088    (4) relative composition: 0x80 | MSEQ ... MSEQ
2089    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2090
2091    When the starter 0x80 and the following header elements are found,
2092    this annotation header is produced.
2093
2094         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2095
2096    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2097    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2098
2099    Then, upon reading the following elements, these codes are produced
2100    until the composition end is found:
2101
2102    (1) CHAR ... CHAR
2103    (2) ALT ... ALT CHAR ... CHAR
2104    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2105    (4) CHAR ... CHAR
2106    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2107
2108    When the composition end is found, LENGTH and NCHARS in the
2109    annotation header is updated as below:
2110
2111    (1) LENGTH: unchanged, NCHARS: unchanged
2112    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2113    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2114    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2115    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2116
2117    If an error is found while composing, the annotation header is
2118    changed to the original composition header (plus filler -1s) as
2119    below:
2120
2121    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2122    (5)          [ 0x80 0xFF -1 -1- -1 ]
2123
2124    and the sequence [ -2 DECODED-RULE ] is changed to the original
2125    byte sequence as below:
2126         o the original byte sequence is B: [ B -1 ]
2127         o the original byte sequence is B1 B2: [ B1 B2 ]
2128
2129    Most of the routines are implemented by macros because many
2130    variables and labels in the caller decode_coding_emacs_mule must be
2131    accessible, and they are usually called just once (thus doesn't
2132    increase the size of compiled object).  */
2133
2134 /* Decode a composition rule represented by C as a component of
2135    composition sequence of Emacs 20 style.  Set RULE to the decoded
2136    rule. */
2137
2138 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2139   do {                                                  \
2140     int gref, nref;                                     \
2141                                                         \
2142     c -= 0xA0;                                          \
2143     if (c < 0 || c >= 81)                               \
2144       goto invalid_code;                                \
2145     gref = c / 9, nref = c % 9;                         \
2146     if (gref == 4) gref = 10;                           \
2147     if (nref == 4) nref = 10;                           \
2148     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2149   } while (0)
2150
2151
2152 /* Decode a composition rule represented by C and the following byte
2153    at SRC as a component of composition sequence of Emacs 21 style.
2154    Set RULE to the decoded rule.  */
2155
2156 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2157   do {                                                  \
2158     int gref, nref;                                     \
2159                                                         \
2160     gref = c - 0x20;                                    \
2161     if (gref < 0 || gref >= 81)                         \
2162       goto invalid_code;                                \
2163     ONE_MORE_BYTE (c);                                  \
2164     nref = c - 0x20;                                    \
2165     if (nref < 0 || nref >= 81)                         \
2166       goto invalid_code;                                \
2167     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2168   } while (0)
2169
2170
2171 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2172    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2173    byte length of this composition information, CHARS is the number of
2174    characters composed by this composition.  */
2175
2176 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2177   do {                                                                  \
2178     enum composition_method method = c - 0xF2;                          \
2179     int nbytes, nchars;                                                 \
2180                                                                         \
2181     ONE_MORE_BYTE (c);                                                  \
2182     if (c < 0)                                                          \
2183       goto invalid_code;                                                \
2184     nbytes = c - 0xA0;                                                  \
2185     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2186       goto invalid_code;                                                \
2187     ONE_MORE_BYTE (c);                                                  \
2188     nchars = c - 0xA0;                                                  \
2189     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2190       goto invalid_code;                                                \
2191     cmp_status->old_form = 0;                                           \
2192     cmp_status->method = method;                                        \
2193     if (method == COMPOSITION_RELATIVE)                                 \
2194       cmp_status->state = COMPOSING_CHAR;                               \
2195     else                                                                \
2196       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2197     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2198     cmp_status->nchars = nchars;                                        \
2199     cmp_status->ncomps = nbytes - 4;                                    \
2200     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2201   } while (0)
2202
2203
2204 /* Start of Emacs 20 style format for relative composition.  */
2205
2206 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2207   do {                                                          \
2208     cmp_status->old_form = 1;                                   \
2209     cmp_status->method = COMPOSITION_RELATIVE;                  \
2210     cmp_status->state = COMPOSING_CHAR;                         \
2211     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2212     cmp_status->nchars = cmp_status->ncomps = 0;                \
2213     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2214   } while (0)
2215
2216
2217 /* Start of Emacs 20 style format for rule-base composition.  */
2218
2219 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2220   do {                                                          \
2221     cmp_status->old_form = 1;                                   \
2222     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2223     cmp_status->state = COMPOSING_CHAR;                         \
2224     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2225     cmp_status->nchars = cmp_status->ncomps = 0;                \
2226     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2227   } while (0)
2228
2229
2230 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2231   do {                                                  \
2232     const unsigned char *current_src = src;             \
2233                                                         \
2234     ONE_MORE_BYTE (c);                                  \
2235     if (c < 0)                                          \
2236       goto invalid_code;                                \
2237     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2238         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2239       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2240     else if (c < 0xA0)                                  \
2241       goto invalid_code;                                \
2242     else if (c < 0xC0)                                  \
2243       {                                                 \
2244         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2245         /* Re-read C as a composition component.  */    \
2246         src = current_src;                              \
2247       }                                                 \
2248     else if (c == 0xFF)                                 \
2249       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2250     else                                                \
2251       goto invalid_code;                                \
2252   } while (0)
2253
2254 #define EMACS_MULE_COMPOSITION_END()                            \
2255   do {                                                          \
2256     int idx = - cmp_status->length;                             \
2257                                                                 \
2258     if (cmp_status->old_form)                                   \
2259       charbuf[idx + 2] = cmp_status->nchars;                    \
2260     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2261       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2262     cmp_status->state = COMPOSING_NO;                           \
2263   } while (0)
2264
2265
2266 static int
2267 emacs_mule_finish_composition (int *charbuf,
2268                                struct composition_status *cmp_status)
2269 {
2270   int idx = - cmp_status->length;
2271   int new_chars;
2272
2273   if (cmp_status->old_form && cmp_status->nchars > 0)
2274     {
2275       charbuf[idx + 2] = cmp_status->nchars;
2276       new_chars = 0;
2277       if (cmp_status->method == COMPOSITION_WITH_RULE
2278           && cmp_status->state == COMPOSING_CHAR)
2279         {
2280           /* The last rule was invalid.  */
2281           int rule = charbuf[-1] + 0xA0;
2282
2283           charbuf[-2] = BYTE8_TO_CHAR (rule);
2284           charbuf[-1] = -1;
2285           new_chars = 1;
2286         }
2287     }
2288   else
2289     {
2290       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2291
2292       if (cmp_status->method == COMPOSITION_WITH_RULE)
2293         {
2294           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2295           charbuf[idx++] = -3;
2296           charbuf[idx++] = 0;
2297           new_chars = 1;
2298         }
2299       else
2300         {
2301           int nchars = charbuf[idx + 1] + 0xA0;
2302           int nbytes = charbuf[idx + 2] + 0xA0;
2303
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2305           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2306           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2307           charbuf[idx++] = -1;
2308           new_chars = 4;
2309         }
2310     }
2311   cmp_status->state = COMPOSING_NO;
2312   return new_chars;
2313 }
2314
2315 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2316   do {                                                                    \
2317     if (cmp_status->state != COMPOSING_NO)                                \
2318       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2319   } while (0)
2320
2321
2322 static void
2323 decode_coding_emacs_mule (struct coding_system *coding)
2324 {
2325   const unsigned char *src = coding->source + coding->consumed;
2326   const unsigned char *src_end = coding->source + coding->src_bytes;
2327   const unsigned char *src_base;
2328   int *charbuf = coding->charbuf + coding->charbuf_used;
2329   /* We may produce two annotations (charset and composition) in one
2330      loop and one more charset annotation at the end.  */
2331   int *charbuf_end
2332     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2333       /* We can produce up to 2 characters in a loop.  */
2334       - 1;
2335   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2336   bool multibytep = coding->src_multibyte;
2337   ptrdiff_t char_offset = coding->produced_char;
2338   ptrdiff_t last_offset = char_offset;
2339   int last_id = charset_ascii;
2340   bool eol_dos
2341     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2342   int byte_after_cr = -1;
2343   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2344
2345   if (cmp_status->state != COMPOSING_NO)
2346     {
2347       int i;
2348
2349       if (charbuf_end - charbuf < cmp_status->length)
2350         emacs_abort ();
2351       for (i = 0; i < cmp_status->length; i++)
2352         *charbuf++ = cmp_status->carryover[i];
2353       coding->annotated = 1;
2354     }
2355
2356   while (1)
2357     {
2358       int c, id IF_LINT (= 0);
2359
2360       src_base = src;
2361       consumed_chars_base = consumed_chars;
2362
2363       if (charbuf >= charbuf_end)
2364         {
2365           if (byte_after_cr >= 0)
2366             src_base--;
2367           break;
2368         }
2369
2370       if (byte_after_cr >= 0)
2371         c = byte_after_cr, byte_after_cr = -1;
2372       else
2373         ONE_MORE_BYTE (c);
2374
2375       if (c < 0 || c == 0x80)
2376         {
2377           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2378           if (c < 0)
2379             {
2380               *charbuf++ = -c;
2381               char_offset++;
2382             }
2383           else
2384             DECODE_EMACS_MULE_COMPOSITION_START ();
2385           continue;
2386         }
2387
2388       if (c < 0x80)
2389         {
2390           if (eol_dos && c == '\r')
2391             ONE_MORE_BYTE (byte_after_cr);
2392           id = charset_ascii;
2393           if (cmp_status->state != COMPOSING_NO)
2394             {
2395               if (cmp_status->old_form)
2396                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2397               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2398                 cmp_status->ncomps--;
2399             }
2400         }
2401       else
2402         {
2403           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2404           /* emacs_mule_char can load a charset map from a file, which
2405              allocates a large structure and might cause buffer text
2406              to be relocated as result.  Thus, we need to remember the
2407              original pointer to buffer text, and fix up all related
2408              pointers after the call.  */
2409           const unsigned char *orig = coding->source;
2410           ptrdiff_t offset;
2411
2412           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2413                                cmp_status);
2414           offset = coding->source - orig;
2415           if (offset)
2416             {
2417               src += offset;
2418               src_base += offset;
2419               src_end += offset;
2420             }
2421           if (c < 0)
2422             {
2423               if (c == -1)
2424                 goto invalid_code;
2425               if (c == -2)
2426                 break;
2427             }
2428           src = src_base + nbytes;
2429           consumed_chars = consumed_chars_base + nchars;
2430           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2431             cmp_status->ncomps -= nchars;
2432         }
2433
2434       /* Now if C >= 0, we found a normally encoded character, if C <
2435          0, we found an old-style composition component character or
2436          rule.  */
2437
2438       if (cmp_status->state == COMPOSING_NO)
2439         {
2440           if (last_id != id)
2441             {
2442               if (last_id != charset_ascii)
2443                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2444                                   last_id);
2445               last_id = id;
2446               last_offset = char_offset;
2447             }
2448           *charbuf++ = c;
2449           char_offset++;
2450         }
2451       else if (cmp_status->state == COMPOSING_CHAR)
2452         {
2453           if (cmp_status->old_form)
2454             {
2455               if (c >= 0)
2456                 {
2457                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2458                   *charbuf++ = c;
2459                   char_offset++;
2460                 }
2461               else
2462                 {
2463                   *charbuf++ = -c;
2464                   cmp_status->nchars++;
2465                   cmp_status->length++;
2466                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2467                     EMACS_MULE_COMPOSITION_END ();
2468                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2469                     cmp_status->state = COMPOSING_RULE;
2470                 }
2471             }
2472           else
2473             {
2474               *charbuf++ = c;
2475               cmp_status->length++;
2476               cmp_status->nchars--;
2477               if (cmp_status->nchars == 0)
2478                 EMACS_MULE_COMPOSITION_END ();
2479             }
2480         }
2481       else if (cmp_status->state == COMPOSING_RULE)
2482         {
2483           int rule;
2484
2485           if (c >= 0)
2486             {
2487               EMACS_MULE_COMPOSITION_END ();
2488               *charbuf++ = c;
2489               char_offset++;
2490             }
2491           else
2492             {
2493               c = -c;
2494               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2495               if (rule < 0)
2496                 goto invalid_code;
2497               *charbuf++ = -2;
2498               *charbuf++ = rule;
2499               cmp_status->length += 2;
2500               cmp_status->state = COMPOSING_CHAR;
2501             }
2502         }
2503       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2504         {
2505           *charbuf++ = c;
2506           cmp_status->length++;
2507           if (cmp_status->ncomps == 0)
2508             cmp_status->state = COMPOSING_CHAR;
2509           else if (cmp_status->ncomps > 0)
2510             {
2511               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2512                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2513             }
2514           else
2515             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2516         }
2517       else                      /* COMPOSING_COMPONENT_RULE */
2518         {
2519           int rule;
2520
2521           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2522           if (rule < 0)
2523             goto invalid_code;
2524           *charbuf++ = -2;
2525           *charbuf++ = rule;
2526           cmp_status->length += 2;
2527           cmp_status->ncomps--;
2528           if (cmp_status->ncomps > 0)
2529             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2530           else
2531             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2532         }
2533       continue;
2534
2535     invalid_code:
2536       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2537       src = src_base;
2538       consumed_chars = consumed_chars_base;
2539       ONE_MORE_BYTE (c);
2540       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2541       char_offset++;
2542       coding->errors++;
2543     }
2544
2545  no_more_source:
2546   if (cmp_status->state != COMPOSING_NO)
2547     {
2548       if (coding->mode & CODING_MODE_LAST_BLOCK)
2549         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2550       else
2551         {
2552           int i;
2553
2554           charbuf -= cmp_status->length;
2555           for (i = 0; i < cmp_status->length; i++)
2556             cmp_status->carryover[i] = charbuf[i];
2557         }
2558     }
2559   if (last_id != charset_ascii)
2560     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2561   coding->consumed_char += consumed_chars_base;
2562   coding->consumed = src_base - coding->source;
2563   coding->charbuf_used = charbuf - coding->charbuf;
2564 }
2565
2566
2567 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2568   do {                                          \
2569     if (id < 0xA0)                              \
2570       codes[0] = id, codes[1] = 0;              \
2571     else if (id < 0xE0)                         \
2572       codes[0] = 0x9A, codes[1] = id;           \
2573     else if (id < 0xF0)                         \
2574       codes[0] = 0x9B, codes[1] = id;           \
2575     else if (id < 0xF5)                         \
2576       codes[0] = 0x9C, codes[1] = id;           \
2577     else                                        \
2578       codes[0] = 0x9D, codes[1] = id;           \
2579   } while (0);
2580
2581
2582 static bool
2583 encode_coding_emacs_mule (struct coding_system *coding)
2584 {
2585   bool multibytep = coding->dst_multibyte;
2586   int *charbuf = coding->charbuf;
2587   int *charbuf_end = charbuf + coding->charbuf_used;
2588   unsigned char *dst = coding->destination + coding->produced;
2589   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2590   int safe_room = 8;
2591   ptrdiff_t produced_chars = 0;
2592   Lisp_Object attrs, charset_list;
2593   int c;
2594   int preferred_charset_id = -1;
2595
2596   CODING_GET_INFO (coding, attrs, charset_list);
2597   if (! EQ (charset_list, Vemacs_mule_charset_list))
2598     {
2599       charset_list = Vemacs_mule_charset_list;
2600       ASET (attrs, coding_attr_charset_list, charset_list);
2601     }
2602
2603   while (charbuf < charbuf_end)
2604     {
2605       ASSURE_DESTINATION (safe_room);
2606       c = *charbuf++;
2607
2608       if (c < 0)
2609         {
2610           /* Handle an annotation.  */
2611           switch (*charbuf)
2612             {
2613             case CODING_ANNOTATE_COMPOSITION_MASK:
2614               /* Not yet implemented.  */
2615               break;
2616             case CODING_ANNOTATE_CHARSET_MASK:
2617               preferred_charset_id = charbuf[3];
2618               if (preferred_charset_id >= 0
2619                   && NILP (Fmemq (make_number (preferred_charset_id),
2620                                   charset_list)))
2621                 preferred_charset_id = -1;
2622               break;
2623             default:
2624               emacs_abort ();
2625             }
2626           charbuf += -c - 1;
2627           continue;
2628         }
2629
2630       if (ASCII_CHAR_P (c))
2631         EMIT_ONE_ASCII_BYTE (c);
2632       else if (CHAR_BYTE8_P (c))
2633         {
2634           c = CHAR_TO_BYTE8 (c);
2635           EMIT_ONE_BYTE (c);
2636         }
2637       else
2638         {
2639           struct charset *charset;
2640           unsigned code;
2641           int dimension;
2642           int emacs_mule_id;
2643           unsigned char leading_codes[2];
2644
2645           if (preferred_charset_id >= 0)
2646             {
2647               bool result;
2648
2649               charset = CHARSET_FROM_ID (preferred_charset_id);
2650               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2651               if (result)
2652                 code = ENCODE_CHAR (charset, c);
2653               else
2654                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2655                                      &code, charset);
2656             }
2657           else
2658             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2659                                  &code, charset);
2660           if (! charset)
2661             {
2662               c = coding->default_char;
2663               if (ASCII_CHAR_P (c))
2664                 {
2665                   EMIT_ONE_ASCII_BYTE (c);
2666                   continue;
2667                 }
2668               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669                                    &code, charset);
2670             }
2671           dimension = CHARSET_DIMENSION (charset);
2672           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2673           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2674           EMIT_ONE_BYTE (leading_codes[0]);
2675           if (leading_codes[1])
2676             EMIT_ONE_BYTE (leading_codes[1]);
2677           if (dimension == 1)
2678             EMIT_ONE_BYTE (code | 0x80);
2679           else
2680             {
2681               code |= 0x8080;
2682               EMIT_ONE_BYTE (code >> 8);
2683               EMIT_ONE_BYTE (code & 0xFF);
2684             }
2685         }
2686     }
2687   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2688   coding->produced_char += produced_chars;
2689   coding->produced = dst - coding->destination;
2690   return 0;
2691 }
2692
2693 \f
2694 /*** 7. ISO2022 handlers ***/
2695
2696 /* The following note describes the coding system ISO2022 briefly.
2697    Since the intention of this note is to help understand the
2698    functions in this file, some parts are NOT ACCURATE or are OVERLY
2699    SIMPLIFIED.  For thorough understanding, please refer to the
2700    original document of ISO2022.  This is equivalent to the standard
2701    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2702
2703    ISO2022 provides many mechanisms to encode several character sets
2704    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2705    is encoded using bytes less than 128.  This may make the encoded
2706    text a little bit longer, but the text passes more easily through
2707    several types of gateway, some of which strip off the MSB (Most
2708    Significant Bit).
2709
2710    There are two kinds of character sets: control character sets and
2711    graphic character sets.  The former contain control characters such
2712    as `newline' and `escape' to provide control functions (control
2713    functions are also provided by escape sequences).  The latter
2714    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2715    two control character sets and many graphic character sets.
2716
2717    Graphic character sets are classified into one of the following
2718    four classes, according to the number of bytes (DIMENSION) and
2719    number of characters in one dimension (CHARS) of the set:
2720    - DIMENSION1_CHARS94
2721    - DIMENSION1_CHARS96
2722    - DIMENSION2_CHARS94
2723    - DIMENSION2_CHARS96
2724
2725    In addition, each character set is assigned an identification tag,
2726    unique for each set, called the "final character" (denoted as <F>
2727    hereafter).  The <F> of each character set is decided by ECMA(*)
2728    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2729    (0x30..0x3F are for private use only).
2730
2731    Note (*): ECMA = European Computer Manufacturers Association
2732
2733    Here are examples of graphic character sets [NAME(<F>)]:
2734         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2735         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2736         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2737         o DIMENSION2_CHARS96 -- none for the moment
2738
2739    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2740         C0 [0x00..0x1F] -- control character plane 0
2741         GL [0x20..0x7F] -- graphic character plane 0
2742         C1 [0x80..0x9F] -- control character plane 1
2743         GR [0xA0..0xFF] -- graphic character plane 1
2744
2745    A control character set is directly designated and invoked to C0 or
2746    C1 by an escape sequence.  The most common case is that:
2747    - ISO646's  control character set is designated/invoked to C0, and
2748    - ISO6429's control character set is designated/invoked to C1,
2749    and usually these designations/invocations are omitted in encoded
2750    text.  In a 7-bit environment, only C0 can be used, and a control
2751    character for C1 is encoded by an appropriate escape sequence to
2752    fit into the environment.  All control characters for C1 are
2753    defined to have corresponding escape sequences.
2754
2755    A graphic character set is at first designated to one of four
2756    graphic registers (G0 through G3), then these graphic registers are
2757    invoked to GL or GR.  These designations and invocations can be
2758    done independently.  The most common case is that G0 is invoked to
2759    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2760    these invocations and designations are omitted in encoded text.
2761    In a 7-bit environment, only GL can be used.
2762
2763    When a graphic character set of CHARS94 is invoked to GL, codes
2764    0x20 and 0x7F of the GL area work as control characters SPACE and
2765    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2766    be used.
2767
2768    There are two ways of invocation: locking-shift and single-shift.
2769    With locking-shift, the invocation lasts until the next different
2770    invocation, whereas with single-shift, the invocation affects the
2771    following character only and doesn't affect the locking-shift
2772    state.  Invocations are done by the following control characters or
2773    escape sequences:
2774
2775    ----------------------------------------------------------------------
2776    abbrev  function                  cntrl escape seq   description
2777    ----------------------------------------------------------------------
2778    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2779    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2780    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2781    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2782    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2783    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2784    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2785    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2786    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2787    ----------------------------------------------------------------------
2788    (*) These are not used by any known coding system.
2789
2790    Control characters for these functions are defined by macros
2791    ISO_CODE_XXX in `coding.h'.
2792
2793    Designations are done by the following escape sequences:
2794    ----------------------------------------------------------------------
2795    escape sequence      description
2796    ----------------------------------------------------------------------
2797    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2798    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2799    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2800    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2801    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2802    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2803    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2804    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2805    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2806    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2807    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2808    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2809    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2810    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2811    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2812    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2813    ----------------------------------------------------------------------
2814
2815    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2816    of dimension 1, chars 94, and final character <F>, etc...
2817
2818    Note (*): Although these designations are not allowed in ISO2022,
2819    Emacs accepts them on decoding, and produces them on encoding
2820    CHARS96 character sets in a coding system which is characterized as
2821    7-bit environment, non-locking-shift, and non-single-shift.
2822
2823    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2824    '(' must be omitted.  We refer to this as "short-form" hereafter.
2825
2826    Now you may notice that there are a lot of ways of encoding the
2827    same multilingual text in ISO2022.  Actually, there exist many
2828    coding systems such as Compound Text (used in X11's inter client
2829    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2830    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2831    localized platforms), and all of these are variants of ISO2022.
2832
2833    In addition to the above, Emacs handles two more kinds of escape
2834    sequences: ISO6429's direction specification and Emacs' private
2835    sequence for specifying character composition.
2836
2837    ISO6429's direction specification takes the following form:
2838         o CSI ']'      -- end of the current direction
2839         o CSI '0' ']'  -- end of the current direction
2840         o CSI '1' ']'  -- start of left-to-right text
2841         o CSI '2' ']'  -- start of right-to-left text
2842    The control character CSI (0x9B: control sequence introducer) is
2843    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2844
2845    Character composition specification takes the following form:
2846         o ESC '0' -- start relative composition
2847         o ESC '1' -- end composition
2848         o ESC '2' -- start rule-base composition (*)
2849         o ESC '3' -- start relative composition with alternate chars  (**)
2850         o ESC '4' -- start rule-base composition with alternate chars  (**)
2851   Since these are not standard escape sequences of any ISO standard,
2852   the use of them with these meanings is restricted to Emacs only.
2853
2854   (*) This form is used only in Emacs 20.7 and older versions,
2855   but newer versions can safely decode it.
2856   (**) This form is used only in Emacs 21.1 and newer versions,
2857   and older versions can't decode it.
2858
2859   Here's a list of example usages of these composition escape
2860   sequences (categorized by `enum composition_method').
2861
2862   COMPOSITION_RELATIVE:
2863         ESC 0 CHAR [ CHAR ] ESC 1
2864   COMPOSITION_WITH_RULE:
2865         ESC 2 CHAR [ RULE CHAR ] ESC 1
2866   COMPOSITION_WITH_ALTCHARS:
2867         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2868   COMPOSITION_WITH_RULE_ALTCHARS:
2869         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2870
2871 static enum iso_code_class_type iso_code_class[256];
2872
2873 #define SAFE_CHARSET_P(coding, id)      \
2874   ((id) <= (coding)->max_charset_id     \
2875    && (coding)->safe_charsets[id] != 255)
2876
2877 static void
2878 setup_iso_safe_charsets (Lisp_Object attrs)
2879 {
2880   Lisp_Object charset_list, safe_charsets;
2881   Lisp_Object request;
2882   Lisp_Object reg_usage;
2883   Lisp_Object tail;
2884   EMACS_INT reg94, reg96;
2885   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2886   int max_charset_id;
2887
2888   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2889   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2890       && ! EQ (charset_list, Viso_2022_charset_list))
2891     {
2892       charset_list = Viso_2022_charset_list;
2893       ASET (attrs, coding_attr_charset_list, charset_list);
2894       ASET (attrs, coding_attr_safe_charsets, Qnil);
2895     }
2896
2897   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2898     return;
2899
2900   max_charset_id = 0;
2901   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2902     {
2903       int id = XINT (XCAR (tail));
2904       if (max_charset_id < id)
2905         max_charset_id = id;
2906     }
2907
2908   safe_charsets = make_uninit_string (max_charset_id + 1);
2909   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2910   request = AREF (attrs, coding_attr_iso_request);
2911   reg_usage = AREF (attrs, coding_attr_iso_usage);
2912   reg94 = XINT (XCAR (reg_usage));
2913   reg96 = XINT (XCDR (reg_usage));
2914
2915   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2916     {
2917       Lisp_Object id;
2918       Lisp_Object reg;
2919       struct charset *charset;
2920
2921       id = XCAR (tail);
2922       charset = CHARSET_FROM_ID (XINT (id));
2923       reg = Fcdr (Fassq (id, request));
2924       if (! NILP (reg))
2925         SSET (safe_charsets, XINT (id), XINT (reg));
2926       else if (charset->iso_chars_96)
2927         {
2928           if (reg96 < 4)
2929             SSET (safe_charsets, XINT (id), reg96);
2930         }
2931       else
2932         {
2933           if (reg94 < 4)
2934             SSET (safe_charsets, XINT (id), reg94);
2935         }
2936     }
2937   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2938 }
2939
2940
2941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2942    Return true if a text is encoded in one of ISO-2022 based coding
2943    systems.  */
2944
2945 static bool
2946 detect_coding_iso_2022 (struct coding_system *coding,
2947                         struct coding_detection_info *detect_info)
2948 {
2949   const unsigned char *src = coding->source, *src_base = src;
2950   const unsigned char *src_end = coding->source + coding->src_bytes;
2951   bool multibytep = coding->src_multibyte;
2952   bool single_shifting = 0;
2953   int id;
2954   int c, c1;
2955   ptrdiff_t consumed_chars = 0;
2956   int i;
2957   int rejected = 0;
2958   int found = 0;
2959   int composition_count = -1;
2960
2961   detect_info->checked |= CATEGORY_MASK_ISO;
2962
2963   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2964     {
2965       struct coding_system *this = &(coding_categories[i]);
2966       Lisp_Object attrs, val;
2967
2968       if (this->id < 0)
2969         continue;
2970       attrs = CODING_ID_ATTRS (this->id);
2971       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2972           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2973         setup_iso_safe_charsets (attrs);
2974       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2975       this->max_charset_id = SCHARS (val) - 1;
2976       this->safe_charsets = SDATA (val);
2977     }
2978
2979   /* A coding system of this category is always ASCII compatible.  */
2980   src += coding->head_ascii;
2981
2982   while (rejected != CATEGORY_MASK_ISO)
2983     {
2984       src_base = src;
2985       ONE_MORE_BYTE (c);
2986       switch (c)
2987         {
2988         case ISO_CODE_ESC:
2989           if (inhibit_iso_escape_detection)
2990             break;
2991           single_shifting = 0;
2992           ONE_MORE_BYTE (c);
2993           if (c == 'N' || c == 'O')
2994             {
2995               /* ESC <Fe> for SS2 or SS3.  */
2996               single_shifting = 1;
2997               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2998             }
2999           else if (c == '1')
3000             {
3001               /* End of composition.  */
3002               if (composition_count < 0
3003                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3004                 /* Invalid */
3005                 break;
3006               composition_count = -1;
3007               found |= CATEGORY_MASK_ISO;
3008             }
3009           else if (c >= '0' && c <= '4')
3010             {
3011               /* ESC <Fp> for start/end composition.  */
3012               composition_count = 0;
3013             }
3014           else
3015             {
3016               if (c >= '(' && c <= '/')
3017                 {
3018                   /* Designation sequence for a charset of dimension 1.  */
3019                   ONE_MORE_BYTE (c1);
3020                   if (c1 < ' ' || c1 >= 0x80
3021                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3022                     /* Invalid designation sequence.  Just ignore.  */
3023                     break;
3024                 }
3025               else if (c == '$')
3026                 {
3027                   /* Designation sequence for a charset of dimension 2.  */
3028                   ONE_MORE_BYTE (c);
3029                   if (c >= '@' && c <= 'B')
3030                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3031                     id = iso_charset_table[1][0][c];
3032                   else if (c >= '(' && c <= '/')
3033                     {
3034                       ONE_MORE_BYTE (c1);
3035                       if (c1 < ' ' || c1 >= 0x80
3036                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3037                         /* Invalid designation sequence.  Just ignore.  */
3038                         break;
3039                     }
3040                   else
3041                     /* Invalid designation sequence.  Just ignore it.  */
3042                     break;
3043                 }
3044               else
3045                 {
3046                   /* Invalid escape sequence.  Just ignore it.  */
3047                   break;
3048                 }
3049
3050               /* We found a valid designation sequence for CHARSET.  */
3051               rejected |= CATEGORY_MASK_ISO_8BIT;
3052               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3053                                   id))
3054                 found |= CATEGORY_MASK_ISO_7;
3055               else
3056                 rejected |= CATEGORY_MASK_ISO_7;
3057               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3058                                   id))
3059                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3060               else
3061                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3062               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3063                                   id))
3064                 found |= CATEGORY_MASK_ISO_7_ELSE;
3065               else
3066                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3067               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3068                                   id))
3069                 found |= CATEGORY_MASK_ISO_8_ELSE;
3070               else
3071                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3072             }
3073           break;
3074
3075         case ISO_CODE_SO:
3076         case ISO_CODE_SI:
3077           /* Locking shift out/in.  */
3078           if (inhibit_iso_escape_detection)
3079             break;
3080           single_shifting = 0;
3081           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3082           break;
3083
3084         case ISO_CODE_CSI:
3085           /* Control sequence introducer.  */
3086           single_shifting = 0;
3087           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088           found |= CATEGORY_MASK_ISO_8_ELSE;
3089           goto check_extra_latin;
3090
3091         case ISO_CODE_SS2:
3092         case ISO_CODE_SS3:
3093           /* Single shift.   */
3094           if (inhibit_iso_escape_detection)
3095             break;
3096           single_shifting = 0;
3097           rejected |= CATEGORY_MASK_ISO_7BIT;
3098           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3099               & CODING_ISO_FLAG_SINGLE_SHIFT)
3100             {
3101               found |= CATEGORY_MASK_ISO_8_1;
3102               single_shifting = 1;
3103             }
3104           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3105               & CODING_ISO_FLAG_SINGLE_SHIFT)
3106             {
3107               found |= CATEGORY_MASK_ISO_8_2;
3108               single_shifting = 1;
3109             }
3110           if (single_shifting)
3111             break;
3112           goto check_extra_latin;
3113
3114         default:
3115           if (c < 0)
3116             continue;
3117           if (c < 0x80)
3118             {
3119               if (composition_count >= 0)
3120                 composition_count++;
3121               single_shifting = 0;
3122               break;
3123             }
3124           if (c >= 0xA0)
3125             {
3126               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3127               found |= CATEGORY_MASK_ISO_8_1;
3128               /* Check the length of succeeding codes of the range
3129                  0xA0..0FF.  If the byte length is even, we include
3130                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3131                  only when we are not single shifting.  */
3132               if (! single_shifting
3133                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3134                 {
3135                   int len = 1;
3136                   while (src < src_end)
3137                     {
3138                       src_base = src;
3139                       ONE_MORE_BYTE (c);
3140                       if (c < 0xA0)
3141                         {
3142                           src = src_base;
3143                           break;
3144                         }
3145                       len++;
3146                     }
3147
3148                   if (len & 1 && src < src_end)
3149                     {
3150                       rejected |= CATEGORY_MASK_ISO_8_2;
3151                       if (composition_count >= 0)
3152                         composition_count += len;
3153                     }
3154                   else
3155                     {
3156                       found |= CATEGORY_MASK_ISO_8_2;
3157                       if (composition_count >= 0)
3158                         composition_count += len / 2;
3159                     }
3160                 }
3161               break;
3162             }
3163         check_extra_latin:
3164           if (! VECTORP (Vlatin_extra_code_table)
3165               || NILP (AREF (Vlatin_extra_code_table, c)))
3166             {
3167               rejected = CATEGORY_MASK_ISO;
3168               break;
3169             }
3170           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3171               & CODING_ISO_FLAG_LATIN_EXTRA)
3172             found |= CATEGORY_MASK_ISO_8_1;
3173           else
3174             rejected |= CATEGORY_MASK_ISO_8_1;
3175           rejected |= CATEGORY_MASK_ISO_8_2;
3176           break;
3177         }
3178     }
3179   detect_info->rejected |= CATEGORY_MASK_ISO;
3180   return 0;
3181
3182  no_more_source:
3183   detect_info->rejected |= rejected;
3184   detect_info->found |= (found & ~rejected);
3185   return 1;
3186 }
3187
3188
3189 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3190    escape sequence should be kept.  */
3191 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3192   do {                                                                  \
3193     int id, prev;                                                       \
3194                                                                         \
3195     if (final < '0' || final >= 128                                     \
3196         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3197         || !SAFE_CHARSET_P (coding, id))                                \
3198       {                                                                 \
3199         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3200         chars_96 = -1;                                                  \
3201         break;                                                          \
3202       }                                                                 \
3203     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3204     if (id == charset_jisx0201_roman)                                   \
3205       {                                                                 \
3206         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3207           id = charset_ascii;                                           \
3208       }                                                                 \
3209     else if (id == charset_jisx0208_1978)                               \
3210       {                                                                 \
3211         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3212           id = charset_jisx0208;                                        \
3213       }                                                                 \
3214     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3215     /* If there was an invalid designation to REG previously, and this  \
3216        designation is ASCII to REG, we should keep this designation     \
3217        sequence.  */                                                    \
3218     if (prev == -2 && id == charset_ascii)                              \
3219       chars_96 = -1;                                                    \
3220   } while (0)
3221
3222
3223 /* Handle these composition sequence (ALT: alternate char):
3224
3225    (1) relative composition: ESC 0 CHAR ... ESC 1
3226    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3227    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3228    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3229
3230    When the start sequence (ESC 0/2/3/4) is found, this annotation
3231    header is produced.
3232
3233         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3234
3235    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3236    produced until the end sequence (ESC 1) is found:
3237
3238    (1) CHAR ... CHAR
3239    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3240    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3241    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3242
3243    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3244    annotation header is updated as below:
3245
3246    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3247    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3248    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3249    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3250
3251    If an error is found while composing, the annotation header is
3252    changed to:
3253
3254         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3255
3256    and the sequence [ -2 DECODED-RULE ] is changed to the original
3257    byte sequence as below:
3258         o the original byte sequence is B: [ B -1 ]
3259         o the original byte sequence is B1 B2: [ B1 B2 ]
3260    and the sequence [ -1 -1 ] is changed to the original byte
3261    sequence:
3262         [ ESC '0' ]
3263 */
3264
3265 /* Decode a composition rule C1 and maybe one more byte from the
3266    source, and set RULE to the encoded composition rule.  If the rule
3267    is invalid, goto invalid_code.  */
3268
3269 #define DECODE_COMPOSITION_RULE(rule)                                   \
3270   do {                                                                  \
3271     rule = c1 - 32;                                                     \
3272     if (rule < 0)                                                       \
3273       goto invalid_code;                                                \
3274     if (rule < 81)              /* old format (before ver.21) */        \
3275       {                                                                 \
3276         int gref = (rule) / 9;                                          \
3277         int nref = (rule) % 9;                                          \
3278         if (gref == 4) gref = 10;                                       \
3279         if (nref == 4) nref = 10;                                       \
3280         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3281       }                                                                 \
3282     else                        /* new format (after ver.21) */         \
3283       {                                                                 \
3284         int b;                                                          \
3285                                                                         \
3286         ONE_MORE_BYTE (b);                                              \
3287         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3288           goto invalid_code;                                            \
3289         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3290         rule += 0x100;   /* Distinguish it from the old format.  */     \
3291       }                                                                 \
3292   } while (0)
3293
3294 #define ENCODE_COMPOSITION_RULE(rule)                           \
3295   do {                                                          \
3296     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3297                                                                 \
3298     if (rule < 0x100)           /* old format */                \
3299       {                                                         \
3300         if (gref == 10) gref = 4;                               \
3301         if (nref == 10) nref = 4;                               \
3302         charbuf[idx] = 32 + gref * 9 + nref;                    \
3303         charbuf[idx + 1] = -1;                                  \
3304         new_chars++;                                            \
3305       }                                                         \
3306     else                                /* new format */        \
3307       {                                                         \
3308         charbuf[idx] = 32 + 81 + gref;                          \
3309         charbuf[idx + 1] = 32 + nref;                           \
3310         new_chars += 2;                                         \
3311       }                                                         \
3312   } while (0)
3313
3314 /* Finish the current composition as invalid.  */
3315
3316 static int
3317 finish_composition (int *charbuf, struct composition_status *cmp_status)
3318 {
3319   int idx = - cmp_status->length;
3320   int new_chars;
3321
3322   /* Recover the original ESC sequence */
3323   charbuf[idx++] = ISO_CODE_ESC;
3324   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3325                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3326                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3327                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3328                     : '4');
3329   charbuf[idx++] = -2;
3330   charbuf[idx++] = 0;
3331   charbuf[idx++] = -1;
3332   new_chars = cmp_status->nchars;
3333   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3334     for (; idx < 0; idx++)
3335       {
3336         int elt = charbuf[idx];
3337
3338         if (elt == -2)
3339           {
3340             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3341             idx++;
3342           }
3343         else if (elt == -1)
3344           {
3345             charbuf[idx++] = ISO_CODE_ESC;
3346             charbuf[idx] = '0';
3347             new_chars += 2;
3348           }
3349       }
3350   cmp_status->state = COMPOSING_NO;
3351   return new_chars;
3352 }
3353
3354 /* If characters are under composition, finish the composition.  */
3355 #define MAYBE_FINISH_COMPOSITION()                              \
3356   do {                                                          \
3357     if (cmp_status->state != COMPOSING_NO)                      \
3358       char_offset += finish_composition (charbuf, cmp_status);  \
3359   } while (0)
3360
3361 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3362
3363    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3364    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3365    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3366    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3367
3368    Produce this annotation sequence now:
3369
3370    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3371 */
3372
3373 #define DECODE_COMPOSITION_START(c1)                                       \
3374   do {                                                                     \
3375     if (c1 == '0'                                                          \
3376         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3377              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3378             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3379                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3380       {                                                                    \
3381         *charbuf++ = -1;                                                   \
3382         *charbuf++= -1;                                                    \
3383         cmp_status->state = COMPOSING_CHAR;                                \
3384         cmp_status->length += 2;                                           \
3385       }                                                                    \
3386     else                                                                   \
3387       {                                                                    \
3388         MAYBE_FINISH_COMPOSITION ();                                       \
3389         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3390                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3391                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3392                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3393         cmp_status->state                                                  \
3394           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3395         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3396         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3397         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3398         coding->annotated = 1;                                             \
3399       }                                                                    \
3400   } while (0)
3401
3402
3403 /* Handle composition end sequence ESC 1.  */
3404
3405 #define DECODE_COMPOSITION_END()                                        \
3406   do {                                                                  \
3407     if (cmp_status->nchars == 0                                         \
3408         || ((cmp_status->state == COMPOSING_CHAR)                       \
3409             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3410       {                                                                 \
3411         MAYBE_FINISH_COMPOSITION ();                                    \
3412         goto invalid_code;                                              \
3413       }                                                                 \
3414     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3415       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3416     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3417       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3418     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3419     char_offset += cmp_status->nchars;                                  \
3420     cmp_status->state = COMPOSING_NO;                                   \
3421   } while (0)
3422
3423 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3424
3425 #define STORE_COMPOSITION_RULE(rule)    \
3426   do {                                  \
3427     *charbuf++ = -2;                    \
3428     *charbuf++ = rule;                  \
3429     cmp_status->length += 2;            \
3430     cmp_status->state--;                \
3431   } while (0)
3432
3433 /* Store a composed char or a component char C in charbuf, and update
3434    cmp_status.  */
3435
3436 #define STORE_COMPOSITION_CHAR(c)                                       \
3437   do {                                                                  \
3438     *charbuf++ = (c);                                                   \
3439     cmp_status->length++;                                               \
3440     if (cmp_status->state == COMPOSING_CHAR)                            \
3441       cmp_status->nchars++;                                             \
3442     else                                                                \
3443       cmp_status->ncomps++;                                             \
3444     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3445         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3446             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3447       cmp_status->state++;                                              \
3448   } while (0)
3449
3450
3451 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3452
3453 static void
3454 decode_coding_iso_2022 (struct coding_system *coding)
3455 {
3456   const unsigned char *src = coding->source + coding->consumed;
3457   const unsigned char *src_end = coding->source + coding->src_bytes;
3458   const unsigned char *src_base;
3459   int *charbuf = coding->charbuf + coding->charbuf_used;
3460   /* We may produce two annotations (charset and composition) in one
3461      loop and one more charset annotation at the end.  */
3462   int *charbuf_end
3463     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3464   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3465   bool multibytep = coding->src_multibyte;
3466   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3467   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3468   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3469   int charset_id_2, charset_id_3;
3470   struct charset *charset;
3471   int c;
3472   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3473   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3474   ptrdiff_t char_offset = coding->produced_char;
3475   ptrdiff_t last_offset = char_offset;
3476   int last_id = charset_ascii;
3477   bool eol_dos
3478     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3479   int byte_after_cr = -1;
3480   int i;
3481
3482   setup_iso_safe_charsets (attrs);
3483   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3484
3485   if (cmp_status->state != COMPOSING_NO)
3486     {
3487       if (charbuf_end - charbuf < cmp_status->length)
3488         emacs_abort ();
3489       for (i = 0; i < cmp_status->length; i++)
3490         *charbuf++ = cmp_status->carryover[i];
3491       coding->annotated = 1;
3492     }
3493
3494   while (1)
3495     {
3496       int c1, c2, c3;
3497
3498       src_base = src;
3499       consumed_chars_base = consumed_chars;
3500
3501       if (charbuf >= charbuf_end)
3502         {
3503           if (byte_after_cr >= 0)
3504             src_base--;
3505           break;
3506         }
3507
3508       if (byte_after_cr >= 0)
3509         c1 = byte_after_cr, byte_after_cr = -1;
3510       else
3511         ONE_MORE_BYTE (c1);
3512       if (c1 < 0)
3513         goto invalid_code;
3514
3515       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3516         {
3517           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3518           char_offset++;
3519           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3520           continue;
3521         }
3522
3523       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3524         {
3525           if (c1 == ISO_CODE_ESC)
3526             {
3527               if (src + 1 >= src_end)
3528                 goto no_more_source;
3529               *charbuf++ = ISO_CODE_ESC;
3530               char_offset++;
3531               if (src[0] == '%' && src[1] == '@')
3532                 {
3533                   src += 2;
3534                   consumed_chars += 2;
3535                   char_offset += 2;
3536                   /* We are sure charbuf can contain two more chars. */
3537                   *charbuf++ = '%';
3538                   *charbuf++ = '@';
3539                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3540                 }
3541             }
3542           else
3543             {
3544               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3545               char_offset++;
3546             }
3547           continue;
3548         }
3549
3550       if ((cmp_status->state == COMPOSING_RULE
3551            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3552           && c1 != ISO_CODE_ESC)
3553         {
3554           int rule;
3555
3556           DECODE_COMPOSITION_RULE (rule);
3557           STORE_COMPOSITION_RULE (rule);
3558           continue;
3559         }
3560
3561       /* We produce at most one character.  */
3562       switch (iso_code_class [c1])
3563         {
3564         case ISO_0x20_or_0x7F:
3565           if (charset_id_0 < 0
3566               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3567             /* This is SPACE or DEL.  */
3568             charset = CHARSET_FROM_ID (charset_ascii);
3569           else
3570             charset = CHARSET_FROM_ID (charset_id_0);
3571           break;
3572
3573         case ISO_graphic_plane_0:
3574           if (charset_id_0 < 0)
3575             charset = CHARSET_FROM_ID (charset_ascii);
3576           else
3577             charset = CHARSET_FROM_ID (charset_id_0);
3578           break;
3579
3580         case ISO_0xA0_or_0xFF:
3581           if (charset_id_1 < 0
3582               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3583               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3584             goto invalid_code;
3585           /* This is a graphic character, we fall down ... */
3586
3587         case ISO_graphic_plane_1:
3588           if (charset_id_1 < 0)
3589             goto invalid_code;
3590           charset = CHARSET_FROM_ID (charset_id_1);
3591           break;
3592
3593         case ISO_control_0:
3594           if (eol_dos && c1 == '\r')
3595             ONE_MORE_BYTE (byte_after_cr);
3596           MAYBE_FINISH_COMPOSITION ();
3597           charset = CHARSET_FROM_ID (charset_ascii);
3598           break;
3599
3600         case ISO_control_1:
3601           goto invalid_code;
3602
3603         case ISO_shift_out:
3604           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3605               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3606             goto invalid_code;
3607           CODING_ISO_INVOCATION (coding, 0) = 1;
3608           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3609           continue;
3610
3611         case ISO_shift_in:
3612           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3613             goto invalid_code;
3614           CODING_ISO_INVOCATION (coding, 0) = 0;
3615           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3616           continue;
3617
3618         case ISO_single_shift_2_7:
3619           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3620             goto invalid_code;
3621         case ISO_single_shift_2:
3622           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3623             goto invalid_code;
3624           /* SS2 is handled as an escape sequence of ESC 'N' */
3625           c1 = 'N';
3626           goto label_escape_sequence;
3627
3628         case ISO_single_shift_3:
3629           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3630             goto invalid_code;
3631           /* SS2 is handled as an escape sequence of ESC 'O' */
3632           c1 = 'O';
3633           goto label_escape_sequence;
3634
3635         case ISO_control_sequence_introducer:
3636           /* CSI is handled as an escape sequence of ESC '[' ...  */
3637           c1 = '[';
3638           goto label_escape_sequence;
3639
3640         case ISO_escape:
3641           ONE_MORE_BYTE (c1);
3642         label_escape_sequence:
3643           /* Escape sequences handled here are invocation,
3644              designation, direction specification, and character
3645              composition specification.  */
3646           switch (c1)
3647             {
3648             case '&':           /* revision of following character set */
3649               ONE_MORE_BYTE (c1);
3650               if (!(c1 >= '@' && c1 <= '~'))
3651                 goto invalid_code;
3652               ONE_MORE_BYTE (c1);
3653               if (c1 != ISO_CODE_ESC)
3654                 goto invalid_code;
3655               ONE_MORE_BYTE (c1);
3656               goto label_escape_sequence;
3657
3658             case '$':           /* designation of 2-byte character set */
3659               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3660                 goto invalid_code;
3661               {
3662                 int reg, chars96;
3663
3664                 ONE_MORE_BYTE (c1);
3665                 if (c1 >= '@' && c1 <= 'B')
3666                   {     /* designation of JISX0208.1978, GB2312.1980,
3667                            or JISX0208.1980 */
3668                     reg = 0, chars96 = 0;
3669                   }
3670                 else if (c1 >= 0x28 && c1 <= 0x2B)
3671                   { /* designation of DIMENSION2_CHARS94 character set */
3672                     reg = c1 - 0x28, chars96 = 0;
3673                     ONE_MORE_BYTE (c1);
3674                   }
3675                 else if (c1 >= 0x2C && c1 <= 0x2F)
3676                   { /* designation of DIMENSION2_CHARS96 character set */
3677                     reg = c1 - 0x2C, chars96 = 1;
3678                     ONE_MORE_BYTE (c1);
3679                   }
3680                 else
3681                   goto invalid_code;
3682                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3683                 /* We must update these variables now.  */
3684                 if (reg == 0)
3685                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3686                 else if (reg == 1)
3687                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3688                 if (chars96 < 0)
3689                   goto invalid_code;
3690               }
3691               continue;
3692
3693             case 'n':           /* invocation of locking-shift-2 */
3694               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3695                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3696                 goto invalid_code;
3697               CODING_ISO_INVOCATION (coding, 0) = 2;
3698               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3699               continue;
3700
3701             case 'o':           /* invocation of locking-shift-3 */
3702               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3703                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3704                 goto invalid_code;
3705               CODING_ISO_INVOCATION (coding, 0) = 3;
3706               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3707               continue;
3708
3709             case 'N':           /* invocation of single-shift-2 */
3710               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3711                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3712                 goto invalid_code;
3713               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3714               if (charset_id_2 < 0)
3715                 charset = CHARSET_FROM_ID (charset_ascii);
3716               else
3717                 charset = CHARSET_FROM_ID (charset_id_2);
3718               ONE_MORE_BYTE (c1);
3719               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3720                 goto invalid_code;
3721               break;
3722
3723             case 'O':           /* invocation of single-shift-3 */
3724               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3725                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3726                 goto invalid_code;
3727               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3728               if (charset_id_3 < 0)
3729                 charset = CHARSET_FROM_ID (charset_ascii);
3730               else
3731                 charset = CHARSET_FROM_ID (charset_id_3);
3732               ONE_MORE_BYTE (c1);
3733               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3734                 goto invalid_code;
3735               break;
3736
3737             case '0': case '2': case '3': case '4': /* start composition */
3738               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3739                 goto invalid_code;
3740               if (last_id != charset_ascii)
3741                 {
3742                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3743                   last_id = charset_ascii;
3744                   last_offset = char_offset;
3745                 }
3746               DECODE_COMPOSITION_START (c1);
3747               continue;
3748
3749             case '1':           /* end composition */
3750               if (cmp_status->state == COMPOSING_NO)
3751                 goto invalid_code;
3752               DECODE_COMPOSITION_END ();
3753               continue;
3754
3755             case '[':           /* specification of direction */
3756               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3757                 goto invalid_code;
3758               /* For the moment, nested direction is not supported.
3759                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3760                  left-to-right, and nonzero means right-to-left.  */
3761               ONE_MORE_BYTE (c1);
3762               switch (c1)
3763                 {
3764                 case ']':       /* end of the current direction */
3765                   coding->mode &= ~CODING_MODE_DIRECTION;
3766
3767                 case '0':       /* end of the current direction */
3768                 case '1':       /* start of left-to-right direction */
3769                   ONE_MORE_BYTE (c1);
3770                   if (c1 == ']')
3771                     coding->mode &= ~CODING_MODE_DIRECTION;
3772                   else
3773                     goto invalid_code;
3774                   break;
3775
3776                 case '2':       /* start of right-to-left direction */
3777                   ONE_MORE_BYTE (c1);
3778                   if (c1 == ']')
3779                     coding->mode |= CODING_MODE_DIRECTION;
3780                   else
3781                     goto invalid_code;
3782                   break;
3783
3784                 default:
3785                   goto invalid_code;
3786                 }
3787               continue;
3788
3789             case '%':
3790               ONE_MORE_BYTE (c1);
3791               if (c1 == '/')
3792                 {
3793                   /* CTEXT extended segment:
3794                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3795                      We keep these bytes as is for the moment.
3796                      They may be decoded by post-read-conversion.  */
3797                   int dim, M, L;
3798                   int size;
3799
3800                   ONE_MORE_BYTE (dim);
3801                   if (dim < '0' || dim > '4')
3802                     goto invalid_code;
3803                   ONE_MORE_BYTE (M);
3804                   if (M < 128)
3805                     goto invalid_code;
3806                   ONE_MORE_BYTE (L);
3807                   if (L < 128)
3808                     goto invalid_code;
3809                   size = ((M - 128) * 128) + (L - 128);
3810                   if (charbuf + 6 > charbuf_end)
3811                     goto break_loop;
3812                   *charbuf++ = ISO_CODE_ESC;
3813                   *charbuf++ = '%';
3814                   *charbuf++ = '/';
3815                   *charbuf++ = dim;
3816                   *charbuf++ = BYTE8_TO_CHAR (M);
3817                   *charbuf++ = BYTE8_TO_CHAR (L);
3818                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3819                 }
3820               else if (c1 == 'G')
3821                 {
3822                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3823                      ESC % G --UTF-8-BYTES-- ESC % @
3824                      We keep these bytes as is for the moment.
3825                      They may be decoded by post-read-conversion.  */
3826                   if (charbuf + 3 > charbuf_end)
3827                     goto break_loop;
3828                   *charbuf++ = ISO_CODE_ESC;
3829                   *charbuf++ = '%';
3830                   *charbuf++ = 'G';
3831                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3832                 }
3833               else
3834                 goto invalid_code;
3835               continue;
3836               break;
3837
3838             default:
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3840                 goto invalid_code;
3841               {
3842                 int reg, chars96;
3843
3844                 if (c1 >= 0x28 && c1 <= 0x2B)
3845                   { /* designation of DIMENSION1_CHARS94 character set */
3846                     reg = c1 - 0x28, chars96 = 0;
3847                     ONE_MORE_BYTE (c1);
3848                   }
3849                 else if (c1 >= 0x2C && c1 <= 0x2F)
3850                   { /* designation of DIMENSION1_CHARS96 character set */
3851                     reg = c1 - 0x2C, chars96 = 1;
3852                     ONE_MORE_BYTE (c1);
3853                   }
3854                 else
3855                   goto invalid_code;
3856                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3857                 /* We must update these variables now.  */
3858                 if (reg == 0)
3859                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3860                 else if (reg == 1)
3861                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3862                 if (chars96 < 0)
3863                   goto invalid_code;
3864               }
3865               continue;
3866             }
3867           break;
3868
3869         default:
3870           emacs_abort ();
3871         }
3872
3873       if (cmp_status->state == COMPOSING_NO
3874           && charset->id != charset_ascii
3875           && last_id != charset->id)
3876         {
3877           if (last_id != charset_ascii)
3878             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3879           last_id = charset->id;
3880           last_offset = char_offset;
3881         }
3882
3883       /* Now we know CHARSET and 1st position code C1 of a character.
3884          Produce a decoded character while getting 2nd and 3rd
3885          position codes C2, C3 if necessary.  */
3886       if (CHARSET_DIMENSION (charset) > 1)
3887         {
3888           ONE_MORE_BYTE (c2);
3889           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3890               || ((c1 & 0x80) != (c2 & 0x80)))
3891             /* C2 is not in a valid range.  */
3892             goto invalid_code;
3893           if (CHARSET_DIMENSION (charset) == 2)
3894             c1 = (c1 << 8) | c2;
3895           else
3896             {
3897               ONE_MORE_BYTE (c3);
3898               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3899                   || ((c1 & 0x80) != (c3 & 0x80)))
3900                 /* C3 is not in a valid range.  */
3901                 goto invalid_code;
3902               c1 = (c1 << 16) | (c2 << 8) | c2;
3903             }
3904         }
3905       c1 &= 0x7F7F7F;
3906       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3907       if (c < 0)
3908         {
3909           MAYBE_FINISH_COMPOSITION ();
3910           for (; src_base < src; src_base++, char_offset++)
3911             {
3912               if (ASCII_BYTE_P (*src_base))
3913                 *charbuf++ = *src_base;
3914               else
3915                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3916             }
3917         }
3918       else if (cmp_status->state == COMPOSING_NO)
3919         {
3920           *charbuf++ = c;
3921           char_offset++;
3922         }
3923       else if ((cmp_status->state == COMPOSING_CHAR
3924                 ? cmp_status->nchars
3925                 : cmp_status->ncomps)
3926                >= MAX_COMPOSITION_COMPONENTS)
3927         {
3928           /* Too long composition.  */
3929           MAYBE_FINISH_COMPOSITION ();
3930           *charbuf++ = c;
3931           char_offset++;
3932         }
3933       else
3934         STORE_COMPOSITION_CHAR (c);
3935       continue;
3936
3937     invalid_code:
3938       MAYBE_FINISH_COMPOSITION ();
3939       src = src_base;
3940       consumed_chars = consumed_chars_base;
3941       ONE_MORE_BYTE (c);
3942       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3943       char_offset++;
3944       coding->errors++;
3945       /* Reset the invocation and designation status to the safest
3946          one; i.e. designate ASCII to the graphic register 0, and
3947          invoke that register to the graphic plane 0.  This typically
3948          helps the case that an designation sequence for ASCII "ESC (
3949          B" is somehow broken (e.g. broken by a newline).  */
3950       CODING_ISO_INVOCATION (coding, 0) = 0;
3951       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3952       charset_id_0 = charset_ascii;
3953       continue;
3954
3955     break_loop:
3956       break;
3957     }
3958
3959  no_more_source:
3960   if (cmp_status->state != COMPOSING_NO)
3961     {
3962       if (coding->mode & CODING_MODE_LAST_BLOCK)
3963         MAYBE_FINISH_COMPOSITION ();
3964       else
3965         {
3966           charbuf -= cmp_status->length;
3967           for (i = 0; i < cmp_status->length; i++)
3968             cmp_status->carryover[i] = charbuf[i];
3969         }
3970     }
3971   else if (last_id != charset_ascii)
3972     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3973   coding->consumed_char += consumed_chars_base;
3974   coding->consumed = src_base - coding->source;
3975   coding->charbuf_used = charbuf - coding->charbuf;
3976 }
3977
3978
3979 /* ISO2022 encoding stuff.  */
3980
3981 /*
3982    It is not enough to say just "ISO2022" on encoding, we have to
3983    specify more details.  In Emacs, each coding system of ISO2022
3984    variant has the following specifications:
3985         1. Initial designation to G0 thru G3.
3986         2. Allows short-form designation?
3987         3. ASCII should be designated to G0 before control characters?
3988         4. ASCII should be designated to G0 at end of line?
3989         5. 7-bit environment or 8-bit environment?
3990         6. Use locking-shift?
3991         7. Use Single-shift?
3992    And the following two are only for Japanese:
3993         8. Use ASCII in place of JIS0201-1976-Roman?
3994         9. Use JISX0208-1983 in place of JISX0208-1978?
3995    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3996    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3997    details.
3998 */
3999
4000 /* Produce codes (escape sequence) for designating CHARSET to graphic
4001    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4002    '@', 'A', or 'B' and the coding system CODING allows, produce
4003    designation sequence of short-form.  */
4004
4005 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4006   do {                                                                  \
4007     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4008     const char *intermediate_char_94 = "()*+";                          \
4009     const char *intermediate_char_96 = ",-./";                          \
4010     int revision = -1;                                                  \
4011                                                                         \
4012     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4013       revision = CHARSET_ISO_REVISION (charset);                        \
4014                                                                         \
4015     if (revision >= 0)                                                  \
4016       {                                                                 \
4017         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4018         EMIT_ONE_BYTE ('@' + revision);                                 \
4019       }                                                                 \
4020     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4021     if (CHARSET_DIMENSION (charset) == 1)                               \
4022       {                                                                 \
4023         int b;                                                          \
4024         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4025           b = intermediate_char_94[reg];                                \
4026         else                                                            \
4027           b = intermediate_char_96[reg];                                \
4028         EMIT_ONE_ASCII_BYTE (b);                                        \
4029       }                                                                 \
4030     else                                                                \
4031       {                                                                 \
4032         EMIT_ONE_ASCII_BYTE ('$');                                      \
4033         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4034           {                                                             \
4035             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4036                 || reg != 0                                             \
4037                 || final_char < '@' || final_char > 'B')                \
4038               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4039           }                                                             \
4040         else                                                            \
4041           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4042       }                                                                 \
4043     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4044                                                                         \
4045     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4046   } while (0)
4047
4048
4049 /* The following two macros produce codes (control character or escape
4050    sequence) for ISO2022 single-shift functions (single-shift-2 and
4051    single-shift-3).  */
4052
4053 #define ENCODE_SINGLE_SHIFT_2                                           \
4054   do {                                                                  \
4055     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4056       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4057     else                                                                \
4058       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4059     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4060   } while (0)
4061
4062
4063 #define ENCODE_SINGLE_SHIFT_3                                           \
4064   do {                                                                  \
4065     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4066       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4067     else                                                                \
4068       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4069     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4070   } while (0)
4071
4072
4073 /* The following four macros produce codes (control character or
4074    escape sequence) for ISO2022 locking-shift functions (shift-in,
4075    shift-out, locking-shift-2, and locking-shift-3).  */
4076
4077 #define ENCODE_SHIFT_IN                                 \
4078   do {                                                  \
4079     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4080     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4081   } while (0)
4082
4083
4084 #define ENCODE_SHIFT_OUT                                \
4085   do {                                                  \
4086     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4087     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4088   } while (0)
4089
4090
4091 #define ENCODE_LOCKING_SHIFT_2                          \
4092   do {                                                  \
4093     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4094     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4095   } while (0)
4096
4097
4098 #define ENCODE_LOCKING_SHIFT_3                          \
4099   do {                                                  \
4100     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4101     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4102   } while (0)
4103
4104
4105 /* Produce codes for a DIMENSION1 character whose character set is
4106    CHARSET and whose position-code is C1.  Designation and invocation
4107    sequences are also produced in advance if necessary.  */
4108
4109 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4110   do {                                                                  \
4111     int id = CHARSET_ID (charset);                                      \
4112                                                                         \
4113     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4114         && id == charset_ascii)                                         \
4115       {                                                                 \
4116         id = charset_jisx0201_roman;                                    \
4117         charset = CHARSET_FROM_ID (id);                                 \
4118       }                                                                 \
4119                                                                         \
4120     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4121       {                                                                 \
4122         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4123           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4124         else                                                            \
4125           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4126         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4127         break;                                                          \
4128       }                                                                 \
4129     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4130       {                                                                 \
4131         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4132         break;                                                          \
4133       }                                                                 \
4134     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4135       {                                                                 \
4136         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4137         break;                                                          \
4138       }                                                                 \
4139     else                                                                \
4140       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4141          must invoke it, or, at first, designate it to some graphic     \
4142          register.  Then repeat the loop to actually produce the        \
4143          character.  */                                                 \
4144       dst = encode_invocation_designation (charset, coding, dst,        \
4145                                            &produced_chars);            \
4146   } while (1)
4147
4148
4149 /* Produce codes for a DIMENSION2 character whose character set is
4150    CHARSET and whose position-codes are C1 and C2.  Designation and
4151    invocation codes are also produced in advance if necessary.  */
4152
4153 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4154   do {                                                                  \
4155     int id = CHARSET_ID (charset);                                      \
4156                                                                         \
4157     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4158         && id == charset_jisx0208)                                      \
4159       {                                                                 \
4160         id = charset_jisx0208_1978;                                     \
4161         charset = CHARSET_FROM_ID (id);                                 \
4162       }                                                                 \
4163                                                                         \
4164     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4165       {                                                                 \
4166         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4167           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4168         else                                                            \
4169           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4170         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4171         break;                                                          \
4172       }                                                                 \
4173     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4174       {                                                                 \
4175         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4176         break;                                                          \
4177       }                                                                 \
4178     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4179       {                                                                 \
4180         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4181         break;                                                          \
4182       }                                                                 \
4183     else                                                                \
4184       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4185          must invoke it, or, at first, designate it to some graphic     \
4186          register.  Then repeat the loop to actually produce the        \
4187          character.  */                                                 \
4188       dst = encode_invocation_designation (charset, coding, dst,        \
4189                                            &produced_chars);            \
4190   } while (1)
4191
4192
4193 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4194   do {                                                                     \
4195     unsigned code;                                                         \
4196     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4197                                                                            \
4198     if (CHARSET_DIMENSION (charset) == 1)                                  \
4199       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4200     else                                                                   \
4201       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4202   } while (0)
4203
4204
4205 /* Produce designation and invocation codes at a place pointed by DST
4206    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4207    Return new DST.  */
4208
4209 static unsigned char *
4210 encode_invocation_designation (struct charset *charset,
4211                                struct coding_system *coding,
4212                                unsigned char *dst, ptrdiff_t *p_nchars)
4213 {
4214   bool multibytep = coding->dst_multibyte;
4215   ptrdiff_t produced_chars = *p_nchars;
4216   int reg;                      /* graphic register number */
4217   int id = CHARSET_ID (charset);
4218
4219   /* At first, check designations.  */
4220   for (reg = 0; reg < 4; reg++)
4221     if (id == CODING_ISO_DESIGNATION (coding, reg))
4222       break;
4223
4224   if (reg >= 4)
4225     {
4226       /* CHARSET is not yet designated to any graphic registers.  */
4227       /* At first check the requested designation.  */
4228       reg = CODING_ISO_REQUEST (coding, id);
4229       if (reg < 0)
4230         /* Since CHARSET requests no special designation, designate it
4231            to graphic register 0.  */
4232         reg = 0;
4233
4234       ENCODE_DESIGNATION (charset, reg, coding);
4235     }
4236
4237   if (CODING_ISO_INVOCATION (coding, 0) != reg
4238       && CODING_ISO_INVOCATION (coding, 1) != reg)
4239     {
4240       /* Since the graphic register REG is not invoked to any graphic
4241          planes, invoke it to graphic plane 0.  */
4242       switch (reg)
4243         {
4244         case 0:                 /* graphic register 0 */
4245           ENCODE_SHIFT_IN;
4246           break;
4247
4248         case 1:                 /* graphic register 1 */
4249           ENCODE_SHIFT_OUT;
4250           break;
4251
4252         case 2:                 /* graphic register 2 */
4253           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4254             ENCODE_SINGLE_SHIFT_2;
4255           else
4256             ENCODE_LOCKING_SHIFT_2;
4257           break;
4258
4259         case 3:                 /* graphic register 3 */
4260           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4261             ENCODE_SINGLE_SHIFT_3;
4262           else
4263             ENCODE_LOCKING_SHIFT_3;
4264           break;
4265         }
4266     }
4267
4268   *p_nchars = produced_chars;
4269   return dst;
4270 }
4271
4272
4273 /* Produce codes for designation and invocation to reset the graphic
4274    planes and registers to initial state.  */
4275 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4276   do {                                                                  \
4277     int reg;                                                            \
4278     struct charset *charset;                                            \
4279                                                                         \
4280     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4281       ENCODE_SHIFT_IN;                                                  \
4282     for (reg = 0; reg < 4; reg++)                                       \
4283       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4284           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4285               != CODING_ISO_INITIAL (coding, reg)))                     \
4286         {                                                               \
4287           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4288           ENCODE_DESIGNATION (charset, reg, coding);                    \
4289         }                                                               \
4290   } while (0)
4291
4292
4293 /* Produce designation sequences of charsets in the line started from
4294    CHARBUF to a place pointed by DST, and return the number of
4295    produced bytes.  DST should not directly point a buffer text area
4296    which may be relocated by char_charset call.
4297
4298    If the current block ends before any end-of-line, we may fail to
4299    find all the necessary designations.  */
4300
4301 static ptrdiff_t
4302 encode_designation_at_bol (struct coding_system *coding,
4303                            int *charbuf, int *charbuf_end,
4304                            unsigned char *dst)
4305 {
4306   unsigned char *orig = dst;
4307   struct charset *charset;
4308   /* Table of charsets to be designated to each graphic register.  */
4309   int r[4];
4310   int c, found = 0, reg;
4311   ptrdiff_t produced_chars = 0;
4312   bool multibytep = coding->dst_multibyte;
4313   Lisp_Object attrs;
4314   Lisp_Object charset_list;
4315
4316   attrs = CODING_ID_ATTRS (coding->id);
4317   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4318   if (EQ (charset_list, Qiso_2022))
4319     charset_list = Viso_2022_charset_list;
4320
4321   for (reg = 0; reg < 4; reg++)
4322     r[reg] = -1;
4323
4324   while (charbuf < charbuf_end && found < 4)
4325     {
4326       int id;
4327
4328       c = *charbuf++;
4329       if (c == '\n')
4330         break;
4331       charset = char_charset (c, charset_list, NULL);
4332       id = CHARSET_ID (charset);
4333       reg = CODING_ISO_REQUEST (coding, id);
4334       if (reg >= 0 && r[reg] < 0)
4335         {
4336           found++;
4337           r[reg] = id;
4338         }
4339     }
4340
4341   if (found)
4342     {
4343       for (reg = 0; reg < 4; reg++)
4344         if (r[reg] >= 0
4345             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4346           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4347     }
4348
4349   return dst - orig;
4350 }
4351
4352 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4353
4354 static bool
4355 encode_coding_iso_2022 (struct coding_system *coding)
4356 {
4357   bool multibytep = coding->dst_multibyte;
4358   int *charbuf = coding->charbuf;
4359   int *charbuf_end = charbuf + coding->charbuf_used;
4360   unsigned char *dst = coding->destination + coding->produced;
4361   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4362   int safe_room = 16;
4363   bool bol_designation
4364     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4365        && CODING_ISO_BOL (coding));
4366   ptrdiff_t produced_chars = 0;
4367   Lisp_Object attrs, eol_type, charset_list;
4368   bool ascii_compatible;
4369   int c;
4370   int preferred_charset_id = -1;
4371
4372   CODING_GET_INFO (coding, attrs, charset_list);
4373   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4374   if (VECTORP (eol_type))
4375     eol_type = Qunix;
4376
4377   setup_iso_safe_charsets (attrs);
4378   /* Charset list may have been changed.  */
4379   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4380   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4381
4382   ascii_compatible
4383     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4384        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4385                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4386
4387   while (charbuf < charbuf_end)
4388     {
4389       ASSURE_DESTINATION (safe_room);
4390
4391       if (bol_designation)
4392         {
4393           /* We have to produce designation sequences if any now.  */
4394           unsigned char desig_buf[16];
4395           int nbytes;
4396           ptrdiff_t offset;
4397
4398           charset_map_loaded = 0;
4399           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4400                                               desig_buf);
4401           if (charset_map_loaded
4402               && (offset = coding_change_destination (coding)))
4403             {
4404               dst += offset;
4405               dst_end += offset;
4406             }
4407           memcpy (dst, desig_buf, nbytes);
4408           dst += nbytes;
4409           /* We are sure that designation sequences are all ASCII bytes.  */
4410           produced_chars += nbytes;
4411           bol_designation = 0;
4412           ASSURE_DESTINATION (safe_room);
4413         }
4414
4415       c = *charbuf++;
4416
4417       if (c < 0)
4418         {
4419           /* Handle an annotation.  */
4420           switch (*charbuf)
4421             {
4422             case CODING_ANNOTATE_COMPOSITION_MASK:
4423               /* Not yet implemented.  */
4424               break;
4425             case CODING_ANNOTATE_CHARSET_MASK:
4426               preferred_charset_id = charbuf[2];
4427               if (preferred_charset_id >= 0
4428                   && NILP (Fmemq (make_number (preferred_charset_id),
4429                                   charset_list)))
4430                 preferred_charset_id = -1;
4431               break;
4432             default:
4433               emacs_abort ();
4434             }
4435           charbuf += -c - 1;
4436           continue;
4437         }
4438
4439       /* Now encode the character C.  */
4440       if (c < 0x20 || c == 0x7F)
4441         {
4442           if (c == '\n'
4443               || (c == '\r' && EQ (eol_type, Qmac)))
4444             {
4445               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4446                 ENCODE_RESET_PLANE_AND_REGISTER ();
4447               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4448                 {
4449                   int i;
4450
4451                   for (i = 0; i < 4; i++)
4452                     CODING_ISO_DESIGNATION (coding, i)
4453                       = CODING_ISO_INITIAL (coding, i);
4454                 }
4455               bol_designation = ((CODING_ISO_FLAGS (coding)
4456                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4457                                  != 0);
4458             }
4459           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4460             ENCODE_RESET_PLANE_AND_REGISTER ();
4461           EMIT_ONE_ASCII_BYTE (c);
4462         }
4463       else if (ASCII_CHAR_P (c))
4464         {
4465           if (ascii_compatible)
4466             EMIT_ONE_ASCII_BYTE (c);
4467           else
4468             {
4469               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4470               ENCODE_ISO_CHARACTER (charset, c);
4471             }
4472         }
4473       else if (CHAR_BYTE8_P (c))
4474         {
4475           c = CHAR_TO_BYTE8 (c);
4476           EMIT_ONE_BYTE (c);
4477         }
4478       else
4479         {
4480           struct charset *charset;
4481
4482           if (preferred_charset_id >= 0)
4483             {
4484               bool result;
4485
4486               charset = CHARSET_FROM_ID (preferred_charset_id);
4487               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4488               if (! result)
4489                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4490                                      NULL, charset);
4491             }
4492           else
4493             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4494                                  NULL, charset);
4495           if (!charset)
4496             {
4497               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4498                 {
4499                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4500                   charset = CHARSET_FROM_ID (charset_ascii);
4501                 }
4502               else
4503                 {
4504                   c = coding->default_char;
4505                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4506                                        charset_list, NULL, charset);
4507                 }
4508             }
4509           ENCODE_ISO_CHARACTER (charset, c);
4510         }
4511     }
4512
4513   if (coding->mode & CODING_MODE_LAST_BLOCK
4514       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4515     {
4516       ASSURE_DESTINATION (safe_room);
4517       ENCODE_RESET_PLANE_AND_REGISTER ();
4518     }
4519   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4520   CODING_ISO_BOL (coding) = bol_designation;
4521   coding->produced_char += produced_chars;
4522   coding->produced = dst - coding->destination;
4523   return 0;
4524 }
4525
4526 \f
4527 /*** 8,9. SJIS and BIG5 handlers ***/
4528
4529 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4530    quite widely.  So, for the moment, Emacs supports them in the bare
4531    C code.  But, in the future, they may be supported only by CCL.  */
4532
4533 /* SJIS is a coding system encoding three character sets: ASCII, right
4534    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4535    as is.  A character of charset katakana-jisx0201 is encoded by
4536    "position-code + 0x80".  A character of charset japanese-jisx0208
4537    is encoded in 2-byte but two position-codes are divided and shifted
4538    so that it fit in the range below.
4539
4540    --- CODE RANGE of SJIS ---
4541    (character set)      (range)
4542    ASCII                0x00 .. 0x7F
4543    KATAKANA-JISX0201    0xA0 .. 0xDF
4544    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4545             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4546    -------------------------------
4547
4548 */
4549
4550 /* BIG5 is a coding system encoding two character sets: ASCII and
4551    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4552    character set and is encoded in two-byte.
4553
4554    --- CODE RANGE of BIG5 ---
4555    (character set)      (range)
4556    ASCII                0x00 .. 0x7F
4557    Big5 (1st byte)      0xA1 .. 0xFE
4558         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4559    --------------------------
4560
4561   */
4562
4563 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4564    Return true if a text is encoded in SJIS.  */
4565
4566 static bool
4567 detect_coding_sjis (struct coding_system *coding,
4568                     struct coding_detection_info *detect_info)
4569 {
4570   const unsigned char *src = coding->source, *src_base;
4571   const unsigned char *src_end = coding->source + coding->src_bytes;
4572   bool multibytep = coding->src_multibyte;
4573   ptrdiff_t consumed_chars = 0;
4574   int found = 0;
4575   int c;
4576   Lisp_Object attrs, charset_list;
4577   int max_first_byte_of_2_byte_code;
4578
4579   CODING_GET_INFO (coding, attrs, charset_list);
4580   max_first_byte_of_2_byte_code
4581     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4582
4583   detect_info->checked |= CATEGORY_MASK_SJIS;
4584   /* A coding system of this category is always ASCII compatible.  */
4585   src += coding->head_ascii;
4586
4587   while (1)
4588     {
4589       src_base = src;
4590       ONE_MORE_BYTE (c);
4591       if (c < 0x80)
4592         continue;
4593       if ((c >= 0x81 && c <= 0x9F)
4594           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4595         {
4596           ONE_MORE_BYTE (c);
4597           if (c < 0x40 || c == 0x7F || c > 0xFC)
4598             break;
4599           found = CATEGORY_MASK_SJIS;
4600         }
4601       else if (c >= 0xA0 && c < 0xE0)
4602         found = CATEGORY_MASK_SJIS;
4603       else
4604         break;
4605     }
4606   detect_info->rejected |= CATEGORY_MASK_SJIS;
4607   return 0;
4608
4609  no_more_source:
4610   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4611     {
4612       detect_info->rejected |= CATEGORY_MASK_SJIS;
4613       return 0;
4614     }
4615   detect_info->found |= found;
4616   return 1;
4617 }
4618
4619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4620    Return true if a text is encoded in BIG5.  */
4621
4622 static bool
4623 detect_coding_big5 (struct coding_system *coding,
4624                     struct coding_detection_info *detect_info)
4625 {
4626   const unsigned char *src = coding->source, *src_base;
4627   const unsigned char *src_end = coding->source + coding->src_bytes;
4628   bool multibytep = coding->src_multibyte;
4629   ptrdiff_t consumed_chars = 0;
4630   int found = 0;
4631   int c;
4632
4633   detect_info->checked |= CATEGORY_MASK_BIG5;
4634   /* A coding system of this category is always ASCII compatible.  */
4635   src += coding->head_ascii;
4636
4637   while (1)
4638     {
4639       src_base = src;
4640       ONE_MORE_BYTE (c);
4641       if (c < 0x80)
4642         continue;
4643       if (c >= 0xA1)
4644         {
4645           ONE_MORE_BYTE (c);
4646           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4647             return 0;
4648           found = CATEGORY_MASK_BIG5;
4649         }
4650       else
4651         break;
4652     }
4653   detect_info->rejected |= CATEGORY_MASK_BIG5;
4654   return 0;
4655
4656  no_more_source:
4657   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4658     {
4659       detect_info->rejected |= CATEGORY_MASK_BIG5;
4660       return 0;
4661     }
4662   detect_info->found |= found;
4663   return 1;
4664 }
4665
4666 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4667
4668 static void
4669 decode_coding_sjis (struct coding_system *coding)
4670 {
4671   const unsigned char *src = coding->source + coding->consumed;
4672   const unsigned char *src_end = coding->source + coding->src_bytes;
4673   const unsigned char *src_base;
4674   int *charbuf = coding->charbuf + coding->charbuf_used;
4675   /* We may produce one charset annotation in one loop and one more at
4676      the end.  */
4677   int *charbuf_end
4678     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4679   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4680   bool multibytep = coding->src_multibyte;
4681   struct charset *charset_roman, *charset_kanji, *charset_kana;
4682   struct charset *charset_kanji2;
4683   Lisp_Object attrs, charset_list, val;
4684   ptrdiff_t char_offset = coding->produced_char;
4685   ptrdiff_t last_offset = char_offset;
4686   int last_id = charset_ascii;
4687   bool eol_dos
4688     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4689   int byte_after_cr = -1;
4690
4691   CODING_GET_INFO (coding, attrs, charset_list);
4692
4693   val = charset_list;
4694   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4695   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4696   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4697   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4698
4699   while (1)
4700     {
4701       int c, c1;
4702       struct charset *charset;
4703
4704       src_base = src;
4705       consumed_chars_base = consumed_chars;
4706
4707       if (charbuf >= charbuf_end)
4708         {
4709           if (byte_after_cr >= 0)
4710             src_base--;
4711           break;
4712         }
4713
4714       if (byte_after_cr >= 0)
4715         c = byte_after_cr, byte_after_cr = -1;
4716       else
4717         ONE_MORE_BYTE (c);
4718       if (c < 0)
4719         goto invalid_code;
4720       if (c < 0x80)
4721         {
4722           if (eol_dos && c == '\r')
4723             ONE_MORE_BYTE (byte_after_cr);
4724           charset = charset_roman;
4725         }
4726       else if (c == 0x80 || c == 0xA0)
4727         goto invalid_code;
4728       else if (c >= 0xA1 && c <= 0xDF)
4729         {
4730           /* SJIS -> JISX0201-Kana */
4731           c &= 0x7F;
4732           charset = charset_kana;
4733         }
4734       else if (c <= 0xEF)
4735         {
4736           /* SJIS -> JISX0208 */
4737           ONE_MORE_BYTE (c1);
4738           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4739             goto invalid_code;
4740           c = (c << 8) | c1;
4741           SJIS_TO_JIS (c);
4742           charset = charset_kanji;
4743         }
4744       else if (c <= 0xFC && charset_kanji2)
4745         {
4746           /* SJIS -> JISX0213-2 */
4747           ONE_MORE_BYTE (c1);
4748           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4749             goto invalid_code;
4750           c = (c << 8) | c1;
4751           SJIS_TO_JIS2 (c);
4752           charset = charset_kanji2;
4753         }
4754       else
4755         goto invalid_code;
4756       if (charset->id != charset_ascii
4757           && last_id != charset->id)
4758         {
4759           if (last_id != charset_ascii)
4760             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4761           last_id = charset->id;
4762           last_offset = char_offset;
4763         }
4764       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4765       *charbuf++ = c;
4766       char_offset++;
4767       continue;
4768
4769     invalid_code:
4770       src = src_base;
4771       consumed_chars = consumed_chars_base;
4772       ONE_MORE_BYTE (c);
4773       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4774       char_offset++;
4775       coding->errors++;
4776     }
4777
4778  no_more_source:
4779   if (last_id != charset_ascii)
4780     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4781   coding->consumed_char += consumed_chars_base;
4782   coding->consumed = src_base - coding->source;
4783   coding->charbuf_used = charbuf - coding->charbuf;
4784 }
4785
4786 static void
4787 decode_coding_big5 (struct coding_system *coding)
4788 {
4789   const unsigned char *src = coding->source + coding->consumed;
4790   const unsigned char *src_end = coding->source + coding->src_bytes;
4791   const unsigned char *src_base;
4792   int *charbuf = coding->charbuf + coding->charbuf_used;
4793   /* We may produce one charset annotation in one loop and one more at
4794      the end.  */
4795   int *charbuf_end
4796     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4797   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4798   bool multibytep = coding->src_multibyte;
4799   struct charset *charset_roman, *charset_big5;
4800   Lisp_Object attrs, charset_list, val;
4801   ptrdiff_t char_offset = coding->produced_char;
4802   ptrdiff_t last_offset = char_offset;
4803   int last_id = charset_ascii;
4804   bool eol_dos
4805     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4806   int byte_after_cr = -1;
4807
4808   CODING_GET_INFO (coding, attrs, charset_list);
4809   val = charset_list;
4810   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4811   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4812
4813   while (1)
4814     {
4815       int c, c1;
4816       struct charset *charset;
4817
4818       src_base = src;
4819       consumed_chars_base = consumed_chars;
4820
4821       if (charbuf >= charbuf_end)
4822         {
4823           if (byte_after_cr >= 0)
4824             src_base--;
4825           break;
4826         }
4827
4828       if (byte_after_cr >= 0)
4829         c = byte_after_cr, byte_after_cr = -1;
4830       else
4831         ONE_MORE_BYTE (c);
4832
4833       if (c < 0)
4834         goto invalid_code;
4835       if (c < 0x80)
4836         {
4837           if (eol_dos && c == '\r')
4838             ONE_MORE_BYTE (byte_after_cr);
4839           charset = charset_roman;
4840         }
4841       else
4842         {
4843           /* BIG5 -> Big5 */
4844           if (c < 0xA1 || c > 0xFE)
4845             goto invalid_code;
4846           ONE_MORE_BYTE (c1);
4847           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4848             goto invalid_code;
4849           c = c << 8 | c1;
4850           charset = charset_big5;
4851         }
4852       if (charset->id != charset_ascii
4853           && last_id != charset->id)
4854         {
4855           if (last_id != charset_ascii)
4856             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4857           last_id = charset->id;
4858           last_offset = char_offset;
4859         }
4860       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4861       *charbuf++ = c;
4862       char_offset++;
4863       continue;
4864
4865     invalid_code:
4866       src = src_base;
4867       consumed_chars = consumed_chars_base;
4868       ONE_MORE_BYTE (c);
4869       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4870       char_offset++;
4871       coding->errors++;
4872     }
4873
4874  no_more_source:
4875   if (last_id != charset_ascii)
4876     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4877   coding->consumed_char += consumed_chars_base;
4878   coding->consumed = src_base - coding->source;
4879   coding->charbuf_used = charbuf - coding->charbuf;
4880 }
4881
4882 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4883    This function can encode charsets `ascii', `katakana-jisx0201',
4884    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4885    are sure that all these charsets are registered as official charset
4886    (i.e. do not have extended leading-codes).  Characters of other
4887    charsets are produced without any encoding.  */
4888
4889 static bool
4890 encode_coding_sjis (struct coding_system *coding)
4891 {
4892   bool multibytep = coding->dst_multibyte;
4893   int *charbuf = coding->charbuf;
4894   int *charbuf_end = charbuf + coding->charbuf_used;
4895   unsigned char *dst = coding->destination + coding->produced;
4896   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4897   int safe_room = 4;
4898   ptrdiff_t produced_chars = 0;
4899   Lisp_Object attrs, charset_list, val;
4900   bool ascii_compatible;
4901   struct charset *charset_kanji, *charset_kana;
4902   struct charset *charset_kanji2;
4903   int c;
4904
4905   CODING_GET_INFO (coding, attrs, charset_list);
4906   val = XCDR (charset_list);
4907   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4908   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4909   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4910
4911   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4912
4913   while (charbuf < charbuf_end)
4914     {
4915       ASSURE_DESTINATION (safe_room);
4916       c = *charbuf++;
4917       /* Now encode the character C.  */
4918       if (ASCII_CHAR_P (c) && ascii_compatible)
4919         EMIT_ONE_ASCII_BYTE (c);
4920       else if (CHAR_BYTE8_P (c))
4921         {
4922           c = CHAR_TO_BYTE8 (c);
4923           EMIT_ONE_BYTE (c);
4924         }
4925       else
4926         {
4927           unsigned code;
4928           struct charset *charset;
4929           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4930                                &code, charset);
4931
4932           if (!charset)
4933             {
4934               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4935                 {
4936                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4937                   charset = CHARSET_FROM_ID (charset_ascii);
4938                 }
4939               else
4940                 {
4941                   c = coding->default_char;
4942                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4943                                        charset_list, &code, charset);
4944                 }
4945             }
4946           if (code == CHARSET_INVALID_CODE (charset))
4947             emacs_abort ();
4948           if (charset == charset_kanji)
4949             {
4950               int c1, c2;
4951               JIS_TO_SJIS (code);
4952               c1 = code >> 8, c2 = code & 0xFF;
4953               EMIT_TWO_BYTES (c1, c2);
4954             }
4955           else if (charset == charset_kana)
4956             EMIT_ONE_BYTE (code | 0x80);
4957           else if (charset_kanji2 && charset == charset_kanji2)
4958             {
4959               int c1, c2;
4960
4961               c1 = code >> 8;
4962               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4963                   || c1 == 0x28
4964                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4965                 {
4966                   JIS_TO_SJIS2 (code);
4967                   c1 = code >> 8, c2 = code & 0xFF;
4968                   EMIT_TWO_BYTES (c1, c2);
4969                 }
4970               else
4971                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4972             }
4973           else
4974             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4975         }
4976     }
4977   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4978   coding->produced_char += produced_chars;
4979   coding->produced = dst - coding->destination;
4980   return 0;
4981 }
4982
4983 static bool
4984 encode_coding_big5 (struct coding_system *coding)
4985 {
4986   bool multibytep = coding->dst_multibyte;
4987   int *charbuf = coding->charbuf;
4988   int *charbuf_end = charbuf + coding->charbuf_used;
4989   unsigned char *dst = coding->destination + coding->produced;
4990   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4991   int safe_room = 4;
4992   ptrdiff_t produced_chars = 0;
4993   Lisp_Object attrs, charset_list, val;
4994   bool ascii_compatible;
4995   struct charset *charset_big5;
4996   int c;
4997
4998   CODING_GET_INFO (coding, attrs, charset_list);
4999   val = XCDR (charset_list);
5000   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5001   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5002
5003   while (charbuf < charbuf_end)
5004     {
5005       ASSURE_DESTINATION (safe_room);
5006       c = *charbuf++;
5007       /* Now encode the character C.  */
5008       if (ASCII_CHAR_P (c) && ascii_compatible)
5009         EMIT_ONE_ASCII_BYTE (c);
5010       else if (CHAR_BYTE8_P (c))
5011         {
5012           c = CHAR_TO_BYTE8 (c);
5013           EMIT_ONE_BYTE (c);
5014         }
5015       else
5016         {
5017           unsigned code;
5018           struct charset *charset;
5019           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5020                                &code, charset);
5021
5022           if (! charset)
5023             {
5024               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5025                 {
5026                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5027                   charset = CHARSET_FROM_ID (charset_ascii);
5028                 }
5029               else
5030                 {
5031                   c = coding->default_char;
5032                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5033                                        charset_list, &code, charset);
5034                 }
5035             }
5036           if (code == CHARSET_INVALID_CODE (charset))
5037             emacs_abort ();
5038           if (charset == charset_big5)
5039             {
5040               int c1, c2;
5041
5042               c1 = code >> 8, c2 = code & 0xFF;
5043               EMIT_TWO_BYTES (c1, c2);
5044             }
5045           else
5046             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5047         }
5048     }
5049   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5050   coding->produced_char += produced_chars;
5051   coding->produced = dst - coding->destination;
5052   return 0;
5053 }
5054
5055 \f
5056 /*** 10. CCL handlers ***/
5057
5058 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5059    Return true if a text is encoded in a coding system of which
5060    encoder/decoder are written in CCL program.  */
5061
5062 static bool
5063 detect_coding_ccl (struct coding_system *coding,
5064                    struct coding_detection_info *detect_info)
5065 {
5066   const unsigned char *src = coding->source, *src_base;
5067   const unsigned char *src_end = coding->source + coding->src_bytes;
5068   bool multibytep = coding->src_multibyte;
5069   ptrdiff_t consumed_chars = 0;
5070   int found = 0;
5071   unsigned char *valids;
5072   ptrdiff_t head_ascii = coding->head_ascii;
5073   Lisp_Object attrs;
5074
5075   detect_info->checked |= CATEGORY_MASK_CCL;
5076
5077   coding = &coding_categories[coding_category_ccl];
5078   valids = CODING_CCL_VALIDS (coding);
5079   attrs = CODING_ID_ATTRS (coding->id);
5080   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5081     src += head_ascii;
5082
5083   while (1)
5084     {
5085       int c;
5086
5087       src_base = src;
5088       ONE_MORE_BYTE (c);
5089       if (c < 0 || ! valids[c])
5090         break;
5091       if ((valids[c] > 1))
5092         found = CATEGORY_MASK_CCL;
5093     }
5094   detect_info->rejected |= CATEGORY_MASK_CCL;
5095   return 0;
5096
5097  no_more_source:
5098   detect_info->found |= found;
5099   return 1;
5100 }
5101
5102 static void
5103 decode_coding_ccl (struct coding_system *coding)
5104 {
5105   const unsigned char *src = coding->source + coding->consumed;
5106   const unsigned char *src_end = coding->source + coding->src_bytes;
5107   int *charbuf = coding->charbuf + coding->charbuf_used;
5108   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5109   ptrdiff_t consumed_chars = 0;
5110   bool multibytep = coding->src_multibyte;
5111   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5112   int source_charbuf[1024];
5113   int source_byteidx[1025];
5114   Lisp_Object attrs, charset_list;
5115
5116   CODING_GET_INFO (coding, attrs, charset_list);
5117
5118   while (1)
5119     {
5120       const unsigned char *p = src;
5121       ptrdiff_t offset;
5122       int i = 0;
5123
5124       if (multibytep)
5125         {
5126           while (i < 1024 && p < src_end)
5127             {
5128               source_byteidx[i] = p - src;
5129               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5130             }
5131           source_byteidx[i] = p - src;
5132         }
5133       else
5134         while (i < 1024 && p < src_end)
5135           source_charbuf[i++] = *p++;
5136
5137       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5138         ccl->last_block = 1;
5139       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5140       charset_map_loaded = 0;
5141       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5142                   charset_list);
5143       if (charset_map_loaded
5144           && (offset = coding_change_source (coding)))
5145         {
5146           p += offset;
5147           src += offset;
5148           src_end += offset;
5149         }
5150       charbuf += ccl->produced;
5151       if (multibytep)
5152         src += source_byteidx[ccl->consumed];
5153       else
5154         src += ccl->consumed;
5155       consumed_chars += ccl->consumed;
5156       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5157         break;
5158     }
5159
5160   switch (ccl->status)
5161     {
5162     case CCL_STAT_SUSPEND_BY_SRC:
5163       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5164       break;
5165     case CCL_STAT_SUSPEND_BY_DST:
5166       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5167       break;
5168     case CCL_STAT_QUIT:
5169     case CCL_STAT_INVALID_CMD:
5170       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5171       break;
5172     default:
5173       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5174       break;
5175     }
5176   coding->consumed_char += consumed_chars;
5177   coding->consumed = src - coding->source;
5178   coding->charbuf_used = charbuf - coding->charbuf;
5179 }
5180
5181 static bool
5182 encode_coding_ccl (struct coding_system *coding)
5183 {
5184   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5185   bool multibytep = coding->dst_multibyte;
5186   int *charbuf = coding->charbuf;
5187   int *charbuf_end = charbuf + coding->charbuf_used;
5188   unsigned char *dst = coding->destination + coding->produced;
5189   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5190   int destination_charbuf[1024];
5191   ptrdiff_t produced_chars = 0;
5192   int i;
5193   Lisp_Object attrs, charset_list;
5194
5195   CODING_GET_INFO (coding, attrs, charset_list);
5196   if (coding->consumed_char == coding->src_chars
5197       && coding->mode & CODING_MODE_LAST_BLOCK)
5198     ccl->last_block = 1;
5199
5200   do
5201     {
5202       ptrdiff_t offset;
5203
5204       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5205       charset_map_loaded = 0;
5206       ccl_driver (ccl, charbuf, destination_charbuf,
5207                   charbuf_end - charbuf, 1024, charset_list);
5208       if (charset_map_loaded
5209           && (offset = coding_change_destination (coding)))
5210         dst += offset;
5211       if (multibytep)
5212         {
5213           ASSURE_DESTINATION (ccl->produced * 2);
5214           for (i = 0; i < ccl->produced; i++)
5215             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5216         }
5217       else
5218         {
5219           ASSURE_DESTINATION (ccl->produced);
5220           for (i = 0; i < ccl->produced; i++)
5221             *dst++ = destination_charbuf[i] & 0xFF;
5222           produced_chars += ccl->produced;
5223         }
5224       charbuf += ccl->consumed;
5225       if (ccl->status == CCL_STAT_QUIT
5226           || ccl->status == CCL_STAT_INVALID_CMD)
5227         break;
5228     }
5229   while (charbuf < charbuf_end);
5230
5231   switch (ccl->status)
5232     {
5233     case CCL_STAT_SUSPEND_BY_SRC:
5234       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5235       break;
5236     case CCL_STAT_SUSPEND_BY_DST:
5237       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5238       break;
5239     case CCL_STAT_QUIT:
5240     case CCL_STAT_INVALID_CMD:
5241       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5242       break;
5243     default:
5244       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5245       break;
5246     }
5247
5248   coding->produced_char += produced_chars;
5249   coding->produced = dst - coding->destination;
5250   return 0;
5251 }
5252
5253 \f
5254 /*** 10, 11. no-conversion handlers ***/
5255
5256 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5257
5258 static void
5259 decode_coding_raw_text (struct coding_system *coding)
5260 {
5261   bool eol_dos
5262     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5263
5264   coding->chars_at_source = 1;
5265   coding->consumed_char = coding->src_chars;
5266   coding->consumed = coding->src_bytes;
5267   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5268     {
5269       coding->consumed_char--;
5270       coding->consumed--;
5271       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5272     }
5273   else
5274     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5275 }
5276
5277 static bool
5278 encode_coding_raw_text (struct coding_system *coding)
5279 {
5280   bool multibytep = coding->dst_multibyte;
5281   int *charbuf = coding->charbuf;
5282   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5283   unsigned char *dst = coding->destination + coding->produced;
5284   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5285   ptrdiff_t produced_chars = 0;
5286   int c;
5287
5288   if (multibytep)
5289     {
5290       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5291
5292       if (coding->src_multibyte)
5293         while (charbuf < charbuf_end)
5294           {
5295             ASSURE_DESTINATION (safe_room);
5296             c = *charbuf++;
5297             if (ASCII_CHAR_P (c))
5298               EMIT_ONE_ASCII_BYTE (c);
5299             else if (CHAR_BYTE8_P (c))
5300               {
5301                 c = CHAR_TO_BYTE8 (c);
5302                 EMIT_ONE_BYTE (c);
5303               }
5304             else
5305               {
5306                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5307
5308                 CHAR_STRING_ADVANCE (c, p1);
5309                 do
5310                   {
5311                     EMIT_ONE_BYTE (*p0);
5312                     p0++;
5313                   }
5314                 while (p0 < p1);
5315               }
5316           }
5317       else
5318         while (charbuf < charbuf_end)
5319           {
5320             ASSURE_DESTINATION (safe_room);
5321             c = *charbuf++;
5322             EMIT_ONE_BYTE (c);
5323           }
5324     }
5325   else
5326     {
5327       if (coding->src_multibyte)
5328         {
5329           int safe_room = MAX_MULTIBYTE_LENGTH;
5330
5331           while (charbuf < charbuf_end)
5332             {
5333               ASSURE_DESTINATION (safe_room);
5334               c = *charbuf++;
5335               if (ASCII_CHAR_P (c))
5336                 *dst++ = c;
5337               else if (CHAR_BYTE8_P (c))
5338                 *dst++ = CHAR_TO_BYTE8 (c);
5339               else
5340                 CHAR_STRING_ADVANCE (c, dst);
5341             }
5342         }
5343       else
5344         {
5345           ASSURE_DESTINATION (charbuf_end - charbuf);
5346           while (charbuf < charbuf_end && dst < dst_end)
5347             *dst++ = *charbuf++;
5348         }
5349       produced_chars = dst - (coding->destination + coding->produced);
5350     }
5351   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5352   coding->produced_char += produced_chars;
5353   coding->produced = dst - coding->destination;
5354   return 0;
5355 }
5356
5357 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5358    Return true if a text is encoded in a charset-based coding system.  */
5359
5360 static bool
5361 detect_coding_charset (struct coding_system *coding,
5362                        struct coding_detection_info *detect_info)
5363 {
5364   const unsigned char *src = coding->source, *src_base;
5365   const unsigned char *src_end = coding->source + coding->src_bytes;
5366   bool multibytep = coding->src_multibyte;
5367   ptrdiff_t consumed_chars = 0;
5368   Lisp_Object attrs, valids, name;
5369   int found = 0;
5370   ptrdiff_t head_ascii = coding->head_ascii;
5371   bool check_latin_extra = 0;
5372
5373   detect_info->checked |= CATEGORY_MASK_CHARSET;
5374
5375   coding = &coding_categories[coding_category_charset];
5376   attrs = CODING_ID_ATTRS (coding->id);
5377   valids = AREF (attrs, coding_attr_charset_valids);
5378   name = CODING_ID_NAME (coding->id);
5379   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5380                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5381       || strncmp (SSDATA (SYMBOL_NAME (name)),
5382                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5383     check_latin_extra = 1;
5384
5385   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5386     src += head_ascii;
5387
5388   while (1)
5389     {
5390       int c;
5391       Lisp_Object val;
5392       struct charset *charset;
5393       int dim, idx;
5394
5395       src_base = src;
5396       ONE_MORE_BYTE (c);
5397       if (c < 0)
5398         continue;
5399       val = AREF (valids, c);
5400       if (NILP (val))
5401         break;
5402       if (c >= 0x80)
5403         {
5404           if (c < 0xA0
5405               && check_latin_extra
5406               && (!VECTORP (Vlatin_extra_code_table)
5407                   || NILP (AREF (Vlatin_extra_code_table, c))))
5408             break;
5409           found = CATEGORY_MASK_CHARSET;
5410         }
5411       if (INTEGERP (val))
5412         {
5413           charset = CHARSET_FROM_ID (XFASTINT (val));
5414           dim = CHARSET_DIMENSION (charset);
5415           for (idx = 1; idx < dim; idx++)
5416             {
5417               if (src == src_end)
5418                 goto too_short;
5419               ONE_MORE_BYTE (c);
5420               if (c < charset->code_space[(dim - 1 - idx) * 4]
5421                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5422                 break;
5423             }
5424           if (idx < dim)
5425             break;
5426         }
5427       else
5428         {
5429           idx = 1;
5430           for (; CONSP (val); val = XCDR (val))
5431             {
5432               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5433               dim = CHARSET_DIMENSION (charset);
5434               while (idx < dim)
5435                 {
5436                   if (src == src_end)
5437                     goto too_short;
5438                   ONE_MORE_BYTE (c);
5439                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5440                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5441                     break;
5442                   idx++;
5443                 }
5444               if (idx == dim)
5445                 {
5446                   val = Qnil;
5447                   break;
5448                 }
5449             }
5450           if (CONSP (val))
5451             break;
5452         }
5453     }
5454  too_short:
5455   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5456   return 0;
5457
5458  no_more_source:
5459   detect_info->found |= found;
5460   return 1;
5461 }
5462
5463 static void
5464 decode_coding_charset (struct coding_system *coding)
5465 {
5466   const unsigned char *src = coding->source + coding->consumed;
5467   const unsigned char *src_end = coding->source + coding->src_bytes;
5468   const unsigned char *src_base;
5469   int *charbuf = coding->charbuf + coding->charbuf_used;
5470   /* We may produce one charset annotation in one loop and one more at
5471      the end.  */
5472   int *charbuf_end
5473     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5474   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5475   bool multibytep = coding->src_multibyte;
5476   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5477   Lisp_Object valids;
5478   ptrdiff_t char_offset = coding->produced_char;
5479   ptrdiff_t last_offset = char_offset;
5480   int last_id = charset_ascii;
5481   bool eol_dos
5482     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5483   int byte_after_cr = -1;
5484
5485   valids = AREF (attrs, coding_attr_charset_valids);
5486
5487   while (1)
5488     {
5489       int c;
5490       Lisp_Object val;
5491       struct charset *charset;
5492       int dim;
5493       int len = 1;
5494       unsigned code;
5495
5496       src_base = src;
5497       consumed_chars_base = consumed_chars;
5498
5499       if (charbuf >= charbuf_end)
5500         {
5501           if (byte_after_cr >= 0)
5502             src_base--;
5503           break;
5504         }
5505
5506       if (byte_after_cr >= 0)
5507         {
5508           c = byte_after_cr;
5509           byte_after_cr = -1;
5510         }
5511       else
5512         {
5513           ONE_MORE_BYTE (c);
5514           if (eol_dos && c == '\r')
5515             ONE_MORE_BYTE (byte_after_cr);
5516         }
5517       if (c < 0)
5518         goto invalid_code;
5519       code = c;
5520
5521       val = AREF (valids, c);
5522       if (! INTEGERP (val) && ! CONSP (val))
5523         goto invalid_code;
5524       if (INTEGERP (val))
5525         {
5526           charset = CHARSET_FROM_ID (XFASTINT (val));
5527           dim = CHARSET_DIMENSION (charset);
5528           while (len < dim)
5529             {
5530               ONE_MORE_BYTE (c);
5531               code = (code << 8) | c;
5532               len++;
5533             }
5534           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5535                               charset, code, c);
5536         }
5537       else
5538         {
5539           /* VAL is a list of charset IDs.  It is assured that the
5540              list is sorted by charset dimensions (smaller one
5541              comes first).  */
5542           while (CONSP (val))
5543             {
5544               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5545               dim = CHARSET_DIMENSION (charset);
5546               while (len < dim)
5547                 {
5548                   ONE_MORE_BYTE (c);
5549                   code = (code << 8) | c;
5550                   len++;
5551                 }
5552               CODING_DECODE_CHAR (coding, src, src_base,
5553                                   src_end, charset, code, c);
5554               if (c >= 0)
5555                 break;
5556               val = XCDR (val);
5557             }
5558         }
5559       if (c < 0)
5560         goto invalid_code;
5561       if (charset->id != charset_ascii
5562           && last_id != charset->id)
5563         {
5564           if (last_id != charset_ascii)
5565             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5566           last_id = charset->id;
5567           last_offset = char_offset;
5568         }
5569
5570       *charbuf++ = c;
5571       char_offset++;
5572       continue;
5573
5574     invalid_code:
5575       src = src_base;
5576       consumed_chars = consumed_chars_base;
5577       ONE_MORE_BYTE (c);
5578       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5579       char_offset++;
5580       coding->errors++;
5581     }
5582
5583  no_more_source:
5584   if (last_id != charset_ascii)
5585     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5586   coding->consumed_char += consumed_chars_base;
5587   coding->consumed = src_base - coding->source;
5588   coding->charbuf_used = charbuf - coding->charbuf;
5589 }
5590
5591 static bool
5592 encode_coding_charset (struct coding_system *coding)
5593 {
5594   bool multibytep = coding->dst_multibyte;
5595   int *charbuf = coding->charbuf;
5596   int *charbuf_end = charbuf + coding->charbuf_used;
5597   unsigned char *dst = coding->destination + coding->produced;
5598   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5599   int safe_room = MAX_MULTIBYTE_LENGTH;
5600   ptrdiff_t produced_chars = 0;
5601   Lisp_Object attrs, charset_list;
5602   bool ascii_compatible;
5603   int c;
5604
5605   CODING_GET_INFO (coding, attrs, charset_list);
5606   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5607
5608   while (charbuf < charbuf_end)
5609     {
5610       struct charset *charset;
5611       unsigned code;
5612
5613       ASSURE_DESTINATION (safe_room);
5614       c = *charbuf++;
5615       if (ascii_compatible && ASCII_CHAR_P (c))
5616         EMIT_ONE_ASCII_BYTE (c);
5617       else if (CHAR_BYTE8_P (c))
5618         {
5619           c = CHAR_TO_BYTE8 (c);
5620           EMIT_ONE_BYTE (c);
5621         }
5622       else
5623         {
5624           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5625                                &code, charset);
5626
5627           if (charset)
5628             {
5629               if (CHARSET_DIMENSION (charset) == 1)
5630                 EMIT_ONE_BYTE (code);
5631               else if (CHARSET_DIMENSION (charset) == 2)
5632                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5633               else if (CHARSET_DIMENSION (charset) == 3)
5634                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5635               else
5636                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5637                                  (code >> 8) & 0xFF, code & 0xFF);
5638             }
5639           else
5640             {
5641               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5642                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5643               else
5644                 c = coding->default_char;
5645               EMIT_ONE_BYTE (c);
5646             }
5647         }
5648     }
5649
5650   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5651   coding->produced_char += produced_chars;
5652   coding->produced = dst - coding->destination;
5653   return 0;
5654 }
5655
5656 \f
5657 /*** 7. C library functions ***/
5658
5659 /* Setup coding context CODING from information about CODING_SYSTEM.
5660    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5661    CODING_SYSTEM is invalid, signal an error.  */
5662
5663 void
5664 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5665 {
5666   Lisp_Object attrs;
5667   Lisp_Object eol_type;
5668   Lisp_Object coding_type;
5669   Lisp_Object val;
5670
5671   if (NILP (coding_system))
5672     coding_system = Qundecided;
5673
5674   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5675
5676   attrs = CODING_ID_ATTRS (coding->id);
5677   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5678
5679   coding->mode = 0;
5680   if (VECTORP (eol_type))
5681     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5682                             | CODING_REQUIRE_DETECTION_MASK);
5683   else if (! EQ (eol_type, Qunix))
5684     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5685                             | CODING_REQUIRE_ENCODING_MASK);
5686   else
5687     coding->common_flags = 0;
5688   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5689     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5690   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5691     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5692   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5693     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5694
5695   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5696   coding->max_charset_id = SCHARS (val) - 1;
5697   coding->safe_charsets = SDATA (val);
5698   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5699   coding->carryover_bytes = 0;
5700
5701   coding_type = CODING_ATTR_TYPE (attrs);
5702   if (EQ (coding_type, Qundecided))
5703     {
5704       coding->detector = NULL;
5705       coding->decoder = decode_coding_raw_text;
5706       coding->encoder = encode_coding_raw_text;
5707       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5708       coding->spec.undecided.inhibit_nbd
5709         = (NILP (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection))
5710            ? -1
5711            : EQ (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection), Qt)
5712            ? 1
5713            : 0);
5714       coding->spec.undecided.inhibit_ied
5715         = (NILP (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection))
5716            ? -1
5717            : EQ (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection), Qt)
5718            ? 1
5719            : 0);
5720       coding->spec.undecided.prefer_utf_8
5721         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5722     }
5723   else if (EQ (coding_type, Qiso_2022))
5724     {
5725       int i;
5726       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5727
5728       /* Invoke graphic register 0 to plane 0.  */
5729       CODING_ISO_INVOCATION (coding, 0) = 0;
5730       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5731       CODING_ISO_INVOCATION (coding, 1)
5732         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5733       /* Setup the initial status of designation.  */
5734       for (i = 0; i < 4; i++)
5735         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5736       /* Not single shifting initially.  */
5737       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5738       /* Beginning of buffer should also be regarded as bol. */
5739       CODING_ISO_BOL (coding) = 1;
5740       coding->detector = detect_coding_iso_2022;
5741       coding->decoder = decode_coding_iso_2022;
5742       coding->encoder = encode_coding_iso_2022;
5743       if (flags & CODING_ISO_FLAG_SAFE)
5744         coding->mode |= CODING_MODE_SAFE_ENCODING;
5745       coding->common_flags
5746         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5747             | CODING_REQUIRE_FLUSHING_MASK);
5748       if (flags & CODING_ISO_FLAG_COMPOSITION)
5749         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5750       if (flags & CODING_ISO_FLAG_DESIGNATION)
5751         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5752       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5753         {
5754           setup_iso_safe_charsets (attrs);
5755           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5756           coding->max_charset_id = SCHARS (val) - 1;
5757           coding->safe_charsets = SDATA (val);
5758         }
5759       CODING_ISO_FLAGS (coding) = flags;
5760       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5761       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5762       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5763       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5764     }
5765   else if (EQ (coding_type, Qcharset))
5766     {
5767       coding->detector = detect_coding_charset;
5768       coding->decoder = decode_coding_charset;
5769       coding->encoder = encode_coding_charset;
5770       coding->common_flags
5771         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5772     }
5773   else if (EQ (coding_type, Qutf_8))
5774     {
5775       val = AREF (attrs, coding_attr_utf_bom);
5776       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5777                                    : EQ (val, Qt) ? utf_with_bom
5778                                    : utf_without_bom);
5779       coding->detector = detect_coding_utf_8;
5780       coding->decoder = decode_coding_utf_8;
5781       coding->encoder = encode_coding_utf_8;
5782       coding->common_flags
5783         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5784       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5785         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5786     }
5787   else if (EQ (coding_type, Qutf_16))
5788     {
5789       val = AREF (attrs, coding_attr_utf_bom);
5790       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5791                                     : EQ (val, Qt) ? utf_with_bom
5792                                     : utf_without_bom);
5793       val = AREF (attrs, coding_attr_utf_16_endian);
5794       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5795                                        : utf_16_little_endian);
5796       CODING_UTF_16_SURROGATE (coding) = 0;
5797       coding->detector = detect_coding_utf_16;
5798       coding->decoder = decode_coding_utf_16;
5799       coding->encoder = encode_coding_utf_16;
5800       coding->common_flags
5801         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5802       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5803         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5804     }
5805   else if (EQ (coding_type, Qccl))
5806     {
5807       coding->detector = detect_coding_ccl;
5808       coding->decoder = decode_coding_ccl;
5809       coding->encoder = encode_coding_ccl;
5810       coding->common_flags
5811         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5812             | CODING_REQUIRE_FLUSHING_MASK);
5813     }
5814   else if (EQ (coding_type, Qemacs_mule))
5815     {
5816       coding->detector = detect_coding_emacs_mule;
5817       coding->decoder = decode_coding_emacs_mule;
5818       coding->encoder = encode_coding_emacs_mule;
5819       coding->common_flags
5820         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5821       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5822           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5823         {
5824           Lisp_Object tail, safe_charsets;
5825           int max_charset_id = 0;
5826
5827           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5828                tail = XCDR (tail))
5829             if (max_charset_id < XFASTINT (XCAR (tail)))
5830               max_charset_id = XFASTINT (XCAR (tail));
5831           safe_charsets = make_uninit_string (max_charset_id + 1);
5832           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5833           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5834                tail = XCDR (tail))
5835             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5836           coding->max_charset_id = max_charset_id;
5837           coding->safe_charsets = SDATA (safe_charsets);
5838         }
5839       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5840       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5841     }
5842   else if (EQ (coding_type, Qshift_jis))
5843     {
5844       coding->detector = detect_coding_sjis;
5845       coding->decoder = decode_coding_sjis;
5846       coding->encoder = encode_coding_sjis;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849     }
5850   else if (EQ (coding_type, Qbig5))
5851     {
5852       coding->detector = detect_coding_big5;
5853       coding->decoder = decode_coding_big5;
5854       coding->encoder = encode_coding_big5;
5855       coding->common_flags
5856         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5857     }
5858   else                          /* EQ (coding_type, Qraw_text) */
5859     {
5860       coding->detector = NULL;
5861       coding->decoder = decode_coding_raw_text;
5862       coding->encoder = encode_coding_raw_text;
5863       if (! EQ (eol_type, Qunix))
5864         {
5865           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5866           if (! VECTORP (eol_type))
5867             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5868         }
5869
5870     }
5871
5872   return;
5873 }
5874
5875 /* Return a list of charsets supported by CODING.  */
5876
5877 Lisp_Object
5878 coding_charset_list (struct coding_system *coding)
5879 {
5880   Lisp_Object attrs, charset_list;
5881
5882   CODING_GET_INFO (coding, attrs, charset_list);
5883   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5884     {
5885       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5886
5887       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5888         charset_list = Viso_2022_charset_list;
5889     }
5890   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5891     {
5892       charset_list = Vemacs_mule_charset_list;
5893     }
5894   return charset_list;
5895 }
5896
5897
5898 /* Return a list of charsets supported by CODING-SYSTEM.  */
5899
5900 Lisp_Object
5901 coding_system_charset_list (Lisp_Object coding_system)
5902 {
5903   ptrdiff_t id;
5904   Lisp_Object attrs, charset_list;
5905
5906   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5907   attrs = CODING_ID_ATTRS (id);
5908
5909   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5910     {
5911       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5912
5913       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5914         charset_list = Viso_2022_charset_list;
5915       else
5916         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5917     }
5918   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5919     {
5920       charset_list = Vemacs_mule_charset_list;
5921     }
5922   else
5923     {
5924       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5925     }
5926   return charset_list;
5927 }
5928
5929
5930 /* Return raw-text or one of its subsidiaries that has the same
5931    eol_type as CODING-SYSTEM.  */
5932
5933 Lisp_Object
5934 raw_text_coding_system (Lisp_Object coding_system)
5935 {
5936   Lisp_Object spec, attrs;
5937   Lisp_Object eol_type, raw_text_eol_type;
5938
5939   if (NILP (coding_system))
5940     return Qraw_text;
5941   spec = CODING_SYSTEM_SPEC (coding_system);
5942   attrs = AREF (spec, 0);
5943
5944   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5945     return coding_system;
5946
5947   eol_type = AREF (spec, 2);
5948   if (VECTORP (eol_type))
5949     return Qraw_text;
5950   spec = CODING_SYSTEM_SPEC (Qraw_text);
5951   raw_text_eol_type = AREF (spec, 2);
5952   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5953           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5954           : AREF (raw_text_eol_type, 2));
5955 }
5956
5957
5958 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5959    the subsidiary that has the same eol-spec as PARENT (if it is not
5960    nil and specifies end-of-line format) or the system's setting
5961    (system_eol_type).  */
5962
5963 Lisp_Object
5964 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5965 {
5966   Lisp_Object spec, eol_type;
5967
5968   if (NILP (coding_system))
5969     coding_system = Qraw_text;
5970   spec = CODING_SYSTEM_SPEC (coding_system);
5971   eol_type = AREF (spec, 2);
5972   if (VECTORP (eol_type))
5973     {
5974       Lisp_Object parent_eol_type;
5975
5976       if (! NILP (parent))
5977         {
5978           Lisp_Object parent_spec;
5979
5980           parent_spec = CODING_SYSTEM_SPEC (parent);
5981           parent_eol_type = AREF (parent_spec, 2);
5982           if (VECTORP (parent_eol_type))
5983             parent_eol_type = system_eol_type;
5984         }
5985       else
5986         parent_eol_type = system_eol_type;
5987       if (EQ (parent_eol_type, Qunix))
5988         coding_system = AREF (eol_type, 0);
5989       else if (EQ (parent_eol_type, Qdos))
5990         coding_system = AREF (eol_type, 1);
5991       else if (EQ (parent_eol_type, Qmac))
5992         coding_system = AREF (eol_type, 2);
5993     }
5994   return coding_system;
5995 }
5996
5997
5998 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5999    decided for writing to a process.  If not, complement them, and
6000    return a new coding system.  */
6001
6002 Lisp_Object
6003 complement_process_encoding_system (Lisp_Object coding_system)
6004 {
6005   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6006   Lisp_Object spec, attrs;
6007   int i;
6008
6009   for (i = 0; i < 3; i++)
6010     {
6011       if (i == 1)
6012         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6013       else if (i == 2)
6014         coding_system = preferred_coding_system ();
6015       spec = CODING_SYSTEM_SPEC (coding_system);
6016       if (NILP (spec))
6017         continue;
6018       attrs = AREF (spec, 0);
6019       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6020         coding_base = CODING_ATTR_BASE_NAME (attrs);
6021       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6022         eol_base = coding_system;
6023       if (! NILP (coding_base) && ! NILP (eol_base))
6024         break;
6025     }
6026
6027   if (i > 0)
6028     /* The original CODING_SYSTEM didn't specify text-conversion or
6029        eol-conversion.  Be sure that we return a fully complemented
6030        coding system.  */
6031     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6032   return coding_system;
6033 }
6034
6035
6036 /* Emacs has a mechanism to automatically detect a coding system if it
6037    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6038    it's impossible to distinguish some coding systems accurately
6039    because they use the same range of codes.  So, at first, coding
6040    systems are categorized into 7, those are:
6041
6042    o coding-category-emacs-mule
6043
6044         The category for a coding system which has the same code range
6045         as Emacs' internal format.  Assigned the coding-system (Lisp
6046         symbol) `emacs-mule' by default.
6047
6048    o coding-category-sjis
6049
6050         The category for a coding system which has the same code range
6051         as SJIS.  Assigned the coding-system (Lisp
6052         symbol) `japanese-shift-jis' by default.
6053
6054    o coding-category-iso-7
6055
6056         The category for a coding system which has the same code range
6057         as ISO2022 of 7-bit environment.  This doesn't use any locking
6058         shift and single shift functions.  This can encode/decode all
6059         charsets.  Assigned the coding-system (Lisp symbol)
6060         `iso-2022-7bit' by default.
6061
6062    o coding-category-iso-7-tight
6063
6064         Same as coding-category-iso-7 except that this can
6065         encode/decode only the specified charsets.
6066
6067    o coding-category-iso-8-1
6068
6069         The category for a coding system which has the same code range
6070         as ISO2022 of 8-bit environment and graphic plane 1 used only
6071         for DIMENSION1 charset.  This doesn't use any locking shift
6072         and single shift functions.  Assigned the coding-system (Lisp
6073         symbol) `iso-latin-1' by default.
6074
6075    o coding-category-iso-8-2
6076
6077         The category for a coding system which has the same code range
6078         as ISO2022 of 8-bit environment and graphic plane 1 used only
6079         for DIMENSION2 charset.  This doesn't use any locking shift
6080         and single shift functions.  Assigned the coding-system (Lisp
6081         symbol) `japanese-iso-8bit' by default.
6082
6083    o coding-category-iso-7-else
6084
6085         The category for a coding system which has the same code range
6086         as ISO2022 of 7-bit environment but uses locking shift or
6087         single shift functions.  Assigned the coding-system (Lisp
6088         symbol) `iso-2022-7bit-lock' by default.
6089
6090    o coding-category-iso-8-else
6091
6092         The category for a coding system which has the same code range
6093         as ISO2022 of 8-bit environment but uses locking shift or
6094         single shift functions.  Assigned the coding-system (Lisp
6095         symbol) `iso-2022-8bit-ss2' by default.
6096
6097    o coding-category-big5
6098
6099         The category for a coding system which has the same code range
6100         as BIG5.  Assigned the coding-system (Lisp symbol)
6101         `cn-big5' by default.
6102
6103    o coding-category-utf-8
6104
6105         The category for a coding system which has the same code range
6106         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6107         symbol) `utf-8' by default.
6108
6109    o coding-category-utf-16-be
6110
6111         The category for a coding system in which a text has an
6112         Unicode signature (cf. Unicode Standard) in the order of BIG
6113         endian at the head.  Assigned the coding-system (Lisp symbol)
6114         `utf-16-be' by default.
6115
6116    o coding-category-utf-16-le
6117
6118         The category for a coding system in which a text has an
6119         Unicode signature (cf. Unicode Standard) in the order of
6120         LITTLE endian at the head.  Assigned the coding-system (Lisp
6121         symbol) `utf-16-le' by default.
6122
6123    o coding-category-ccl
6124
6125         The category for a coding system of which encoder/decoder is
6126         written in CCL programs.  The default value is nil, i.e., no
6127         coding system is assigned.
6128
6129    o coding-category-binary
6130
6131         The category for a coding system not categorized in any of the
6132         above.  Assigned the coding-system (Lisp symbol)
6133         `no-conversion' by default.
6134
6135    Each of them is a Lisp symbol and the value is an actual
6136    `coding-system's (this is also a Lisp symbol) assigned by a user.
6137    What Emacs does actually is to detect a category of coding system.
6138    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6139    decide only one possible category, it selects a category of the
6140    highest priority.  Priorities of categories are also specified by a
6141    user in a Lisp variable `coding-category-list'.
6142
6143 */
6144
6145 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6146                                            int eol_seen);
6147
6148
6149 /* Return the number of ASCII characters at the head of the source.
6150    By side effects, set coding->head_ascii and update
6151    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6152    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6153    reliable only when all the source bytes are ASCII.  */
6154
6155 static int
6156 check_ascii (struct coding_system *coding)
6157 {
6158   const unsigned char *src, *end;
6159   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6160   int eol_seen = coding->eol_seen;
6161
6162   coding_set_source (coding);
6163   src = coding->source;
6164   end = src + coding->src_bytes;
6165
6166   if (inhibit_eol_conversion
6167       || SYMBOLP (eol_type))
6168     {
6169       /* We don't have to check EOL format.  */
6170       while (src < end && !( *src & 0x80))
6171         {
6172           if (*src++ == '\n')
6173             eol_seen |= EOL_SEEN_LF;
6174         }
6175     }
6176   else
6177     {
6178       end--;                /* We look ahead one byte for "CR LF".  */
6179       while (src < end)
6180         {
6181           int c = *src;
6182
6183           if (c & 0x80)
6184             break;
6185           src++;
6186           if (c == '\r')
6187             {
6188               if (*src == '\n')
6189                 {
6190                   eol_seen |= EOL_SEEN_CRLF;
6191                   src++;
6192                 }
6193               else
6194                 eol_seen |= EOL_SEEN_CR;
6195             }
6196           else if (c == '\n')
6197             eol_seen |= EOL_SEEN_LF;
6198         }
6199       if (src == end)
6200         {
6201           int c = *src;
6202
6203           /* All bytes but the last one C are ASCII.  */
6204           if (! (c & 0x80))
6205             {
6206               if (c == '\r')
6207                 eol_seen |= EOL_SEEN_CR;
6208               else if (c  == '\n')
6209                 eol_seen |= EOL_SEEN_LF;
6210               src++;
6211             }
6212         }
6213     }
6214   coding->head_ascii = src - coding->source;
6215   coding->eol_seen = eol_seen;
6216   return (coding->head_ascii);
6217 }
6218
6219
6220 /* Return the number of characters at the source if all the bytes are
6221    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6222    effects, update coding->eol_seen.  The value of coding->eol_seen is
6223    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6224    the value is reliable only when all the source bytes are valid
6225    UTF-8.  */
6226
6227 static int
6228 check_utf_8 (struct coding_system *coding)
6229 {
6230   const unsigned char *src, *end;
6231   int eol_seen;
6232   int nchars = coding->head_ascii;
6233
6234   if (coding->head_ascii < 0)
6235     check_ascii (coding);
6236   else
6237     coding_set_source (coding);
6238   src = coding->source + coding->head_ascii;
6239   /* We look ahead one byte for CR LF.  */
6240   end = coding->source + coding->src_bytes - 1;
6241   eol_seen = coding->eol_seen;
6242   while (src < end)
6243     {
6244       int c = *src;
6245
6246       if (UTF_8_1_OCTET_P (*src))
6247         {
6248           src++;
6249           if (c < 0x20)
6250             {
6251               if (c == '\r')
6252                 {
6253                   if (*src == '\n')
6254                     {
6255                       eol_seen |= EOL_SEEN_CRLF;
6256                       src++;
6257                       nchars++;
6258                     }
6259                   else
6260                     eol_seen |= EOL_SEEN_CR;
6261                 }
6262               else if (c == '\n')
6263                 eol_seen |= EOL_SEEN_LF;
6264             }
6265         }
6266       else if (UTF_8_2_OCTET_LEADING_P (c))
6267         {
6268           if (c < 0xC2          /* overlong sequence */
6269               || src + 1 >= end
6270               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6271             return -1;
6272           src += 2;
6273         }
6274       else if (UTF_8_3_OCTET_LEADING_P (c))
6275         {
6276           if (src + 2 >= end
6277               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6278                     && UTF_8_EXTRA_OCTET_P (src[2])))
6279             return -1;
6280           c = (((c & 0xF) << 12)
6281                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6282           if (c < 0x800                       /* overlong sequence */
6283               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6284             return -1;
6285           src += 3;
6286         }
6287       else if (UTF_8_4_OCTET_LEADING_P (c))
6288         {
6289           if (src + 3 >= end
6290               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6291                     && UTF_8_EXTRA_OCTET_P (src[2])
6292                     && UTF_8_EXTRA_OCTET_P (src[3])))
6293             return -1;
6294           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6295                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6296           if (c < 0x10000       /* overlong sequence */
6297               || c >= 0x110000) /* non-Unicode character  */
6298             return -1;
6299           src += 4;
6300         }
6301       else
6302         return -1;
6303       nchars++;
6304     }
6305
6306   if (src == end)
6307     {
6308       if (! UTF_8_1_OCTET_P (*src))
6309         return -1;
6310       nchars++;
6311       if (*src == '\r')
6312         eol_seen |= EOL_SEEN_CR;
6313       else if (*src  == '\n')
6314         eol_seen |= EOL_SEEN_LF;
6315     }
6316   coding->eol_seen = eol_seen;
6317   return nchars;
6318 }
6319
6320
6321 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6322    SOURCE is encoded.  If CATEGORY is one of
6323    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6324    two-byte, else they are encoded by one-byte.
6325
6326    Return one of EOL_SEEN_XXX.  */
6327
6328 #define MAX_EOL_CHECK_COUNT 3
6329
6330 static int
6331 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6332             enum coding_category category)
6333 {
6334   const unsigned char *src = source, *src_end = src + src_bytes;
6335   unsigned char c;
6336   int total  = 0;
6337   int eol_seen = EOL_SEEN_NONE;
6338
6339   if ((1 << category) & CATEGORY_MASK_UTF_16)
6340     {
6341       bool msb = category == (coding_category_utf_16_le
6342                               | coding_category_utf_16_le_nosig);
6343       bool lsb = !msb;
6344
6345       while (src + 1 < src_end)
6346         {
6347           c = src[lsb];
6348           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6349             {
6350               int this_eol;
6351
6352               if (c == '\n')
6353                 this_eol = EOL_SEEN_LF;
6354               else if (src + 3 >= src_end
6355                        || src[msb + 2] != 0
6356                        || src[lsb + 2] != '\n')
6357                 this_eol = EOL_SEEN_CR;
6358               else
6359                 {
6360                   this_eol = EOL_SEEN_CRLF;
6361                   src += 2;
6362                 }
6363
6364               if (eol_seen == EOL_SEEN_NONE)
6365                 /* This is the first end-of-line.  */
6366                 eol_seen = this_eol;
6367               else if (eol_seen != this_eol)
6368                 {
6369                   /* The found type is different from what found before.
6370                      Allow for stray ^M characters in DOS EOL files.  */
6371                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6372                       || (eol_seen == EOL_SEEN_CRLF
6373                           && this_eol == EOL_SEEN_CR))
6374                     eol_seen = EOL_SEEN_CRLF;
6375                   else
6376                     {
6377                       eol_seen = EOL_SEEN_LF;
6378                       break;
6379                     }
6380                 }
6381               if (++total == MAX_EOL_CHECK_COUNT)
6382                 break;
6383             }
6384           src += 2;
6385         }
6386     }
6387   else
6388     while (src < src_end)
6389       {
6390         c = *src++;
6391         if (c == '\n' || c == '\r')
6392           {
6393             int this_eol;
6394
6395             if (c == '\n')
6396               this_eol = EOL_SEEN_LF;
6397             else if (src >= src_end || *src != '\n')
6398               this_eol = EOL_SEEN_CR;
6399             else
6400               this_eol = EOL_SEEN_CRLF, src++;
6401
6402             if (eol_seen == EOL_SEEN_NONE)
6403               /* This is the first end-of-line.  */
6404               eol_seen = this_eol;
6405             else if (eol_seen != this_eol)
6406               {
6407                 /* The found type is different from what found before.
6408                    Allow for stray ^M characters in DOS EOL files.  */
6409                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6410                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6411                   eol_seen = EOL_SEEN_CRLF;
6412                 else
6413                   {
6414                     eol_seen = EOL_SEEN_LF;
6415                     break;
6416                   }
6417               }
6418             if (++total == MAX_EOL_CHECK_COUNT)
6419               break;
6420           }
6421       }
6422   return eol_seen;
6423 }
6424
6425
6426 static Lisp_Object
6427 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6428 {
6429   Lisp_Object eol_type;
6430
6431   eol_type = CODING_ID_EOL_TYPE (coding->id);
6432   if (! VECTORP (eol_type))
6433     /* Already adjusted.  */
6434     return eol_type;
6435   if (eol_seen & EOL_SEEN_LF)
6436     {
6437       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6438       eol_type = Qunix;
6439     }
6440   else if (eol_seen & EOL_SEEN_CRLF)
6441     {
6442       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6443       eol_type = Qdos;
6444     }
6445   else if (eol_seen & EOL_SEEN_CR)
6446     {
6447       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6448       eol_type = Qmac;
6449     }
6450   return eol_type;
6451 }
6452
6453 /* Detect how a text specified in CODING is encoded.  If a coding
6454    system is detected, update fields of CODING by the detected coding
6455    system.  */
6456
6457 static void
6458 detect_coding (struct coding_system *coding)
6459 {
6460   const unsigned char *src, *src_end;
6461   unsigned int saved_mode = coding->mode;
6462   Lisp_Object found = Qnil;
6463   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6464
6465   coding->consumed = coding->consumed_char = 0;
6466   coding->produced = coding->produced_char = 0;
6467   coding_set_source (coding);
6468
6469   src_end = coding->source + coding->src_bytes;
6470
6471   coding->eol_seen = EOL_SEEN_NONE;
6472   /* If we have not yet decided the text encoding type, detect it
6473      now.  */
6474   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6475     {
6476       int c, i;
6477       struct coding_detection_info detect_info;
6478       bool null_byte_found = 0, eight_bit_found = 0;
6479       int inhibit_nbd           /* null byte detection */
6480         = (coding->spec.undecided.inhibit_nbd > 0
6481            | (coding->spec.undecided.inhibit_nbd == 0
6482               & inhibit_null_byte_detection));
6483       int inhibit_ied           /* iso escape detection */
6484         = (coding->spec.undecided.inhibit_ied > 0
6485            | (coding->spec.undecided.inhibit_ied == 0
6486               & inhibit_iso_escape_detection));
6487       int prefer_utf_8
6488         = coding->spec.undecided.prefer_utf_8;
6489
6490       coding->head_ascii = 0;
6491       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6492       for (src = coding->source; src < src_end; src++)
6493         {
6494           c = *src;
6495           if (c & 0x80)
6496             {
6497               eight_bit_found = 1;
6498               if (null_byte_found)
6499                 break;
6500             }
6501           else if (c < 0x20)
6502             {
6503               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6504                   && ! inhibit_ied
6505                   && ! detect_info.checked)
6506                 {
6507                   if (detect_coding_iso_2022 (coding, &detect_info))
6508                     {
6509                       /* We have scanned the whole data.  */
6510                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6511                         {
6512                           /* We didn't find an 8-bit code.  We may
6513                              have found a null-byte, but it's very
6514                              rare that a binary file conforms to
6515                              ISO-2022.  */
6516                           src = src_end;
6517                           coding->head_ascii = src - coding->source;
6518                         }
6519                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6520                       break;
6521                     }
6522                 }
6523               else if (! c && !inhibit_nbd)
6524                 {
6525                   null_byte_found = 1;
6526                   if (eight_bit_found)
6527                     break;
6528                 }
6529               else if (! disable_ascii_optimization
6530                        && ! inhibit_eol_conversion)
6531                 {
6532                   if (c == '\r')
6533                     {
6534                       if (src < src_end && src[1] == '\n')
6535                         {
6536                           coding->eol_seen |= EOL_SEEN_CRLF;
6537                           src++;
6538                           if (! eight_bit_found)
6539                             coding->head_ascii++;
6540                         }
6541                       else
6542                         coding->eol_seen |= EOL_SEEN_CR;
6543                     }
6544                   else if (c == '\n')
6545                     {
6546                       coding->eol_seen |= EOL_SEEN_LF;
6547                     }
6548                 }
6549
6550               if (! eight_bit_found)
6551                 coding->head_ascii++;
6552             }
6553           else if (! eight_bit_found)
6554             coding->head_ascii++;
6555         }
6556
6557       if (null_byte_found || eight_bit_found
6558           || coding->head_ascii < coding->src_bytes
6559           || detect_info.found)
6560         {
6561           enum coding_category category;
6562           struct coding_system *this;
6563
6564           if (coding->head_ascii == coding->src_bytes)
6565             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6566             for (i = 0; i < coding_category_raw_text; i++)
6567               {
6568                 category = coding_priorities[i];
6569                 this = coding_categories + category;
6570                 if (detect_info.found & (1 << category))
6571                   break;
6572               }
6573           else
6574             {
6575               if (null_byte_found)
6576                 {
6577                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6578                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6579                 }
6580               else if (prefer_utf_8
6581                        && detect_coding_utf_8 (coding, &detect_info))
6582                 {
6583                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6584                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6585                 }
6586               for (i = 0; i < coding_category_raw_text; i++)
6587                 {
6588                   category = coding_priorities[i];
6589                   this = coding_categories + category;
6590                   /* Some of this->detector (e.g. detect_coding_sjis)
6591                      require this information.  */
6592                   coding->id = this->id;
6593                   if (this->id < 0)
6594                     {
6595                       /* No coding system of this category is defined.  */
6596                       detect_info.rejected |= (1 << category);
6597                     }
6598                   else if (category >= coding_category_raw_text)
6599                     continue;
6600                   else if (detect_info.checked & (1 << category))
6601                     {
6602                       if (detect_info.found & (1 << category))
6603                         break;
6604                     }
6605                   else if ((*(this->detector)) (coding, &detect_info)
6606                            && detect_info.found & (1 << category))
6607                     break;
6608                 }
6609             }
6610
6611           if (i < coding_category_raw_text)
6612             {
6613               if (category == coding_category_utf_8_auto)
6614                 {
6615                   Lisp_Object coding_systems;
6616
6617                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6618                                          coding_attr_utf_bom);
6619                   if (CONSP (coding_systems))
6620                     {
6621                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6622                         found = XCAR (coding_systems);
6623                       else
6624                         found = XCDR (coding_systems);
6625                     }
6626                   else
6627                     found = CODING_ID_NAME (this->id);
6628                 }
6629               else if (category == coding_category_utf_16_auto)
6630                 {
6631                   Lisp_Object coding_systems;
6632
6633                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6634                                          coding_attr_utf_bom);
6635                   if (CONSP (coding_systems))
6636                     {
6637                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6638                         found = XCAR (coding_systems);
6639                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6640                         found = XCDR (coding_systems);
6641                     }
6642                   else
6643                     found = CODING_ID_NAME (this->id);
6644                 }
6645               else
6646                 found = CODING_ID_NAME (this->id);
6647             }
6648           else if (null_byte_found)
6649             found = Qno_conversion;
6650           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6651                    == CATEGORY_MASK_ANY)
6652             found = Qraw_text;
6653           else if (detect_info.rejected)
6654             for (i = 0; i < coding_category_raw_text; i++)
6655               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6656                 {
6657                   this = coding_categories + coding_priorities[i];
6658                   found = CODING_ID_NAME (this->id);
6659                   break;
6660                 }
6661         }
6662     }
6663   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6664            == coding_category_utf_8_auto)
6665     {
6666       Lisp_Object coding_systems;
6667       struct coding_detection_info detect_info;
6668
6669       coding_systems
6670         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6671       detect_info.found = detect_info.rejected = 0;
6672       if (check_ascii (coding) == coding->src_bytes)
6673         {
6674           if (CONSP (coding_systems))
6675             found = XCDR (coding_systems);
6676         }
6677       else
6678         {
6679           if (CONSP (coding_systems)
6680               && detect_coding_utf_8 (coding, &detect_info))
6681             {
6682               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6683                 found = XCAR (coding_systems);
6684               else
6685                 found = XCDR (coding_systems);
6686             }
6687         }
6688     }
6689   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6690            == coding_category_utf_16_auto)
6691     {
6692       Lisp_Object coding_systems;
6693       struct coding_detection_info detect_info;
6694
6695       coding_systems
6696         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6697       detect_info.found = detect_info.rejected = 0;
6698       coding->head_ascii = 0;
6699       if (CONSP (coding_systems)
6700           && detect_coding_utf_16 (coding, &detect_info))
6701         {
6702           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6703             found = XCAR (coding_systems);
6704           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6705             found = XCDR (coding_systems);
6706         }
6707     }
6708
6709   if (! NILP (found))
6710     {
6711       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6712                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6713                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6714                            : EOL_SEEN_LF);
6715
6716       setup_coding_system (found, coding);
6717       if (specified_eol != EOL_SEEN_NONE)
6718         adjust_coding_eol_type (coding, specified_eol);
6719     }
6720
6721   coding->mode = saved_mode;
6722 }
6723
6724
6725 static void
6726 decode_eol (struct coding_system *coding)
6727 {
6728   Lisp_Object eol_type;
6729   unsigned char *p, *pbeg, *pend;
6730
6731   eol_type = CODING_ID_EOL_TYPE (coding->id);
6732   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6733     return;
6734
6735   if (NILP (coding->dst_object))
6736     pbeg = coding->destination;
6737   else
6738     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6739   pend = pbeg + coding->produced;
6740
6741   if (VECTORP (eol_type))
6742     {
6743       int eol_seen = EOL_SEEN_NONE;
6744
6745       for (p = pbeg; p < pend; p++)
6746         {
6747           if (*p == '\n')
6748             eol_seen |= EOL_SEEN_LF;
6749           else if (*p == '\r')
6750             {
6751               if (p + 1 < pend && *(p + 1) == '\n')
6752                 {
6753                   eol_seen |= EOL_SEEN_CRLF;
6754                   p++;
6755                 }
6756               else
6757                 eol_seen |= EOL_SEEN_CR;
6758             }
6759         }
6760       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6761       if ((eol_seen & EOL_SEEN_CRLF) != 0
6762           && (eol_seen & EOL_SEEN_CR) != 0
6763           && (eol_seen & EOL_SEEN_LF) == 0)
6764         eol_seen = EOL_SEEN_CRLF;
6765       else if (eol_seen != EOL_SEEN_NONE
6766           && eol_seen != EOL_SEEN_LF
6767           && eol_seen != EOL_SEEN_CRLF
6768           && eol_seen != EOL_SEEN_CR)
6769         eol_seen = EOL_SEEN_LF;
6770       if (eol_seen != EOL_SEEN_NONE)
6771         eol_type = adjust_coding_eol_type (coding, eol_seen);
6772     }
6773
6774   if (EQ (eol_type, Qmac))
6775     {
6776       for (p = pbeg; p < pend; p++)
6777         if (*p == '\r')
6778           *p = '\n';
6779     }
6780   else if (EQ (eol_type, Qdos))
6781     {
6782       ptrdiff_t n = 0;
6783
6784       if (NILP (coding->dst_object))
6785         {
6786           /* Start deleting '\r' from the tail to minimize the memory
6787              movement.  */
6788           for (p = pend - 2; p >= pbeg; p--)
6789             if (*p == '\r')
6790               {
6791                 memmove (p, p + 1, pend-- - p - 1);
6792                 n++;
6793               }
6794         }
6795       else
6796         {
6797           ptrdiff_t pos_byte = coding->dst_pos_byte;
6798           ptrdiff_t pos = coding->dst_pos;
6799           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6800
6801           while (pos < pos_end)
6802             {
6803               p = BYTE_POS_ADDR (pos_byte);
6804               if (*p == '\r' && p[1] == '\n')
6805                 {
6806                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6807                   n++;
6808                   pos_end--;
6809                 }
6810               pos++;
6811               if (coding->dst_multibyte)
6812                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6813               else
6814                 pos_byte++;
6815             }
6816         }
6817       coding->produced -= n;
6818       coding->produced_char -= n;
6819     }
6820 }
6821
6822
6823 /* Return a translation table (or list of them) from coding system
6824    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6825    not ENCODEP). */
6826
6827 static Lisp_Object
6828 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6829 {
6830   Lisp_Object standard, translation_table;
6831   Lisp_Object val;
6832
6833   if (NILP (Venable_character_translation))
6834     {
6835       if (max_lookup)
6836         *max_lookup = 0;
6837       return Qnil;
6838     }
6839   if (encodep)
6840     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6841       standard = Vstandard_translation_table_for_encode;
6842   else
6843     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6844       standard = Vstandard_translation_table_for_decode;
6845   if (NILP (translation_table))
6846     translation_table = standard;
6847   else
6848     {
6849       if (SYMBOLP (translation_table))
6850         translation_table = Fget (translation_table, Qtranslation_table);
6851       else if (CONSP (translation_table))
6852         {
6853           translation_table = Fcopy_sequence (translation_table);
6854           for (val = translation_table; CONSP (val); val = XCDR (val))
6855             if (SYMBOLP (XCAR (val)))
6856               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6857         }
6858       if (CHAR_TABLE_P (standard))
6859         {
6860           if (CONSP (translation_table))
6861             translation_table = nconc2 (translation_table,
6862                                         Fcons (standard, Qnil));
6863           else
6864             translation_table = Fcons (translation_table,
6865                                        Fcons (standard, Qnil));
6866         }
6867     }
6868
6869   if (max_lookup)
6870     {
6871       *max_lookup = 1;
6872       if (CHAR_TABLE_P (translation_table)
6873           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6874         {
6875           val = XCHAR_TABLE (translation_table)->extras[1];
6876           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6877             *max_lookup = XFASTINT (val);
6878         }
6879       else if (CONSP (translation_table))
6880         {
6881           Lisp_Object tail;
6882
6883           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6884             if (CHAR_TABLE_P (XCAR (tail))
6885                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6886               {
6887                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6888                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6889                   *max_lookup = XFASTINT (tailval);
6890               }
6891         }
6892     }
6893   return translation_table;
6894 }
6895
6896 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6897   do {                                                          \
6898     trans = Qnil;                                               \
6899     if (CHAR_TABLE_P (table))                                   \
6900       {                                                         \
6901         trans = CHAR_TABLE_REF (table, c);                      \
6902         if (CHARACTERP (trans))                                 \
6903           c = XFASTINT (trans), trans = Qnil;                   \
6904       }                                                         \
6905     else if (CONSP (table))                                     \
6906       {                                                         \
6907         Lisp_Object tail;                                       \
6908                                                                 \
6909         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6910           if (CHAR_TABLE_P (XCAR (tail)))                       \
6911             {                                                   \
6912               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6913               if (CHARACTERP (trans))                           \
6914                 c = XFASTINT (trans), trans = Qnil;             \
6915               else if (! NILP (trans))                          \
6916                 break;                                          \
6917             }                                                   \
6918       }                                                         \
6919   } while (0)
6920
6921
6922 /* Return a translation of character(s) at BUF according to TRANS.
6923    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6924    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6925    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6926    translation is found, and Qnil if not found..
6927    If BUF is too short to lookup characters in FROM, return Qt.  */
6928
6929 static Lisp_Object
6930 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6931 {
6932
6933   if (INTEGERP (trans))
6934     return trans;
6935   for (; CONSP (trans); trans = XCDR (trans))
6936     {
6937       Lisp_Object val = XCAR (trans);
6938       Lisp_Object from = XCAR (val);
6939       ptrdiff_t len = ASIZE (from);
6940       ptrdiff_t i;
6941
6942       for (i = 0; i < len; i++)
6943         {
6944           if (buf + i == buf_end)
6945             return Qt;
6946           if (XINT (AREF (from, i)) != buf[i])
6947             break;
6948         }
6949       if (i == len)
6950         return val;
6951     }
6952   return Qnil;
6953 }
6954
6955
6956 static int
6957 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6958                bool last_block)
6959 {
6960   unsigned char *dst = coding->destination + coding->produced;
6961   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6962   ptrdiff_t produced;
6963   ptrdiff_t produced_chars = 0;
6964   int carryover = 0;
6965
6966   if (! coding->chars_at_source)
6967     {
6968       /* Source characters are in coding->charbuf.  */
6969       int *buf = coding->charbuf;
6970       int *buf_end = buf + coding->charbuf_used;
6971
6972       if (EQ (coding->src_object, coding->dst_object))
6973         {
6974           coding_set_source (coding);
6975           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6976         }
6977
6978       while (buf < buf_end)
6979         {
6980           int c = *buf;
6981           ptrdiff_t i;
6982
6983           if (c >= 0)
6984             {
6985               ptrdiff_t from_nchars = 1, to_nchars = 1;
6986               Lisp_Object trans = Qnil;
6987
6988               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6989               if (! NILP (trans))
6990                 {
6991                   trans = get_translation (trans, buf, buf_end);
6992                   if (INTEGERP (trans))
6993                     c = XINT (trans);
6994                   else if (CONSP (trans))
6995                     {
6996                       from_nchars = ASIZE (XCAR (trans));
6997                       trans = XCDR (trans);
6998                       if (INTEGERP (trans))
6999                         c = XINT (trans);
7000                       else
7001                         {
7002                           to_nchars = ASIZE (trans);
7003                           c = XINT (AREF (trans, 0));
7004                         }
7005                     }
7006                   else if (EQ (trans, Qt) && ! last_block)
7007                     break;
7008                 }
7009
7010               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7011                 {
7012                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7013                        / MAX_MULTIBYTE_LENGTH)
7014                       < to_nchars)
7015                     memory_full (SIZE_MAX);
7016                   dst = alloc_destination (coding,
7017                                            buf_end - buf
7018                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7019                                            dst);
7020                   if (EQ (coding->src_object, coding->dst_object))
7021                     {
7022                       coding_set_source (coding);
7023                       dst_end = (((unsigned char *) coding->source)
7024                                  + coding->consumed);
7025                     }
7026                   else
7027                     dst_end = coding->destination + coding->dst_bytes;
7028                 }
7029
7030               for (i = 0; i < to_nchars; i++)
7031                 {
7032                   if (i > 0)
7033                     c = XINT (AREF (trans, i));
7034                   if (coding->dst_multibyte
7035                       || ! CHAR_BYTE8_P (c))
7036                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7037                   else
7038                     *dst++ = CHAR_TO_BYTE8 (c);
7039                 }
7040               produced_chars += to_nchars;
7041               buf += from_nchars;
7042             }
7043           else
7044             /* This is an annotation datum.  (-C) is the length.  */
7045             buf += -c;
7046         }
7047       carryover = buf_end - buf;
7048     }
7049   else
7050     {
7051       /* Source characters are at coding->source.  */
7052       const unsigned char *src = coding->source;
7053       const unsigned char *src_end = src + coding->consumed;
7054
7055       if (EQ (coding->dst_object, coding->src_object))
7056         dst_end = (unsigned char *) src;
7057       if (coding->src_multibyte != coding->dst_multibyte)
7058         {
7059           if (coding->src_multibyte)
7060             {
7061               bool multibytep = 1;
7062               ptrdiff_t consumed_chars = 0;
7063
7064               while (1)
7065                 {
7066                   const unsigned char *src_base = src;
7067                   int c;
7068
7069                   ONE_MORE_BYTE (c);
7070                   if (dst == dst_end)
7071                     {
7072                       if (EQ (coding->src_object, coding->dst_object))
7073                         dst_end = (unsigned char *) src;
7074                       if (dst == dst_end)
7075                         {
7076                           ptrdiff_t offset = src - coding->source;
7077
7078                           dst = alloc_destination (coding, src_end - src + 1,
7079                                                    dst);
7080                           dst_end = coding->destination + coding->dst_bytes;
7081                           coding_set_source (coding);
7082                           src = coding->source + offset;
7083                           src_end = coding->source + coding->consumed;
7084                           if (EQ (coding->src_object, coding->dst_object))
7085                             dst_end = (unsigned char *) src;
7086                         }
7087                     }
7088                   *dst++ = c;
7089                   produced_chars++;
7090                 }
7091             no_more_source:
7092               ;
7093             }
7094           else
7095             while (src < src_end)
7096               {
7097                 bool multibytep = 1;
7098                 int c = *src++;
7099
7100                 if (dst >= dst_end - 1)
7101                   {
7102                     if (EQ (coding->src_object, coding->dst_object))
7103                       dst_end = (unsigned char *) src;
7104                     if (dst >= dst_end - 1)
7105                       {
7106                         ptrdiff_t offset = src - coding->source;
7107                         ptrdiff_t more_bytes;
7108
7109                         if (EQ (coding->src_object, coding->dst_object))
7110                           more_bytes = ((src_end - src) / 2) + 2;
7111                         else
7112                           more_bytes = src_end - src + 2;
7113                         dst = alloc_destination (coding, more_bytes, dst);
7114                         dst_end = coding->destination + coding->dst_bytes;
7115                         coding_set_source (coding);
7116                         src = coding->source + offset;
7117                         src_end = coding->source + coding->consumed;
7118                         if (EQ (coding->src_object, coding->dst_object))
7119                           dst_end = (unsigned char *) src;
7120                       }
7121                   }
7122                 EMIT_ONE_BYTE (c);
7123               }
7124         }
7125       else
7126         {
7127           if (!EQ (coding->src_object, coding->dst_object))
7128             {
7129               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7130
7131               if (require > 0)
7132                 {
7133                   ptrdiff_t offset = src - coding->source;
7134
7135                   dst = alloc_destination (coding, require, dst);
7136                   coding_set_source (coding);
7137                   src = coding->source + offset;
7138                   src_end = coding->source + coding->consumed;
7139                 }
7140             }
7141           produced_chars = coding->consumed_char;
7142           while (src < src_end)
7143             *dst++ = *src++;
7144         }
7145     }
7146
7147   produced = dst - (coding->destination + coding->produced);
7148   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7149     insert_from_gap (produced_chars, produced, 0);
7150   coding->produced += produced;
7151   coding->produced_char += produced_chars;
7152   return carryover;
7153 }
7154
7155 /* Compose text in CODING->object according to the annotation data at
7156    CHARBUF.  CHARBUF is an array:
7157      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7158  */
7159
7160 static void
7161 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7162 {
7163   int len;
7164   ptrdiff_t to;
7165   enum composition_method method;
7166   Lisp_Object components;
7167
7168   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7169   to = pos + charbuf[2];
7170   method = (enum composition_method) (charbuf[4]);
7171
7172   if (method == COMPOSITION_RELATIVE)
7173     components = Qnil;
7174   else
7175     {
7176       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7177       int i, j;
7178
7179       if (method == COMPOSITION_WITH_RULE)
7180         len = charbuf[2] * 3 - 2;
7181       charbuf += MAX_ANNOTATION_LENGTH;
7182       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7183       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7184         {
7185           if (charbuf[i] >= 0)
7186             args[j] = make_number (charbuf[i]);
7187           else
7188             {
7189               i++;
7190               args[j] = make_number (charbuf[i] % 0x100);
7191             }
7192         }
7193       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7194     }
7195   compose_text (pos, to, components, Qnil, coding->dst_object);
7196 }
7197
7198
7199 /* Put `charset' property on text in CODING->object according to
7200    the annotation data at CHARBUF.  CHARBUF is an array:
7201      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7202  */
7203
7204 static void
7205 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7206 {
7207   ptrdiff_t from = pos - charbuf[2];
7208   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7209
7210   Fput_text_property (make_number (from), make_number (pos),
7211                       Qcharset, CHARSET_NAME (charset),
7212                       coding->dst_object);
7213 }
7214
7215
7216 #define CHARBUF_SIZE 0x4000
7217
7218 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7219   do {                                                                  \
7220     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7221     coding->charbuf_size = CHARBUF_SIZE;                                \
7222   } while (0)
7223
7224
7225 static void
7226 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7227 {
7228   int *charbuf = coding->charbuf;
7229   int *charbuf_end = charbuf + coding->charbuf_used;
7230
7231   if (NILP (coding->dst_object))
7232     return;
7233
7234   while (charbuf < charbuf_end)
7235     {
7236       if (*charbuf >= 0)
7237         pos++, charbuf++;
7238       else
7239         {
7240           int len = -*charbuf;
7241
7242           if (len > 2)
7243             switch (charbuf[1])
7244               {
7245               case CODING_ANNOTATE_COMPOSITION_MASK:
7246                 produce_composition (coding, charbuf, pos);
7247                 break;
7248               case CODING_ANNOTATE_CHARSET_MASK:
7249                 produce_charset (coding, charbuf, pos);
7250                 break;
7251               }
7252           charbuf += len;
7253         }
7254     }
7255 }
7256
7257 /* Decode the data at CODING->src_object into CODING->dst_object.
7258    CODING->src_object is a buffer, a string, or nil.
7259    CODING->dst_object is a buffer.
7260
7261    If CODING->src_object is a buffer, it must be the current buffer.
7262    In this case, if CODING->src_pos is positive, it is a position of
7263    the source text in the buffer, otherwise, the source text is in the
7264    gap area of the buffer, and CODING->src_pos specifies the offset of
7265    the text from GPT (which must be the same as PT).  If this is the
7266    same buffer as CODING->dst_object, CODING->src_pos must be
7267    negative.
7268
7269    If CODING->src_object is a string, CODING->src_pos is an index to
7270    that string.
7271
7272    If CODING->src_object is nil, CODING->source must already point to
7273    the non-relocatable memory area.  In this case, CODING->src_pos is
7274    an offset from CODING->source.
7275
7276    The decoded data is inserted at the current point of the buffer
7277    CODING->dst_object.
7278 */
7279
7280 static void
7281 decode_coding (struct coding_system *coding)
7282 {
7283   Lisp_Object attrs;
7284   Lisp_Object undo_list;
7285   Lisp_Object translation_table;
7286   struct ccl_spec cclspec;
7287   int carryover;
7288   int i;
7289
7290   USE_SAFE_ALLOCA;
7291
7292   if (BUFFERP (coding->src_object)
7293       && coding->src_pos > 0
7294       && coding->src_pos < GPT
7295       && coding->src_pos + coding->src_chars > GPT)
7296     move_gap_both (coding->src_pos, coding->src_pos_byte);
7297
7298   undo_list = Qt;
7299   if (BUFFERP (coding->dst_object))
7300     {
7301       set_buffer_internal (XBUFFER (coding->dst_object));
7302       if (GPT != PT)
7303         move_gap_both (PT, PT_BYTE);
7304
7305       /* We must disable undo_list in order to record the whole insert
7306          transaction via record_insert at the end.  But doing so also
7307          disables the recording of the first change to the undo_list.
7308          Therefore we check for first change here and record it via
7309          record_first_change if needed.  */
7310       if (MODIFF <= SAVE_MODIFF)
7311         record_first_change ();
7312
7313       undo_list = BVAR (current_buffer, undo_list);
7314       bset_undo_list (current_buffer, Qt);
7315     }
7316
7317   coding->consumed = coding->consumed_char = 0;
7318   coding->produced = coding->produced_char = 0;
7319   coding->chars_at_source = 0;
7320   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7321   coding->errors = 0;
7322
7323   ALLOC_CONVERSION_WORK_AREA (coding);
7324
7325   attrs = CODING_ID_ATTRS (coding->id);
7326   translation_table = get_translation_table (attrs, 0, NULL);
7327
7328   carryover = 0;
7329   if (coding->decoder == decode_coding_ccl)
7330     {
7331       coding->spec.ccl = &cclspec;
7332       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7333     }
7334   do
7335     {
7336       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7337
7338       coding_set_source (coding);
7339       coding->annotated = 0;
7340       coding->charbuf_used = carryover;
7341       (*(coding->decoder)) (coding);
7342       coding_set_destination (coding);
7343       carryover = produce_chars (coding, translation_table, 0);
7344       if (coding->annotated)
7345         produce_annotation (coding, pos);
7346       for (i = 0; i < carryover; i++)
7347         coding->charbuf[i]
7348           = coding->charbuf[coding->charbuf_used - carryover + i];
7349     }
7350   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7351          || (coding->consumed < coding->src_bytes
7352              && (coding->result == CODING_RESULT_SUCCESS
7353                  || coding->result == CODING_RESULT_INVALID_SRC)));
7354
7355   if (carryover > 0)
7356     {
7357       coding_set_destination (coding);
7358       coding->charbuf_used = carryover;
7359       produce_chars (coding, translation_table, 1);
7360     }
7361
7362   coding->carryover_bytes = 0;
7363   if (coding->consumed < coding->src_bytes)
7364     {
7365       int nbytes = coding->src_bytes - coding->consumed;
7366       const unsigned char *src;
7367
7368       coding_set_source (coding);
7369       coding_set_destination (coding);
7370       src = coding->source + coding->consumed;
7371
7372       if (coding->mode & CODING_MODE_LAST_BLOCK)
7373         {
7374           /* Flush out unprocessed data as binary chars.  We are sure
7375              that the number of data is less than the size of
7376              coding->charbuf.  */
7377           coding->charbuf_used = 0;
7378           coding->chars_at_source = 0;
7379
7380           while (nbytes-- > 0)
7381             {
7382               int c = *src++;
7383
7384               if (c & 0x80)
7385                 c = BYTE8_TO_CHAR (c);
7386               coding->charbuf[coding->charbuf_used++] = c;
7387             }
7388           produce_chars (coding, Qnil, 1);
7389         }
7390       else
7391         {
7392           /* Record unprocessed bytes in coding->carryover.  We are
7393              sure that the number of data is less than the size of
7394              coding->carryover.  */
7395           unsigned char *p = coding->carryover;
7396
7397           if (nbytes > sizeof coding->carryover)
7398             nbytes = sizeof coding->carryover;
7399           coding->carryover_bytes = nbytes;
7400           while (nbytes-- > 0)
7401             *p++ = *src++;
7402         }
7403       coding->consumed = coding->src_bytes;
7404     }
7405
7406   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7407       && !inhibit_eol_conversion)
7408     decode_eol (coding);
7409   if (BUFFERP (coding->dst_object))
7410     {
7411       bset_undo_list (current_buffer, undo_list);
7412       record_insert (coding->dst_pos, coding->produced_char);
7413     }
7414
7415   SAFE_FREE ();
7416 }
7417
7418
7419 /* Extract an annotation datum from a composition starting at POS and
7420    ending before LIMIT of CODING->src_object (buffer or string), store
7421    the data in BUF, set *STOP to a starting position of the next
7422    composition (if any) or to LIMIT, and return the address of the
7423    next element of BUF.
7424
7425    If such an annotation is not found, set *STOP to a starting
7426    position of a composition after POS (if any) or to LIMIT, and
7427    return BUF.  */
7428
7429 static int *
7430 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7431                                struct coding_system *coding, int *buf,
7432                                ptrdiff_t *stop)
7433 {
7434   ptrdiff_t start, end;
7435   Lisp_Object prop;
7436
7437   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7438       || end > limit)
7439     *stop = limit;
7440   else if (start > pos)
7441     *stop = start;
7442   else
7443     {
7444       if (start == pos)
7445         {
7446           /* We found a composition.  Store the corresponding
7447              annotation data in BUF.  */
7448           int *head = buf;
7449           enum composition_method method = COMPOSITION_METHOD (prop);
7450           int nchars = COMPOSITION_LENGTH (prop);
7451
7452           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7453           if (method != COMPOSITION_RELATIVE)
7454             {
7455               Lisp_Object components;
7456               ptrdiff_t i, len, i_byte;
7457
7458               components = COMPOSITION_COMPONENTS (prop);
7459               if (VECTORP (components))
7460                 {
7461                   len = ASIZE (components);
7462                   for (i = 0; i < len; i++)
7463                     *buf++ = XINT (AREF (components, i));
7464                 }
7465               else if (STRINGP (components))
7466                 {
7467                   len = SCHARS (components);
7468                   i = i_byte = 0;
7469                   while (i < len)
7470                     {
7471                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7472                       buf++;
7473                     }
7474                 }
7475               else if (INTEGERP (components))
7476                 {
7477                   len = 1;
7478                   *buf++ = XINT (components);
7479                 }
7480               else if (CONSP (components))
7481                 {
7482                   for (len = 0; CONSP (components);
7483                        len++, components = XCDR (components))
7484                     *buf++ = XINT (XCAR (components));
7485                 }
7486               else
7487                 emacs_abort ();
7488               *head -= len;
7489             }
7490         }
7491
7492       if (find_composition (end, limit, &start, &end, &prop,
7493                             coding->src_object)
7494           && end <= limit)
7495         *stop = start;
7496       else
7497         *stop = limit;
7498     }
7499   return buf;
7500 }
7501
7502
7503 /* Extract an annotation datum from a text property `charset' at POS of
7504    CODING->src_object (buffer of string), store the data in BUF, set
7505    *STOP to the position where the value of `charset' property changes
7506    (limiting by LIMIT), and return the address of the next element of
7507    BUF.
7508
7509    If the property value is nil, set *STOP to the position where the
7510    property value is non-nil (limiting by LIMIT), and return BUF.  */
7511
7512 static int *
7513 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7514                            struct coding_system *coding, int *buf,
7515                            ptrdiff_t *stop)
7516 {
7517   Lisp_Object val, next;
7518   int id;
7519
7520   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7521   if (! NILP (val) && CHARSETP (val))
7522     id = XINT (CHARSET_SYMBOL_ID (val));
7523   else
7524     id = -1;
7525   ADD_CHARSET_DATA (buf, 0, id);
7526   next = Fnext_single_property_change (make_number (pos), Qcharset,
7527                                        coding->src_object,
7528                                        make_number (limit));
7529   *stop = XINT (next);
7530   return buf;
7531 }
7532
7533
7534 static void
7535 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7536                int max_lookup)
7537 {
7538   int *buf = coding->charbuf;
7539   int *buf_end = coding->charbuf + coding->charbuf_size;
7540   const unsigned char *src = coding->source + coding->consumed;
7541   const unsigned char *src_end = coding->source + coding->src_bytes;
7542   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7543   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7544   bool multibytep = coding->src_multibyte;
7545   Lisp_Object eol_type;
7546   int c;
7547   ptrdiff_t stop, stop_composition, stop_charset;
7548   int *lookup_buf = NULL;
7549
7550   if (! NILP (translation_table))
7551     lookup_buf = alloca (sizeof (int) * max_lookup);
7552
7553   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7554   if (VECTORP (eol_type))
7555     eol_type = Qunix;
7556
7557   /* Note: composition handling is not yet implemented.  */
7558   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7559
7560   if (NILP (coding->src_object))
7561     stop = stop_composition = stop_charset = end_pos;
7562   else
7563     {
7564       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7565         stop = stop_composition = pos;
7566       else
7567         stop = stop_composition = end_pos;
7568       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7569         stop = stop_charset = pos;
7570       else
7571         stop_charset = end_pos;
7572     }
7573
7574   /* Compensate for CRLF and conversion.  */
7575   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7576   while (buf < buf_end)
7577     {
7578       Lisp_Object trans;
7579
7580       if (pos == stop)
7581         {
7582           if (pos == end_pos)
7583             break;
7584           if (pos == stop_composition)
7585             buf = handle_composition_annotation (pos, end_pos, coding,
7586                                                  buf, &stop_composition);
7587           if (pos == stop_charset)
7588             buf = handle_charset_annotation (pos, end_pos, coding,
7589                                              buf, &stop_charset);
7590           stop = (stop_composition < stop_charset
7591                   ? stop_composition : stop_charset);
7592         }
7593
7594       if (! multibytep)
7595         {
7596           int bytes;
7597
7598           if (coding->encoder == encode_coding_raw_text
7599               || coding->encoder == encode_coding_ccl)
7600             c = *src++, pos++;
7601           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7602             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7603           else
7604             c = BYTE8_TO_CHAR (*src), src++, pos++;
7605         }
7606       else
7607         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7608       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7609         c = '\n';
7610       if (! EQ (eol_type, Qunix))
7611         {
7612           if (c == '\n')
7613             {
7614               if (EQ (eol_type, Qdos))
7615                 *buf++ = '\r';
7616               else
7617                 c = '\r';
7618             }
7619         }
7620
7621       trans = Qnil;
7622       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7623       if (NILP (trans))
7624         *buf++ = c;
7625       else
7626         {
7627           ptrdiff_t from_nchars = 1, to_nchars = 1;
7628           int *lookup_buf_end;
7629           const unsigned char *p = src;
7630           int i;
7631
7632           lookup_buf[0] = c;
7633           for (i = 1; i < max_lookup && p < src_end; i++)
7634             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7635           lookup_buf_end = lookup_buf + i;
7636           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7637           if (INTEGERP (trans))
7638             c = XINT (trans);
7639           else if (CONSP (trans))
7640             {
7641               from_nchars = ASIZE (XCAR (trans));
7642               trans = XCDR (trans);
7643               if (INTEGERP (trans))
7644                 c = XINT (trans);
7645               else
7646                 {
7647                   to_nchars = ASIZE (trans);
7648                   if (buf_end - buf < to_nchars)
7649                     break;
7650                   c = XINT (AREF (trans, 0));
7651                 }
7652             }
7653           else
7654             break;
7655           *buf++ = c;
7656           for (i = 1; i < to_nchars; i++)
7657             *buf++ = XINT (AREF (trans, i));
7658           for (i = 1; i < from_nchars; i++, pos++)
7659             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7660         }
7661     }
7662
7663   coding->consumed = src - coding->source;
7664   coding->consumed_char = pos - coding->src_pos;
7665   coding->charbuf_used = buf - coding->charbuf;
7666   coding->chars_at_source = 0;
7667 }
7668
7669
7670 /* Encode the text at CODING->src_object into CODING->dst_object.
7671    CODING->src_object is a buffer or a string.
7672    CODING->dst_object is a buffer or nil.
7673
7674    If CODING->src_object is a buffer, it must be the current buffer.
7675    In this case, if CODING->src_pos is positive, it is a position of
7676    the source text in the buffer, otherwise. the source text is in the
7677    gap area of the buffer, and coding->src_pos specifies the offset of
7678    the text from GPT (which must be the same as PT).  If this is the
7679    same buffer as CODING->dst_object, CODING->src_pos must be
7680    negative and CODING should not have `pre-write-conversion'.
7681
7682    If CODING->src_object is a string, CODING should not have
7683    `pre-write-conversion'.
7684
7685    If CODING->dst_object is a buffer, the encoded data is inserted at
7686    the current point of that buffer.
7687
7688    If CODING->dst_object is nil, the encoded data is placed at the
7689    memory area specified by CODING->destination.  */
7690
7691 static void
7692 encode_coding (struct coding_system *coding)
7693 {
7694   Lisp_Object attrs;
7695   Lisp_Object translation_table;
7696   int max_lookup;
7697   struct ccl_spec cclspec;
7698
7699   USE_SAFE_ALLOCA;
7700
7701   attrs = CODING_ID_ATTRS (coding->id);
7702   if (coding->encoder == encode_coding_raw_text)
7703     translation_table = Qnil, max_lookup = 0;
7704   else
7705     translation_table = get_translation_table (attrs, 1, &max_lookup);
7706
7707   if (BUFFERP (coding->dst_object))
7708     {
7709       set_buffer_internal (XBUFFER (coding->dst_object));
7710       coding->dst_multibyte
7711         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7712     }
7713
7714   coding->consumed = coding->consumed_char = 0;
7715   coding->produced = coding->produced_char = 0;
7716   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7717   coding->errors = 0;
7718
7719   ALLOC_CONVERSION_WORK_AREA (coding);
7720
7721   if (coding->encoder == encode_coding_ccl)
7722     {
7723       coding->spec.ccl = &cclspec;
7724       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7725     }
7726   do {
7727     coding_set_source (coding);
7728     consume_chars (coding, translation_table, max_lookup);
7729     coding_set_destination (coding);
7730     (*(coding->encoder)) (coding);
7731   } while (coding->consumed_char < coding->src_chars);
7732
7733   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7734     insert_from_gap (coding->produced_char, coding->produced, 0);
7735
7736   SAFE_FREE ();
7737 }
7738
7739
7740 /* Name (or base name) of work buffer for code conversion.  */
7741 static Lisp_Object Vcode_conversion_workbuf_name;
7742
7743 /* A working buffer used by the top level conversion.  Once it is
7744    created, it is never destroyed.  It has the name
7745    Vcode_conversion_workbuf_name.  The other working buffers are
7746    destroyed after the use is finished, and their names are modified
7747    versions of Vcode_conversion_workbuf_name.  */
7748 static Lisp_Object Vcode_conversion_reused_workbuf;
7749
7750 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7751 static bool reused_workbuf_in_use;
7752
7753
7754 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7755    multibyteness of returning buffer.  */
7756
7757 static Lisp_Object
7758 make_conversion_work_buffer (bool multibyte)
7759 {
7760   Lisp_Object name, workbuf;
7761   struct buffer *current;
7762
7763   if (reused_workbuf_in_use)
7764     {
7765       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7766       workbuf = Fget_buffer_create (name);
7767     }
7768   else
7769     {
7770       reused_workbuf_in_use = 1;
7771       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7772         Vcode_conversion_reused_workbuf
7773           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7774       workbuf = Vcode_conversion_reused_workbuf;
7775     }
7776   current = current_buffer;
7777   set_buffer_internal (XBUFFER (workbuf));
7778   /* We can't allow modification hooks to run in the work buffer.  For
7779      instance, directory_files_internal assumes that file decoding
7780      doesn't compile new regexps.  */
7781   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7782   Ferase_buffer ();
7783   bset_undo_list (current_buffer, Qt);
7784   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7785   set_buffer_internal (current);
7786   return workbuf;
7787 }
7788
7789
7790 static Lisp_Object
7791 code_conversion_restore (Lisp_Object arg)
7792 {
7793   Lisp_Object current, workbuf;
7794   struct gcpro gcpro1;
7795
7796   GCPRO1 (arg);
7797   current = XCAR (arg);
7798   workbuf = XCDR (arg);
7799   if (! NILP (workbuf))
7800     {
7801       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7802         reused_workbuf_in_use = 0;
7803       else
7804         Fkill_buffer (workbuf);
7805     }
7806   set_buffer_internal (XBUFFER (current));
7807   UNGCPRO;
7808   return Qnil;
7809 }
7810
7811 Lisp_Object
7812 code_conversion_save (bool with_work_buf, bool multibyte)
7813 {
7814   Lisp_Object workbuf = Qnil;
7815
7816   if (with_work_buf)
7817     workbuf = make_conversion_work_buffer (multibyte);
7818   record_unwind_protect (code_conversion_restore,
7819                          Fcons (Fcurrent_buffer (), workbuf));
7820   return workbuf;
7821 }
7822
7823 void
7824 decode_coding_gap (struct coding_system *coding,
7825                    ptrdiff_t chars, ptrdiff_t bytes)
7826 {
7827   ptrdiff_t count = SPECPDL_INDEX ();
7828   Lisp_Object attrs;
7829
7830   coding->src_object = Fcurrent_buffer ();
7831   coding->src_chars = chars;
7832   coding->src_bytes = bytes;
7833   coding->src_pos = -chars;
7834   coding->src_pos_byte = -bytes;
7835   coding->src_multibyte = chars < bytes;
7836   coding->dst_object = coding->src_object;
7837   coding->dst_pos = PT;
7838   coding->dst_pos_byte = PT_BYTE;
7839   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7840
7841   coding->head_ascii = -1;
7842   coding->detected_utf8_chars = -1;
7843   coding->eol_seen = EOL_SEEN_NONE;
7844   if (CODING_REQUIRE_DETECTION (coding))
7845     detect_coding (coding);
7846   attrs = CODING_ID_ATTRS (coding->id);
7847   if (! disable_ascii_optimization
7848       && ! coding->src_multibyte
7849       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7850       && NILP (CODING_ATTR_POST_READ (attrs))
7851       && NILP (get_translation_table (attrs, 0, NULL)))
7852     {
7853       chars = coding->head_ascii;
7854       if (chars < 0)
7855         chars = check_ascii (coding);
7856       if (chars != bytes)
7857         {
7858           /* There exists a non-ASCII byte.  */
7859           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7860             {
7861               if (coding->detected_utf8_chars >= 0)
7862                 chars = coding->detected_utf8_chars;
7863               else
7864                 chars = check_utf_8 (coding);
7865               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7866                   && coding->head_ascii == 0
7867                   && coding->source[0] == UTF_8_BOM_1
7868                   && coding->source[1] == UTF_8_BOM_2
7869                   && coding->source[2] == UTF_8_BOM_3)
7870                 {
7871                   chars--;
7872                   bytes -= 3;
7873                   coding->src_bytes -= 3;
7874                 }
7875             }
7876           else
7877             chars = -1;
7878         }
7879       if (chars >= 0)
7880         {
7881           Lisp_Object eol_type;
7882
7883           eol_type = CODING_ID_EOL_TYPE (coding->id);
7884           if (VECTORP (eol_type))
7885             {
7886               if (coding->eol_seen != EOL_SEEN_NONE)
7887                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7888             }
7889           if (EQ (eol_type, Qmac))
7890             {
7891               unsigned char *src_end = GAP_END_ADDR;
7892               unsigned char *src = src_end - coding->src_bytes;
7893
7894               while (src < src_end)
7895                 {
7896                   if (*src++ == '\r')
7897                     src[-1] = '\n';
7898                 }
7899             }
7900           else if (EQ (eol_type, Qdos))
7901             {
7902               unsigned char *src = GAP_END_ADDR;
7903               unsigned char *src_beg = src - coding->src_bytes;
7904               unsigned char *dst = src;
7905               ptrdiff_t diff;
7906
7907               while (src_beg < src)
7908                 {
7909                   *--dst = *--src;
7910                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7911                     src--;
7912                 }
7913               diff = dst - src;
7914               bytes -= diff;
7915               chars -= diff;
7916             }
7917           coding->produced = bytes;
7918           coding->produced_char = chars;
7919           insert_from_gap (chars, bytes, 1);
7920           return;
7921         }
7922     }
7923   code_conversion_save (0, 0);
7924
7925   coding->mode |= CODING_MODE_LAST_BLOCK;
7926   current_buffer->text->inhibit_shrinking = 1;
7927   decode_coding (coding);
7928   current_buffer->text->inhibit_shrinking = 0;
7929
7930   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7931     {
7932       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7933       Lisp_Object val;
7934
7935       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7936       val = call1 (CODING_ATTR_POST_READ (attrs),
7937                    make_number (coding->produced_char));
7938       CHECK_NATNUM (val);
7939       coding->produced_char += Z - prev_Z;
7940       coding->produced += Z_BYTE - prev_Z_BYTE;
7941     }
7942
7943   unbind_to (count, Qnil);
7944 }
7945
7946
7947 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7948    SRC_OBJECT into DST_OBJECT by coding context CODING.
7949
7950    SRC_OBJECT is a buffer, a string, or Qnil.
7951
7952    If it is a buffer, the text is at point of the buffer.  FROM and TO
7953    are positions in the buffer.
7954
7955    If it is a string, the text is at the beginning of the string.
7956    FROM and TO are indices to the string.
7957
7958    If it is nil, the text is at coding->source.  FROM and TO are
7959    indices to coding->source.
7960
7961    DST_OBJECT is a buffer, Qt, or Qnil.
7962
7963    If it is a buffer, the decoded text is inserted at point of the
7964    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7965    is deleted.
7966
7967    If it is Qt, a string is made from the decoded text, and
7968    set in CODING->dst_object.
7969
7970    If it is Qnil, the decoded text is stored at CODING->destination.
7971    The caller must allocate CODING->dst_bytes bytes at
7972    CODING->destination by xmalloc.  If the decoded text is longer than
7973    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7974  */
7975
7976 void
7977 decode_coding_object (struct coding_system *coding,
7978                       Lisp_Object src_object,
7979                       ptrdiff_t from, ptrdiff_t from_byte,
7980                       ptrdiff_t to, ptrdiff_t to_byte,
7981                       Lisp_Object dst_object)
7982 {
7983   ptrdiff_t count = SPECPDL_INDEX ();
7984   unsigned char *destination IF_LINT (= NULL);
7985   ptrdiff_t dst_bytes IF_LINT (= 0);
7986   ptrdiff_t chars = to - from;
7987   ptrdiff_t bytes = to_byte - from_byte;
7988   Lisp_Object attrs;
7989   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7990   bool need_marker_adjustment = 0;
7991   Lisp_Object old_deactivate_mark;
7992
7993   old_deactivate_mark = Vdeactivate_mark;
7994
7995   if (NILP (dst_object))
7996     {
7997       destination = coding->destination;
7998       dst_bytes = coding->dst_bytes;
7999     }
8000
8001   coding->src_object = src_object;
8002   coding->src_chars = chars;
8003   coding->src_bytes = bytes;
8004   coding->src_multibyte = chars < bytes;
8005
8006   if (STRINGP (src_object))
8007     {
8008       coding->src_pos = from;
8009       coding->src_pos_byte = from_byte;
8010     }
8011   else if (BUFFERP (src_object))
8012     {
8013       set_buffer_internal (XBUFFER (src_object));
8014       if (from != GPT)
8015         move_gap_both (from, from_byte);
8016       if (EQ (src_object, dst_object))
8017         {
8018           struct Lisp_Marker *tail;
8019
8020           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8021             {
8022               tail->need_adjustment
8023                 = tail->charpos == (tail->insertion_type ? from : to);
8024               need_marker_adjustment |= tail->need_adjustment;
8025             }
8026           saved_pt = PT, saved_pt_byte = PT_BYTE;
8027           TEMP_SET_PT_BOTH (from, from_byte);
8028           current_buffer->text->inhibit_shrinking = 1;
8029           del_range_both (from, from_byte, to, to_byte, 1);
8030           coding->src_pos = -chars;
8031           coding->src_pos_byte = -bytes;
8032         }
8033       else
8034         {
8035           coding->src_pos = from;
8036           coding->src_pos_byte = from_byte;
8037         }
8038     }
8039
8040   if (CODING_REQUIRE_DETECTION (coding))
8041     detect_coding (coding);
8042   attrs = CODING_ID_ATTRS (coding->id);
8043
8044   if (EQ (dst_object, Qt)
8045       || (! NILP (CODING_ATTR_POST_READ (attrs))
8046           && NILP (dst_object)))
8047     {
8048       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8049       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8050       coding->dst_pos = BEG;
8051       coding->dst_pos_byte = BEG_BYTE;
8052     }
8053   else if (BUFFERP (dst_object))
8054     {
8055       code_conversion_save (0, 0);
8056       coding->dst_object = dst_object;
8057       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8058       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8059       coding->dst_multibyte
8060         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8061     }
8062   else
8063     {
8064       code_conversion_save (0, 0);
8065       coding->dst_object = Qnil;
8066       /* Most callers presume this will return a multibyte result, and they
8067          won't use `binary' or `raw-text' anyway, so let's not worry about
8068          CODING_FOR_UNIBYTE.  */
8069       coding->dst_multibyte = 1;
8070     }
8071
8072   decode_coding (coding);
8073
8074   if (BUFFERP (coding->dst_object))
8075     set_buffer_internal (XBUFFER (coding->dst_object));
8076
8077   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8078     {
8079       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8080       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8081       Lisp_Object val;
8082
8083       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8084       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8085               old_deactivate_mark);
8086       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8087                         make_number (coding->produced_char));
8088       UNGCPRO;
8089       CHECK_NATNUM (val);
8090       coding->produced_char += Z - prev_Z;
8091       coding->produced += Z_BYTE - prev_Z_BYTE;
8092     }
8093
8094   if (EQ (dst_object, Qt))
8095     {
8096       coding->dst_object = Fbuffer_string ();
8097     }
8098   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8099     {
8100       set_buffer_internal (XBUFFER (coding->dst_object));
8101       if (dst_bytes < coding->produced)
8102         {
8103           eassert (coding->produced > 0);
8104           destination = xrealloc (destination, coding->produced);
8105           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8106             move_gap_both (BEGV, BEGV_BYTE);
8107           memcpy (destination, BEGV_ADDR, coding->produced);
8108           coding->destination = destination;
8109         }
8110     }
8111
8112   if (saved_pt >= 0)
8113     {
8114       /* This is the case of:
8115          (BUFFERP (src_object) && EQ (src_object, dst_object))
8116          As we have moved PT while replacing the original buffer
8117          contents, we must recover it now.  */
8118       set_buffer_internal (XBUFFER (src_object));
8119       current_buffer->text->inhibit_shrinking = 0;
8120       if (saved_pt < from)
8121         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8122       else if (saved_pt < from + chars)
8123         TEMP_SET_PT_BOTH (from, from_byte);
8124       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8125         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8126                           saved_pt_byte + (coding->produced - bytes));
8127       else
8128         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8129                           saved_pt_byte + (coding->produced - bytes));
8130
8131       if (need_marker_adjustment)
8132         {
8133           struct Lisp_Marker *tail;
8134
8135           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8136             if (tail->need_adjustment)
8137               {
8138                 tail->need_adjustment = 0;
8139                 if (tail->insertion_type)
8140                   {
8141                     tail->bytepos = from_byte;
8142                     tail->charpos = from;
8143                   }
8144                 else
8145                   {
8146                     tail->bytepos = from_byte + coding->produced;
8147                     tail->charpos
8148                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8149                          ? tail->bytepos : from + coding->produced_char);
8150                   }
8151               }
8152         }
8153     }
8154
8155   Vdeactivate_mark = old_deactivate_mark;
8156   unbind_to (count, coding->dst_object);
8157 }
8158
8159
8160 void
8161 encode_coding_object (struct coding_system *coding,
8162                       Lisp_Object src_object,
8163                       ptrdiff_t from, ptrdiff_t from_byte,
8164                       ptrdiff_t to, ptrdiff_t to_byte,
8165                       Lisp_Object dst_object)
8166 {
8167   ptrdiff_t count = SPECPDL_INDEX ();
8168   ptrdiff_t chars = to - from;
8169   ptrdiff_t bytes = to_byte - from_byte;
8170   Lisp_Object attrs;
8171   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8172   bool need_marker_adjustment = 0;
8173   bool kill_src_buffer = 0;
8174   Lisp_Object old_deactivate_mark;
8175
8176   old_deactivate_mark = Vdeactivate_mark;
8177
8178   coding->src_object = src_object;
8179   coding->src_chars = chars;
8180   coding->src_bytes = bytes;
8181   coding->src_multibyte = chars < bytes;
8182
8183   attrs = CODING_ID_ATTRS (coding->id);
8184
8185   if (EQ (src_object, dst_object))
8186     {
8187       struct Lisp_Marker *tail;
8188
8189       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8190         {
8191           tail->need_adjustment
8192             = tail->charpos == (tail->insertion_type ? from : to);
8193           need_marker_adjustment |= tail->need_adjustment;
8194         }
8195     }
8196
8197   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8198     {
8199       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8200       set_buffer_internal (XBUFFER (coding->src_object));
8201       if (STRINGP (src_object))
8202         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8203       else if (BUFFERP (src_object))
8204         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8205       else
8206         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8207
8208       if (EQ (src_object, dst_object))
8209         {
8210           set_buffer_internal (XBUFFER (src_object));
8211           saved_pt = PT, saved_pt_byte = PT_BYTE;
8212           del_range_both (from, from_byte, to, to_byte, 1);
8213           set_buffer_internal (XBUFFER (coding->src_object));
8214         }
8215
8216       {
8217         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8218
8219         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8220                 old_deactivate_mark);
8221         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8222                     make_number (BEG), make_number (Z));
8223         UNGCPRO;
8224       }
8225       if (XBUFFER (coding->src_object) != current_buffer)
8226         kill_src_buffer = 1;
8227       coding->src_object = Fcurrent_buffer ();
8228       if (BEG != GPT)
8229         move_gap_both (BEG, BEG_BYTE);
8230       coding->src_chars = Z - BEG;
8231       coding->src_bytes = Z_BYTE - BEG_BYTE;
8232       coding->src_pos = BEG;
8233       coding->src_pos_byte = BEG_BYTE;
8234       coding->src_multibyte = Z < Z_BYTE;
8235     }
8236   else if (STRINGP (src_object))
8237     {
8238       code_conversion_save (0, 0);
8239       coding->src_pos = from;
8240       coding->src_pos_byte = from_byte;
8241     }
8242   else if (BUFFERP (src_object))
8243     {
8244       code_conversion_save (0, 0);
8245       set_buffer_internal (XBUFFER (src_object));
8246       if (EQ (src_object, dst_object))
8247         {
8248           saved_pt = PT, saved_pt_byte = PT_BYTE;
8249           coding->src_object = del_range_1 (from, to, 1, 1);
8250           coding->src_pos = 0;
8251           coding->src_pos_byte = 0;
8252         }
8253       else
8254         {
8255           if (from < GPT && to >= GPT)
8256             move_gap_both (from, from_byte);
8257           coding->src_pos = from;
8258           coding->src_pos_byte = from_byte;
8259         }
8260     }
8261   else
8262     code_conversion_save (0, 0);
8263
8264   if (BUFFERP (dst_object))
8265     {
8266       coding->dst_object = dst_object;
8267       if (EQ (src_object, dst_object))
8268         {
8269           coding->dst_pos = from;
8270           coding->dst_pos_byte = from_byte;
8271         }
8272       else
8273         {
8274           struct buffer *current = current_buffer;
8275
8276           set_buffer_temp (XBUFFER (dst_object));
8277           coding->dst_pos = PT;
8278           coding->dst_pos_byte = PT_BYTE;
8279           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8280           set_buffer_temp (current);
8281         }
8282       coding->dst_multibyte
8283         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8284     }
8285   else if (EQ (dst_object, Qt))
8286     {
8287       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8288       coding->dst_object = Qnil;
8289       coding->destination = xmalloc (dst_bytes);
8290       coding->dst_bytes = dst_bytes;
8291       coding->dst_multibyte = 0;
8292     }
8293   else
8294     {
8295       coding->dst_object = Qnil;
8296       coding->dst_multibyte = 0;
8297     }
8298
8299   encode_coding (coding);
8300
8301   if (EQ (dst_object, Qt))
8302     {
8303       if (BUFFERP (coding->dst_object))
8304         coding->dst_object = Fbuffer_string ();
8305       else
8306         {
8307           coding->dst_object
8308             = make_unibyte_string ((char *) coding->destination,
8309                                    coding->produced);
8310           xfree (coding->destination);
8311         }
8312     }
8313
8314   if (saved_pt >= 0)
8315     {
8316       /* This is the case of:
8317          (BUFFERP (src_object) && EQ (src_object, dst_object))
8318          As we have moved PT while replacing the original buffer
8319          contents, we must recover it now.  */
8320       set_buffer_internal (XBUFFER (src_object));
8321       if (saved_pt < from)
8322         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8323       else if (saved_pt < from + chars)
8324         TEMP_SET_PT_BOTH (from, from_byte);
8325       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8326         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8327                           saved_pt_byte + (coding->produced - bytes));
8328       else
8329         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8330                           saved_pt_byte + (coding->produced - bytes));
8331
8332       if (need_marker_adjustment)
8333         {
8334           struct Lisp_Marker *tail;
8335
8336           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8337             if (tail->need_adjustment)
8338               {
8339                 tail->need_adjustment = 0;
8340                 if (tail->insertion_type)
8341                   {
8342                     tail->bytepos = from_byte;
8343                     tail->charpos = from;
8344                   }
8345                 else
8346                   {
8347                     tail->bytepos = from_byte + coding->produced;
8348                     tail->charpos
8349                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8350                          ? tail->bytepos : from + coding->produced_char);
8351                   }
8352               }
8353         }
8354     }
8355
8356   if (kill_src_buffer)
8357     Fkill_buffer (coding->src_object);
8358
8359   Vdeactivate_mark = old_deactivate_mark;
8360   unbind_to (count, Qnil);
8361 }
8362
8363
8364 Lisp_Object
8365 preferred_coding_system (void)
8366 {
8367   int id = coding_categories[coding_priorities[0]].id;
8368
8369   return CODING_ID_NAME (id);
8370 }
8371
8372 #if defined (WINDOWSNT) || defined (CYGWIN)
8373
8374 Lisp_Object
8375 from_unicode (Lisp_Object str)
8376 {
8377   CHECK_STRING (str);
8378   if (!STRING_MULTIBYTE (str) &&
8379       SBYTES (str) & 1)
8380     {
8381       str = Fsubstring (str, make_number (0), make_number (-1));
8382     }
8383
8384   return code_convert_string_norecord (str, Qutf_16le, 0);
8385 }
8386
8387 Lisp_Object
8388 from_unicode_buffer (const wchar_t* wstr)
8389 {
8390     return from_unicode (
8391         make_unibyte_string (
8392             (char*) wstr,
8393             /* we get one of the two final 0 bytes for free. */
8394             1 + sizeof (wchar_t) * wcslen (wstr)));
8395 }
8396
8397 wchar_t *
8398 to_unicode (Lisp_Object str, Lisp_Object *buf)
8399 {
8400   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8401   /* We need to make another copy (in addition to the one made by
8402      code_convert_string_norecord) to ensure that the final string is
8403      _doubly_ zero terminated --- that is, that the string is
8404      terminated by two zero bytes and one utf-16le null character.
8405      Because strings are already terminated with a single zero byte,
8406      we just add one additional zero. */
8407   str = make_uninit_string (SBYTES (*buf) + 1);
8408   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8409   SDATA (str) [SBYTES (*buf)] = '\0';
8410   *buf = str;
8411   return WCSDATA (*buf);
8412 }
8413
8414 #endif /* WINDOWSNT || CYGWIN */
8415
8416 \f
8417 #ifdef emacs
8418 /*** 8. Emacs Lisp library functions ***/
8419
8420 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8421        doc: /* Return t if OBJECT is nil or a coding-system.
8422 See the documentation of `define-coding-system' for information
8423 about coding-system objects.  */)
8424   (Lisp_Object object)
8425 {
8426   if (NILP (object)
8427       || CODING_SYSTEM_ID (object) >= 0)
8428     return Qt;
8429   if (! SYMBOLP (object)
8430       || NILP (Fget (object, Qcoding_system_define_form)))
8431     return Qnil;
8432   return Qt;
8433 }
8434
8435 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8436        Sread_non_nil_coding_system, 1, 1, 0,
8437        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8438   (Lisp_Object prompt)
8439 {
8440   Lisp_Object val;
8441   do
8442     {
8443       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8444                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8445     }
8446   while (SCHARS (val) == 0);
8447   return (Fintern (val, Qnil));
8448 }
8449
8450 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8451        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8452 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8453 Ignores case when completing coding systems (all Emacs coding systems
8454 are lower-case).  */)
8455   (Lisp_Object prompt, Lisp_Object default_coding_system)
8456 {
8457   Lisp_Object val;
8458   ptrdiff_t count = SPECPDL_INDEX ();
8459
8460   if (SYMBOLP (default_coding_system))
8461     default_coding_system = SYMBOL_NAME (default_coding_system);
8462   specbind (Qcompletion_ignore_case, Qt);
8463   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8464                           Qt, Qnil, Qcoding_system_history,
8465                           default_coding_system, Qnil);
8466   unbind_to (count, Qnil);
8467   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8468 }
8469
8470 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8471        1, 1, 0,
8472        doc: /* Check validity of CODING-SYSTEM.
8473 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8474 It is valid if it is nil or a symbol defined as a coding system by the
8475 function `define-coding-system'.  */)
8476   (Lisp_Object coding_system)
8477 {
8478   Lisp_Object define_form;
8479
8480   define_form = Fget (coding_system, Qcoding_system_define_form);
8481   if (! NILP (define_form))
8482     {
8483       Fput (coding_system, Qcoding_system_define_form, Qnil);
8484       safe_eval (define_form);
8485     }
8486   if (!NILP (Fcoding_system_p (coding_system)))
8487     return coding_system;
8488   xsignal1 (Qcoding_system_error, coding_system);
8489 }
8490
8491 \f
8492 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8493    HIGHEST, return the coding system of the highest
8494    priority among the detected coding systems.  Otherwise return a
8495    list of detected coding systems sorted by their priorities.  If
8496    MULTIBYTEP, it is assumed that the bytes are in correct
8497    multibyte form but contains only ASCII and eight-bit chars.
8498    Otherwise, the bytes are raw bytes.
8499
8500    CODING-SYSTEM controls the detection as below:
8501
8502    If it is nil, detect both text-format and eol-format.  If the
8503    text-format part of CODING-SYSTEM is already specified
8504    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8505    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8506    detect only text-format.  */
8507
8508 Lisp_Object
8509 detect_coding_system (const unsigned char *src,
8510                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8511                       bool highest, bool multibytep,
8512                       Lisp_Object coding_system)
8513 {
8514   const unsigned char *src_end = src + src_bytes;
8515   Lisp_Object attrs, eol_type;
8516   Lisp_Object val = Qnil;
8517   struct coding_system coding;
8518   ptrdiff_t id;
8519   struct coding_detection_info detect_info;
8520   enum coding_category base_category;
8521   bool null_byte_found = 0, eight_bit_found = 0;
8522
8523   if (NILP (coding_system))
8524     coding_system = Qundecided;
8525   setup_coding_system (coding_system, &coding);
8526   attrs = CODING_ID_ATTRS (coding.id);
8527   eol_type = CODING_ID_EOL_TYPE (coding.id);
8528   coding_system = CODING_ATTR_BASE_NAME (attrs);
8529
8530   coding.source = src;
8531   coding.src_chars = src_chars;
8532   coding.src_bytes = src_bytes;
8533   coding.src_multibyte = multibytep;
8534   coding.consumed = 0;
8535   coding.mode |= CODING_MODE_LAST_BLOCK;
8536   coding.head_ascii = 0;
8537
8538   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8539
8540   /* At first, detect text-format if necessary.  */
8541   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8542   if (base_category == coding_category_undecided)
8543     {
8544       enum coding_category category IF_LINT (= 0);
8545       struct coding_system *this IF_LINT (= NULL);
8546       int c, i;
8547       int inhibit_nbd           /* null byte detection */
8548         = (coding.spec.undecided.inhibit_nbd > 0
8549            | (coding.spec.undecided.inhibit_nbd == 0
8550               & inhibit_null_byte_detection));
8551       int inhibit_ied           /* iso escape detection */
8552         = (coding.spec.undecided.inhibit_ied > 0
8553            | (coding.spec.undecided.inhibit_ied == 0
8554               & inhibit_iso_escape_detection));
8555       int prefer_utf_8
8556         = coding.spec.undecided.prefer_utf_8;
8557
8558
8559       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8560       for (; src < src_end; src++)
8561         {
8562           c = *src;
8563           if (c & 0x80)
8564             {
8565               eight_bit_found = 1;
8566               if (null_byte_found)
8567                 break;
8568             }
8569           else if (c < 0x20)
8570             {
8571               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8572                   && ! inhibit_ied
8573                   && ! detect_info.checked)
8574                 {
8575                   if (detect_coding_iso_2022 (&coding, &detect_info))
8576                     {
8577                       /* We have scanned the whole data.  */
8578                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8579                         {
8580                           /* We didn't find an 8-bit code.  We may
8581                              have found a null-byte, but it's very
8582                              rare that a binary file confirm to
8583                              ISO-2022.  */
8584                           src = src_end;
8585                           coding.head_ascii = src - coding.source;
8586                         }
8587                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8588                       break;
8589                     }
8590                 }
8591               else if (! c && !inhibit_nbd)
8592                 {
8593                   null_byte_found = 1;
8594                   if (eight_bit_found)
8595                     break;
8596                 }
8597               if (! eight_bit_found)
8598                 coding.head_ascii++;
8599             }
8600           else if (! eight_bit_found)
8601             coding.head_ascii++;
8602         }
8603
8604       if (null_byte_found || eight_bit_found
8605           || coding.head_ascii < coding.src_bytes
8606           || detect_info.found)
8607         {
8608           if (coding.head_ascii == coding.src_bytes)
8609             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8610             for (i = 0; i < coding_category_raw_text; i++)
8611               {
8612                 category = coding_priorities[i];
8613                 this = coding_categories + category;
8614                 if (detect_info.found & (1 << category))
8615                   break;
8616               }
8617           else
8618             {
8619               if (null_byte_found)
8620                 {
8621                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8622                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8623                 }
8624               else if (prefer_utf_8
8625                        && detect_coding_utf_8 (&coding, &detect_info))
8626                 {
8627                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8628                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8629                 }
8630               for (i = 0; i < coding_category_raw_text; i++)
8631                 {
8632                   category = coding_priorities[i];
8633                   this = coding_categories + category;
8634
8635                   if (this->id < 0)
8636                     {
8637                       /* No coding system of this category is defined.  */
8638                       detect_info.rejected |= (1 << category);
8639                     }
8640                   else if (category >= coding_category_raw_text)
8641                     continue;
8642                   else if (detect_info.checked & (1 << category))
8643                     {
8644                       if (highest
8645                           && (detect_info.found & (1 << category)))
8646                         break;
8647                     }
8648                   else if ((*(this->detector)) (&coding, &detect_info)
8649                            && highest
8650                            && (detect_info.found & (1 << category)))
8651                     {
8652                       if (category == coding_category_utf_16_auto)
8653                         {
8654                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8655                             category = coding_category_utf_16_le;
8656                           else
8657                             category = coding_category_utf_16_be;
8658                         }
8659                       break;
8660                     }
8661                 }
8662             }
8663         }
8664
8665       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8666           || null_byte_found)
8667         {
8668           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8669           id = CODING_SYSTEM_ID (Qno_conversion);
8670           val = Fcons (make_number (id), Qnil);
8671         }
8672       else if (! detect_info.rejected && ! detect_info.found)
8673         {
8674           detect_info.found = CATEGORY_MASK_ANY;
8675           id = coding_categories[coding_category_undecided].id;
8676           val = Fcons (make_number (id), Qnil);
8677         }
8678       else if (highest)
8679         {
8680           if (detect_info.found)
8681             {
8682               detect_info.found = 1 << category;
8683               val = Fcons (make_number (this->id), Qnil);
8684             }
8685           else
8686             for (i = 0; i < coding_category_raw_text; i++)
8687               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8688                 {
8689                   detect_info.found = 1 << coding_priorities[i];
8690                   id = coding_categories[coding_priorities[i]].id;
8691                   val = Fcons (make_number (id), Qnil);
8692                   break;
8693                 }
8694         }
8695       else
8696         {
8697           int mask = detect_info.rejected | detect_info.found;
8698           int found = 0;
8699
8700           for (i = coding_category_raw_text - 1; i >= 0; i--)
8701             {
8702               category = coding_priorities[i];
8703               if (! (mask & (1 << category)))
8704                 {
8705                   found |= 1 << category;
8706                   id = coding_categories[category].id;
8707                   if (id >= 0)
8708                     val = Fcons (make_number (id), val);
8709                 }
8710             }
8711           for (i = coding_category_raw_text - 1; i >= 0; i--)
8712             {
8713               category = coding_priorities[i];
8714               if (detect_info.found & (1 << category))
8715                 {
8716                   id = coding_categories[category].id;
8717                   val = Fcons (make_number (id), val);
8718                 }
8719             }
8720           detect_info.found |= found;
8721         }
8722     }
8723   else if (base_category == coding_category_utf_8_auto)
8724     {
8725       if (detect_coding_utf_8 (&coding, &detect_info))
8726         {
8727           struct coding_system *this;
8728
8729           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8730             this = coding_categories + coding_category_utf_8_sig;
8731           else
8732             this = coding_categories + coding_category_utf_8_nosig;
8733           val = Fcons (make_number (this->id), Qnil);
8734         }
8735     }
8736   else if (base_category == coding_category_utf_16_auto)
8737     {
8738       if (detect_coding_utf_16 (&coding, &detect_info))
8739         {
8740           struct coding_system *this;
8741
8742           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8743             this = coding_categories + coding_category_utf_16_le;
8744           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8745             this = coding_categories + coding_category_utf_16_be;
8746           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8747             this = coding_categories + coding_category_utf_16_be_nosig;
8748           else
8749             this = coding_categories + coding_category_utf_16_le_nosig;
8750           val = Fcons (make_number (this->id), Qnil);
8751         }
8752     }
8753   else
8754     {
8755       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8756       val = Fcons (make_number (coding.id), Qnil);
8757     }
8758
8759   /* Then, detect eol-format if necessary.  */
8760   {
8761     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8762     Lisp_Object tail;
8763
8764     if (VECTORP (eol_type))
8765       {
8766         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8767           {
8768             if (null_byte_found)
8769               normal_eol = EOL_SEEN_LF;
8770             else
8771               normal_eol = detect_eol (coding.source, src_bytes,
8772                                        coding_category_raw_text);
8773           }
8774         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8775                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8776           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8777                                       coding_category_utf_16_be);
8778         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8779                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8780           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8781                                       coding_category_utf_16_le);
8782       }
8783     else
8784       {
8785         if (EQ (eol_type, Qunix))
8786           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8787         else if (EQ (eol_type, Qdos))
8788           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8789         else
8790           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8791       }
8792
8793     for (tail = val; CONSP (tail); tail = XCDR (tail))
8794       {
8795         enum coding_category category;
8796         int this_eol;
8797
8798         id = XINT (XCAR (tail));
8799         attrs = CODING_ID_ATTRS (id);
8800         category = XINT (CODING_ATTR_CATEGORY (attrs));
8801         eol_type = CODING_ID_EOL_TYPE (id);
8802         if (VECTORP (eol_type))
8803           {
8804             if (category == coding_category_utf_16_be
8805                 || category == coding_category_utf_16_be_nosig)
8806               this_eol = utf_16_be_eol;
8807             else if (category == coding_category_utf_16_le
8808                      || category == coding_category_utf_16_le_nosig)
8809               this_eol = utf_16_le_eol;
8810             else
8811               this_eol = normal_eol;
8812
8813             if (this_eol == EOL_SEEN_LF)
8814               XSETCAR (tail, AREF (eol_type, 0));
8815             else if (this_eol == EOL_SEEN_CRLF)
8816               XSETCAR (tail, AREF (eol_type, 1));
8817             else if (this_eol == EOL_SEEN_CR)
8818               XSETCAR (tail, AREF (eol_type, 2));
8819             else
8820               XSETCAR (tail, CODING_ID_NAME (id));
8821           }
8822         else
8823           XSETCAR (tail, CODING_ID_NAME (id));
8824       }
8825   }
8826
8827   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8828 }
8829
8830
8831 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8832        2, 3, 0,
8833        doc: /* Detect coding system of the text in the region between START and END.
8834 Return a list of possible coding systems ordered by priority.
8835 The coding systems to try and their priorities follows what
8836 the function `coding-system-priority-list' (which see) returns.
8837
8838 If only ASCII characters are found (except for such ISO-2022 control
8839 characters as ESC), it returns a list of single element `undecided'
8840 or its subsidiary coding system according to a detected end-of-line
8841 format.
8842
8843 If optional argument HIGHEST is non-nil, return the coding system of
8844 highest priority.  */)
8845   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8846 {
8847   ptrdiff_t from, to;
8848   ptrdiff_t from_byte, to_byte;
8849
8850   validate_region (&start, &end);
8851   from = XINT (start), to = XINT (end);
8852   from_byte = CHAR_TO_BYTE (from);
8853   to_byte = CHAR_TO_BYTE (to);
8854
8855   if (from < GPT && to >= GPT)
8856     move_gap_both (to, to_byte);
8857
8858   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8859                                to - from, to_byte - from_byte,
8860                                !NILP (highest),
8861                                !NILP (BVAR (current_buffer
8862                                       , enable_multibyte_characters)),
8863                                Qnil);
8864 }
8865
8866 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8867        1, 2, 0,
8868        doc: /* Detect coding system of the text in STRING.
8869 Return a list of possible coding systems ordered by priority.
8870 The coding systems to try and their priorities follows what
8871 the function `coding-system-priority-list' (which see) returns.
8872
8873 If only ASCII characters are found (except for such ISO-2022 control
8874 characters as ESC), it returns a list of single element `undecided'
8875 or its subsidiary coding system according to a detected end-of-line
8876 format.
8877
8878 If optional argument HIGHEST is non-nil, return the coding system of
8879 highest priority.  */)
8880   (Lisp_Object string, Lisp_Object highest)
8881 {
8882   CHECK_STRING (string);
8883
8884   return detect_coding_system (SDATA (string),
8885                                SCHARS (string), SBYTES (string),
8886                                !NILP (highest), STRING_MULTIBYTE (string),
8887                                Qnil);
8888 }
8889
8890
8891 static bool
8892 char_encodable_p (int c, Lisp_Object attrs)
8893 {
8894   Lisp_Object tail;
8895   struct charset *charset;
8896   Lisp_Object translation_table;
8897
8898   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8899   if (! NILP (translation_table))
8900     c = translate_char (translation_table, c);
8901   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8902        CONSP (tail); tail = XCDR (tail))
8903     {
8904       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8905       if (CHAR_CHARSET_P (c, charset))
8906         break;
8907     }
8908   return (! NILP (tail));
8909 }
8910
8911
8912 /* Return a list of coding systems that safely encode the text between
8913    START and END.  If EXCLUDE is non-nil, it is a list of coding
8914    systems not to check.  The returned list doesn't contain any such
8915    coding systems.  In any case, if the text contains only ASCII or is
8916    unibyte, return t.  */
8917
8918 DEFUN ("find-coding-systems-region-internal",
8919        Ffind_coding_systems_region_internal,
8920        Sfind_coding_systems_region_internal, 2, 3, 0,
8921        doc: /* Internal use only.  */)
8922   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8923 {
8924   Lisp_Object coding_attrs_list, safe_codings;
8925   ptrdiff_t start_byte, end_byte;
8926   const unsigned char *p, *pbeg, *pend;
8927   int c;
8928   Lisp_Object tail, elt, work_table;
8929
8930   if (STRINGP (start))
8931     {
8932       if (!STRING_MULTIBYTE (start)
8933           || SCHARS (start) == SBYTES (start))
8934         return Qt;
8935       start_byte = 0;
8936       end_byte = SBYTES (start);
8937     }
8938   else
8939     {
8940       CHECK_NUMBER_COERCE_MARKER (start);
8941       CHECK_NUMBER_COERCE_MARKER (end);
8942       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8943         args_out_of_range (start, end);
8944       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8945         return Qt;
8946       start_byte = CHAR_TO_BYTE (XINT (start));
8947       end_byte = CHAR_TO_BYTE (XINT (end));
8948       if (XINT (end) - XINT (start) == end_byte - start_byte)
8949         return Qt;
8950
8951       if (XINT (start) < GPT && XINT (end) > GPT)
8952         {
8953           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8954             move_gap_both (XINT (start), start_byte);
8955           else
8956             move_gap_both (XINT (end), end_byte);
8957         }
8958     }
8959
8960   coding_attrs_list = Qnil;
8961   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8962     if (NILP (exclude)
8963         || NILP (Fmemq (XCAR (tail), exclude)))
8964       {
8965         Lisp_Object attrs;
8966
8967         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8968         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8969           {
8970             ASET (attrs, coding_attr_trans_tbl,
8971                   get_translation_table (attrs, 1, NULL));
8972             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8973           }
8974       }
8975
8976   if (STRINGP (start))
8977     p = pbeg = SDATA (start);
8978   else
8979     p = pbeg = BYTE_POS_ADDR (start_byte);
8980   pend = p + (end_byte - start_byte);
8981
8982   while (p < pend && ASCII_BYTE_P (*p)) p++;
8983   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8984
8985   work_table = Fmake_char_table (Qnil, Qnil);
8986   while (p < pend)
8987     {
8988       if (ASCII_BYTE_P (*p))
8989         p++;
8990       else
8991         {
8992           c = STRING_CHAR_ADVANCE (p);
8993           if (!NILP (char_table_ref (work_table, c)))
8994             /* This character was already checked.  Ignore it.  */
8995             continue;
8996
8997           charset_map_loaded = 0;
8998           for (tail = coding_attrs_list; CONSP (tail);)
8999             {
9000               elt = XCAR (tail);
9001               if (NILP (elt))
9002                 tail = XCDR (tail);
9003               else if (char_encodable_p (c, elt))
9004                 tail = XCDR (tail);
9005               else if (CONSP (XCDR (tail)))
9006                 {
9007                   XSETCAR (tail, XCAR (XCDR (tail)));
9008                   XSETCDR (tail, XCDR (XCDR (tail)));
9009                 }
9010               else
9011                 {
9012                   XSETCAR (tail, Qnil);
9013                   tail = XCDR (tail);
9014                 }
9015             }
9016           if (charset_map_loaded)
9017             {
9018               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9019
9020               if (STRINGP (start))
9021                 pbeg = SDATA (start);
9022               else
9023                 pbeg = BYTE_POS_ADDR (start_byte);
9024               p = pbeg + p_offset;
9025               pend = pbeg + pend_offset;
9026             }
9027           char_table_set (work_table, c, Qt);
9028         }
9029     }
9030
9031   safe_codings = list2 (Qraw_text, Qno_conversion);
9032   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9033     if (! NILP (XCAR (tail)))
9034       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9035
9036   return safe_codings;
9037 }
9038
9039
9040 DEFUN ("unencodable-char-position", Funencodable_char_position,
9041        Sunencodable_char_position, 3, 5, 0,
9042        doc: /*
9043 Return position of first un-encodable character in a region.
9044 START and END specify the region and CODING-SYSTEM specifies the
9045 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9046
9047 If optional 4th argument COUNT is non-nil, it specifies at most how
9048 many un-encodable characters to search.  In this case, the value is a
9049 list of positions.
9050
9051 If optional 5th argument STRING is non-nil, it is a string to search
9052 for un-encodable characters.  In that case, START and END are indexes
9053 to the string.  */)
9054   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9055 {
9056   EMACS_INT n;
9057   struct coding_system coding;
9058   Lisp_Object attrs, charset_list, translation_table;
9059   Lisp_Object positions;
9060   ptrdiff_t from, to;
9061   const unsigned char *p, *stop, *pend;
9062   bool ascii_compatible;
9063
9064   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9065   attrs = CODING_ID_ATTRS (coding.id);
9066   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9067     return Qnil;
9068   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9069   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9070   translation_table = get_translation_table (attrs, 1, NULL);
9071
9072   if (NILP (string))
9073     {
9074       validate_region (&start, &end);
9075       from = XINT (start);
9076       to = XINT (end);
9077       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9078           || (ascii_compatible
9079               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9080         return Qnil;
9081       p = CHAR_POS_ADDR (from);
9082       pend = CHAR_POS_ADDR (to);
9083       if (from < GPT && to >= GPT)
9084         stop = GPT_ADDR;
9085       else
9086         stop = pend;
9087     }
9088   else
9089     {
9090       CHECK_STRING (string);
9091       CHECK_NATNUM (start);
9092       CHECK_NATNUM (end);
9093       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9094         args_out_of_range_3 (string, start, end);
9095       from = XINT (start);
9096       to = XINT (end);
9097       if (! STRING_MULTIBYTE (string))
9098         return Qnil;
9099       p = SDATA (string) + string_char_to_byte (string, from);
9100       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9101       if (ascii_compatible && (to - from) == (pend - p))
9102         return Qnil;
9103     }
9104
9105   if (NILP (count))
9106     n = 1;
9107   else
9108     {
9109       CHECK_NATNUM (count);
9110       n = XINT (count);
9111     }
9112
9113   positions = Qnil;
9114   charset_map_loaded = 0;
9115   while (1)
9116     {
9117       int c;
9118
9119       if (ascii_compatible)
9120         while (p < stop && ASCII_BYTE_P (*p))
9121           p++, from++;
9122       if (p >= stop)
9123         {
9124           if (p >= pend)
9125             break;
9126           stop = pend;
9127           p = GAP_END_ADDR;
9128         }
9129
9130       c = STRING_CHAR_ADVANCE (p);
9131       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9132           && ! char_charset (translate_char (translation_table, c),
9133                              charset_list, NULL))
9134         {
9135           positions = Fcons (make_number (from), positions);
9136           n--;
9137           if (n == 0)
9138             break;
9139         }
9140
9141       from++;
9142       if (charset_map_loaded && NILP (string))
9143         {
9144           p = CHAR_POS_ADDR (from);
9145           pend = CHAR_POS_ADDR (to);
9146           if (from < GPT && to >= GPT)
9147             stop = GPT_ADDR;
9148           else
9149             stop = pend;
9150           charset_map_loaded = 0;
9151         }
9152     }
9153
9154   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9155 }
9156
9157
9158 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9159        Scheck_coding_systems_region, 3, 3, 0,
9160        doc: /* Check if the region is encodable by coding systems.
9161
9162 START and END are buffer positions specifying the region.
9163 CODING-SYSTEM-LIST is a list of coding systems to check.
9164
9165 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9166 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9167 whole region, POS0, POS1, ... are buffer positions where non-encodable
9168 characters are found.
9169
9170 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9171 value is nil.
9172
9173 START may be a string.  In that case, check if the string is
9174 encodable, and the value contains indices to the string instead of
9175 buffer positions.  END is ignored.
9176
9177 If the current buffer (or START if it is a string) is unibyte, the value
9178 is nil.  */)
9179   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9180 {
9181   Lisp_Object list;
9182   ptrdiff_t start_byte, end_byte;
9183   ptrdiff_t pos;
9184   const unsigned char *p, *pbeg, *pend;
9185   int c;
9186   Lisp_Object tail, elt, attrs;
9187
9188   if (STRINGP (start))
9189     {
9190       if (!STRING_MULTIBYTE (start)
9191           || SCHARS (start) == SBYTES (start))
9192         return Qnil;
9193       start_byte = 0;
9194       end_byte = SBYTES (start);
9195       pos = 0;
9196     }
9197   else
9198     {
9199       CHECK_NUMBER_COERCE_MARKER (start);
9200       CHECK_NUMBER_COERCE_MARKER (end);
9201       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9202         args_out_of_range (start, end);
9203       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9204         return Qnil;
9205       start_byte = CHAR_TO_BYTE (XINT (start));
9206       end_byte = CHAR_TO_BYTE (XINT (end));
9207       if (XINT (end) - XINT (start) == end_byte - start_byte)
9208         return Qnil;
9209
9210       if (XINT (start) < GPT && XINT (end) > GPT)
9211         {
9212           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9213             move_gap_both (XINT (start), start_byte);
9214           else
9215             move_gap_both (XINT (end), end_byte);
9216         }
9217       pos = XINT (start);
9218     }
9219
9220   list = Qnil;
9221   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9222     {
9223       elt = XCAR (tail);
9224       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9225       ASET (attrs, coding_attr_trans_tbl,
9226             get_translation_table (attrs, 1, NULL));
9227       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9228     }
9229
9230   if (STRINGP (start))
9231     p = pbeg = SDATA (start);
9232   else
9233     p = pbeg = BYTE_POS_ADDR (start_byte);
9234   pend = p + (end_byte - start_byte);
9235
9236   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9237   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9238
9239   while (p < pend)
9240     {
9241       if (ASCII_BYTE_P (*p))
9242         p++;
9243       else
9244         {
9245           c = STRING_CHAR_ADVANCE (p);
9246
9247           charset_map_loaded = 0;
9248           for (tail = list; CONSP (tail); tail = XCDR (tail))
9249             {
9250               elt = XCDR (XCAR (tail));
9251               if (! char_encodable_p (c, XCAR (elt)))
9252                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9253             }
9254           if (charset_map_loaded)
9255             {
9256               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9257
9258               if (STRINGP (start))
9259                 pbeg = SDATA (start);
9260               else
9261                 pbeg = BYTE_POS_ADDR (start_byte);
9262               p = pbeg + p_offset;
9263               pend = pbeg + pend_offset;
9264             }
9265         }
9266       pos++;
9267     }
9268
9269   tail = list;
9270   list = Qnil;
9271   for (; CONSP (tail); tail = XCDR (tail))
9272     {
9273       elt = XCAR (tail);
9274       if (CONSP (XCDR (XCDR (elt))))
9275         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9276                       list);
9277     }
9278
9279   return list;
9280 }
9281
9282
9283 static Lisp_Object
9284 code_convert_region (Lisp_Object start, Lisp_Object end,
9285                      Lisp_Object coding_system, Lisp_Object dst_object,
9286                      bool encodep, bool norecord)
9287 {
9288   struct coding_system coding;
9289   ptrdiff_t from, from_byte, to, to_byte;
9290   Lisp_Object src_object;
9291
9292   if (NILP (coding_system))
9293     coding_system = Qno_conversion;
9294   else
9295     CHECK_CODING_SYSTEM (coding_system);
9296   src_object = Fcurrent_buffer ();
9297   if (NILP (dst_object))
9298     dst_object = src_object;
9299   else if (! EQ (dst_object, Qt))
9300     CHECK_BUFFER (dst_object);
9301
9302   validate_region (&start, &end);
9303   from = XFASTINT (start);
9304   from_byte = CHAR_TO_BYTE (from);
9305   to = XFASTINT (end);
9306   to_byte = CHAR_TO_BYTE (to);
9307
9308   setup_coding_system (coding_system, &coding);
9309   coding.mode |= CODING_MODE_LAST_BLOCK;
9310
9311   if (encodep)
9312     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9313                           dst_object);
9314   else
9315     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9316                           dst_object);
9317   if (! norecord)
9318     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9319
9320   return (BUFFERP (dst_object)
9321           ? make_number (coding.produced_char)
9322           : coding.dst_object);
9323 }
9324
9325
9326 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9327        3, 4, "r\nzCoding system: ",
9328        doc: /* Decode the current region from the specified coding system.
9329 When called from a program, takes four arguments:
9330         START, END, CODING-SYSTEM, and DESTINATION.
9331 START and END are buffer positions.
9332
9333 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9334 If nil, the region between START and END is replaced by the decoded text.
9335 If buffer, the decoded text is inserted in that buffer after point (point
9336 does not move).
9337 In those cases, the length of the decoded text is returned.
9338 If DESTINATION is t, the decoded text is returned.
9339
9340 This function sets `last-coding-system-used' to the precise coding system
9341 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9342 not fully specified.)  */)
9343   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9344 {
9345   return code_convert_region (start, end, coding_system, destination, 0, 0);
9346 }
9347
9348 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9349        3, 4, "r\nzCoding system: ",
9350        doc: /* Encode the current region by specified coding system.
9351 When called from a program, takes four arguments:
9352         START, END, CODING-SYSTEM and DESTINATION.
9353 START and END are buffer positions.
9354
9355 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9356 If nil, the region between START and END is replace by the encoded text.
9357 If buffer, the encoded text is inserted in that buffer after point (point
9358 does not move).
9359 In those cases, the length of the encoded text is returned.
9360 If DESTINATION is t, the encoded text is returned.
9361
9362 This function sets `last-coding-system-used' to the precise coding system
9363 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9364 not fully specified.)  */)
9365   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9366 {
9367   return code_convert_region (start, end, coding_system, destination, 1, 0);
9368 }
9369
9370 Lisp_Object
9371 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9372                      Lisp_Object dst_object, bool encodep, bool nocopy,
9373                      bool norecord)
9374 {
9375   struct coding_system coding;
9376   ptrdiff_t chars, bytes;
9377
9378   CHECK_STRING (string);
9379   if (NILP (coding_system))
9380     {
9381       if (! norecord)
9382         Vlast_coding_system_used = Qno_conversion;
9383       if (NILP (dst_object))
9384         return (nocopy ? Fcopy_sequence (string) : string);
9385     }
9386
9387   if (NILP (coding_system))
9388     coding_system = Qno_conversion;
9389   else
9390     CHECK_CODING_SYSTEM (coding_system);
9391   if (NILP (dst_object))
9392     dst_object = Qt;
9393   else if (! EQ (dst_object, Qt))
9394     CHECK_BUFFER (dst_object);
9395
9396   setup_coding_system (coding_system, &coding);
9397   coding.mode |= CODING_MODE_LAST_BLOCK;
9398   chars = SCHARS (string);
9399   bytes = SBYTES (string);
9400   if (encodep)
9401     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9402   else
9403     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9404   if (! norecord)
9405     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9406
9407   return (BUFFERP (dst_object)
9408           ? make_number (coding.produced_char)
9409           : coding.dst_object);
9410 }
9411
9412
9413 /* Encode or decode STRING according to CODING_SYSTEM.
9414    Do not set Vlast_coding_system_used.
9415
9416    This function is called only from macros DECODE_FILE and
9417    ENCODE_FILE, thus we ignore character composition.  */
9418
9419 Lisp_Object
9420 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9421                               bool encodep)
9422 {
9423   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9424 }
9425
9426
9427 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9428        2, 4, 0,
9429        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9430
9431 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9432 if the decoding operation is trivial.
9433
9434 Optional fourth arg BUFFER non-nil means that the decoded text is
9435 inserted in that buffer after point (point does not move).  In this
9436 case, the return value is the length of the decoded text.
9437
9438 This function sets `last-coding-system-used' to the precise coding system
9439 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9440 not fully specified.)  */)
9441   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9442 {
9443   return code_convert_string (string, coding_system, buffer,
9444                               0, ! NILP (nocopy), 0);
9445 }
9446
9447 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9448        2, 4, 0,
9449        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9450
9451 Optional third arg NOCOPY non-nil means it is OK to return STRING
9452 itself if the encoding operation is trivial.
9453
9454 Optional fourth arg BUFFER non-nil means that the encoded text is
9455 inserted in that buffer after point (point does not move).  In this
9456 case, the return value is the length of the encoded text.
9457
9458 This function sets `last-coding-system-used' to the precise coding system
9459 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9460 not fully specified.)  */)
9461   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9462 {
9463   return code_convert_string (string, coding_system, buffer,
9464                               1, ! NILP (nocopy), 0);
9465 }
9466
9467 \f
9468 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9469        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9470 Return the corresponding character.  */)
9471   (Lisp_Object code)
9472 {
9473   Lisp_Object spec, attrs, val;
9474   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9475   EMACS_INT ch;
9476   int c;
9477
9478   CHECK_NATNUM (code);
9479   ch = XFASTINT (code);
9480   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9481   attrs = AREF (spec, 0);
9482
9483   if (ASCII_BYTE_P (ch)
9484       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9485     return code;
9486
9487   val = CODING_ATTR_CHARSET_LIST (attrs);
9488   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9489   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9490   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9491
9492   if (ch <= 0x7F)
9493     {
9494       c = ch;
9495       charset = charset_roman;
9496     }
9497   else if (ch >= 0xA0 && ch < 0xDF)
9498     {
9499       c = ch - 0x80;
9500       charset = charset_kana;
9501     }
9502   else
9503     {
9504       EMACS_INT c1 = ch >> 8;
9505       int c2 = ch & 0xFF;
9506
9507       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9508           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9509         error ("Invalid code: %"pI"d", ch);
9510       c = ch;
9511       SJIS_TO_JIS (c);
9512       charset = charset_kanji;
9513     }
9514   c = DECODE_CHAR (charset, c);
9515   if (c < 0)
9516     error ("Invalid code: %"pI"d", ch);
9517   return make_number (c);
9518 }
9519
9520
9521 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9522        doc: /* Encode a Japanese character CH to shift_jis encoding.
9523 Return the corresponding code in SJIS.  */)
9524   (Lisp_Object ch)
9525 {
9526   Lisp_Object spec, attrs, charset_list;
9527   int c;
9528   struct charset *charset;
9529   unsigned code;
9530
9531   CHECK_CHARACTER (ch);
9532   c = XFASTINT (ch);
9533   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9534   attrs = AREF (spec, 0);
9535
9536   if (ASCII_CHAR_P (c)
9537       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9538     return ch;
9539
9540   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9541   charset = char_charset (c, charset_list, &code);
9542   if (code == CHARSET_INVALID_CODE (charset))
9543     error ("Can't encode by shift_jis encoding: %c", c);
9544   JIS_TO_SJIS (code);
9545
9546   return make_number (code);
9547 }
9548
9549 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9550        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9551 Return the corresponding character.  */)
9552   (Lisp_Object code)
9553 {
9554   Lisp_Object spec, attrs, val;
9555   struct charset *charset_roman, *charset_big5, *charset;
9556   EMACS_INT ch;
9557   int c;
9558
9559   CHECK_NATNUM (code);
9560   ch = XFASTINT (code);
9561   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9562   attrs = AREF (spec, 0);
9563
9564   if (ASCII_BYTE_P (ch)
9565       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9566     return code;
9567
9568   val = CODING_ATTR_CHARSET_LIST (attrs);
9569   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9570   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9571
9572   if (ch <= 0x7F)
9573     {
9574       c = ch;
9575       charset = charset_roman;
9576     }
9577   else
9578     {
9579       EMACS_INT b1 = ch >> 8;
9580       int b2 = ch & 0x7F;
9581       if (b1 < 0xA1 || b1 > 0xFE
9582           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9583         error ("Invalid code: %"pI"d", ch);
9584       c = ch;
9585       charset = charset_big5;
9586     }
9587   c = DECODE_CHAR (charset, c);
9588   if (c < 0)
9589     error ("Invalid code: %"pI"d", ch);
9590   return make_number (c);
9591 }
9592
9593 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9594        doc: /* Encode the Big5 character CH to BIG5 coding system.
9595 Return the corresponding character code in Big5.  */)
9596   (Lisp_Object ch)
9597 {
9598   Lisp_Object spec, attrs, charset_list;
9599   struct charset *charset;
9600   int c;
9601   unsigned code;
9602
9603   CHECK_CHARACTER (ch);
9604   c = XFASTINT (ch);
9605   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9606   attrs = AREF (spec, 0);
9607   if (ASCII_CHAR_P (c)
9608       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9609     return ch;
9610
9611   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9612   charset = char_charset (c, charset_list, &code);
9613   if (code == CHARSET_INVALID_CODE (charset))
9614     error ("Can't encode by Big5 encoding: %c", c);
9615
9616   return make_number (code);
9617 }
9618
9619 \f
9620 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9621        Sset_terminal_coding_system_internal, 1, 2, 0,
9622        doc: /* Internal use only.  */)
9623   (Lisp_Object coding_system, Lisp_Object terminal)
9624 {
9625   struct terminal *term = get_terminal (terminal, 1);
9626   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9627   CHECK_SYMBOL (coding_system);
9628   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9629   /* We had better not send unsafe characters to terminal.  */
9630   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9631   /* Character composition should be disabled.  */
9632   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9633   terminal_coding->src_multibyte = 1;
9634   terminal_coding->dst_multibyte = 0;
9635   tset_charset_list
9636     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9637             ? coding_charset_list (terminal_coding)
9638             : Fcons (make_number (charset_ascii), Qnil)));
9639   return Qnil;
9640 }
9641
9642 DEFUN ("set-safe-terminal-coding-system-internal",
9643        Fset_safe_terminal_coding_system_internal,
9644        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9645        doc: /* Internal use only.  */)
9646   (Lisp_Object coding_system)
9647 {
9648   CHECK_SYMBOL (coding_system);
9649   setup_coding_system (Fcheck_coding_system (coding_system),
9650                        &safe_terminal_coding);
9651   /* Character composition should be disabled.  */
9652   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9653   safe_terminal_coding.src_multibyte = 1;
9654   safe_terminal_coding.dst_multibyte = 0;
9655   return Qnil;
9656 }
9657
9658 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9659        Sterminal_coding_system, 0, 1, 0,
9660        doc: /* Return coding system specified for terminal output on the given terminal.
9661 TERMINAL may be a terminal object, a frame, or nil for the selected
9662 frame's terminal device.  */)
9663   (Lisp_Object terminal)
9664 {
9665   struct coding_system *terminal_coding
9666     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9667   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9668
9669   /* For backward compatibility, return nil if it is `undecided'.  */
9670   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9671 }
9672
9673 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9674        Sset_keyboard_coding_system_internal, 1, 2, 0,
9675        doc: /* Internal use only.  */)
9676   (Lisp_Object coding_system, Lisp_Object terminal)
9677 {
9678   struct terminal *t = get_terminal (terminal, 1);
9679   CHECK_SYMBOL (coding_system);
9680   if (NILP (coding_system))
9681     coding_system = Qno_conversion;
9682   else
9683     Fcheck_coding_system (coding_system);
9684   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9685   /* Character composition should be disabled.  */
9686   TERMINAL_KEYBOARD_CODING (t)->common_flags
9687     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9688   return Qnil;
9689 }
9690
9691 DEFUN ("keyboard-coding-system",
9692        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9693        doc: /* Return coding system specified for decoding keyboard input.  */)
9694   (Lisp_Object terminal)
9695 {
9696   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9697                          (get_terminal (terminal, 1))->id);
9698 }
9699
9700 \f
9701 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9702        Sfind_operation_coding_system,  1, MANY, 0,
9703        doc: /* Choose a coding system for an operation based on the target name.
9704 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9705 DECODING-SYSTEM is the coding system to use for decoding
9706 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9707 for encoding (in case OPERATION does encoding).
9708
9709 The first argument OPERATION specifies an I/O primitive:
9710   For file I/O, `insert-file-contents' or `write-region'.
9711   For process I/O, `call-process', `call-process-region', or `start-process'.
9712   For network I/O, `open-network-stream'.
9713
9714 The remaining arguments should be the same arguments that were passed
9715 to the primitive.  Depending on which primitive, one of those arguments
9716 is selected as the TARGET.  For example, if OPERATION does file I/O,
9717 whichever argument specifies the file name is TARGET.
9718
9719 TARGET has a meaning which depends on OPERATION:
9720   For file I/O, TARGET is a file name (except for the special case below).
9721   For process I/O, TARGET is a process name.
9722   For network I/O, TARGET is a service name or a port number.
9723
9724 This function looks up what is specified for TARGET in
9725 `file-coding-system-alist', `process-coding-system-alist',
9726 or `network-coding-system-alist' depending on OPERATION.
9727 They may specify a coding system, a cons of coding systems,
9728 or a function symbol to call.
9729 In the last case, we call the function with one argument,
9730 which is a list of all the arguments given to this function.
9731 If the function can't decide a coding system, it can return
9732 `undecided' so that the normal code-detection is performed.
9733
9734 If OPERATION is `insert-file-contents', the argument corresponding to
9735 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9736 file name to look up, and BUFFER is a buffer that contains the file's
9737 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9738 function to call for FILENAME, that function should examine the
9739 contents of BUFFER instead of reading the file.
9740
9741 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9742   (ptrdiff_t nargs, Lisp_Object *args)
9743 {
9744   Lisp_Object operation, target_idx, target, val;
9745   register Lisp_Object chain;
9746
9747   if (nargs < 2)
9748     error ("Too few arguments");
9749   operation = args[0];
9750   if (!SYMBOLP (operation)
9751       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9752     error ("Invalid first argument");
9753   if (nargs <= 1 + XFASTINT (target_idx))
9754     error ("Too few arguments for operation `%s'",
9755            SDATA (SYMBOL_NAME (operation)));
9756   target = args[XFASTINT (target_idx) + 1];
9757   if (!(STRINGP (target)
9758         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9759             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9760         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9761     error ("Invalid argument %"pI"d of operation `%s'",
9762            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9763   if (CONSP (target))
9764     target = XCAR (target);
9765
9766   chain = ((EQ (operation, Qinsert_file_contents)
9767             || EQ (operation, Qwrite_region))
9768            ? Vfile_coding_system_alist
9769            : (EQ (operation, Qopen_network_stream)
9770               ? Vnetwork_coding_system_alist
9771               : Vprocess_coding_system_alist));
9772   if (NILP (chain))
9773     return Qnil;
9774
9775   for (; CONSP (chain); chain = XCDR (chain))
9776     {
9777       Lisp_Object elt;
9778
9779       elt = XCAR (chain);
9780       if (CONSP (elt)
9781           && ((STRINGP (target)
9782                && STRINGP (XCAR (elt))
9783                && fast_string_match (XCAR (elt), target) >= 0)
9784               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9785         {
9786           val = XCDR (elt);
9787           /* Here, if VAL is both a valid coding system and a valid
9788              function symbol, we return VAL as a coding system.  */
9789           if (CONSP (val))
9790             return val;
9791           if (! SYMBOLP (val))
9792             return Qnil;
9793           if (! NILP (Fcoding_system_p (val)))
9794             return Fcons (val, val);
9795           if (! NILP (Ffboundp (val)))
9796             {
9797               /* We use call1 rather than safe_call1
9798                  so as to get bug reports about functions called here
9799                  which don't handle the current interface.  */
9800               val = call1 (val, Flist (nargs, args));
9801               if (CONSP (val))
9802                 return val;
9803               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9804                 return Fcons (val, val);
9805             }
9806           return Qnil;
9807         }
9808     }
9809   return Qnil;
9810 }
9811
9812 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9813        Sset_coding_system_priority, 0, MANY, 0,
9814        doc: /* Assign higher priority to the coding systems given as arguments.
9815 If multiple coding systems belong to the same category,
9816 all but the first one are ignored.
9817
9818 usage: (set-coding-system-priority &rest coding-systems)  */)
9819   (ptrdiff_t nargs, Lisp_Object *args)
9820 {
9821   ptrdiff_t i, j;
9822   bool changed[coding_category_max];
9823   enum coding_category priorities[coding_category_max];
9824
9825   memset (changed, 0, sizeof changed);
9826
9827   for (i = j = 0; i < nargs; i++)
9828     {
9829       enum coding_category category;
9830       Lisp_Object spec, attrs;
9831
9832       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9833       attrs = AREF (spec, 0);
9834       category = XINT (CODING_ATTR_CATEGORY (attrs));
9835       if (changed[category])
9836         /* Ignore this coding system because a coding system of the
9837            same category already had a higher priority.  */
9838         continue;
9839       changed[category] = 1;
9840       priorities[j++] = category;
9841       if (coding_categories[category].id >= 0
9842           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9843         setup_coding_system (args[i], &coding_categories[category]);
9844       Fset (AREF (Vcoding_category_table, category), args[i]);
9845     }
9846
9847   /* Now we have decided top J priorities.  Reflect the order of the
9848      original priorities to the remaining priorities.  */
9849
9850   for (i = j, j = 0; i < coding_category_max; i++, j++)
9851     {
9852       while (j < coding_category_max
9853              && changed[coding_priorities[j]])
9854         j++;
9855       if (j == coding_category_max)
9856         emacs_abort ();
9857       priorities[i] = coding_priorities[j];
9858     }
9859
9860   memcpy (coding_priorities, priorities, sizeof priorities);
9861
9862   /* Update `coding-category-list'.  */
9863   Vcoding_category_list = Qnil;
9864   for (i = coding_category_max; i-- > 0; )
9865     Vcoding_category_list
9866       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9867                Vcoding_category_list);
9868
9869   return Qnil;
9870 }
9871
9872 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9873        Scoding_system_priority_list, 0, 1, 0,
9874        doc: /* Return a list of coding systems ordered by their priorities.
9875 The list contains a subset of coding systems; i.e. coding systems
9876 assigned to each coding category (see `coding-category-list').
9877
9878 HIGHESTP non-nil means just return the highest priority one.  */)
9879   (Lisp_Object highestp)
9880 {
9881   int i;
9882   Lisp_Object val;
9883
9884   for (i = 0, val = Qnil; i < coding_category_max; i++)
9885     {
9886       enum coding_category category = coding_priorities[i];
9887       int id = coding_categories[category].id;
9888       Lisp_Object attrs;
9889
9890       if (id < 0)
9891         continue;
9892       attrs = CODING_ID_ATTRS (id);
9893       if (! NILP (highestp))
9894         return CODING_ATTR_BASE_NAME (attrs);
9895       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9896     }
9897   return Fnreverse (val);
9898 }
9899
9900 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9901
9902 static Lisp_Object
9903 make_subsidiaries (Lisp_Object base)
9904 {
9905   Lisp_Object subsidiaries;
9906   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9907   char *buf = alloca (base_name_len + 6);
9908   int i;
9909
9910   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9911   subsidiaries = make_uninit_vector (3);
9912   for (i = 0; i < 3; i++)
9913     {
9914       strcpy (buf + base_name_len, suffixes[i]);
9915       ASET (subsidiaries, i, intern (buf));
9916     }
9917   return subsidiaries;
9918 }
9919
9920
9921 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9922        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9923        doc: /* For internal use only.
9924 usage: (define-coding-system-internal ...)  */)
9925   (ptrdiff_t nargs, Lisp_Object *args)
9926 {
9927   Lisp_Object name;
9928   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9929   Lisp_Object attrs;            /* Vector of attributes.  */
9930   Lisp_Object eol_type;
9931   Lisp_Object aliases;
9932   Lisp_Object coding_type, charset_list, safe_charsets;
9933   enum coding_category category;
9934   Lisp_Object tail, val;
9935   int max_charset_id = 0;
9936   int i;
9937
9938   if (nargs < coding_arg_max)
9939     goto short_args;
9940
9941   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9942
9943   name = args[coding_arg_name];
9944   CHECK_SYMBOL (name);
9945   ASET (attrs, coding_attr_base_name, name);
9946
9947   val = args[coding_arg_mnemonic];
9948   if (! STRINGP (val))
9949     CHECK_CHARACTER (val);
9950   ASET (attrs, coding_attr_mnemonic, val);
9951
9952   coding_type = args[coding_arg_coding_type];
9953   CHECK_SYMBOL (coding_type);
9954   ASET (attrs, coding_attr_type, coding_type);
9955
9956   charset_list = args[coding_arg_charset_list];
9957   if (SYMBOLP (charset_list))
9958     {
9959       if (EQ (charset_list, Qiso_2022))
9960         {
9961           if (! EQ (coding_type, Qiso_2022))
9962             error ("Invalid charset-list");
9963           charset_list = Viso_2022_charset_list;
9964         }
9965       else if (EQ (charset_list, Qemacs_mule))
9966         {
9967           if (! EQ (coding_type, Qemacs_mule))
9968             error ("Invalid charset-list");
9969           charset_list = Vemacs_mule_charset_list;
9970         }
9971       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9972         {
9973           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9974             error ("Invalid charset-list");
9975           if (max_charset_id < XFASTINT (XCAR (tail)))
9976             max_charset_id = XFASTINT (XCAR (tail));
9977         }
9978     }
9979   else
9980     {
9981       charset_list = Fcopy_sequence (charset_list);
9982       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9983         {
9984           struct charset *charset;
9985
9986           val = XCAR (tail);
9987           CHECK_CHARSET_GET_CHARSET (val, charset);
9988           if (EQ (coding_type, Qiso_2022)
9989               ? CHARSET_ISO_FINAL (charset) < 0
9990               : EQ (coding_type, Qemacs_mule)
9991               ? CHARSET_EMACS_MULE_ID (charset) < 0
9992               : 0)
9993             error ("Can't handle charset `%s'",
9994                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9995
9996           XSETCAR (tail, make_number (charset->id));
9997           if (max_charset_id < charset->id)
9998             max_charset_id = charset->id;
9999         }
10000     }
10001   ASET (attrs, coding_attr_charset_list, charset_list);
10002
10003   safe_charsets = make_uninit_string (max_charset_id + 1);
10004   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10005   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10006     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10007   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10008
10009   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10010
10011   val = args[coding_arg_decode_translation_table];
10012   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10013     CHECK_SYMBOL (val);
10014   ASET (attrs, coding_attr_decode_tbl, val);
10015
10016   val = args[coding_arg_encode_translation_table];
10017   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10018     CHECK_SYMBOL (val);
10019   ASET (attrs, coding_attr_encode_tbl, val);
10020
10021   val = args[coding_arg_post_read_conversion];
10022   CHECK_SYMBOL (val);
10023   ASET (attrs, coding_attr_post_read, val);
10024
10025   val = args[coding_arg_pre_write_conversion];
10026   CHECK_SYMBOL (val);
10027   ASET (attrs, coding_attr_pre_write, val);
10028
10029   val = args[coding_arg_default_char];
10030   if (NILP (val))
10031     ASET (attrs, coding_attr_default_char, make_number (' '));
10032   else
10033     {
10034       CHECK_CHARACTER (val);
10035       ASET (attrs, coding_attr_default_char, val);
10036     }
10037
10038   val = args[coding_arg_for_unibyte];
10039   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10040
10041   val = args[coding_arg_plist];
10042   CHECK_LIST (val);
10043   ASET (attrs, coding_attr_plist, val);
10044
10045   if (EQ (coding_type, Qcharset))
10046     {
10047       /* Generate a lisp vector of 256 elements.  Each element is nil,
10048          integer, or a list of charset IDs.
10049
10050          If Nth element is nil, the byte code N is invalid in this
10051          coding system.
10052
10053          If Nth element is a number NUM, N is the first byte of a
10054          charset whose ID is NUM.
10055
10056          If Nth element is a list of charset IDs, N is the first byte
10057          of one of them.  The list is sorted by dimensions of the
10058          charsets.  A charset of smaller dimension comes first. */
10059       val = Fmake_vector (make_number (256), Qnil);
10060
10061       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10062         {
10063           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10064           int dim = CHARSET_DIMENSION (charset);
10065           int idx = (dim - 1) * 4;
10066
10067           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10068             ASET (attrs, coding_attr_ascii_compat, Qt);
10069
10070           for (i = charset->code_space[idx];
10071                i <= charset->code_space[idx + 1]; i++)
10072             {
10073               Lisp_Object tmp, tmp2;
10074               int dim2;
10075
10076               tmp = AREF (val, i);
10077               if (NILP (tmp))
10078                 tmp = XCAR (tail);
10079               else if (NUMBERP (tmp))
10080                 {
10081                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10082                   if (dim < dim2)
10083                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
10084                   else
10085                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
10086                 }
10087               else
10088                 {
10089                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10090                     {
10091                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10092                       if (dim < dim2)
10093                         break;
10094                     }
10095                   if (NILP (tmp2))
10096                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
10097                   else
10098                     {
10099                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10100                       XSETCAR (tmp2, XCAR (tail));
10101                     }
10102                 }
10103               ASET (val, i, tmp);
10104             }
10105         }
10106       ASET (attrs, coding_attr_charset_valids, val);
10107       category = coding_category_charset;
10108     }
10109   else if (EQ (coding_type, Qccl))
10110     {
10111       Lisp_Object valids;
10112
10113       if (nargs < coding_arg_ccl_max)
10114         goto short_args;
10115
10116       val = args[coding_arg_ccl_decoder];
10117       CHECK_CCL_PROGRAM (val);
10118       if (VECTORP (val))
10119         val = Fcopy_sequence (val);
10120       ASET (attrs, coding_attr_ccl_decoder, val);
10121
10122       val = args[coding_arg_ccl_encoder];
10123       CHECK_CCL_PROGRAM (val);
10124       if (VECTORP (val))
10125         val = Fcopy_sequence (val);
10126       ASET (attrs, coding_attr_ccl_encoder, val);
10127
10128       val = args[coding_arg_ccl_valids];
10129       valids = Fmake_string (make_number (256), make_number (0));
10130       for (tail = val; CONSP (tail); tail = XCDR (tail))
10131         {
10132           int from, to;
10133
10134           val = XCAR (tail);
10135           if (INTEGERP (val))
10136             {
10137               if (! (0 <= XINT (val) && XINT (val) <= 255))
10138                 args_out_of_range_3 (val, make_number (0), make_number (255));
10139               from = to = XINT (val);
10140             }
10141           else
10142             {
10143               CHECK_CONS (val);
10144               CHECK_NATNUM_CAR (val);
10145               CHECK_NUMBER_CDR (val);
10146               if (XINT (XCAR (val)) > 255)
10147                 args_out_of_range_3 (XCAR (val),
10148                                      make_number (0), make_number (255));
10149               from = XINT (XCAR (val));
10150               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10151                 args_out_of_range_3 (XCDR (val),
10152                                      XCAR (val), make_number (255));
10153               to = XINT (XCDR (val));
10154             }
10155           for (i = from; i <= to; i++)
10156             SSET (valids, i, 1);
10157         }
10158       ASET (attrs, coding_attr_ccl_valids, valids);
10159
10160       category = coding_category_ccl;
10161     }
10162   else if (EQ (coding_type, Qutf_16))
10163     {
10164       Lisp_Object bom, endian;
10165
10166       ASET (attrs, coding_attr_ascii_compat, Qnil);
10167
10168       if (nargs < coding_arg_utf16_max)
10169         goto short_args;
10170
10171       bom = args[coding_arg_utf16_bom];
10172       if (! NILP (bom) && ! EQ (bom, Qt))
10173         {
10174           CHECK_CONS (bom);
10175           val = XCAR (bom);
10176           CHECK_CODING_SYSTEM (val);
10177           val = XCDR (bom);
10178           CHECK_CODING_SYSTEM (val);
10179         }
10180       ASET (attrs, coding_attr_utf_bom, bom);
10181
10182       endian = args[coding_arg_utf16_endian];
10183       CHECK_SYMBOL (endian);
10184       if (NILP (endian))
10185         endian = Qbig;
10186       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10187         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10188       ASET (attrs, coding_attr_utf_16_endian, endian);
10189
10190       category = (CONSP (bom)
10191                   ? coding_category_utf_16_auto
10192                   : NILP (bom)
10193                   ? (EQ (endian, Qbig)
10194                      ? coding_category_utf_16_be_nosig
10195                      : coding_category_utf_16_le_nosig)
10196                   : (EQ (endian, Qbig)
10197                      ? coding_category_utf_16_be
10198                      : coding_category_utf_16_le));
10199     }
10200   else if (EQ (coding_type, Qiso_2022))
10201     {
10202       Lisp_Object initial, reg_usage, request, flags;
10203
10204       if (nargs < coding_arg_iso2022_max)
10205         goto short_args;
10206
10207       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10208       CHECK_VECTOR (initial);
10209       for (i = 0; i < 4; i++)
10210         {
10211           val = AREF (initial, i);
10212           if (! NILP (val))
10213             {
10214               struct charset *charset;
10215
10216               CHECK_CHARSET_GET_CHARSET (val, charset);
10217               ASET (initial, i, make_number (CHARSET_ID (charset)));
10218               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10219                 ASET (attrs, coding_attr_ascii_compat, Qt);
10220             }
10221           else
10222             ASET (initial, i, make_number (-1));
10223         }
10224
10225       reg_usage = args[coding_arg_iso2022_reg_usage];
10226       CHECK_CONS (reg_usage);
10227       CHECK_NUMBER_CAR (reg_usage);
10228       CHECK_NUMBER_CDR (reg_usage);
10229
10230       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10231       for (tail = request; CONSP (tail); tail = XCDR (tail))
10232         {
10233           int id;
10234           Lisp_Object tmp1;
10235
10236           val = XCAR (tail);
10237           CHECK_CONS (val);
10238           tmp1 = XCAR (val);
10239           CHECK_CHARSET_GET_ID (tmp1, id);
10240           CHECK_NATNUM_CDR (val);
10241           if (XINT (XCDR (val)) >= 4)
10242             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10243           XSETCAR (val, make_number (id));
10244         }
10245
10246       flags = args[coding_arg_iso2022_flags];
10247       CHECK_NATNUM (flags);
10248       i = XINT (flags) & INT_MAX;
10249       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10250         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10251       flags = make_number (i);
10252
10253       ASET (attrs, coding_attr_iso_initial, initial);
10254       ASET (attrs, coding_attr_iso_usage, reg_usage);
10255       ASET (attrs, coding_attr_iso_request, request);
10256       ASET (attrs, coding_attr_iso_flags, flags);
10257       setup_iso_safe_charsets (attrs);
10258
10259       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10260         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10261                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10262                     ? coding_category_iso_7_else
10263                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10264                     ? coding_category_iso_7
10265                     : coding_category_iso_7_tight);
10266       else
10267         {
10268           int id = XINT (AREF (initial, 1));
10269
10270           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10271                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10272                        || id < 0)
10273                       ? coding_category_iso_8_else
10274                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10275                       ? coding_category_iso_8_1
10276                       : coding_category_iso_8_2);
10277         }
10278       if (category != coding_category_iso_8_1
10279           && category != coding_category_iso_8_2)
10280         ASET (attrs, coding_attr_ascii_compat, Qnil);
10281     }
10282   else if (EQ (coding_type, Qemacs_mule))
10283     {
10284       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10285         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10286       ASET (attrs, coding_attr_ascii_compat, Qt);
10287       category = coding_category_emacs_mule;
10288     }
10289   else if (EQ (coding_type, Qshift_jis))
10290     {
10291
10292       struct charset *charset;
10293
10294       if (XINT (Flength (charset_list)) != 3
10295           && XINT (Flength (charset_list)) != 4)
10296         error ("There should be three or four charsets");
10297
10298       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10299       if (CHARSET_DIMENSION (charset) != 1)
10300         error ("Dimension of charset %s is not one",
10301                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10302       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10303         ASET (attrs, coding_attr_ascii_compat, Qt);
10304
10305       charset_list = XCDR (charset_list);
10306       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10307       if (CHARSET_DIMENSION (charset) != 1)
10308         error ("Dimension of charset %s is not one",
10309                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10310
10311       charset_list = XCDR (charset_list);
10312       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10313       if (CHARSET_DIMENSION (charset) != 2)
10314         error ("Dimension of charset %s is not two",
10315                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10316
10317       charset_list = XCDR (charset_list);
10318       if (! NILP (charset_list))
10319         {
10320           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10321           if (CHARSET_DIMENSION (charset) != 2)
10322             error ("Dimension of charset %s is not two",
10323                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10324         }
10325
10326       category = coding_category_sjis;
10327       Vsjis_coding_system = name;
10328     }
10329   else if (EQ (coding_type, Qbig5))
10330     {
10331       struct charset *charset;
10332
10333       if (XINT (Flength (charset_list)) != 2)
10334         error ("There should be just two charsets");
10335
10336       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10337       if (CHARSET_DIMENSION (charset) != 1)
10338         error ("Dimension of charset %s is not one",
10339                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10340       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10341         ASET (attrs, coding_attr_ascii_compat, Qt);
10342
10343       charset_list = XCDR (charset_list);
10344       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10345       if (CHARSET_DIMENSION (charset) != 2)
10346         error ("Dimension of charset %s is not two",
10347                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10348
10349       category = coding_category_big5;
10350       Vbig5_coding_system = name;
10351     }
10352   else if (EQ (coding_type, Qraw_text))
10353     {
10354       category = coding_category_raw_text;
10355       ASET (attrs, coding_attr_ascii_compat, Qt);
10356     }
10357   else if (EQ (coding_type, Qutf_8))
10358     {
10359       Lisp_Object bom;
10360
10361       if (nargs < coding_arg_utf8_max)
10362         goto short_args;
10363
10364       bom = args[coding_arg_utf8_bom];
10365       if (! NILP (bom) && ! EQ (bom, Qt))
10366         {
10367           CHECK_CONS (bom);
10368           val = XCAR (bom);
10369           CHECK_CODING_SYSTEM (val);
10370           val = XCDR (bom);
10371           CHECK_CODING_SYSTEM (val);
10372         }
10373       ASET (attrs, coding_attr_utf_bom, bom);
10374       if (NILP (bom))
10375         ASET (attrs, coding_attr_ascii_compat, Qt);
10376
10377       category = (CONSP (bom) ? coding_category_utf_8_auto
10378                   : NILP (bom) ? coding_category_utf_8_nosig
10379                   : coding_category_utf_8_sig);
10380     }
10381   else if (EQ (coding_type, Qundecided))
10382     {
10383       if (nargs < coding_arg_undecided_max)
10384         goto short_args;
10385       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10386             args[coding_arg_undecided_inhibit_null_byte_detection]);
10387       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10388             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10389       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10390             args[coding_arg_undecided_prefer_utf_8]);
10391       category = coding_category_undecided;
10392     }
10393   else
10394     error ("Invalid coding system type: %s",
10395            SDATA (SYMBOL_NAME (coding_type)));
10396
10397   ASET (attrs, coding_attr_category, make_number (category));
10398   ASET (attrs, coding_attr_plist,
10399         Fcons (QCcategory,
10400                Fcons (AREF (Vcoding_category_table, category),
10401                       CODING_ATTR_PLIST (attrs))));
10402   ASET (attrs, coding_attr_plist,
10403         Fcons (QCascii_compatible_p,
10404                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10405                       CODING_ATTR_PLIST (attrs))));
10406
10407   eol_type = args[coding_arg_eol_type];
10408   if (! NILP (eol_type)
10409       && ! EQ (eol_type, Qunix)
10410       && ! EQ (eol_type, Qdos)
10411       && ! EQ (eol_type, Qmac))
10412     error ("Invalid eol-type");
10413
10414   aliases = Fcons (name, Qnil);
10415
10416   if (NILP (eol_type))
10417     {
10418       eol_type = make_subsidiaries (name);
10419       for (i = 0; i < 3; i++)
10420         {
10421           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10422
10423           this_name = AREF (eol_type, i);
10424           this_aliases = Fcons (this_name, Qnil);
10425           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10426           this_spec = make_uninit_vector (3);
10427           ASET (this_spec, 0, attrs);
10428           ASET (this_spec, 1, this_aliases);
10429           ASET (this_spec, 2, this_eol_type);
10430           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10431           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10432           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10433           if (NILP (val))
10434             Vcoding_system_alist
10435               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10436                        Vcoding_system_alist);
10437         }
10438     }
10439
10440   spec_vec = make_uninit_vector (3);
10441   ASET (spec_vec, 0, attrs);
10442   ASET (spec_vec, 1, aliases);
10443   ASET (spec_vec, 2, eol_type);
10444
10445   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10446   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10447   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10448   if (NILP (val))
10449     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10450                                   Vcoding_system_alist);
10451
10452   {
10453     int id = coding_categories[category].id;
10454
10455     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10456       setup_coding_system (name, &coding_categories[category]);
10457   }
10458
10459   return Qnil;
10460
10461  short_args:
10462   return Fsignal (Qwrong_number_of_arguments,
10463                   Fcons (intern ("define-coding-system-internal"),
10464                          make_number (nargs)));
10465 }
10466
10467
10468 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10469        3, 3, 0,
10470        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10471   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10472 {
10473   Lisp_Object spec, attrs;
10474
10475   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10476   attrs = AREF (spec, 0);
10477   if (EQ (prop, QCmnemonic))
10478     {
10479       if (! STRINGP (val))
10480         CHECK_CHARACTER (val);
10481       ASET (attrs, coding_attr_mnemonic, val);
10482     }
10483   else if (EQ (prop, QCdefault_char))
10484     {
10485       if (NILP (val))
10486         val = make_number (' ');
10487       else
10488         CHECK_CHARACTER (val);
10489       ASET (attrs, coding_attr_default_char, val);
10490     }
10491   else if (EQ (prop, QCdecode_translation_table))
10492     {
10493       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10494         CHECK_SYMBOL (val);
10495       ASET (attrs, coding_attr_decode_tbl, val);
10496     }
10497   else if (EQ (prop, QCencode_translation_table))
10498     {
10499       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10500         CHECK_SYMBOL (val);
10501       ASET (attrs, coding_attr_encode_tbl, val);
10502     }
10503   else if (EQ (prop, QCpost_read_conversion))
10504     {
10505       CHECK_SYMBOL (val);
10506       ASET (attrs, coding_attr_post_read, val);
10507     }
10508   else if (EQ (prop, QCpre_write_conversion))
10509     {
10510       CHECK_SYMBOL (val);
10511       ASET (attrs, coding_attr_pre_write, val);
10512     }
10513   else if (EQ (prop, QCascii_compatible_p))
10514     {
10515       ASET (attrs, coding_attr_ascii_compat, val);
10516     }
10517
10518   ASET (attrs, coding_attr_plist,
10519         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10520   return val;
10521 }
10522
10523
10524 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10525        Sdefine_coding_system_alias, 2, 2, 0,
10526        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10527   (Lisp_Object alias, Lisp_Object coding_system)
10528 {
10529   Lisp_Object spec, aliases, eol_type, val;
10530
10531   CHECK_SYMBOL (alias);
10532   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10533   aliases = AREF (spec, 1);
10534   /* ALIASES should be a list of length more than zero, and the first
10535      element is a base coding system.  Append ALIAS at the tail of the
10536      list.  */
10537   while (!NILP (XCDR (aliases)))
10538     aliases = XCDR (aliases);
10539   XSETCDR (aliases, Fcons (alias, Qnil));
10540
10541   eol_type = AREF (spec, 2);
10542   if (VECTORP (eol_type))
10543     {
10544       Lisp_Object subsidiaries;
10545       int i;
10546
10547       subsidiaries = make_subsidiaries (alias);
10548       for (i = 0; i < 3; i++)
10549         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10550                                      AREF (eol_type, i));
10551     }
10552
10553   Fputhash (alias, spec, Vcoding_system_hash_table);
10554   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10555   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10556   if (NILP (val))
10557     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10558                                   Vcoding_system_alist);
10559
10560   return Qnil;
10561 }
10562
10563 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10564        1, 1, 0,
10565        doc: /* Return the base of CODING-SYSTEM.
10566 Any alias or subsidiary coding system is not a base coding system.  */)
10567   (Lisp_Object coding_system)
10568 {
10569   Lisp_Object spec, attrs;
10570
10571   if (NILP (coding_system))
10572     return (Qno_conversion);
10573   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10574   attrs = AREF (spec, 0);
10575   return CODING_ATTR_BASE_NAME (attrs);
10576 }
10577
10578 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10579        1, 1, 0,
10580        doc: "Return the property list of CODING-SYSTEM.")
10581   (Lisp_Object coding_system)
10582 {
10583   Lisp_Object spec, attrs;
10584
10585   if (NILP (coding_system))
10586     coding_system = Qno_conversion;
10587   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10588   attrs = AREF (spec, 0);
10589   return CODING_ATTR_PLIST (attrs);
10590 }
10591
10592
10593 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10594        1, 1, 0,
10595        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10596   (Lisp_Object coding_system)
10597 {
10598   Lisp_Object spec;
10599
10600   if (NILP (coding_system))
10601     coding_system = Qno_conversion;
10602   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10603   return AREF (spec, 1);
10604 }
10605
10606 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10607        Scoding_system_eol_type, 1, 1, 0,
10608        doc: /* Return eol-type of CODING-SYSTEM.
10609 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10610
10611 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10612 and CR respectively.
10613
10614 A vector value indicates that a format of end-of-line should be
10615 detected automatically.  Nth element of the vector is the subsidiary
10616 coding system whose eol-type is N.  */)
10617   (Lisp_Object coding_system)
10618 {
10619   Lisp_Object spec, eol_type;
10620   int n;
10621
10622   if (NILP (coding_system))
10623     coding_system = Qno_conversion;
10624   if (! CODING_SYSTEM_P (coding_system))
10625     return Qnil;
10626   spec = CODING_SYSTEM_SPEC (coding_system);
10627   eol_type = AREF (spec, 2);
10628   if (VECTORP (eol_type))
10629     return Fcopy_sequence (eol_type);
10630   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10631   return make_number (n);
10632 }
10633
10634 #endif /* emacs */
10635
10636 \f
10637 /*** 9. Post-amble ***/
10638
10639 void
10640 init_coding_once (void)
10641 {
10642   int i;
10643
10644   for (i = 0; i < coding_category_max; i++)
10645     {
10646       coding_categories[i].id = -1;
10647       coding_priorities[i] = i;
10648     }
10649
10650   /* ISO2022 specific initialize routine.  */
10651   for (i = 0; i < 0x20; i++)
10652     iso_code_class[i] = ISO_control_0;
10653   for (i = 0x21; i < 0x7F; i++)
10654     iso_code_class[i] = ISO_graphic_plane_0;
10655   for (i = 0x80; i < 0xA0; i++)
10656     iso_code_class[i] = ISO_control_1;
10657   for (i = 0xA1; i < 0xFF; i++)
10658     iso_code_class[i] = ISO_graphic_plane_1;
10659   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10660   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10661   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10662   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10663   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10664   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10665   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10666   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10667   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10668
10669   for (i = 0; i < 256; i++)
10670     {
10671       emacs_mule_bytes[i] = 1;
10672     }
10673   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10674   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10675   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10676   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10677 }
10678
10679 #ifdef emacs
10680
10681 void
10682 syms_of_coding (void)
10683 {
10684   staticpro (&Vcoding_system_hash_table);
10685   {
10686     Lisp_Object args[2];
10687     args[0] = QCtest;
10688     args[1] = Qeq;
10689     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10690   }
10691
10692   staticpro (&Vsjis_coding_system);
10693   Vsjis_coding_system = Qnil;
10694
10695   staticpro (&Vbig5_coding_system);
10696   Vbig5_coding_system = Qnil;
10697
10698   staticpro (&Vcode_conversion_reused_workbuf);
10699   Vcode_conversion_reused_workbuf = Qnil;
10700
10701   staticpro (&Vcode_conversion_workbuf_name);
10702   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10703
10704   reused_workbuf_in_use = 0;
10705
10706   DEFSYM (Qcharset, "charset");
10707   DEFSYM (Qtarget_idx, "target-idx");
10708   DEFSYM (Qcoding_system_history, "coding-system-history");
10709   Fset (Qcoding_system_history, Qnil);
10710
10711   /* Target FILENAME is the first argument.  */
10712   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10713   /* Target FILENAME is the third argument.  */
10714   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10715
10716   DEFSYM (Qcall_process, "call-process");
10717   /* Target PROGRAM is the first argument.  */
10718   Fput (Qcall_process, Qtarget_idx, make_number (0));
10719
10720   DEFSYM (Qcall_process_region, "call-process-region");
10721   /* Target PROGRAM is the third argument.  */
10722   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10723
10724   DEFSYM (Qstart_process, "start-process");
10725   /* Target PROGRAM is the third argument.  */
10726   Fput (Qstart_process, Qtarget_idx, make_number (2));
10727
10728   DEFSYM (Qopen_network_stream, "open-network-stream");
10729   /* Target SERVICE is the fourth argument.  */
10730   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10731
10732   DEFSYM (Qcoding_system, "coding-system");
10733   DEFSYM (Qcoding_aliases, "coding-aliases");
10734
10735   DEFSYM (Qeol_type, "eol-type");
10736   DEFSYM (Qunix, "unix");
10737   DEFSYM (Qdos, "dos");
10738   DEFSYM (Qmac, "mac");
10739
10740   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10741   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10742   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10743   DEFSYM (Qdefault_char, "default-char");
10744   DEFSYM (Qundecided, "undecided");
10745   DEFSYM (Qno_conversion, "no-conversion");
10746   DEFSYM (Qraw_text, "raw-text");
10747
10748   DEFSYM (Qiso_2022, "iso-2022");
10749
10750   DEFSYM (Qutf_8, "utf-8");
10751   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10752
10753 #if defined (WINDOWSNT) || defined (CYGWIN)
10754   /* No, not utf-16-le: that one has a BOM.  */
10755   DEFSYM (Qutf_16le, "utf-16le");
10756 #endif
10757
10758   DEFSYM (Qutf_16, "utf-16");
10759   DEFSYM (Qbig, "big");
10760   DEFSYM (Qlittle, "little");
10761
10762   DEFSYM (Qshift_jis, "shift-jis");
10763   DEFSYM (Qbig5, "big5");
10764
10765   DEFSYM (Qcoding_system_p, "coding-system-p");
10766
10767   DEFSYM (Qcoding_system_error, "coding-system-error");
10768   Fput (Qcoding_system_error, Qerror_conditions,
10769         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10770   Fput (Qcoding_system_error, Qerror_message,
10771         build_pure_c_string ("Invalid coding system"));
10772
10773   /* Intern this now in case it isn't already done.
10774      Setting this variable twice is harmless.
10775      But don't staticpro it here--that is done in alloc.c.  */
10776   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10777
10778   DEFSYM (Qtranslation_table, "translation-table");
10779   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10780   DEFSYM (Qtranslation_table_id, "translation-table-id");
10781   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10782   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10783
10784   DEFSYM (Qvalid_codes, "valid-codes");
10785
10786   DEFSYM (Qemacs_mule, "emacs-mule");
10787
10788   DEFSYM (QCcategory, ":category");
10789   DEFSYM (QCmnemonic, ":mnemonic");
10790   DEFSYM (QCdefault_char, ":default-char");
10791   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10792   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10793   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10794   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10795   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10796
10797   Vcoding_category_table
10798     = Fmake_vector (make_number (coding_category_max), Qnil);
10799   staticpro (&Vcoding_category_table);
10800   /* Followings are target of code detection.  */
10801   ASET (Vcoding_category_table, coding_category_iso_7,
10802         intern_c_string ("coding-category-iso-7"));
10803   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10804         intern_c_string ("coding-category-iso-7-tight"));
10805   ASET (Vcoding_category_table, coding_category_iso_8_1,
10806         intern_c_string ("coding-category-iso-8-1"));
10807   ASET (Vcoding_category_table, coding_category_iso_8_2,
10808         intern_c_string ("coding-category-iso-8-2"));
10809   ASET (Vcoding_category_table, coding_category_iso_7_else,
10810         intern_c_string ("coding-category-iso-7-else"));
10811   ASET (Vcoding_category_table, coding_category_iso_8_else,
10812         intern_c_string ("coding-category-iso-8-else"));
10813   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10814         intern_c_string ("coding-category-utf-8-auto"));
10815   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10816         intern_c_string ("coding-category-utf-8"));
10817   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10818         intern_c_string ("coding-category-utf-8-sig"));
10819   ASET (Vcoding_category_table, coding_category_utf_16_be,
10820         intern_c_string ("coding-category-utf-16-be"));
10821   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10822         intern_c_string ("coding-category-utf-16-auto"));
10823   ASET (Vcoding_category_table, coding_category_utf_16_le,
10824         intern_c_string ("coding-category-utf-16-le"));
10825   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10826         intern_c_string ("coding-category-utf-16-be-nosig"));
10827   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10828         intern_c_string ("coding-category-utf-16-le-nosig"));
10829   ASET (Vcoding_category_table, coding_category_charset,
10830         intern_c_string ("coding-category-charset"));
10831   ASET (Vcoding_category_table, coding_category_sjis,
10832         intern_c_string ("coding-category-sjis"));
10833   ASET (Vcoding_category_table, coding_category_big5,
10834         intern_c_string ("coding-category-big5"));
10835   ASET (Vcoding_category_table, coding_category_ccl,
10836         intern_c_string ("coding-category-ccl"));
10837   ASET (Vcoding_category_table, coding_category_emacs_mule,
10838         intern_c_string ("coding-category-emacs-mule"));
10839   /* Followings are NOT target of code detection.  */
10840   ASET (Vcoding_category_table, coding_category_raw_text,
10841         intern_c_string ("coding-category-raw-text"));
10842   ASET (Vcoding_category_table, coding_category_undecided,
10843         intern_c_string ("coding-category-undecided"));
10844
10845   DEFSYM (Qinsufficient_source, "insufficient-source");
10846   DEFSYM (Qinvalid_source, "invalid-source");
10847   DEFSYM (Qinterrupted, "interrupted");
10848   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10849
10850   defsubr (&Scoding_system_p);
10851   defsubr (&Sread_coding_system);
10852   defsubr (&Sread_non_nil_coding_system);
10853   defsubr (&Scheck_coding_system);
10854   defsubr (&Sdetect_coding_region);
10855   defsubr (&Sdetect_coding_string);
10856   defsubr (&Sfind_coding_systems_region_internal);
10857   defsubr (&Sunencodable_char_position);
10858   defsubr (&Scheck_coding_systems_region);
10859   defsubr (&Sdecode_coding_region);
10860   defsubr (&Sencode_coding_region);
10861   defsubr (&Sdecode_coding_string);
10862   defsubr (&Sencode_coding_string);
10863   defsubr (&Sdecode_sjis_char);
10864   defsubr (&Sencode_sjis_char);
10865   defsubr (&Sdecode_big5_char);
10866   defsubr (&Sencode_big5_char);
10867   defsubr (&Sset_terminal_coding_system_internal);
10868   defsubr (&Sset_safe_terminal_coding_system_internal);
10869   defsubr (&Sterminal_coding_system);
10870   defsubr (&Sset_keyboard_coding_system_internal);
10871   defsubr (&Skeyboard_coding_system);
10872   defsubr (&Sfind_operation_coding_system);
10873   defsubr (&Sset_coding_system_priority);
10874   defsubr (&Sdefine_coding_system_internal);
10875   defsubr (&Sdefine_coding_system_alias);
10876   defsubr (&Scoding_system_put);
10877   defsubr (&Scoding_system_base);
10878   defsubr (&Scoding_system_plist);
10879   defsubr (&Scoding_system_aliases);
10880   defsubr (&Scoding_system_eol_type);
10881   defsubr (&Scoding_system_priority_list);
10882
10883   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10884                doc: /* List of coding systems.
10885
10886 Do not alter the value of this variable manually.  This variable should be
10887 updated by the functions `define-coding-system' and
10888 `define-coding-system-alias'.  */);
10889   Vcoding_system_list = Qnil;
10890
10891   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10892                doc: /* Alist of coding system names.
10893 Each element is one element list of coding system name.
10894 This variable is given to `completing-read' as COLLECTION argument.
10895
10896 Do not alter the value of this variable manually.  This variable should be
10897 updated by the functions `make-coding-system' and
10898 `define-coding-system-alias'.  */);
10899   Vcoding_system_alist = Qnil;
10900
10901   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10902                doc: /* List of coding-categories (symbols) ordered by priority.
10903
10904 On detecting a coding system, Emacs tries code detection algorithms
10905 associated with each coding-category one by one in this order.  When
10906 one algorithm agrees with a byte sequence of source text, the coding
10907 system bound to the corresponding coding-category is selected.
10908
10909 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10910   {
10911     int i;
10912
10913     Vcoding_category_list = Qnil;
10914     for (i = coding_category_max - 1; i >= 0; i--)
10915       Vcoding_category_list
10916         = Fcons (AREF (Vcoding_category_table, i),
10917                  Vcoding_category_list);
10918   }
10919
10920   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10921                doc: /* Specify the coding system for read operations.
10922 It is useful to bind this variable with `let', but do not set it globally.
10923 If the value is a coding system, it is used for decoding on read operation.
10924 If not, an appropriate element is used from one of the coding system alists.
10925 There are three such tables: `file-coding-system-alist',
10926 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10927   Vcoding_system_for_read = Qnil;
10928
10929   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10930                doc: /* Specify the coding system for write operations.
10931 Programs bind this variable with `let', but you should not set it globally.
10932 If the value is a coding system, it is used for encoding of output,
10933 when writing it to a file and when sending it to a file or subprocess.
10934
10935 If this does not specify a coding system, an appropriate element
10936 is used from one of the coding system alists.
10937 There are three such tables: `file-coding-system-alist',
10938 `process-coding-system-alist', and `network-coding-system-alist'.
10939 For output to files, if the above procedure does not specify a coding system,
10940 the value of `buffer-file-coding-system' is used.  */);
10941   Vcoding_system_for_write = Qnil;
10942
10943   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10944                doc: /*
10945 Coding system used in the latest file or process I/O.  */);
10946   Vlast_coding_system_used = Qnil;
10947
10948   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10949                doc: /*
10950 Error status of the last code conversion.
10951
10952 When an error was detected in the last code conversion, this variable
10953 is set to one of the following symbols.
10954   `insufficient-source'
10955   `inconsistent-eol'
10956   `invalid-source'
10957   `interrupted'
10958   `insufficient-memory'
10959 When no error was detected, the value doesn't change.  So, to check
10960 the error status of a code conversion by this variable, you must
10961 explicitly set this variable to nil before performing code
10962 conversion.  */);
10963   Vlast_code_conversion_error = Qnil;
10964
10965   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10966                doc: /*
10967 *Non-nil means always inhibit code conversion of end-of-line format.
10968 See info node `Coding Systems' and info node `Text and Binary' concerning
10969 such conversion.  */);
10970   inhibit_eol_conversion = 0;
10971
10972   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10973                doc: /*
10974 Non-nil means process buffer inherits coding system of process output.
10975 Bind it to t if the process output is to be treated as if it were a file
10976 read from some filesystem.  */);
10977   inherit_process_coding_system = 0;
10978
10979   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10980                doc: /*
10981 Alist to decide a coding system to use for a file I/O operation.
10982 The format is ((PATTERN . VAL) ...),
10983 where PATTERN is a regular expression matching a file name,
10984 VAL is a coding system, a cons of coding systems, or a function symbol.
10985 If VAL is a coding system, it is used for both decoding and encoding
10986 the file contents.
10987 If VAL is a cons of coding systems, the car part is used for decoding,
10988 and the cdr part is used for encoding.
10989 If VAL is a function symbol, the function must return a coding system
10990 or a cons of coding systems which are used as above.  The function is
10991 called with an argument that is a list of the arguments with which
10992 `find-operation-coding-system' was called.  If the function can't decide
10993 a coding system, it can return `undecided' so that the normal
10994 code-detection is performed.
10995
10996 See also the function `find-operation-coding-system'
10997 and the variable `auto-coding-alist'.  */);
10998   Vfile_coding_system_alist = Qnil;
10999
11000   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11001                doc: /*
11002 Alist to decide a coding system to use for a process I/O operation.
11003 The format is ((PATTERN . VAL) ...),
11004 where PATTERN is a regular expression matching a program name,
11005 VAL is a coding system, a cons of coding systems, or a function symbol.
11006 If VAL is a coding system, it is used for both decoding what received
11007 from the program and encoding what sent to the program.
11008 If VAL is a cons of coding systems, the car part is used for decoding,
11009 and the cdr part is used for encoding.
11010 If VAL is a function symbol, the function must return a coding system
11011 or a cons of coding systems which are used as above.
11012
11013 See also the function `find-operation-coding-system'.  */);
11014   Vprocess_coding_system_alist = Qnil;
11015
11016   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11017                doc: /*
11018 Alist to decide a coding system to use for a network I/O operation.
11019 The format is ((PATTERN . VAL) ...),
11020 where PATTERN is a regular expression matching a network service name
11021 or is a port number to connect to,
11022 VAL is a coding system, a cons of coding systems, or a function symbol.
11023 If VAL is a coding system, it is used for both decoding what received
11024 from the network stream and encoding what sent to the network stream.
11025 If VAL is a cons of coding systems, the car part is used for decoding,
11026 and the cdr part is used for encoding.
11027 If VAL is a function symbol, the function must return a coding system
11028 or a cons of coding systems which are used as above.
11029
11030 See also the function `find-operation-coding-system'.  */);
11031   Vnetwork_coding_system_alist = Qnil;
11032
11033   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11034                doc: /* Coding system to use with system messages.
11035 Also used for decoding keyboard input on X Window system.  */);
11036   Vlocale_coding_system = Qnil;
11037
11038   /* The eol mnemonics are reset in startup.el system-dependently.  */
11039   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11040                doc: /*
11041 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11042   eol_mnemonic_unix = build_pure_c_string (":");
11043
11044   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11045                doc: /*
11046 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11047   eol_mnemonic_dos = build_pure_c_string ("\\");
11048
11049   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11050                doc: /*
11051 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11052   eol_mnemonic_mac = build_pure_c_string ("/");
11053
11054   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11055                doc: /*
11056 *String displayed in mode line when end-of-line format is not yet determined.  */);
11057   eol_mnemonic_undecided = build_pure_c_string (":");
11058
11059   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11060                doc: /*
11061 *Non-nil enables character translation while encoding and decoding.  */);
11062   Venable_character_translation = Qt;
11063
11064   DEFVAR_LISP ("standard-translation-table-for-decode",
11065                Vstandard_translation_table_for_decode,
11066                doc: /* Table for translating characters while decoding.  */);
11067   Vstandard_translation_table_for_decode = Qnil;
11068
11069   DEFVAR_LISP ("standard-translation-table-for-encode",
11070                Vstandard_translation_table_for_encode,
11071                doc: /* Table for translating characters while encoding.  */);
11072   Vstandard_translation_table_for_encode = Qnil;
11073
11074   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11075                doc: /* Alist of charsets vs revision numbers.
11076 While encoding, if a charset (car part of an element) is found,
11077 designate it with the escape sequence identifying revision (cdr part
11078 of the element).  */);
11079   Vcharset_revision_table = Qnil;
11080
11081   DEFVAR_LISP ("default-process-coding-system",
11082                Vdefault_process_coding_system,
11083                doc: /* Cons of coding systems used for process I/O by default.
11084 The car part is used for decoding a process output,
11085 the cdr part is used for encoding a text to be sent to a process.  */);
11086   Vdefault_process_coding_system = Qnil;
11087
11088   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11089                doc: /*
11090 Table of extra Latin codes in the range 128..159 (inclusive).
11091 This is a vector of length 256.
11092 If Nth element is non-nil, the existence of code N in a file
11093 \(or output of subprocess) doesn't prevent it to be detected as
11094 a coding system of ISO 2022 variant which has a flag
11095 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11096 or reading output of a subprocess.
11097 Only 128th through 159th elements have a meaning.  */);
11098   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11099
11100   DEFVAR_LISP ("select-safe-coding-system-function",
11101                Vselect_safe_coding_system_function,
11102                doc: /*
11103 Function to call to select safe coding system for encoding a text.
11104
11105 If set, this function is called to force a user to select a proper
11106 coding system which can encode the text in the case that a default
11107 coding system used in each operation can't encode the text.  The
11108 function should take care that the buffer is not modified while
11109 the coding system is being selected.
11110
11111 The default value is `select-safe-coding-system' (which see).  */);
11112   Vselect_safe_coding_system_function = Qnil;
11113
11114   DEFVAR_BOOL ("coding-system-require-warning",
11115                coding_system_require_warning,
11116                doc: /* Internal use only.
11117 If non-nil, on writing a file, `select-safe-coding-system-function' is
11118 called even if `coding-system-for-write' is non-nil.  The command
11119 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11120   coding_system_require_warning = 0;
11121
11122
11123   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11124                inhibit_iso_escape_detection,
11125                doc: /*
11126 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11127
11128 When Emacs reads text, it tries to detect how the text is encoded.
11129 This code detection is sensitive to escape sequences.  If Emacs sees
11130 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11131 of the ISO2022 encodings, and decodes text by the corresponding coding
11132 system (e.g. `iso-2022-7bit').
11133
11134 However, there may be a case that you want to read escape sequences in
11135 a file as is.  In such a case, you can set this variable to non-nil.
11136 Then the code detection will ignore any escape sequences, and no text is
11137 detected as encoded in some ISO-2022 encoding.  The result is that all
11138 escape sequences become visible in a buffer.
11139
11140 The default value is nil, and it is strongly recommended not to change
11141 it.  That is because many Emacs Lisp source files that contain
11142 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11143 in Emacs's distribution, and they won't be decoded correctly on
11144 reading if you suppress escape sequence detection.
11145
11146 The other way to read escape sequences in a file without decoding is
11147 to explicitly specify some coding system that doesn't use ISO-2022
11148 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11149   inhibit_iso_escape_detection = 0;
11150
11151   DEFVAR_BOOL ("inhibit-null-byte-detection",
11152                inhibit_null_byte_detection,
11153                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11154 By default, Emacs treats it as binary data, and does not attempt to
11155 decode it.  The effect is as if you specified `no-conversion' for
11156 reading that text.
11157
11158 Set this to non-nil when a regular text happens to include null bytes.
11159 Examples are Index nodes of Info files and null-byte delimited output
11160 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11161 decode text as usual.  */);
11162   inhibit_null_byte_detection = 0;
11163
11164   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11165                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11166 Internal use only.  Removed after the experimental optimizer gets stable. */);
11167   disable_ascii_optimization = 0;
11168
11169   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11170                doc: /* Char table for translating self-inserting characters.
11171 This is applied to the result of input methods, not their input.
11172 See also `keyboard-translate-table'.
11173
11174 Use of this variable for character code unification was rendered
11175 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11176 internal character representation.  */);
11177     Vtranslation_table_for_input = Qnil;
11178
11179   {
11180     Lisp_Object args[coding_arg_undecided_max];
11181     Lisp_Object plist[16];
11182     int i;
11183
11184     for (i = 0; i < coding_arg_undecided_max; i++)
11185       args[i] = Qnil;
11186
11187     plist[0] = intern_c_string (":name");
11188     plist[1] = args[coding_arg_name] = Qno_conversion;
11189     plist[2] = intern_c_string (":mnemonic");
11190     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11191     plist[4] = intern_c_string (":coding-type");
11192     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11193     plist[6] = intern_c_string (":ascii-compatible-p");
11194     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11195     plist[8] = intern_c_string (":default-char");
11196     plist[9] = args[coding_arg_default_char] = make_number (0);
11197     plist[10] = intern_c_string (":for-unibyte");
11198     plist[11] = args[coding_arg_for_unibyte] = Qt;
11199     plist[12] = intern_c_string (":docstring");
11200     plist[13] = build_pure_c_string ("Do no conversion.\n\
11201 \n\
11202 When you visit a file with this coding, the file is read into a\n\
11203 unibyte buffer as is, thus each byte of a file is treated as a\n\
11204 character.");
11205     plist[14] = intern_c_string (":eol-type");
11206     plist[15] = args[coding_arg_eol_type] = Qunix;
11207     args[coding_arg_plist] = Flist (16, plist);
11208     Fdefine_coding_system_internal (coding_arg_max, args);
11209
11210     plist[1] = args[coding_arg_name] = Qundecided;
11211     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11212     plist[5] = args[coding_arg_coding_type] = Qundecided;
11213     /* This is already set.
11214        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11215     plist[8] = intern_c_string (":charset-list");
11216     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11217     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11218     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11219     plist[15] = args[coding_arg_eol_type] = Qnil;
11220     args[coding_arg_plist] = Flist (16, plist);
11221     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11222   }
11223
11224   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11225
11226   {
11227     int i;
11228
11229     for (i = 0; i < coding_category_max; i++)
11230       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11231   }
11232 #if defined (DOS_NT)
11233   system_eol_type = Qdos;
11234 #else
11235   system_eol_type = Qunix;
11236 #endif
11237   staticpro (&system_eol_type);
11238 }
11239
11240 char *
11241 emacs_strerror (int error_number)
11242 {
11243   char *str;
11244
11245   synchronize_system_messages_locale ();
11246   str = strerror (error_number);
11247
11248   if (! NILP (Vlocale_coding_system))
11249     {
11250       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11251                                                       Vlocale_coding_system,
11252                                                       0);
11253       str = SSDATA (dec);
11254     }
11255
11256   return str;
11257 }
11258
11259 #endif /* emacs */