code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts number of
1194    characters and check EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   ptrdiff_t nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_bytes = src_base - coding->source;
1304   coding->detected_utf8_chars = nchars;
1305   return 1;
1306 }
1307
1308
1309 static void
1310 decode_coding_utf_8 (struct coding_system *coding)
1311 {
1312   const unsigned char *src = coding->source + coding->consumed;
1313   const unsigned char *src_end = coding->source + coding->src_bytes;
1314   const unsigned char *src_base;
1315   int *charbuf = coding->charbuf + coding->charbuf_used;
1316   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1317   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1318   bool multibytep = coding->src_multibyte;
1319   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1320   bool eol_dos
1321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1322   int byte_after_cr = -1;
1323
1324   if (bom != utf_without_bom)
1325     {
1326       int c1, c2, c3;
1327
1328       src_base = src;
1329       ONE_MORE_BYTE (c1);
1330       if (! UTF_8_3_OCTET_LEADING_P (c1))
1331         src = src_base;
1332       else
1333         {
1334           ONE_MORE_BYTE (c2);
1335           if (! UTF_8_EXTRA_OCTET_P (c2))
1336             src = src_base;
1337           else
1338             {
1339               ONE_MORE_BYTE (c3);
1340               if (! UTF_8_EXTRA_OCTET_P (c3))
1341                 src = src_base;
1342               else
1343                 {
1344                   if ((c1 != UTF_8_BOM_1)
1345                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1346                     src = src_base;
1347                   else
1348                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1349                 }
1350             }
1351         }
1352     }
1353   CODING_UTF_8_BOM (coding) = utf_without_bom;
1354
1355   while (1)
1356     {
1357       int c, c1, c2, c3, c4, c5;
1358
1359       src_base = src;
1360       consumed_chars_base = consumed_chars;
1361
1362       if (charbuf >= charbuf_end)
1363         {
1364           if (byte_after_cr >= 0)
1365             src_base--;
1366           break;
1367         }
1368
1369       /* In the simple case, rapidly handle ordinary characters */
1370       if (multibytep && ! eol_dos
1371           && charbuf < charbuf_end - 6 && src < src_end - 6)
1372         {
1373           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1374             {
1375               c1 = *src;
1376               if (c1 & 0x80)
1377                 break;
1378               src++;
1379               consumed_chars++;
1380               *charbuf++ = c1;
1381
1382               c1 = *src;
1383               if (c1 & 0x80)
1384                 break;
1385               src++;
1386               consumed_chars++;
1387               *charbuf++ = c1;
1388
1389               c1 = *src;
1390               if (c1 & 0x80)
1391                 break;
1392               src++;
1393               consumed_chars++;
1394               *charbuf++ = c1;
1395
1396               c1 = *src;
1397               if (c1 & 0x80)
1398                 break;
1399               src++;
1400               consumed_chars++;
1401               *charbuf++ = c1;
1402             }
1403           /* If we handled at least one character, restart the main loop.  */
1404           if (src != src_base)
1405             continue;
1406         }
1407
1408       if (byte_after_cr >= 0)
1409         c1 = byte_after_cr, byte_after_cr = -1;
1410       else
1411         ONE_MORE_BYTE (c1);
1412       if (c1 < 0)
1413         {
1414           c = - c1;
1415         }
1416       else if (UTF_8_1_OCTET_P (c1))
1417         {
1418           if (eol_dos && c1 == '\r')
1419             ONE_MORE_BYTE (byte_after_cr);
1420           c = c1;
1421         }
1422       else
1423         {
1424           ONE_MORE_BYTE (c2);
1425           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1426             goto invalid_code;
1427           if (UTF_8_2_OCTET_LEADING_P (c1))
1428             {
1429               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1430               /* Reject overlong sequences here and below.  Encoders
1431                  producing them are incorrect, they can be misleading,
1432                  and they mess up read/write invariance.  */
1433               if (c < 128)
1434                 goto invalid_code;
1435             }
1436           else
1437             {
1438               ONE_MORE_BYTE (c3);
1439               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1440                 goto invalid_code;
1441               if (UTF_8_3_OCTET_LEADING_P (c1))
1442                 {
1443                   c = (((c1 & 0xF) << 12)
1444                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1445                   if (c < 0x800
1446                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1447                     goto invalid_code;
1448                 }
1449               else
1450                 {
1451                   ONE_MORE_BYTE (c4);
1452                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1453                     goto invalid_code;
1454                   if (UTF_8_4_OCTET_LEADING_P (c1))
1455                     {
1456                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1457                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1458                     if (c < 0x10000)
1459                       goto invalid_code;
1460                     }
1461                   else
1462                     {
1463                       ONE_MORE_BYTE (c5);
1464                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1465                         goto invalid_code;
1466                       if (UTF_8_5_OCTET_LEADING_P (c1))
1467                         {
1468                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1469                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1470                                | (c5 & 0x3F));
1471                           if ((c > MAX_CHAR) || (c < 0x200000))
1472                             goto invalid_code;
1473                         }
1474                       else
1475                         goto invalid_code;
1476                     }
1477                 }
1478             }
1479         }
1480
1481       *charbuf++ = c;
1482       continue;
1483
1484     invalid_code:
1485       src = src_base;
1486       consumed_chars = consumed_chars_base;
1487       ONE_MORE_BYTE (c);
1488       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1489     }
1490
1491  no_more_source:
1492   coding->consumed_char += consumed_chars_base;
1493   coding->consumed = src_base - coding->source;
1494   coding->charbuf_used = charbuf - coding->charbuf;
1495 }
1496
1497
1498 static bool
1499 encode_coding_utf_8 (struct coding_system *coding)
1500 {
1501   bool multibytep = coding->dst_multibyte;
1502   int *charbuf = coding->charbuf;
1503   int *charbuf_end = charbuf + coding->charbuf_used;
1504   unsigned char *dst = coding->destination + coding->produced;
1505   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1506   ptrdiff_t produced_chars = 0;
1507   int c;
1508
1509   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1510     {
1511       ASSURE_DESTINATION (3);
1512       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1513       CODING_UTF_8_BOM (coding) = utf_without_bom;
1514     }
1515
1516   if (multibytep)
1517     {
1518       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1519
1520       while (charbuf < charbuf_end)
1521         {
1522           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1523
1524           ASSURE_DESTINATION (safe_room);
1525           c = *charbuf++;
1526           if (CHAR_BYTE8_P (c))
1527             {
1528               c = CHAR_TO_BYTE8 (c);
1529               EMIT_ONE_BYTE (c);
1530             }
1531           else
1532             {
1533               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1534               for (p = str; p < pend; p++)
1535                 EMIT_ONE_BYTE (*p);
1536             }
1537         }
1538     }
1539   else
1540     {
1541       int safe_room = MAX_MULTIBYTE_LENGTH;
1542
1543       while (charbuf < charbuf_end)
1544         {
1545           ASSURE_DESTINATION (safe_room);
1546           c = *charbuf++;
1547           if (CHAR_BYTE8_P (c))
1548             *dst++ = CHAR_TO_BYTE8 (c);
1549           else
1550             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1551         }
1552       produced_chars = dst - (coding->destination + coding->produced);
1553     }
1554   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1555   coding->produced_char += produced_chars;
1556   coding->produced = dst - coding->destination;
1557   return 0;
1558 }
1559
1560
1561 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1562    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1563
1564 #define UTF_16_HIGH_SURROGATE_P(val) \
1565   (((val) & 0xFC00) == 0xD800)
1566
1567 #define UTF_16_LOW_SURROGATE_P(val) \
1568   (((val) & 0xFC00) == 0xDC00)
1569
1570
1571 static bool
1572 detect_coding_utf_16 (struct coding_system *coding,
1573                       struct coding_detection_info *detect_info)
1574 {
1575   const unsigned char *src = coding->source;
1576   const unsigned char *src_end = coding->source + coding->src_bytes;
1577   bool multibytep = coding->src_multibyte;
1578   int c1, c2;
1579
1580   detect_info->checked |= CATEGORY_MASK_UTF_16;
1581   if (coding->mode & CODING_MODE_LAST_BLOCK
1582       && (coding->src_chars & 1))
1583     {
1584       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1585       return 0;
1586     }
1587
1588   TWO_MORE_BYTES (c1, c2);
1589   if ((c1 == 0xFF) && (c2 == 0xFE))
1590     {
1591       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1592                              | CATEGORY_MASK_UTF_16_AUTO);
1593       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1594                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1595                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1596     }
1597   else if ((c1 == 0xFE) && (c2 == 0xFF))
1598     {
1599       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1600                              | CATEGORY_MASK_UTF_16_AUTO);
1601       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1602                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1603                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1604     }
1605   else if (c2 < 0)
1606     {
1607       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1608       return 0;
1609     }
1610   else
1611     {
1612       /* We check the dispersion of Eth and Oth bytes where E is even and
1613          O is odd.  If both are high, we assume binary data.*/
1614       unsigned char e[256], o[256];
1615       unsigned e_num = 1, o_num = 1;
1616
1617       memset (e, 0, 256);
1618       memset (o, 0, 256);
1619       e[c1] = 1;
1620       o[c2] = 1;
1621
1622       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1623                                 |CATEGORY_MASK_UTF_16_BE
1624                                 | CATEGORY_MASK_UTF_16_LE);
1625
1626       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1627              != CATEGORY_MASK_UTF_16)
1628         {
1629           TWO_MORE_BYTES (c1, c2);
1630           if (c2 < 0)
1631             break;
1632           if (! e[c1])
1633             {
1634               e[c1] = 1;
1635               e_num++;
1636               if (e_num >= 128)
1637                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1638             }
1639           if (! o[c2])
1640             {
1641               o[c2] = 1;
1642               o_num++;
1643               if (o_num >= 128)
1644                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1645             }
1646         }
1647       return 0;
1648     }
1649
1650  no_more_source:
1651   return 1;
1652 }
1653
1654 static void
1655 decode_coding_utf_16 (struct coding_system *coding)
1656 {
1657   const unsigned char *src = coding->source + coding->consumed;
1658   const unsigned char *src_end = coding->source + coding->src_bytes;
1659   const unsigned char *src_base;
1660   int *charbuf = coding->charbuf + coding->charbuf_used;
1661   /* We may produces at most 3 chars in one loop.  */
1662   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1663   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1664   bool multibytep = coding->src_multibyte;
1665   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1666   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1667   int surrogate = CODING_UTF_16_SURROGATE (coding);
1668   bool eol_dos
1669     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1670   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1671
1672   if (bom == utf_with_bom)
1673     {
1674       int c, c1, c2;
1675
1676       src_base = src;
1677       ONE_MORE_BYTE (c1);
1678       ONE_MORE_BYTE (c2);
1679       c = (c1 << 8) | c2;
1680
1681       if (endian == utf_16_big_endian
1682           ? c != 0xFEFF : c != 0xFFFE)
1683         {
1684           /* The first two bytes are not BOM.  Treat them as bytes
1685              for a normal character.  */
1686           src = src_base;
1687         }
1688       CODING_UTF_16_BOM (coding) = utf_without_bom;
1689     }
1690   else if (bom == utf_detect_bom)
1691     {
1692       /* We have already tried to detect BOM and failed in
1693          detect_coding.  */
1694       CODING_UTF_16_BOM (coding) = utf_without_bom;
1695     }
1696
1697   while (1)
1698     {
1699       int c, c1, c2;
1700
1701       src_base = src;
1702       consumed_chars_base = consumed_chars;
1703
1704       if (charbuf >= charbuf_end)
1705         {
1706           if (byte_after_cr1 >= 0)
1707             src_base -= 2;
1708           break;
1709         }
1710
1711       if (byte_after_cr1 >= 0)
1712         c1 = byte_after_cr1, byte_after_cr1 = -1;
1713       else
1714         ONE_MORE_BYTE (c1);
1715       if (c1 < 0)
1716         {
1717           *charbuf++ = -c1;
1718           continue;
1719         }
1720       if (byte_after_cr2 >= 0)
1721         c2 = byte_after_cr2, byte_after_cr2 = -1;
1722       else
1723         ONE_MORE_BYTE (c2);
1724       if (c2 < 0)
1725         {
1726           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1727           *charbuf++ = -c2;
1728           continue;
1729         }
1730       c = (endian == utf_16_big_endian
1731            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1732
1733       if (surrogate)
1734         {
1735           if (! UTF_16_LOW_SURROGATE_P (c))
1736             {
1737               if (endian == utf_16_big_endian)
1738                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1739               else
1740                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1741               *charbuf++ = c1;
1742               *charbuf++ = c2;
1743               if (UTF_16_HIGH_SURROGATE_P (c))
1744                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1745               else
1746                 *charbuf++ = c;
1747             }
1748           else
1749             {
1750               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1751               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1752               *charbuf++ = 0x10000 + c;
1753             }
1754         }
1755       else
1756         {
1757           if (UTF_16_HIGH_SURROGATE_P (c))
1758             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1759           else
1760             {
1761               if (eol_dos && c == '\r')
1762                 {
1763                   ONE_MORE_BYTE (byte_after_cr1);
1764                   ONE_MORE_BYTE (byte_after_cr2);
1765                 }
1766               *charbuf++ = c;
1767             }
1768         }
1769     }
1770
1771  no_more_source:
1772   coding->consumed_char += consumed_chars_base;
1773   coding->consumed = src_base - coding->source;
1774   coding->charbuf_used = charbuf - coding->charbuf;
1775 }
1776
1777 static bool
1778 encode_coding_utf_16 (struct coding_system *coding)
1779 {
1780   bool multibytep = coding->dst_multibyte;
1781   int *charbuf = coding->charbuf;
1782   int *charbuf_end = charbuf + coding->charbuf_used;
1783   unsigned char *dst = coding->destination + coding->produced;
1784   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1785   int safe_room = 8;
1786   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1787   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1788   ptrdiff_t produced_chars = 0;
1789   int c;
1790
1791   if (bom != utf_without_bom)
1792     {
1793       ASSURE_DESTINATION (safe_room);
1794       if (big_endian)
1795         EMIT_TWO_BYTES (0xFE, 0xFF);
1796       else
1797         EMIT_TWO_BYTES (0xFF, 0xFE);
1798       CODING_UTF_16_BOM (coding) = utf_without_bom;
1799     }
1800
1801   while (charbuf < charbuf_end)
1802     {
1803       ASSURE_DESTINATION (safe_room);
1804       c = *charbuf++;
1805       if (c > MAX_UNICODE_CHAR)
1806         c = coding->default_char;
1807
1808       if (c < 0x10000)
1809         {
1810           if (big_endian)
1811             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1812           else
1813             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1814         }
1815       else
1816         {
1817           int c1, c2;
1818
1819           c -= 0x10000;
1820           c1 = (c >> 10) + 0xD800;
1821           c2 = (c & 0x3FF) + 0xDC00;
1822           if (big_endian)
1823             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1824           else
1825             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1826         }
1827     }
1828   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1829   coding->produced = dst - coding->destination;
1830   coding->produced_char += produced_chars;
1831   return 0;
1832 }
1833
1834 \f
1835 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1836
1837 /* Emacs' internal format for representation of multiple character
1838    sets is a kind of multi-byte encoding, i.e. characters are
1839    represented by variable-length sequences of one-byte codes.
1840
1841    ASCII characters and control characters (e.g. `tab', `newline') are
1842    represented by one-byte sequences which are their ASCII codes, in
1843    the range 0x00 through 0x7F.
1844
1845    8-bit characters of the range 0x80..0x9F are represented by
1846    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1847    code + 0x20).
1848
1849    8-bit characters of the range 0xA0..0xFF are represented by
1850    one-byte sequences which are their 8-bit code.
1851
1852    The other characters are represented by a sequence of `base
1853    leading-code', optional `extended leading-code', and one or two
1854    `position-code's.  The length of the sequence is determined by the
1855    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1856    whereas extended leading-code and position-code take the range 0xA0
1857    through 0xFF.  See `charset.h' for more details about leading-code
1858    and position-code.
1859
1860    --- CODE RANGE of Emacs' internal format ---
1861    character set        range
1862    -------------        -----
1863    ascii                0x00..0x7F
1864    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1865    eight-bit-graphic    0xA0..0xBF
1866    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1867    ---------------------------------------------
1868
1869    As this is the internal character representation, the format is
1870    usually not used externally (i.e. in a file or in a data sent to a
1871    process).  But, it is possible to have a text externally in this
1872    format (i.e. by encoding by the coding system `emacs-mule').
1873
1874    In that case, a sequence of one-byte codes has a slightly different
1875    form.
1876
1877    At first, all characters in eight-bit-control are represented by
1878    one-byte sequences which are their 8-bit code.
1879
1880    Next, character composition data are represented by the byte
1881    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1882    where,
1883         METHOD is 0xF2 plus one of composition method (enum
1884         composition_method),
1885
1886         BYTES is 0xA0 plus a byte length of this composition data,
1887
1888         CHARS is 0xA0 plus a number of characters composed by this
1889         data,
1890
1891         COMPONENTs are characters of multibyte form or composition
1892         rules encoded by two-byte of ASCII codes.
1893
1894    In addition, for backward compatibility, the following formats are
1895    also recognized as composition data on decoding.
1896
1897    0x80 MSEQ ...
1898    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1899
1900    Here,
1901         MSEQ is a multibyte form but in these special format:
1902           ASCII: 0xA0 ASCII_CODE+0x80,
1903           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1904         RULE is a one byte code of the range 0xA0..0xF0 that
1905         represents a composition rule.
1906   */
1907
1908 char emacs_mule_bytes[256];
1909
1910
1911 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1912    Return true if a text is encoded in 'emacs-mule'.  */
1913
1914 static bool
1915 detect_coding_emacs_mule (struct coding_system *coding,
1916                           struct coding_detection_info *detect_info)
1917 {
1918   const unsigned char *src = coding->source, *src_base;
1919   const unsigned char *src_end = coding->source + coding->src_bytes;
1920   bool multibytep = coding->src_multibyte;
1921   ptrdiff_t consumed_chars = 0;
1922   int c;
1923   int found = 0;
1924
1925   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1926   /* A coding system of this category is always ASCII compatible.  */
1927   src += coding->head_ascii;
1928
1929   while (1)
1930     {
1931       src_base = src;
1932       ONE_MORE_BYTE (c);
1933       if (c < 0)
1934         continue;
1935       if (c == 0x80)
1936         {
1937           /* Perhaps the start of composite character.  We simply skip
1938              it because analyzing it is too heavy for detecting.  But,
1939              at least, we check that the composite character
1940              constitutes of more than 4 bytes.  */
1941           const unsigned char *src_start;
1942
1943         repeat:
1944           src_start = src;
1945           do
1946             {
1947               ONE_MORE_BYTE (c);
1948             }
1949           while (c >= 0xA0);
1950
1951           if (src - src_start <= 4)
1952             break;
1953           found = CATEGORY_MASK_EMACS_MULE;
1954           if (c == 0x80)
1955             goto repeat;
1956         }
1957
1958       if (c < 0x80)
1959         {
1960           if (c < 0x20
1961               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1962             break;
1963         }
1964       else
1965         {
1966           int more_bytes = emacs_mule_bytes[c] - 1;
1967
1968           while (more_bytes > 0)
1969             {
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 {
1973                   src--;        /* Unread the last byte.  */
1974                   break;
1975                 }
1976               more_bytes--;
1977             }
1978           if (more_bytes != 0)
1979             break;
1980           found = CATEGORY_MASK_EMACS_MULE;
1981         }
1982     }
1983   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1984   return 0;
1985
1986  no_more_source:
1987   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1988     {
1989       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1990       return 0;
1991     }
1992   detect_info->found |= found;
1993   return 1;
1994 }
1995
1996
1997 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1998    character.  If CMP_STATUS indicates that we must expect MSEQ or
1999    RULE described above, decode it and return the negative value of
2000    the decoded character or rule.  If an invalid byte is found, return
2001    -1.  If SRC is too short, return -2.  */
2002
2003 static int
2004 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2005                  int *nbytes, int *nchars, int *id,
2006                  struct composition_status *cmp_status)
2007 {
2008   const unsigned char *src_end = coding->source + coding->src_bytes;
2009   const unsigned char *src_base = src;
2010   bool multibytep = coding->src_multibyte;
2011   int charset_ID;
2012   unsigned code;
2013   int c;
2014   ptrdiff_t consumed_chars = 0;
2015   bool mseq_found = 0;
2016
2017   ONE_MORE_BYTE (c);
2018   if (c < 0)
2019     {
2020       c = -c;
2021       charset_ID = emacs_mule_charset[0];
2022     }
2023   else
2024     {
2025       if (c >= 0xA0)
2026         {
2027           if (cmp_status->state != COMPOSING_NO
2028               && cmp_status->old_form)
2029             {
2030               if (cmp_status->state == COMPOSING_CHAR)
2031                 {
2032                   if (c == 0xA0)
2033                     {
2034                       ONE_MORE_BYTE (c);
2035                       c -= 0x80;
2036                       if (c < 0)
2037                         goto invalid_code;
2038                     }
2039                   else
2040                     c -= 0x20;
2041                   mseq_found = 1;
2042                 }
2043               else
2044                 {
2045                   *nbytes = src - src_base;
2046                   *nchars = consumed_chars;
2047                   return -c;
2048                 }
2049             }
2050           else
2051             goto invalid_code;
2052         }
2053
2054       switch (emacs_mule_bytes[c])
2055         {
2056         case 2:
2057           if ((charset_ID = emacs_mule_charset[c]) < 0)
2058             goto invalid_code;
2059           ONE_MORE_BYTE (c);
2060           if (c < 0xA0)
2061             goto invalid_code;
2062           code = c & 0x7F;
2063           break;
2064
2065         case 3:
2066           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2067               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2068             {
2069               ONE_MORE_BYTE (c);
2070               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2071                 goto invalid_code;
2072               ONE_MORE_BYTE (c);
2073               if (c < 0xA0)
2074                 goto invalid_code;
2075               code = c & 0x7F;
2076             }
2077           else
2078             {
2079               if ((charset_ID = emacs_mule_charset[c]) < 0)
2080                 goto invalid_code;
2081               ONE_MORE_BYTE (c);
2082               if (c < 0xA0)
2083                 goto invalid_code;
2084               code = (c & 0x7F) << 8;
2085               ONE_MORE_BYTE (c);
2086               if (c < 0xA0)
2087                 goto invalid_code;
2088               code |= c & 0x7F;
2089             }
2090           break;
2091
2092         case 4:
2093           ONE_MORE_BYTE (c);
2094           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2095             goto invalid_code;
2096           ONE_MORE_BYTE (c);
2097           if (c < 0xA0)
2098             goto invalid_code;
2099           code = (c & 0x7F) << 8;
2100           ONE_MORE_BYTE (c);
2101           if (c < 0xA0)
2102             goto invalid_code;
2103           code |= c & 0x7F;
2104           break;
2105
2106         case 1:
2107           code = c;
2108           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2109           break;
2110
2111         default:
2112           emacs_abort ();
2113         }
2114       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2115                           CHARSET_FROM_ID (charset_ID), code, c);
2116       if (c < 0)
2117         goto invalid_code;
2118     }
2119   *nbytes = src - src_base;
2120   *nchars = consumed_chars;
2121   if (id)
2122     *id = charset_ID;
2123   return (mseq_found ? -c : c);
2124
2125  no_more_source:
2126   return -2;
2127
2128  invalid_code:
2129   return -1;
2130 }
2131
2132
2133 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2134
2135 /* Handle these composition sequence ('|': the end of header elements,
2136    BYTES and CHARS >= 0xA0):
2137
2138    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2139    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2140    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2141
2142    and these old form:
2143
2144    (4) relative composition: 0x80 | MSEQ ... MSEQ
2145    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2146
2147    When the starter 0x80 and the following header elements are found,
2148    this annotation header is produced.
2149
2150         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2151
2152    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2153    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2154
2155    Then, upon reading the following elements, these codes are produced
2156    until the composition end is found:
2157
2158    (1) CHAR ... CHAR
2159    (2) ALT ... ALT CHAR ... CHAR
2160    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2161    (4) CHAR ... CHAR
2162    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2163
2164    When the composition end is found, LENGTH and NCHARS in the
2165    annotation header is updated as below:
2166
2167    (1) LENGTH: unchanged, NCHARS: unchanged
2168    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2169    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2170    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2171    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2172
2173    If an error is found while composing, the annotation header is
2174    changed to the original composition header (plus filler -1s) as
2175    below:
2176
2177    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2178    (5)          [ 0x80 0xFF -1 -1- -1 ]
2179
2180    and the sequence [ -2 DECODED-RULE ] is changed to the original
2181    byte sequence as below:
2182         o the original byte sequence is B: [ B -1 ]
2183         o the original byte sequence is B1 B2: [ B1 B2 ]
2184
2185    Most of the routines are implemented by macros because many
2186    variables and labels in the caller decode_coding_emacs_mule must be
2187    accessible, and they are usually called just once (thus doesn't
2188    increase the size of compiled object).  */
2189
2190 /* Decode a composition rule represented by C as a component of
2191    composition sequence of Emacs 20 style.  Set RULE to the decoded
2192    rule. */
2193
2194 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2195   do {                                                  \
2196     int gref, nref;                                     \
2197                                                         \
2198     c -= 0xA0;                                          \
2199     if (c < 0 || c >= 81)                               \
2200       goto invalid_code;                                \
2201     gref = c / 9, nref = c % 9;                         \
2202     if (gref == 4) gref = 10;                           \
2203     if (nref == 4) nref = 10;                           \
2204     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2205   } while (0)
2206
2207
2208 /* Decode a composition rule represented by C and the following byte
2209    at SRC as a component of composition sequence of Emacs 21 style.
2210    Set RULE to the decoded rule.  */
2211
2212 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2213   do {                                                  \
2214     int gref, nref;                                     \
2215                                                         \
2216     gref = c - 0x20;                                    \
2217     if (gref < 0 || gref >= 81)                         \
2218       goto invalid_code;                                \
2219     ONE_MORE_BYTE (c);                                  \
2220     nref = c - 0x20;                                    \
2221     if (nref < 0 || nref >= 81)                         \
2222       goto invalid_code;                                \
2223     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2228    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2229    byte length of this composition information, CHARS is the number of
2230    characters composed by this composition.  */
2231
2232 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2233   do {                                                                  \
2234     enum composition_method method = c - 0xF2;                          \
2235     int nbytes, nchars;                                                 \
2236                                                                         \
2237     ONE_MORE_BYTE (c);                                                  \
2238     if (c < 0)                                                          \
2239       goto invalid_code;                                                \
2240     nbytes = c - 0xA0;                                                  \
2241     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2242       goto invalid_code;                                                \
2243     ONE_MORE_BYTE (c);                                                  \
2244     nchars = c - 0xA0;                                                  \
2245     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2246       goto invalid_code;                                                \
2247     cmp_status->old_form = 0;                                           \
2248     cmp_status->method = method;                                        \
2249     if (method == COMPOSITION_RELATIVE)                                 \
2250       cmp_status->state = COMPOSING_CHAR;                               \
2251     else                                                                \
2252       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2253     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2254     cmp_status->nchars = nchars;                                        \
2255     cmp_status->ncomps = nbytes - 4;                                    \
2256     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2257   } while (0)
2258
2259
2260 /* Start of Emacs 20 style format for relative composition.  */
2261
2262 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2263   do {                                                          \
2264     cmp_status->old_form = 1;                                   \
2265     cmp_status->method = COMPOSITION_RELATIVE;                  \
2266     cmp_status->state = COMPOSING_CHAR;                         \
2267     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2268     cmp_status->nchars = cmp_status->ncomps = 0;                \
2269     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2270   } while (0)
2271
2272
2273 /* Start of Emacs 20 style format for rule-base composition.  */
2274
2275 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2276   do {                                                          \
2277     cmp_status->old_form = 1;                                   \
2278     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2279     cmp_status->state = COMPOSING_CHAR;                         \
2280     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2281     cmp_status->nchars = cmp_status->ncomps = 0;                \
2282     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2283   } while (0)
2284
2285
2286 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2287   do {                                                  \
2288     const unsigned char *current_src = src;             \
2289                                                         \
2290     ONE_MORE_BYTE (c);                                  \
2291     if (c < 0)                                          \
2292       goto invalid_code;                                \
2293     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2294         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2295       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2296     else if (c < 0xA0)                                  \
2297       goto invalid_code;                                \
2298     else if (c < 0xC0)                                  \
2299       {                                                 \
2300         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2301         /* Re-read C as a composition component.  */    \
2302         src = current_src;                              \
2303       }                                                 \
2304     else if (c == 0xFF)                                 \
2305       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2306     else                                                \
2307       goto invalid_code;                                \
2308   } while (0)
2309
2310 #define EMACS_MULE_COMPOSITION_END()                            \
2311   do {                                                          \
2312     int idx = - cmp_status->length;                             \
2313                                                                 \
2314     if (cmp_status->old_form)                                   \
2315       charbuf[idx + 2] = cmp_status->nchars;                    \
2316     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2317       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2318     cmp_status->state = COMPOSING_NO;                           \
2319   } while (0)
2320
2321
2322 static int
2323 emacs_mule_finish_composition (int *charbuf,
2324                                struct composition_status *cmp_status)
2325 {
2326   int idx = - cmp_status->length;
2327   int new_chars;
2328
2329   if (cmp_status->old_form && cmp_status->nchars > 0)
2330     {
2331       charbuf[idx + 2] = cmp_status->nchars;
2332       new_chars = 0;
2333       if (cmp_status->method == COMPOSITION_WITH_RULE
2334           && cmp_status->state == COMPOSING_CHAR)
2335         {
2336           /* The last rule was invalid.  */
2337           int rule = charbuf[-1] + 0xA0;
2338
2339           charbuf[-2] = BYTE8_TO_CHAR (rule);
2340           charbuf[-1] = -1;
2341           new_chars = 1;
2342         }
2343     }
2344   else
2345     {
2346       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2347
2348       if (cmp_status->method == COMPOSITION_WITH_RULE)
2349         {
2350           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2351           charbuf[idx++] = -3;
2352           charbuf[idx++] = 0;
2353           new_chars = 1;
2354         }
2355       else
2356         {
2357           int nchars = charbuf[idx + 1] + 0xA0;
2358           int nbytes = charbuf[idx + 2] + 0xA0;
2359
2360           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2361           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2362           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2363           charbuf[idx++] = -1;
2364           new_chars = 4;
2365         }
2366     }
2367   cmp_status->state = COMPOSING_NO;
2368   return new_chars;
2369 }
2370
2371 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2372   do {                                                                    \
2373     if (cmp_status->state != COMPOSING_NO)                                \
2374       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2375   } while (0)
2376
2377
2378 static void
2379 decode_coding_emacs_mule (struct coding_system *coding)
2380 {
2381   const unsigned char *src = coding->source + coding->consumed;
2382   const unsigned char *src_end = coding->source + coding->src_bytes;
2383   const unsigned char *src_base;
2384   int *charbuf = coding->charbuf + coding->charbuf_used;
2385   /* We may produce two annotations (charset and composition) in one
2386      loop and one more charset annotation at the end.  */
2387   int *charbuf_end
2388     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2389       /* We can produce up to 2 characters in a loop.  */
2390       - 1;
2391   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2392   bool multibytep = coding->src_multibyte;
2393   ptrdiff_t char_offset = coding->produced_char;
2394   ptrdiff_t last_offset = char_offset;
2395   int last_id = charset_ascii;
2396   bool eol_dos
2397     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2398   int byte_after_cr = -1;
2399   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2400
2401   if (cmp_status->state != COMPOSING_NO)
2402     {
2403       int i;
2404
2405       if (charbuf_end - charbuf < cmp_status->length)
2406         emacs_abort ();
2407       for (i = 0; i < cmp_status->length; i++)
2408         *charbuf++ = cmp_status->carryover[i];
2409       coding->annotated = 1;
2410     }
2411
2412   while (1)
2413     {
2414       int c, id IF_LINT (= 0);
2415
2416       src_base = src;
2417       consumed_chars_base = consumed_chars;
2418
2419       if (charbuf >= charbuf_end)
2420         {
2421           if (byte_after_cr >= 0)
2422             src_base--;
2423           break;
2424         }
2425
2426       if (byte_after_cr >= 0)
2427         c = byte_after_cr, byte_after_cr = -1;
2428       else
2429         ONE_MORE_BYTE (c);
2430
2431       if (c < 0 || c == 0x80)
2432         {
2433           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2434           if (c < 0)
2435             {
2436               *charbuf++ = -c;
2437               char_offset++;
2438             }
2439           else
2440             DECODE_EMACS_MULE_COMPOSITION_START ();
2441           continue;
2442         }
2443
2444       if (c < 0x80)
2445         {
2446           if (eol_dos && c == '\r')
2447             ONE_MORE_BYTE (byte_after_cr);
2448           id = charset_ascii;
2449           if (cmp_status->state != COMPOSING_NO)
2450             {
2451               if (cmp_status->old_form)
2452                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2453               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2454                 cmp_status->ncomps--;
2455             }
2456         }
2457       else
2458         {
2459           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2460           /* emacs_mule_char can load a charset map from a file, which
2461              allocates a large structure and might cause buffer text
2462              to be relocated as result.  Thus, we need to remember the
2463              original pointer to buffer text, and fix up all related
2464              pointers after the call.  */
2465           const unsigned char *orig = coding->source;
2466           ptrdiff_t offset;
2467
2468           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2469                                cmp_status);
2470           offset = coding->source - orig;
2471           if (offset)
2472             {
2473               src += offset;
2474               src_base += offset;
2475               src_end += offset;
2476             }
2477           if (c < 0)
2478             {
2479               if (c == -1)
2480                 goto invalid_code;
2481               if (c == -2)
2482                 break;
2483             }
2484           src = src_base + nbytes;
2485           consumed_chars = consumed_chars_base + nchars;
2486           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2487             cmp_status->ncomps -= nchars;
2488         }
2489
2490       /* Now if C >= 0, we found a normally encoded character, if C <
2491          0, we found an old-style composition component character or
2492          rule.  */
2493
2494       if (cmp_status->state == COMPOSING_NO)
2495         {
2496           if (last_id != id)
2497             {
2498               if (last_id != charset_ascii)
2499                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2500                                   last_id);
2501               last_id = id;
2502               last_offset = char_offset;
2503             }
2504           *charbuf++ = c;
2505           char_offset++;
2506         }
2507       else if (cmp_status->state == COMPOSING_CHAR)
2508         {
2509           if (cmp_status->old_form)
2510             {
2511               if (c >= 0)
2512                 {
2513                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2514                   *charbuf++ = c;
2515                   char_offset++;
2516                 }
2517               else
2518                 {
2519                   *charbuf++ = -c;
2520                   cmp_status->nchars++;
2521                   cmp_status->length++;
2522                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2523                     EMACS_MULE_COMPOSITION_END ();
2524                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2525                     cmp_status->state = COMPOSING_RULE;
2526                 }
2527             }
2528           else
2529             {
2530               *charbuf++ = c;
2531               cmp_status->length++;
2532               cmp_status->nchars--;
2533               if (cmp_status->nchars == 0)
2534                 EMACS_MULE_COMPOSITION_END ();
2535             }
2536         }
2537       else if (cmp_status->state == COMPOSING_RULE)
2538         {
2539           int rule;
2540
2541           if (c >= 0)
2542             {
2543               EMACS_MULE_COMPOSITION_END ();
2544               *charbuf++ = c;
2545               char_offset++;
2546             }
2547           else
2548             {
2549               c = -c;
2550               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2551               if (rule < 0)
2552                 goto invalid_code;
2553               *charbuf++ = -2;
2554               *charbuf++ = rule;
2555               cmp_status->length += 2;
2556               cmp_status->state = COMPOSING_CHAR;
2557             }
2558         }
2559       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2560         {
2561           *charbuf++ = c;
2562           cmp_status->length++;
2563           if (cmp_status->ncomps == 0)
2564             cmp_status->state = COMPOSING_CHAR;
2565           else if (cmp_status->ncomps > 0)
2566             {
2567               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2568                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2569             }
2570           else
2571             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2572         }
2573       else                      /* COMPOSING_COMPONENT_RULE */
2574         {
2575           int rule;
2576
2577           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2578           if (rule < 0)
2579             goto invalid_code;
2580           *charbuf++ = -2;
2581           *charbuf++ = rule;
2582           cmp_status->length += 2;
2583           cmp_status->ncomps--;
2584           if (cmp_status->ncomps > 0)
2585             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2586           else
2587             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2588         }
2589       continue;
2590
2591     invalid_code:
2592       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2593       src = src_base;
2594       consumed_chars = consumed_chars_base;
2595       ONE_MORE_BYTE (c);
2596       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2597       char_offset++;
2598     }
2599
2600  no_more_source:
2601   if (cmp_status->state != COMPOSING_NO)
2602     {
2603       if (coding->mode & CODING_MODE_LAST_BLOCK)
2604         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2605       else
2606         {
2607           int i;
2608
2609           charbuf -= cmp_status->length;
2610           for (i = 0; i < cmp_status->length; i++)
2611             cmp_status->carryover[i] = charbuf[i];
2612         }
2613     }
2614   if (last_id != charset_ascii)
2615     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2616   coding->consumed_char += consumed_chars_base;
2617   coding->consumed = src_base - coding->source;
2618   coding->charbuf_used = charbuf - coding->charbuf;
2619 }
2620
2621
2622 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2623   do {                                          \
2624     if (id < 0xA0)                              \
2625       codes[0] = id, codes[1] = 0;              \
2626     else if (id < 0xE0)                         \
2627       codes[0] = 0x9A, codes[1] = id;           \
2628     else if (id < 0xF0)                         \
2629       codes[0] = 0x9B, codes[1] = id;           \
2630     else if (id < 0xF5)                         \
2631       codes[0] = 0x9C, codes[1] = id;           \
2632     else                                        \
2633       codes[0] = 0x9D, codes[1] = id;           \
2634   } while (0);
2635
2636
2637 static bool
2638 encode_coding_emacs_mule (struct coding_system *coding)
2639 {
2640   bool multibytep = coding->dst_multibyte;
2641   int *charbuf = coding->charbuf;
2642   int *charbuf_end = charbuf + coding->charbuf_used;
2643   unsigned char *dst = coding->destination + coding->produced;
2644   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2645   int safe_room = 8;
2646   ptrdiff_t produced_chars = 0;
2647   Lisp_Object attrs, charset_list;
2648   int c;
2649   int preferred_charset_id = -1;
2650
2651   CODING_GET_INFO (coding, attrs, charset_list);
2652   if (! EQ (charset_list, Vemacs_mule_charset_list))
2653     {
2654       charset_list = Vemacs_mule_charset_list;
2655       ASET (attrs, coding_attr_charset_list, charset_list);
2656     }
2657
2658   while (charbuf < charbuf_end)
2659     {
2660       ASSURE_DESTINATION (safe_room);
2661       c = *charbuf++;
2662
2663       if (c < 0)
2664         {
2665           /* Handle an annotation.  */
2666           switch (*charbuf)
2667             {
2668             case CODING_ANNOTATE_COMPOSITION_MASK:
2669               /* Not yet implemented.  */
2670               break;
2671             case CODING_ANNOTATE_CHARSET_MASK:
2672               preferred_charset_id = charbuf[3];
2673               if (preferred_charset_id >= 0
2674                   && NILP (Fmemq (make_number (preferred_charset_id),
2675                                   charset_list)))
2676                 preferred_charset_id = -1;
2677               break;
2678             default:
2679               emacs_abort ();
2680             }
2681           charbuf += -c - 1;
2682           continue;
2683         }
2684
2685       if (ASCII_CHAR_P (c))
2686         EMIT_ONE_ASCII_BYTE (c);
2687       else if (CHAR_BYTE8_P (c))
2688         {
2689           c = CHAR_TO_BYTE8 (c);
2690           EMIT_ONE_BYTE (c);
2691         }
2692       else
2693         {
2694           struct charset *charset;
2695           unsigned code;
2696           int dimension;
2697           int emacs_mule_id;
2698           unsigned char leading_codes[2];
2699
2700           if (preferred_charset_id >= 0)
2701             {
2702               bool result;
2703
2704               charset = CHARSET_FROM_ID (preferred_charset_id);
2705               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2706               if (result)
2707                 code = ENCODE_CHAR (charset, c);
2708               else
2709                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2710                                      &code, charset);
2711             }
2712           else
2713             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2714                                  &code, charset);
2715           if (! charset)
2716             {
2717               c = coding->default_char;
2718               if (ASCII_CHAR_P (c))
2719                 {
2720                   EMIT_ONE_ASCII_BYTE (c);
2721                   continue;
2722                 }
2723               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2724                                    &code, charset);
2725             }
2726           dimension = CHARSET_DIMENSION (charset);
2727           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2728           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2729           EMIT_ONE_BYTE (leading_codes[0]);
2730           if (leading_codes[1])
2731             EMIT_ONE_BYTE (leading_codes[1]);
2732           if (dimension == 1)
2733             EMIT_ONE_BYTE (code | 0x80);
2734           else
2735             {
2736               code |= 0x8080;
2737               EMIT_ONE_BYTE (code >> 8);
2738               EMIT_ONE_BYTE (code & 0xFF);
2739             }
2740         }
2741     }
2742   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2743   coding->produced_char += produced_chars;
2744   coding->produced = dst - coding->destination;
2745   return 0;
2746 }
2747
2748 \f
2749 /*** 7. ISO2022 handlers ***/
2750
2751 /* The following note describes the coding system ISO2022 briefly.
2752    Since the intention of this note is to help understand the
2753    functions in this file, some parts are NOT ACCURATE or are OVERLY
2754    SIMPLIFIED.  For thorough understanding, please refer to the
2755    original document of ISO2022.  This is equivalent to the standard
2756    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2757
2758    ISO2022 provides many mechanisms to encode several character sets
2759    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2760    is encoded using bytes less than 128.  This may make the encoded
2761    text a little bit longer, but the text passes more easily through
2762    several types of gateway, some of which strip off the MSB (Most
2763    Significant Bit).
2764
2765    There are two kinds of character sets: control character sets and
2766    graphic character sets.  The former contain control characters such
2767    as `newline' and `escape' to provide control functions (control
2768    functions are also provided by escape sequences).  The latter
2769    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2770    two control character sets and many graphic character sets.
2771
2772    Graphic character sets are classified into one of the following
2773    four classes, according to the number of bytes (DIMENSION) and
2774    number of characters in one dimension (CHARS) of the set:
2775    - DIMENSION1_CHARS94
2776    - DIMENSION1_CHARS96
2777    - DIMENSION2_CHARS94
2778    - DIMENSION2_CHARS96
2779
2780    In addition, each character set is assigned an identification tag,
2781    unique for each set, called the "final character" (denoted as <F>
2782    hereafter).  The <F> of each character set is decided by ECMA(*)
2783    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2784    (0x30..0x3F are for private use only).
2785
2786    Note (*): ECMA = European Computer Manufacturers Association
2787
2788    Here are examples of graphic character sets [NAME(<F>)]:
2789         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2790         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2791         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2792         o DIMENSION2_CHARS96 -- none for the moment
2793
2794    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2795         C0 [0x00..0x1F] -- control character plane 0
2796         GL [0x20..0x7F] -- graphic character plane 0
2797         C1 [0x80..0x9F] -- control character plane 1
2798         GR [0xA0..0xFF] -- graphic character plane 1
2799
2800    A control character set is directly designated and invoked to C0 or
2801    C1 by an escape sequence.  The most common case is that:
2802    - ISO646's  control character set is designated/invoked to C0, and
2803    - ISO6429's control character set is designated/invoked to C1,
2804    and usually these designations/invocations are omitted in encoded
2805    text.  In a 7-bit environment, only C0 can be used, and a control
2806    character for C1 is encoded by an appropriate escape sequence to
2807    fit into the environment.  All control characters for C1 are
2808    defined to have corresponding escape sequences.
2809
2810    A graphic character set is at first designated to one of four
2811    graphic registers (G0 through G3), then these graphic registers are
2812    invoked to GL or GR.  These designations and invocations can be
2813    done independently.  The most common case is that G0 is invoked to
2814    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2815    these invocations and designations are omitted in encoded text.
2816    In a 7-bit environment, only GL can be used.
2817
2818    When a graphic character set of CHARS94 is invoked to GL, codes
2819    0x20 and 0x7F of the GL area work as control characters SPACE and
2820    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2821    be used.
2822
2823    There are two ways of invocation: locking-shift and single-shift.
2824    With locking-shift, the invocation lasts until the next different
2825    invocation, whereas with single-shift, the invocation affects the
2826    following character only and doesn't affect the locking-shift
2827    state.  Invocations are done by the following control characters or
2828    escape sequences:
2829
2830    ----------------------------------------------------------------------
2831    abbrev  function                  cntrl escape seq   description
2832    ----------------------------------------------------------------------
2833    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2834    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2835    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2836    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2837    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2838    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2839    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2840    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2841    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2842    ----------------------------------------------------------------------
2843    (*) These are not used by any known coding system.
2844
2845    Control characters for these functions are defined by macros
2846    ISO_CODE_XXX in `coding.h'.
2847
2848    Designations are done by the following escape sequences:
2849    ----------------------------------------------------------------------
2850    escape sequence      description
2851    ----------------------------------------------------------------------
2852    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2853    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2854    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2855    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2856    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2857    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2858    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2859    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2860    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2861    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2862    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2863    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2864    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2865    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2866    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2867    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2868    ----------------------------------------------------------------------
2869
2870    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2871    of dimension 1, chars 94, and final character <F>, etc...
2872
2873    Note (*): Although these designations are not allowed in ISO2022,
2874    Emacs accepts them on decoding, and produces them on encoding
2875    CHARS96 character sets in a coding system which is characterized as
2876    7-bit environment, non-locking-shift, and non-single-shift.
2877
2878    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2879    '(' must be omitted.  We refer to this as "short-form" hereafter.
2880
2881    Now you may notice that there are a lot of ways of encoding the
2882    same multilingual text in ISO2022.  Actually, there exist many
2883    coding systems such as Compound Text (used in X11's inter client
2884    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2885    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2886    localized platforms), and all of these are variants of ISO2022.
2887
2888    In addition to the above, Emacs handles two more kinds of escape
2889    sequences: ISO6429's direction specification and Emacs' private
2890    sequence for specifying character composition.
2891
2892    ISO6429's direction specification takes the following form:
2893         o CSI ']'      -- end of the current direction
2894         o CSI '0' ']'  -- end of the current direction
2895         o CSI '1' ']'  -- start of left-to-right text
2896         o CSI '2' ']'  -- start of right-to-left text
2897    The control character CSI (0x9B: control sequence introducer) is
2898    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2899
2900    Character composition specification takes the following form:
2901         o ESC '0' -- start relative composition
2902         o ESC '1' -- end composition
2903         o ESC '2' -- start rule-base composition (*)
2904         o ESC '3' -- start relative composition with alternate chars  (**)
2905         o ESC '4' -- start rule-base composition with alternate chars  (**)
2906   Since these are not standard escape sequences of any ISO standard,
2907   the use of them with these meanings is restricted to Emacs only.
2908
2909   (*) This form is used only in Emacs 20.7 and older versions,
2910   but newer versions can safely decode it.
2911   (**) This form is used only in Emacs 21.1 and newer versions,
2912   and older versions can't decode it.
2913
2914   Here's a list of example usages of these composition escape
2915   sequences (categorized by `enum composition_method').
2916
2917   COMPOSITION_RELATIVE:
2918         ESC 0 CHAR [ CHAR ] ESC 1
2919   COMPOSITION_WITH_RULE:
2920         ESC 2 CHAR [ RULE CHAR ] ESC 1
2921   COMPOSITION_WITH_ALTCHARS:
2922         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2923   COMPOSITION_WITH_RULE_ALTCHARS:
2924         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2925
2926 static enum iso_code_class_type iso_code_class[256];
2927
2928 #define SAFE_CHARSET_P(coding, id)      \
2929   ((id) <= (coding)->max_charset_id     \
2930    && (coding)->safe_charsets[id] != 255)
2931
2932 static void
2933 setup_iso_safe_charsets (Lisp_Object attrs)
2934 {
2935   Lisp_Object charset_list, safe_charsets;
2936   Lisp_Object request;
2937   Lisp_Object reg_usage;
2938   Lisp_Object tail;
2939   EMACS_INT reg94, reg96;
2940   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2941   int max_charset_id;
2942
2943   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2944   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2945       && ! EQ (charset_list, Viso_2022_charset_list))
2946     {
2947       charset_list = Viso_2022_charset_list;
2948       ASET (attrs, coding_attr_charset_list, charset_list);
2949       ASET (attrs, coding_attr_safe_charsets, Qnil);
2950     }
2951
2952   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2953     return;
2954
2955   max_charset_id = 0;
2956   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2957     {
2958       int id = XINT (XCAR (tail));
2959       if (max_charset_id < id)
2960         max_charset_id = id;
2961     }
2962
2963   safe_charsets = make_uninit_string (max_charset_id + 1);
2964   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2965   request = AREF (attrs, coding_attr_iso_request);
2966   reg_usage = AREF (attrs, coding_attr_iso_usage);
2967   reg94 = XINT (XCAR (reg_usage));
2968   reg96 = XINT (XCDR (reg_usage));
2969
2970   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2971     {
2972       Lisp_Object id;
2973       Lisp_Object reg;
2974       struct charset *charset;
2975
2976       id = XCAR (tail);
2977       charset = CHARSET_FROM_ID (XINT (id));
2978       reg = Fcdr (Fassq (id, request));
2979       if (! NILP (reg))
2980         SSET (safe_charsets, XINT (id), XINT (reg));
2981       else if (charset->iso_chars_96)
2982         {
2983           if (reg96 < 4)
2984             SSET (safe_charsets, XINT (id), reg96);
2985         }
2986       else
2987         {
2988           if (reg94 < 4)
2989             SSET (safe_charsets, XINT (id), reg94);
2990         }
2991     }
2992   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2993 }
2994
2995
2996 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2997    Return true if a text is encoded in one of ISO-2022 based coding
2998    systems.  */
2999
3000 static bool
3001 detect_coding_iso_2022 (struct coding_system *coding,
3002                         struct coding_detection_info *detect_info)
3003 {
3004   const unsigned char *src = coding->source, *src_base = src;
3005   const unsigned char *src_end = coding->source + coding->src_bytes;
3006   bool multibytep = coding->src_multibyte;
3007   bool single_shifting = 0;
3008   int id;
3009   int c, c1;
3010   ptrdiff_t consumed_chars = 0;
3011   int i;
3012   int rejected = 0;
3013   int found = 0;
3014   int composition_count = -1;
3015
3016   detect_info->checked |= CATEGORY_MASK_ISO;
3017
3018   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3019     {
3020       struct coding_system *this = &(coding_categories[i]);
3021       Lisp_Object attrs, val;
3022
3023       if (this->id < 0)
3024         continue;
3025       attrs = CODING_ID_ATTRS (this->id);
3026       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3027           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3028         setup_iso_safe_charsets (attrs);
3029       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3030       this->max_charset_id = SCHARS (val) - 1;
3031       this->safe_charsets = SDATA (val);
3032     }
3033
3034   /* A coding system of this category is always ASCII compatible.  */
3035   src += coding->head_ascii;
3036
3037   while (rejected != CATEGORY_MASK_ISO)
3038     {
3039       src_base = src;
3040       ONE_MORE_BYTE (c);
3041       switch (c)
3042         {
3043         case ISO_CODE_ESC:
3044           if (inhibit_iso_escape_detection)
3045             break;
3046           single_shifting = 0;
3047           ONE_MORE_BYTE (c);
3048           if (c == 'N' || c == 'O')
3049             {
3050               /* ESC <Fe> for SS2 or SS3.  */
3051               single_shifting = 1;
3052               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3053             }
3054           else if (c == '1')
3055             {
3056               /* End of composition.  */
3057               if (composition_count < 0
3058                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3059                 /* Invalid */
3060                 break;
3061               composition_count = -1;
3062               found |= CATEGORY_MASK_ISO;
3063             }
3064           else if (c >= '0' && c <= '4')
3065             {
3066               /* ESC <Fp> for start/end composition.  */
3067               composition_count = 0;
3068             }
3069           else
3070             {
3071               if (c >= '(' && c <= '/')
3072                 {
3073                   /* Designation sequence for a charset of dimension 1.  */
3074                   ONE_MORE_BYTE (c1);
3075                   if (c1 < ' ' || c1 >= 0x80
3076                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3077                     /* Invalid designation sequence.  Just ignore.  */
3078                     break;
3079                 }
3080               else if (c == '$')
3081                 {
3082                   /* Designation sequence for a charset of dimension 2.  */
3083                   ONE_MORE_BYTE (c);
3084                   if (c >= '@' && c <= 'B')
3085                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3086                     id = iso_charset_table[1][0][c];
3087                   else if (c >= '(' && c <= '/')
3088                     {
3089                       ONE_MORE_BYTE (c1);
3090                       if (c1 < ' ' || c1 >= 0x80
3091                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3092                         /* Invalid designation sequence.  Just ignore.  */
3093                         break;
3094                     }
3095                   else
3096                     /* Invalid designation sequence.  Just ignore it.  */
3097                     break;
3098                 }
3099               else
3100                 {
3101                   /* Invalid escape sequence.  Just ignore it.  */
3102                   break;
3103                 }
3104
3105               /* We found a valid designation sequence for CHARSET.  */
3106               rejected |= CATEGORY_MASK_ISO_8BIT;
3107               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3108                                   id))
3109                 found |= CATEGORY_MASK_ISO_7;
3110               else
3111                 rejected |= CATEGORY_MASK_ISO_7;
3112               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3113                                   id))
3114                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3115               else
3116                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3117               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3118                                   id))
3119                 found |= CATEGORY_MASK_ISO_7_ELSE;
3120               else
3121                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3122               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3123                                   id))
3124                 found |= CATEGORY_MASK_ISO_8_ELSE;
3125               else
3126                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3127             }
3128           break;
3129
3130         case ISO_CODE_SO:
3131         case ISO_CODE_SI:
3132           /* Locking shift out/in.  */
3133           if (inhibit_iso_escape_detection)
3134             break;
3135           single_shifting = 0;
3136           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3137           break;
3138
3139         case ISO_CODE_CSI:
3140           /* Control sequence introducer.  */
3141           single_shifting = 0;
3142           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3143           found |= CATEGORY_MASK_ISO_8_ELSE;
3144           goto check_extra_latin;
3145
3146         case ISO_CODE_SS2:
3147         case ISO_CODE_SS3:
3148           /* Single shift.   */
3149           if (inhibit_iso_escape_detection)
3150             break;
3151           single_shifting = 0;
3152           rejected |= CATEGORY_MASK_ISO_7BIT;
3153           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3154               & CODING_ISO_FLAG_SINGLE_SHIFT)
3155             {
3156               found |= CATEGORY_MASK_ISO_8_1;
3157               single_shifting = 1;
3158             }
3159           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3160               & CODING_ISO_FLAG_SINGLE_SHIFT)
3161             {
3162               found |= CATEGORY_MASK_ISO_8_2;
3163               single_shifting = 1;
3164             }
3165           if (single_shifting)
3166             break;
3167           goto check_extra_latin;
3168
3169         default:
3170           if (c < 0)
3171             continue;
3172           if (c < 0x80)
3173             {
3174               if (composition_count >= 0)
3175                 composition_count++;
3176               single_shifting = 0;
3177               break;
3178             }
3179           if (c >= 0xA0)
3180             {
3181               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3182               found |= CATEGORY_MASK_ISO_8_1;
3183               /* Check the length of succeeding codes of the range
3184                  0xA0..0FF.  If the byte length is even, we include
3185                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3186                  only when we are not single shifting.  */
3187               if (! single_shifting
3188                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3189                 {
3190                   ptrdiff_t len = 1;
3191                   while (src < src_end)
3192                     {
3193                       src_base = src;
3194                       ONE_MORE_BYTE (c);
3195                       if (c < 0xA0)
3196                         {
3197                           src = src_base;
3198                           break;
3199                         }
3200                       len++;
3201                     }
3202
3203                   if (len & 1 && src < src_end)
3204                     {
3205                       rejected |= CATEGORY_MASK_ISO_8_2;
3206                       if (composition_count >= 0)
3207                         composition_count += len;
3208                     }
3209                   else
3210                     {
3211                       found |= CATEGORY_MASK_ISO_8_2;
3212                       if (composition_count >= 0)
3213                         composition_count += len / 2;
3214                     }
3215                 }
3216               break;
3217             }
3218         check_extra_latin:
3219           if (! VECTORP (Vlatin_extra_code_table)
3220               || NILP (AREF (Vlatin_extra_code_table, c)))
3221             {
3222               rejected = CATEGORY_MASK_ISO;
3223               break;
3224             }
3225           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3226               & CODING_ISO_FLAG_LATIN_EXTRA)
3227             found |= CATEGORY_MASK_ISO_8_1;
3228           else
3229             rejected |= CATEGORY_MASK_ISO_8_1;
3230           rejected |= CATEGORY_MASK_ISO_8_2;
3231           break;
3232         }
3233     }
3234   detect_info->rejected |= CATEGORY_MASK_ISO;
3235   return 0;
3236
3237  no_more_source:
3238   detect_info->rejected |= rejected;
3239   detect_info->found |= (found & ~rejected);
3240   return 1;
3241 }
3242
3243
3244 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3245    escape sequence should be kept.  */
3246 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3247   do {                                                                  \
3248     int id, prev;                                                       \
3249                                                                         \
3250     if (final < '0' || final >= 128                                     \
3251         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3252         || !SAFE_CHARSET_P (coding, id))                                \
3253       {                                                                 \
3254         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3255         chars_96 = -1;                                                  \
3256         break;                                                          \
3257       }                                                                 \
3258     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3259     if (id == charset_jisx0201_roman)                                   \
3260       {                                                                 \
3261         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3262           id = charset_ascii;                                           \
3263       }                                                                 \
3264     else if (id == charset_jisx0208_1978)                               \
3265       {                                                                 \
3266         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3267           id = charset_jisx0208;                                        \
3268       }                                                                 \
3269     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3270     /* If there was an invalid designation to REG previously, and this  \
3271        designation is ASCII to REG, we should keep this designation     \
3272        sequence.  */                                                    \
3273     if (prev == -2 && id == charset_ascii)                              \
3274       chars_96 = -1;                                                    \
3275   } while (0)
3276
3277
3278 /* Handle these composition sequence (ALT: alternate char):
3279
3280    (1) relative composition: ESC 0 CHAR ... ESC 1
3281    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3282    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3283    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3284
3285    When the start sequence (ESC 0/2/3/4) is found, this annotation
3286    header is produced.
3287
3288         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3289
3290    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3291    produced until the end sequence (ESC 1) is found:
3292
3293    (1) CHAR ... CHAR
3294    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3295    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3296    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3297
3298    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3299    annotation header is updated as below:
3300
3301    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3302    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3303    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3304    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3305
3306    If an error is found while composing, the annotation header is
3307    changed to:
3308
3309         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3310
3311    and the sequence [ -2 DECODED-RULE ] is changed to the original
3312    byte sequence as below:
3313         o the original byte sequence is B: [ B -1 ]
3314         o the original byte sequence is B1 B2: [ B1 B2 ]
3315    and the sequence [ -1 -1 ] is changed to the original byte
3316    sequence:
3317         [ ESC '0' ]
3318 */
3319
3320 /* Decode a composition rule C1 and maybe one more byte from the
3321    source, and set RULE to the encoded composition rule.  If the rule
3322    is invalid, goto invalid_code.  */
3323
3324 #define DECODE_COMPOSITION_RULE(rule)                                   \
3325   do {                                                                  \
3326     rule = c1 - 32;                                                     \
3327     if (rule < 0)                                                       \
3328       goto invalid_code;                                                \
3329     if (rule < 81)              /* old format (before ver.21) */        \
3330       {                                                                 \
3331         int gref = (rule) / 9;                                          \
3332         int nref = (rule) % 9;                                          \
3333         if (gref == 4) gref = 10;                                       \
3334         if (nref == 4) nref = 10;                                       \
3335         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3336       }                                                                 \
3337     else                        /* new format (after ver.21) */         \
3338       {                                                                 \
3339         int b;                                                          \
3340                                                                         \
3341         ONE_MORE_BYTE (b);                                              \
3342         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3343           goto invalid_code;                                            \
3344         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3345         rule += 0x100;   /* Distinguish it from the old format.  */     \
3346       }                                                                 \
3347   } while (0)
3348
3349 #define ENCODE_COMPOSITION_RULE(rule)                           \
3350   do {                                                          \
3351     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3352                                                                 \
3353     if (rule < 0x100)           /* old format */                \
3354       {                                                         \
3355         if (gref == 10) gref = 4;                               \
3356         if (nref == 10) nref = 4;                               \
3357         charbuf[idx] = 32 + gref * 9 + nref;                    \
3358         charbuf[idx + 1] = -1;                                  \
3359         new_chars++;                                            \
3360       }                                                         \
3361     else                                /* new format */        \
3362       {                                                         \
3363         charbuf[idx] = 32 + 81 + gref;                          \
3364         charbuf[idx + 1] = 32 + nref;                           \
3365         new_chars += 2;                                         \
3366       }                                                         \
3367   } while (0)
3368
3369 /* Finish the current composition as invalid.  */
3370
3371 static int
3372 finish_composition (int *charbuf, struct composition_status *cmp_status)
3373 {
3374   int idx = - cmp_status->length;
3375   int new_chars;
3376
3377   /* Recover the original ESC sequence */
3378   charbuf[idx++] = ISO_CODE_ESC;
3379   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3380                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3381                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3382                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3383                     : '4');
3384   charbuf[idx++] = -2;
3385   charbuf[idx++] = 0;
3386   charbuf[idx++] = -1;
3387   new_chars = cmp_status->nchars;
3388   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3389     for (; idx < 0; idx++)
3390       {
3391         int elt = charbuf[idx];
3392
3393         if (elt == -2)
3394           {
3395             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3396             idx++;
3397           }
3398         else if (elt == -1)
3399           {
3400             charbuf[idx++] = ISO_CODE_ESC;
3401             charbuf[idx] = '0';
3402             new_chars += 2;
3403           }
3404       }
3405   cmp_status->state = COMPOSING_NO;
3406   return new_chars;
3407 }
3408
3409 /* If characters are under composition, finish the composition.  */
3410 #define MAYBE_FINISH_COMPOSITION()                              \
3411   do {                                                          \
3412     if (cmp_status->state != COMPOSING_NO)                      \
3413       char_offset += finish_composition (charbuf, cmp_status);  \
3414   } while (0)
3415
3416 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3417
3418    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3419    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3420    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3421    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3422
3423    Produce this annotation sequence now:
3424
3425    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3426 */
3427
3428 #define DECODE_COMPOSITION_START(c1)                                       \
3429   do {                                                                     \
3430     if (c1 == '0'                                                          \
3431         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3432              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3433             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3434                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3435       {                                                                    \
3436         *charbuf++ = -1;                                                   \
3437         *charbuf++= -1;                                                    \
3438         cmp_status->state = COMPOSING_CHAR;                                \
3439         cmp_status->length += 2;                                           \
3440       }                                                                    \
3441     else                                                                   \
3442       {                                                                    \
3443         MAYBE_FINISH_COMPOSITION ();                                       \
3444         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3445                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3446                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3447                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3448         cmp_status->state                                                  \
3449           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3450         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3451         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3452         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3453         coding->annotated = 1;                                             \
3454       }                                                                    \
3455   } while (0)
3456
3457
3458 /* Handle composition end sequence ESC 1.  */
3459
3460 #define DECODE_COMPOSITION_END()                                        \
3461   do {                                                                  \
3462     if (cmp_status->nchars == 0                                         \
3463         || ((cmp_status->state == COMPOSING_CHAR)                       \
3464             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3465       {                                                                 \
3466         MAYBE_FINISH_COMPOSITION ();                                    \
3467         goto invalid_code;                                              \
3468       }                                                                 \
3469     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3470       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3471     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3472       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3473     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3474     char_offset += cmp_status->nchars;                                  \
3475     cmp_status->state = COMPOSING_NO;                                   \
3476   } while (0)
3477
3478 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3479
3480 #define STORE_COMPOSITION_RULE(rule)    \
3481   do {                                  \
3482     *charbuf++ = -2;                    \
3483     *charbuf++ = rule;                  \
3484     cmp_status->length += 2;            \
3485     cmp_status->state--;                \
3486   } while (0)
3487
3488 /* Store a composed char or a component char C in charbuf, and update
3489    cmp_status.  */
3490
3491 #define STORE_COMPOSITION_CHAR(c)                                       \
3492   do {                                                                  \
3493     *charbuf++ = (c);                                                   \
3494     cmp_status->length++;                                               \
3495     if (cmp_status->state == COMPOSING_CHAR)                            \
3496       cmp_status->nchars++;                                             \
3497     else                                                                \
3498       cmp_status->ncomps++;                                             \
3499     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3500         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3501             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3502       cmp_status->state++;                                              \
3503   } while (0)
3504
3505
3506 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3507
3508 static void
3509 decode_coding_iso_2022 (struct coding_system *coding)
3510 {
3511   const unsigned char *src = coding->source + coding->consumed;
3512   const unsigned char *src_end = coding->source + coding->src_bytes;
3513   const unsigned char *src_base;
3514   int *charbuf = coding->charbuf + coding->charbuf_used;
3515   /* We may produce two annotations (charset and composition) in one
3516      loop and one more charset annotation at the end.  */
3517   int *charbuf_end
3518     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3519   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3520   bool multibytep = coding->src_multibyte;
3521   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3522   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3523   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3524   int charset_id_2, charset_id_3;
3525   struct charset *charset;
3526   int c;
3527   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3528   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3529   ptrdiff_t char_offset = coding->produced_char;
3530   ptrdiff_t last_offset = char_offset;
3531   int last_id = charset_ascii;
3532   bool eol_dos
3533     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3534   int byte_after_cr = -1;
3535   int i;
3536
3537   setup_iso_safe_charsets (attrs);
3538   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3539
3540   if (cmp_status->state != COMPOSING_NO)
3541     {
3542       if (charbuf_end - charbuf < cmp_status->length)
3543         emacs_abort ();
3544       for (i = 0; i < cmp_status->length; i++)
3545         *charbuf++ = cmp_status->carryover[i];
3546       coding->annotated = 1;
3547     }
3548
3549   while (1)
3550     {
3551       int c1, c2, c3;
3552
3553       src_base = src;
3554       consumed_chars_base = consumed_chars;
3555
3556       if (charbuf >= charbuf_end)
3557         {
3558           if (byte_after_cr >= 0)
3559             src_base--;
3560           break;
3561         }
3562
3563       if (byte_after_cr >= 0)
3564         c1 = byte_after_cr, byte_after_cr = -1;
3565       else
3566         ONE_MORE_BYTE (c1);
3567       if (c1 < 0)
3568         goto invalid_code;
3569
3570       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3571         {
3572           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3573           char_offset++;
3574           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3575           continue;
3576         }
3577
3578       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3579         {
3580           if (c1 == ISO_CODE_ESC)
3581             {
3582               if (src + 1 >= src_end)
3583                 goto no_more_source;
3584               *charbuf++ = ISO_CODE_ESC;
3585               char_offset++;
3586               if (src[0] == '%' && src[1] == '@')
3587                 {
3588                   src += 2;
3589                   consumed_chars += 2;
3590                   char_offset += 2;
3591                   /* We are sure charbuf can contain two more chars. */
3592                   *charbuf++ = '%';
3593                   *charbuf++ = '@';
3594                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3595                 }
3596             }
3597           else
3598             {
3599               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3600               char_offset++;
3601             }
3602           continue;
3603         }
3604
3605       if ((cmp_status->state == COMPOSING_RULE
3606            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3607           && c1 != ISO_CODE_ESC)
3608         {
3609           int rule;
3610
3611           DECODE_COMPOSITION_RULE (rule);
3612           STORE_COMPOSITION_RULE (rule);
3613           continue;
3614         }
3615
3616       /* We produce at most one character.  */
3617       switch (iso_code_class [c1])
3618         {
3619         case ISO_0x20_or_0x7F:
3620           if (charset_id_0 < 0
3621               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3622             /* This is SPACE or DEL.  */
3623             charset = CHARSET_FROM_ID (charset_ascii);
3624           else
3625             charset = CHARSET_FROM_ID (charset_id_0);
3626           break;
3627
3628         case ISO_graphic_plane_0:
3629           if (charset_id_0 < 0)
3630             charset = CHARSET_FROM_ID (charset_ascii);
3631           else
3632             charset = CHARSET_FROM_ID (charset_id_0);
3633           break;
3634
3635         case ISO_0xA0_or_0xFF:
3636           if (charset_id_1 < 0
3637               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3638               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3639             goto invalid_code;
3640           /* This is a graphic character, we fall down ... */
3641
3642         case ISO_graphic_plane_1:
3643           if (charset_id_1 < 0)
3644             goto invalid_code;
3645           charset = CHARSET_FROM_ID (charset_id_1);
3646           break;
3647
3648         case ISO_control_0:
3649           if (eol_dos && c1 == '\r')
3650             ONE_MORE_BYTE (byte_after_cr);
3651           MAYBE_FINISH_COMPOSITION ();
3652           charset = CHARSET_FROM_ID (charset_ascii);
3653           break;
3654
3655         case ISO_control_1:
3656           goto invalid_code;
3657
3658         case ISO_shift_out:
3659           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3660               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3661             goto invalid_code;
3662           CODING_ISO_INVOCATION (coding, 0) = 1;
3663           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3664           continue;
3665
3666         case ISO_shift_in:
3667           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3668             goto invalid_code;
3669           CODING_ISO_INVOCATION (coding, 0) = 0;
3670           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3671           continue;
3672
3673         case ISO_single_shift_2_7:
3674           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3675             goto invalid_code;
3676         case ISO_single_shift_2:
3677           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3678             goto invalid_code;
3679           /* SS2 is handled as an escape sequence of ESC 'N' */
3680           c1 = 'N';
3681           goto label_escape_sequence;
3682
3683         case ISO_single_shift_3:
3684           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3685             goto invalid_code;
3686           /* SS2 is handled as an escape sequence of ESC 'O' */
3687           c1 = 'O';
3688           goto label_escape_sequence;
3689
3690         case ISO_control_sequence_introducer:
3691           /* CSI is handled as an escape sequence of ESC '[' ...  */
3692           c1 = '[';
3693           goto label_escape_sequence;
3694
3695         case ISO_escape:
3696           ONE_MORE_BYTE (c1);
3697         label_escape_sequence:
3698           /* Escape sequences handled here are invocation,
3699              designation, direction specification, and character
3700              composition specification.  */
3701           switch (c1)
3702             {
3703             case '&':           /* revision of following character set */
3704               ONE_MORE_BYTE (c1);
3705               if (!(c1 >= '@' && c1 <= '~'))
3706                 goto invalid_code;
3707               ONE_MORE_BYTE (c1);
3708               if (c1 != ISO_CODE_ESC)
3709                 goto invalid_code;
3710               ONE_MORE_BYTE (c1);
3711               goto label_escape_sequence;
3712
3713             case '$':           /* designation of 2-byte character set */
3714               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3715                 goto invalid_code;
3716               {
3717                 int reg, chars96;
3718
3719                 ONE_MORE_BYTE (c1);
3720                 if (c1 >= '@' && c1 <= 'B')
3721                   {     /* designation of JISX0208.1978, GB2312.1980,
3722                            or JISX0208.1980 */
3723                     reg = 0, chars96 = 0;
3724                   }
3725                 else if (c1 >= 0x28 && c1 <= 0x2B)
3726                   { /* designation of DIMENSION2_CHARS94 character set */
3727                     reg = c1 - 0x28, chars96 = 0;
3728                     ONE_MORE_BYTE (c1);
3729                   }
3730                 else if (c1 >= 0x2C && c1 <= 0x2F)
3731                   { /* designation of DIMENSION2_CHARS96 character set */
3732                     reg = c1 - 0x2C, chars96 = 1;
3733                     ONE_MORE_BYTE (c1);
3734                   }
3735                 else
3736                   goto invalid_code;
3737                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3738                 /* We must update these variables now.  */
3739                 if (reg == 0)
3740                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3741                 else if (reg == 1)
3742                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3743                 if (chars96 < 0)
3744                   goto invalid_code;
3745               }
3746               continue;
3747
3748             case 'n':           /* invocation of locking-shift-2 */
3749               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3750                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3751                 goto invalid_code;
3752               CODING_ISO_INVOCATION (coding, 0) = 2;
3753               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3754               continue;
3755
3756             case 'o':           /* invocation of locking-shift-3 */
3757               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3758                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3759                 goto invalid_code;
3760               CODING_ISO_INVOCATION (coding, 0) = 3;
3761               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3762               continue;
3763
3764             case 'N':           /* invocation of single-shift-2 */
3765               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3766                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3767                 goto invalid_code;
3768               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3769               if (charset_id_2 < 0)
3770                 charset = CHARSET_FROM_ID (charset_ascii);
3771               else
3772                 charset = CHARSET_FROM_ID (charset_id_2);
3773               ONE_MORE_BYTE (c1);
3774               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3775                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3776                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3777                           ? c1 >= 0x80 : c1 < 0x80)))
3778                 goto invalid_code;
3779               break;
3780
3781             case 'O':           /* invocation of single-shift-3 */
3782               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3783                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3784                 goto invalid_code;
3785               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3786               if (charset_id_3 < 0)
3787                 charset = CHARSET_FROM_ID (charset_ascii);
3788               else
3789                 charset = CHARSET_FROM_ID (charset_id_3);
3790               ONE_MORE_BYTE (c1);
3791               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3792                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3793                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3794                           ? c1 >= 0x80 : c1 < 0x80)))
3795                 goto invalid_code;
3796               break;
3797
3798             case '0': case '2': case '3': case '4': /* start composition */
3799               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3800                 goto invalid_code;
3801               if (last_id != charset_ascii)
3802                 {
3803                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3804                   last_id = charset_ascii;
3805                   last_offset = char_offset;
3806                 }
3807               DECODE_COMPOSITION_START (c1);
3808               continue;
3809
3810             case '1':           /* end composition */
3811               if (cmp_status->state == COMPOSING_NO)
3812                 goto invalid_code;
3813               DECODE_COMPOSITION_END ();
3814               continue;
3815
3816             case '[':           /* specification of direction */
3817               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3818                 goto invalid_code;
3819               /* For the moment, nested direction is not supported.
3820                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3821                  left-to-right, and nonzero means right-to-left.  */
3822               ONE_MORE_BYTE (c1);
3823               switch (c1)
3824                 {
3825                 case ']':       /* end of the current direction */
3826                   coding->mode &= ~CODING_MODE_DIRECTION;
3827
3828                 case '0':       /* end of the current direction */
3829                 case '1':       /* start of left-to-right direction */
3830                   ONE_MORE_BYTE (c1);
3831                   if (c1 == ']')
3832                     coding->mode &= ~CODING_MODE_DIRECTION;
3833                   else
3834                     goto invalid_code;
3835                   break;
3836
3837                 case '2':       /* start of right-to-left direction */
3838                   ONE_MORE_BYTE (c1);
3839                   if (c1 == ']')
3840                     coding->mode |= CODING_MODE_DIRECTION;
3841                   else
3842                     goto invalid_code;
3843                   break;
3844
3845                 default:
3846                   goto invalid_code;
3847                 }
3848               continue;
3849
3850             case '%':
3851               ONE_MORE_BYTE (c1);
3852               if (c1 == '/')
3853                 {
3854                   /* CTEXT extended segment:
3855                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3856                      We keep these bytes as is for the moment.
3857                      They may be decoded by post-read-conversion.  */
3858                   int dim, M, L;
3859                   int size;
3860
3861                   ONE_MORE_BYTE (dim);
3862                   if (dim < '0' || dim > '4')
3863                     goto invalid_code;
3864                   ONE_MORE_BYTE (M);
3865                   if (M < 128)
3866                     goto invalid_code;
3867                   ONE_MORE_BYTE (L);
3868                   if (L < 128)
3869                     goto invalid_code;
3870                   size = ((M - 128) * 128) + (L - 128);
3871                   if (charbuf + 6 > charbuf_end)
3872                     goto break_loop;
3873                   *charbuf++ = ISO_CODE_ESC;
3874                   *charbuf++ = '%';
3875                   *charbuf++ = '/';
3876                   *charbuf++ = dim;
3877                   *charbuf++ = BYTE8_TO_CHAR (M);
3878                   *charbuf++ = BYTE8_TO_CHAR (L);
3879                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3880                 }
3881               else if (c1 == 'G')
3882                 {
3883                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3884                      ESC % G --UTF-8-BYTES-- ESC % @
3885                      We keep these bytes as is for the moment.
3886                      They may be decoded by post-read-conversion.  */
3887                   if (charbuf + 3 > charbuf_end)
3888                     goto break_loop;
3889                   *charbuf++ = ISO_CODE_ESC;
3890                   *charbuf++ = '%';
3891                   *charbuf++ = 'G';
3892                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3893                 }
3894               else
3895                 goto invalid_code;
3896               continue;
3897               break;
3898
3899             default:
3900               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3901                 goto invalid_code;
3902               {
3903                 int reg, chars96;
3904
3905                 if (c1 >= 0x28 && c1 <= 0x2B)
3906                   { /* designation of DIMENSION1_CHARS94 character set */
3907                     reg = c1 - 0x28, chars96 = 0;
3908                     ONE_MORE_BYTE (c1);
3909                   }
3910                 else if (c1 >= 0x2C && c1 <= 0x2F)
3911                   { /* designation of DIMENSION1_CHARS96 character set */
3912                     reg = c1 - 0x2C, chars96 = 1;
3913                     ONE_MORE_BYTE (c1);
3914                   }
3915                 else
3916                   goto invalid_code;
3917                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3918                 /* We must update these variables now.  */
3919                 if (reg == 0)
3920                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3921                 else if (reg == 1)
3922                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3923                 if (chars96 < 0)
3924                   goto invalid_code;
3925               }
3926               continue;
3927             }
3928           break;
3929
3930         default:
3931           emacs_abort ();
3932         }
3933
3934       if (cmp_status->state == COMPOSING_NO
3935           && charset->id != charset_ascii
3936           && last_id != charset->id)
3937         {
3938           if (last_id != charset_ascii)
3939             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3940           last_id = charset->id;
3941           last_offset = char_offset;
3942         }
3943
3944       /* Now we know CHARSET and 1st position code C1 of a character.
3945          Produce a decoded character while getting 2nd and 3rd
3946          position codes C2, C3 if necessary.  */
3947       if (CHARSET_DIMENSION (charset) > 1)
3948         {
3949           ONE_MORE_BYTE (c2);
3950           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3951               || ((c1 & 0x80) != (c2 & 0x80)))
3952             /* C2 is not in a valid range.  */
3953             goto invalid_code;
3954           if (CHARSET_DIMENSION (charset) == 2)
3955             c1 = (c1 << 8) | c2;
3956           else
3957             {
3958               ONE_MORE_BYTE (c3);
3959               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3960                   || ((c1 & 0x80) != (c3 & 0x80)))
3961                 /* C3 is not in a valid range.  */
3962                 goto invalid_code;
3963               c1 = (c1 << 16) | (c2 << 8) | c2;
3964             }
3965         }
3966       c1 &= 0x7F7F7F;
3967       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3968       if (c < 0)
3969         {
3970           MAYBE_FINISH_COMPOSITION ();
3971           for (; src_base < src; src_base++, char_offset++)
3972             {
3973               if (ASCII_CHAR_P (*src_base))
3974                 *charbuf++ = *src_base;
3975               else
3976                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3977             }
3978         }
3979       else if (cmp_status->state == COMPOSING_NO)
3980         {
3981           *charbuf++ = c;
3982           char_offset++;
3983         }
3984       else if ((cmp_status->state == COMPOSING_CHAR
3985                 ? cmp_status->nchars
3986                 : cmp_status->ncomps)
3987                >= MAX_COMPOSITION_COMPONENTS)
3988         {
3989           /* Too long composition.  */
3990           MAYBE_FINISH_COMPOSITION ();
3991           *charbuf++ = c;
3992           char_offset++;
3993         }
3994       else
3995         STORE_COMPOSITION_CHAR (c);
3996       continue;
3997
3998     invalid_code:
3999       MAYBE_FINISH_COMPOSITION ();
4000       src = src_base;
4001       consumed_chars = consumed_chars_base;
4002       ONE_MORE_BYTE (c);
4003       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
4004       char_offset++;
4005       /* Reset the invocation and designation status to the safest
4006          one; i.e. designate ASCII to the graphic register 0, and
4007          invoke that register to the graphic plane 0.  This typically
4008          helps the case that an designation sequence for ASCII "ESC (
4009          B" is somehow broken (e.g. broken by a newline).  */
4010       CODING_ISO_INVOCATION (coding, 0) = 0;
4011       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4012       charset_id_0 = charset_ascii;
4013       continue;
4014
4015     break_loop:
4016       break;
4017     }
4018
4019  no_more_source:
4020   if (cmp_status->state != COMPOSING_NO)
4021     {
4022       if (coding->mode & CODING_MODE_LAST_BLOCK)
4023         MAYBE_FINISH_COMPOSITION ();
4024       else
4025         {
4026           charbuf -= cmp_status->length;
4027           for (i = 0; i < cmp_status->length; i++)
4028             cmp_status->carryover[i] = charbuf[i];
4029         }
4030     }
4031   else if (last_id != charset_ascii)
4032     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4033   coding->consumed_char += consumed_chars_base;
4034   coding->consumed = src_base - coding->source;
4035   coding->charbuf_used = charbuf - coding->charbuf;
4036 }
4037
4038
4039 /* ISO2022 encoding stuff.  */
4040
4041 /*
4042    It is not enough to say just "ISO2022" on encoding, we have to
4043    specify more details.  In Emacs, each coding system of ISO2022
4044    variant has the following specifications:
4045         1. Initial designation to G0 thru G3.
4046         2. Allows short-form designation?
4047         3. ASCII should be designated to G0 before control characters?
4048         4. ASCII should be designated to G0 at end of line?
4049         5. 7-bit environment or 8-bit environment?
4050         6. Use locking-shift?
4051         7. Use Single-shift?
4052    And the following two are only for Japanese:
4053         8. Use ASCII in place of JIS0201-1976-Roman?
4054         9. Use JISX0208-1983 in place of JISX0208-1978?
4055    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4056    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4057    details.
4058 */
4059
4060 /* Produce codes (escape sequence) for designating CHARSET to graphic
4061    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4062    '@', 'A', or 'B' and the coding system CODING allows, produce
4063    designation sequence of short-form.  */
4064
4065 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4066   do {                                                                  \
4067     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4068     const char *intermediate_char_94 = "()*+";                          \
4069     const char *intermediate_char_96 = ",-./";                          \
4070     int revision = -1;                                                  \
4071                                                                         \
4072     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4073       revision = CHARSET_ISO_REVISION (charset);                        \
4074                                                                         \
4075     if (revision >= 0)                                                  \
4076       {                                                                 \
4077         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4078         EMIT_ONE_BYTE ('@' + revision);                                 \
4079       }                                                                 \
4080     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4081     if (CHARSET_DIMENSION (charset) == 1)                               \
4082       {                                                                 \
4083         int b;                                                          \
4084         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4085           b = intermediate_char_94[reg];                                \
4086         else                                                            \
4087           b = intermediate_char_96[reg];                                \
4088         EMIT_ONE_ASCII_BYTE (b);                                        \
4089       }                                                                 \
4090     else                                                                \
4091       {                                                                 \
4092         EMIT_ONE_ASCII_BYTE ('$');                                      \
4093         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4094           {                                                             \
4095             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4096                 || reg != 0                                             \
4097                 || final_char < '@' || final_char > 'B')                \
4098               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4099           }                                                             \
4100         else                                                            \
4101           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4102       }                                                                 \
4103     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4104                                                                         \
4105     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4106   } while (0)
4107
4108
4109 /* The following two macros produce codes (control character or escape
4110    sequence) for ISO2022 single-shift functions (single-shift-2 and
4111    single-shift-3).  */
4112
4113 #define ENCODE_SINGLE_SHIFT_2                                           \
4114   do {                                                                  \
4115     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4116       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4117     else                                                                \
4118       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4119     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4120   } while (0)
4121
4122
4123 #define ENCODE_SINGLE_SHIFT_3                                           \
4124   do {                                                                  \
4125     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4126       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4127     else                                                                \
4128       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4129     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4130   } while (0)
4131
4132
4133 /* The following four macros produce codes (control character or
4134    escape sequence) for ISO2022 locking-shift functions (shift-in,
4135    shift-out, locking-shift-2, and locking-shift-3).  */
4136
4137 #define ENCODE_SHIFT_IN                                 \
4138   do {                                                  \
4139     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4140     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4141   } while (0)
4142
4143
4144 #define ENCODE_SHIFT_OUT                                \
4145   do {                                                  \
4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4147     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4148   } while (0)
4149
4150
4151 #define ENCODE_LOCKING_SHIFT_2                          \
4152   do {                                                  \
4153     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4154     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4155   } while (0)
4156
4157
4158 #define ENCODE_LOCKING_SHIFT_3                          \
4159   do {                                                  \
4160     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4161     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4162   } while (0)
4163
4164
4165 /* Produce codes for a DIMENSION1 character whose character set is
4166    CHARSET and whose position-code is C1.  Designation and invocation
4167    sequences are also produced in advance if necessary.  */
4168
4169 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4170   do {                                                                  \
4171     int id = CHARSET_ID (charset);                                      \
4172                                                                         \
4173     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4174         && id == charset_ascii)                                         \
4175       {                                                                 \
4176         id = charset_jisx0201_roman;                                    \
4177         charset = CHARSET_FROM_ID (id);                                 \
4178       }                                                                 \
4179                                                                         \
4180     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4181       {                                                                 \
4182         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4183           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4184         else                                                            \
4185           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4186         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4187         break;                                                          \
4188       }                                                                 \
4189     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4190       {                                                                 \
4191         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4192         break;                                                          \
4193       }                                                                 \
4194     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4195       {                                                                 \
4196         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4197         break;                                                          \
4198       }                                                                 \
4199     else                                                                \
4200       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4201          must invoke it, or, at first, designate it to some graphic     \
4202          register.  Then repeat the loop to actually produce the        \
4203          character.  */                                                 \
4204       dst = encode_invocation_designation (charset, coding, dst,        \
4205                                            &produced_chars);            \
4206   } while (1)
4207
4208
4209 /* Produce codes for a DIMENSION2 character whose character set is
4210    CHARSET and whose position-codes are C1 and C2.  Designation and
4211    invocation codes are also produced in advance if necessary.  */
4212
4213 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4214   do {                                                                  \
4215     int id = CHARSET_ID (charset);                                      \
4216                                                                         \
4217     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4218         && id == charset_jisx0208)                                      \
4219       {                                                                 \
4220         id = charset_jisx0208_1978;                                     \
4221         charset = CHARSET_FROM_ID (id);                                 \
4222       }                                                                 \
4223                                                                         \
4224     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4225       {                                                                 \
4226         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4227           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4228         else                                                            \
4229           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4230         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4231         break;                                                          \
4232       }                                                                 \
4233     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4234       {                                                                 \
4235         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4236         break;                                                          \
4237       }                                                                 \
4238     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4239       {                                                                 \
4240         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4241         break;                                                          \
4242       }                                                                 \
4243     else                                                                \
4244       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4245          must invoke it, or, at first, designate it to some graphic     \
4246          register.  Then repeat the loop to actually produce the        \
4247          character.  */                                                 \
4248       dst = encode_invocation_designation (charset, coding, dst,        \
4249                                            &produced_chars);            \
4250   } while (1)
4251
4252
4253 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4254   do {                                                                     \
4255     unsigned code;                                                         \
4256     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4257                                                                            \
4258     if (CHARSET_DIMENSION (charset) == 1)                                  \
4259       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4260     else                                                                   \
4261       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4262   } while (0)
4263
4264
4265 /* Produce designation and invocation codes at a place pointed by DST
4266    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4267    Return new DST.  */
4268
4269 static unsigned char *
4270 encode_invocation_designation (struct charset *charset,
4271                                struct coding_system *coding,
4272                                unsigned char *dst, ptrdiff_t *p_nchars)
4273 {
4274   bool multibytep = coding->dst_multibyte;
4275   ptrdiff_t produced_chars = *p_nchars;
4276   int reg;                      /* graphic register number */
4277   int id = CHARSET_ID (charset);
4278
4279   /* At first, check designations.  */
4280   for (reg = 0; reg < 4; reg++)
4281     if (id == CODING_ISO_DESIGNATION (coding, reg))
4282       break;
4283
4284   if (reg >= 4)
4285     {
4286       /* CHARSET is not yet designated to any graphic registers.  */
4287       /* At first check the requested designation.  */
4288       reg = CODING_ISO_REQUEST (coding, id);
4289       if (reg < 0)
4290         /* Since CHARSET requests no special designation, designate it
4291            to graphic register 0.  */
4292         reg = 0;
4293
4294       ENCODE_DESIGNATION (charset, reg, coding);
4295     }
4296
4297   if (CODING_ISO_INVOCATION (coding, 0) != reg
4298       && CODING_ISO_INVOCATION (coding, 1) != reg)
4299     {
4300       /* Since the graphic register REG is not invoked to any graphic
4301          planes, invoke it to graphic plane 0.  */
4302       switch (reg)
4303         {
4304         case 0:                 /* graphic register 0 */
4305           ENCODE_SHIFT_IN;
4306           break;
4307
4308         case 1:                 /* graphic register 1 */
4309           ENCODE_SHIFT_OUT;
4310           break;
4311
4312         case 2:                 /* graphic register 2 */
4313           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4314             ENCODE_SINGLE_SHIFT_2;
4315           else
4316             ENCODE_LOCKING_SHIFT_2;
4317           break;
4318
4319         case 3:                 /* graphic register 3 */
4320           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4321             ENCODE_SINGLE_SHIFT_3;
4322           else
4323             ENCODE_LOCKING_SHIFT_3;
4324           break;
4325         }
4326     }
4327
4328   *p_nchars = produced_chars;
4329   return dst;
4330 }
4331
4332
4333 /* Produce codes for designation and invocation to reset the graphic
4334    planes and registers to initial state.  */
4335 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4336   do {                                                                  \
4337     int reg;                                                            \
4338     struct charset *charset;                                            \
4339                                                                         \
4340     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4341       ENCODE_SHIFT_IN;                                                  \
4342     for (reg = 0; reg < 4; reg++)                                       \
4343       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4344           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4345               != CODING_ISO_INITIAL (coding, reg)))                     \
4346         {                                                               \
4347           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4348           ENCODE_DESIGNATION (charset, reg, coding);                    \
4349         }                                                               \
4350   } while (0)
4351
4352
4353 /* Produce designation sequences of charsets in the line started from
4354    CHARBUF to a place pointed by DST, and return the number of
4355    produced bytes.  DST should not directly point a buffer text area
4356    which may be relocated by char_charset call.
4357
4358    If the current block ends before any end-of-line, we may fail to
4359    find all the necessary designations.  */
4360
4361 static ptrdiff_t
4362 encode_designation_at_bol (struct coding_system *coding,
4363                            int *charbuf, int *charbuf_end,
4364                            unsigned char *dst)
4365 {
4366   unsigned char *orig = dst;
4367   struct charset *charset;
4368   /* Table of charsets to be designated to each graphic register.  */
4369   int r[4];
4370   int c, found = 0, reg;
4371   ptrdiff_t produced_chars = 0;
4372   bool multibytep = coding->dst_multibyte;
4373   Lisp_Object attrs;
4374   Lisp_Object charset_list;
4375
4376   attrs = CODING_ID_ATTRS (coding->id);
4377   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4378   if (EQ (charset_list, Qiso_2022))
4379     charset_list = Viso_2022_charset_list;
4380
4381   for (reg = 0; reg < 4; reg++)
4382     r[reg] = -1;
4383
4384   while (charbuf < charbuf_end && found < 4)
4385     {
4386       int id;
4387
4388       c = *charbuf++;
4389       if (c == '\n')
4390         break;
4391       charset = char_charset (c, charset_list, NULL);
4392       id = CHARSET_ID (charset);
4393       reg = CODING_ISO_REQUEST (coding, id);
4394       if (reg >= 0 && r[reg] < 0)
4395         {
4396           found++;
4397           r[reg] = id;
4398         }
4399     }
4400
4401   if (found)
4402     {
4403       for (reg = 0; reg < 4; reg++)
4404         if (r[reg] >= 0
4405             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4406           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4407     }
4408
4409   return dst - orig;
4410 }
4411
4412 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4413
4414 static bool
4415 encode_coding_iso_2022 (struct coding_system *coding)
4416 {
4417   bool multibytep = coding->dst_multibyte;
4418   int *charbuf = coding->charbuf;
4419   int *charbuf_end = charbuf + coding->charbuf_used;
4420   unsigned char *dst = coding->destination + coding->produced;
4421   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4422   int safe_room = 16;
4423   bool bol_designation
4424     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4425        && CODING_ISO_BOL (coding));
4426   ptrdiff_t produced_chars = 0;
4427   Lisp_Object attrs, eol_type, charset_list;
4428   bool ascii_compatible;
4429   int c;
4430   int preferred_charset_id = -1;
4431
4432   CODING_GET_INFO (coding, attrs, charset_list);
4433   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4434   if (VECTORP (eol_type))
4435     eol_type = Qunix;
4436
4437   setup_iso_safe_charsets (attrs);
4438   /* Charset list may have been changed.  */
4439   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4440   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4441
4442   ascii_compatible
4443     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4444        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4445                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4446
4447   while (charbuf < charbuf_end)
4448     {
4449       ASSURE_DESTINATION (safe_room);
4450
4451       if (bol_designation)
4452         {
4453           /* We have to produce designation sequences if any now.  */
4454           unsigned char desig_buf[16];
4455           ptrdiff_t nbytes;
4456           ptrdiff_t offset;
4457
4458           charset_map_loaded = 0;
4459           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4460                                               desig_buf);
4461           if (charset_map_loaded
4462               && (offset = coding_change_destination (coding)))
4463             {
4464               dst += offset;
4465               dst_end += offset;
4466             }
4467           memcpy (dst, desig_buf, nbytes);
4468           dst += nbytes;
4469           /* We are sure that designation sequences are all ASCII bytes.  */
4470           produced_chars += nbytes;
4471           bol_designation = 0;
4472           ASSURE_DESTINATION (safe_room);
4473         }
4474
4475       c = *charbuf++;
4476
4477       if (c < 0)
4478         {
4479           /* Handle an annotation.  */
4480           switch (*charbuf)
4481             {
4482             case CODING_ANNOTATE_COMPOSITION_MASK:
4483               /* Not yet implemented.  */
4484               break;
4485             case CODING_ANNOTATE_CHARSET_MASK:
4486               preferred_charset_id = charbuf[2];
4487               if (preferred_charset_id >= 0
4488                   && NILP (Fmemq (make_number (preferred_charset_id),
4489                                   charset_list)))
4490                 preferred_charset_id = -1;
4491               break;
4492             default:
4493               emacs_abort ();
4494             }
4495           charbuf += -c - 1;
4496           continue;
4497         }
4498
4499       /* Now encode the character C.  */
4500       if (c < 0x20 || c == 0x7F)
4501         {
4502           if (c == '\n'
4503               || (c == '\r' && EQ (eol_type, Qmac)))
4504             {
4505               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4506                 ENCODE_RESET_PLANE_AND_REGISTER ();
4507               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4508                 {
4509                   int i;
4510
4511                   for (i = 0; i < 4; i++)
4512                     CODING_ISO_DESIGNATION (coding, i)
4513                       = CODING_ISO_INITIAL (coding, i);
4514                 }
4515               bol_designation = ((CODING_ISO_FLAGS (coding)
4516                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4517                                  != 0);
4518             }
4519           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4520             ENCODE_RESET_PLANE_AND_REGISTER ();
4521           EMIT_ONE_ASCII_BYTE (c);
4522         }
4523       else if (ASCII_CHAR_P (c))
4524         {
4525           if (ascii_compatible)
4526             EMIT_ONE_ASCII_BYTE (c);
4527           else
4528             {
4529               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4530               ENCODE_ISO_CHARACTER (charset, c);
4531             }
4532         }
4533       else if (CHAR_BYTE8_P (c))
4534         {
4535           c = CHAR_TO_BYTE8 (c);
4536           EMIT_ONE_BYTE (c);
4537         }
4538       else
4539         {
4540           struct charset *charset;
4541
4542           if (preferred_charset_id >= 0)
4543             {
4544               bool result;
4545
4546               charset = CHARSET_FROM_ID (preferred_charset_id);
4547               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4548               if (! result)
4549                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4550                                      NULL, charset);
4551             }
4552           else
4553             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4554                                  NULL, charset);
4555           if (!charset)
4556             {
4557               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4558                 {
4559                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4560                   charset = CHARSET_FROM_ID (charset_ascii);
4561                 }
4562               else
4563                 {
4564                   c = coding->default_char;
4565                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4566                                        charset_list, NULL, charset);
4567                 }
4568             }
4569           ENCODE_ISO_CHARACTER (charset, c);
4570         }
4571     }
4572
4573   if (coding->mode & CODING_MODE_LAST_BLOCK
4574       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4575     {
4576       ASSURE_DESTINATION (safe_room);
4577       ENCODE_RESET_PLANE_AND_REGISTER ();
4578     }
4579   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4580   CODING_ISO_BOL (coding) = bol_designation;
4581   coding->produced_char += produced_chars;
4582   coding->produced = dst - coding->destination;
4583   return 0;
4584 }
4585
4586 \f
4587 /*** 8,9. SJIS and BIG5 handlers ***/
4588
4589 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4590    quite widely.  So, for the moment, Emacs supports them in the bare
4591    C code.  But, in the future, they may be supported only by CCL.  */
4592
4593 /* SJIS is a coding system encoding three character sets: ASCII, right
4594    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4595    as is.  A character of charset katakana-jisx0201 is encoded by
4596    "position-code + 0x80".  A character of charset japanese-jisx0208
4597    is encoded in 2-byte but two position-codes are divided and shifted
4598    so that it fit in the range below.
4599
4600    --- CODE RANGE of SJIS ---
4601    (character set)      (range)
4602    ASCII                0x00 .. 0x7F
4603    KATAKANA-JISX0201    0xA0 .. 0xDF
4604    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4605             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4606    -------------------------------
4607
4608 */
4609
4610 /* BIG5 is a coding system encoding two character sets: ASCII and
4611    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4612    character set and is encoded in two-byte.
4613
4614    --- CODE RANGE of BIG5 ---
4615    (character set)      (range)
4616    ASCII                0x00 .. 0x7F
4617    Big5 (1st byte)      0xA1 .. 0xFE
4618         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4619    --------------------------
4620
4621   */
4622
4623 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4624    Return true if a text is encoded in SJIS.  */
4625
4626 static bool
4627 detect_coding_sjis (struct coding_system *coding,
4628                     struct coding_detection_info *detect_info)
4629 {
4630   const unsigned char *src = coding->source, *src_base;
4631   const unsigned char *src_end = coding->source + coding->src_bytes;
4632   bool multibytep = coding->src_multibyte;
4633   ptrdiff_t consumed_chars = 0;
4634   int found = 0;
4635   int c;
4636   Lisp_Object attrs, charset_list;
4637   int max_first_byte_of_2_byte_code;
4638
4639   CODING_GET_INFO (coding, attrs, charset_list);
4640   max_first_byte_of_2_byte_code
4641     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4642
4643   detect_info->checked |= CATEGORY_MASK_SJIS;
4644   /* A coding system of this category is always ASCII compatible.  */
4645   src += coding->head_ascii;
4646
4647   while (1)
4648     {
4649       src_base = src;
4650       ONE_MORE_BYTE (c);
4651       if (c < 0x80)
4652         continue;
4653       if ((c >= 0x81 && c <= 0x9F)
4654           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4655         {
4656           ONE_MORE_BYTE (c);
4657           if (c < 0x40 || c == 0x7F || c > 0xFC)
4658             break;
4659           found = CATEGORY_MASK_SJIS;
4660         }
4661       else if (c >= 0xA0 && c < 0xE0)
4662         found = CATEGORY_MASK_SJIS;
4663       else
4664         break;
4665     }
4666   detect_info->rejected |= CATEGORY_MASK_SJIS;
4667   return 0;
4668
4669  no_more_source:
4670   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4671     {
4672       detect_info->rejected |= CATEGORY_MASK_SJIS;
4673       return 0;
4674     }
4675   detect_info->found |= found;
4676   return 1;
4677 }
4678
4679 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4680    Return true if a text is encoded in BIG5.  */
4681
4682 static bool
4683 detect_coding_big5 (struct coding_system *coding,
4684                     struct coding_detection_info *detect_info)
4685 {
4686   const unsigned char *src = coding->source, *src_base;
4687   const unsigned char *src_end = coding->source + coding->src_bytes;
4688   bool multibytep = coding->src_multibyte;
4689   ptrdiff_t consumed_chars = 0;
4690   int found = 0;
4691   int c;
4692
4693   detect_info->checked |= CATEGORY_MASK_BIG5;
4694   /* A coding system of this category is always ASCII compatible.  */
4695   src += coding->head_ascii;
4696
4697   while (1)
4698     {
4699       src_base = src;
4700       ONE_MORE_BYTE (c);
4701       if (c < 0x80)
4702         continue;
4703       if (c >= 0xA1)
4704         {
4705           ONE_MORE_BYTE (c);
4706           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4707             return 0;
4708           found = CATEGORY_MASK_BIG5;
4709         }
4710       else
4711         break;
4712     }
4713   detect_info->rejected |= CATEGORY_MASK_BIG5;
4714   return 0;
4715
4716  no_more_source:
4717   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4718     {
4719       detect_info->rejected |= CATEGORY_MASK_BIG5;
4720       return 0;
4721     }
4722   detect_info->found |= found;
4723   return 1;
4724 }
4725
4726 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4727
4728 static void
4729 decode_coding_sjis (struct coding_system *coding)
4730 {
4731   const unsigned char *src = coding->source + coding->consumed;
4732   const unsigned char *src_end = coding->source + coding->src_bytes;
4733   const unsigned char *src_base;
4734   int *charbuf = coding->charbuf + coding->charbuf_used;
4735   /* We may produce one charset annotation in one loop and one more at
4736      the end.  */
4737   int *charbuf_end
4738     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4739   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4740   bool multibytep = coding->src_multibyte;
4741   struct charset *charset_roman, *charset_kanji, *charset_kana;
4742   struct charset *charset_kanji2;
4743   Lisp_Object attrs, charset_list, val;
4744   ptrdiff_t char_offset = coding->produced_char;
4745   ptrdiff_t last_offset = char_offset;
4746   int last_id = charset_ascii;
4747   bool eol_dos
4748     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4749   int byte_after_cr = -1;
4750
4751   CODING_GET_INFO (coding, attrs, charset_list);
4752
4753   val = charset_list;
4754   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4755   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4756   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4757   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4758
4759   while (1)
4760     {
4761       int c, c1;
4762       struct charset *charset;
4763
4764       src_base = src;
4765       consumed_chars_base = consumed_chars;
4766
4767       if (charbuf >= charbuf_end)
4768         {
4769           if (byte_after_cr >= 0)
4770             src_base--;
4771           break;
4772         }
4773
4774       if (byte_after_cr >= 0)
4775         c = byte_after_cr, byte_after_cr = -1;
4776       else
4777         ONE_MORE_BYTE (c);
4778       if (c < 0)
4779         goto invalid_code;
4780       if (c < 0x80)
4781         {
4782           if (eol_dos && c == '\r')
4783             ONE_MORE_BYTE (byte_after_cr);
4784           charset = charset_roman;
4785         }
4786       else if (c == 0x80 || c == 0xA0)
4787         goto invalid_code;
4788       else if (c >= 0xA1 && c <= 0xDF)
4789         {
4790           /* SJIS -> JISX0201-Kana */
4791           c &= 0x7F;
4792           charset = charset_kana;
4793         }
4794       else if (c <= 0xEF)
4795         {
4796           /* SJIS -> JISX0208 */
4797           ONE_MORE_BYTE (c1);
4798           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4799             goto invalid_code;
4800           c = (c << 8) | c1;
4801           SJIS_TO_JIS (c);
4802           charset = charset_kanji;
4803         }
4804       else if (c <= 0xFC && charset_kanji2)
4805         {
4806           /* SJIS -> JISX0213-2 */
4807           ONE_MORE_BYTE (c1);
4808           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4809             goto invalid_code;
4810           c = (c << 8) | c1;
4811           SJIS_TO_JIS2 (c);
4812           charset = charset_kanji2;
4813         }
4814       else
4815         goto invalid_code;
4816       if (charset->id != charset_ascii
4817           && last_id != charset->id)
4818         {
4819           if (last_id != charset_ascii)
4820             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4821           last_id = charset->id;
4822           last_offset = char_offset;
4823         }
4824       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4825       *charbuf++ = c;
4826       char_offset++;
4827       continue;
4828
4829     invalid_code:
4830       src = src_base;
4831       consumed_chars = consumed_chars_base;
4832       ONE_MORE_BYTE (c);
4833       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4834       char_offset++;
4835     }
4836
4837  no_more_source:
4838   if (last_id != charset_ascii)
4839     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4840   coding->consumed_char += consumed_chars_base;
4841   coding->consumed = src_base - coding->source;
4842   coding->charbuf_used = charbuf - coding->charbuf;
4843 }
4844
4845 static void
4846 decode_coding_big5 (struct coding_system *coding)
4847 {
4848   const unsigned char *src = coding->source + coding->consumed;
4849   const unsigned char *src_end = coding->source + coding->src_bytes;
4850   const unsigned char *src_base;
4851   int *charbuf = coding->charbuf + coding->charbuf_used;
4852   /* We may produce one charset annotation in one loop and one more at
4853      the end.  */
4854   int *charbuf_end
4855     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4856   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4857   bool multibytep = coding->src_multibyte;
4858   struct charset *charset_roman, *charset_big5;
4859   Lisp_Object attrs, charset_list, val;
4860   ptrdiff_t char_offset = coding->produced_char;
4861   ptrdiff_t last_offset = char_offset;
4862   int last_id = charset_ascii;
4863   bool eol_dos
4864     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4865   int byte_after_cr = -1;
4866
4867   CODING_GET_INFO (coding, attrs, charset_list);
4868   val = charset_list;
4869   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4870   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4871
4872   while (1)
4873     {
4874       int c, c1;
4875       struct charset *charset;
4876
4877       src_base = src;
4878       consumed_chars_base = consumed_chars;
4879
4880       if (charbuf >= charbuf_end)
4881         {
4882           if (byte_after_cr >= 0)
4883             src_base--;
4884           break;
4885         }
4886
4887       if (byte_after_cr >= 0)
4888         c = byte_after_cr, byte_after_cr = -1;
4889       else
4890         ONE_MORE_BYTE (c);
4891
4892       if (c < 0)
4893         goto invalid_code;
4894       if (c < 0x80)
4895         {
4896           if (eol_dos && c == '\r')
4897             ONE_MORE_BYTE (byte_after_cr);
4898           charset = charset_roman;
4899         }
4900       else
4901         {
4902           /* BIG5 -> Big5 */
4903           if (c < 0xA1 || c > 0xFE)
4904             goto invalid_code;
4905           ONE_MORE_BYTE (c1);
4906           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4907             goto invalid_code;
4908           c = c << 8 | c1;
4909           charset = charset_big5;
4910         }
4911       if (charset->id != charset_ascii
4912           && last_id != charset->id)
4913         {
4914           if (last_id != charset_ascii)
4915             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4916           last_id = charset->id;
4917           last_offset = char_offset;
4918         }
4919       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4920       *charbuf++ = c;
4921       char_offset++;
4922       continue;
4923
4924     invalid_code:
4925       src = src_base;
4926       consumed_chars = consumed_chars_base;
4927       ONE_MORE_BYTE (c);
4928       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4929       char_offset++;
4930     }
4931
4932  no_more_source:
4933   if (last_id != charset_ascii)
4934     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4935   coding->consumed_char += consumed_chars_base;
4936   coding->consumed = src_base - coding->source;
4937   coding->charbuf_used = charbuf - coding->charbuf;
4938 }
4939
4940 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4941    This function can encode charsets `ascii', `katakana-jisx0201',
4942    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4943    are sure that all these charsets are registered as official charset
4944    (i.e. do not have extended leading-codes).  Characters of other
4945    charsets are produced without any encoding.  */
4946
4947 static bool
4948 encode_coding_sjis (struct coding_system *coding)
4949 {
4950   bool multibytep = coding->dst_multibyte;
4951   int *charbuf = coding->charbuf;
4952   int *charbuf_end = charbuf + coding->charbuf_used;
4953   unsigned char *dst = coding->destination + coding->produced;
4954   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4955   int safe_room = 4;
4956   ptrdiff_t produced_chars = 0;
4957   Lisp_Object attrs, charset_list, val;
4958   bool ascii_compatible;
4959   struct charset *charset_kanji, *charset_kana;
4960   struct charset *charset_kanji2;
4961   int c;
4962
4963   CODING_GET_INFO (coding, attrs, charset_list);
4964   val = XCDR (charset_list);
4965   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4966   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4967   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4968
4969   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4970
4971   while (charbuf < charbuf_end)
4972     {
4973       ASSURE_DESTINATION (safe_room);
4974       c = *charbuf++;
4975       /* Now encode the character C.  */
4976       if (ASCII_CHAR_P (c) && ascii_compatible)
4977         EMIT_ONE_ASCII_BYTE (c);
4978       else if (CHAR_BYTE8_P (c))
4979         {
4980           c = CHAR_TO_BYTE8 (c);
4981           EMIT_ONE_BYTE (c);
4982         }
4983       else
4984         {
4985           unsigned code;
4986           struct charset *charset;
4987           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4988                                &code, charset);
4989
4990           if (!charset)
4991             {
4992               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4993                 {
4994                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4995                   charset = CHARSET_FROM_ID (charset_ascii);
4996                 }
4997               else
4998                 {
4999                   c = coding->default_char;
5000                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5001                                        charset_list, &code, charset);
5002                 }
5003             }
5004           if (code == CHARSET_INVALID_CODE (charset))
5005             emacs_abort ();
5006           if (charset == charset_kanji)
5007             {
5008               int c1, c2;
5009               JIS_TO_SJIS (code);
5010               c1 = code >> 8, c2 = code & 0xFF;
5011               EMIT_TWO_BYTES (c1, c2);
5012             }
5013           else if (charset == charset_kana)
5014             EMIT_ONE_BYTE (code | 0x80);
5015           else if (charset_kanji2 && charset == charset_kanji2)
5016             {
5017               int c1, c2;
5018
5019               c1 = code >> 8;
5020               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5021                   || c1 == 0x28
5022                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5023                 {
5024                   JIS_TO_SJIS2 (code);
5025                   c1 = code >> 8, c2 = code & 0xFF;
5026                   EMIT_TWO_BYTES (c1, c2);
5027                 }
5028               else
5029                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5030             }
5031           else
5032             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5033         }
5034     }
5035   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5036   coding->produced_char += produced_chars;
5037   coding->produced = dst - coding->destination;
5038   return 0;
5039 }
5040
5041 static bool
5042 encode_coding_big5 (struct coding_system *coding)
5043 {
5044   bool multibytep = coding->dst_multibyte;
5045   int *charbuf = coding->charbuf;
5046   int *charbuf_end = charbuf + coding->charbuf_used;
5047   unsigned char *dst = coding->destination + coding->produced;
5048   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5049   int safe_room = 4;
5050   ptrdiff_t produced_chars = 0;
5051   Lisp_Object attrs, charset_list, val;
5052   bool ascii_compatible;
5053   struct charset *charset_big5;
5054   int c;
5055
5056   CODING_GET_INFO (coding, attrs, charset_list);
5057   val = XCDR (charset_list);
5058   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5059   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5060
5061   while (charbuf < charbuf_end)
5062     {
5063       ASSURE_DESTINATION (safe_room);
5064       c = *charbuf++;
5065       /* Now encode the character C.  */
5066       if (ASCII_CHAR_P (c) && ascii_compatible)
5067         EMIT_ONE_ASCII_BYTE (c);
5068       else if (CHAR_BYTE8_P (c))
5069         {
5070           c = CHAR_TO_BYTE8 (c);
5071           EMIT_ONE_BYTE (c);
5072         }
5073       else
5074         {
5075           unsigned code;
5076           struct charset *charset;
5077           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5078                                &code, charset);
5079
5080           if (! charset)
5081             {
5082               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5083                 {
5084                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5085                   charset = CHARSET_FROM_ID (charset_ascii);
5086                 }
5087               else
5088                 {
5089                   c = coding->default_char;
5090                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5091                                        charset_list, &code, charset);
5092                 }
5093             }
5094           if (code == CHARSET_INVALID_CODE (charset))
5095             emacs_abort ();
5096           if (charset == charset_big5)
5097             {
5098               int c1, c2;
5099
5100               c1 = code >> 8, c2 = code & 0xFF;
5101               EMIT_TWO_BYTES (c1, c2);
5102             }
5103           else
5104             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5105         }
5106     }
5107   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5108   coding->produced_char += produced_chars;
5109   coding->produced = dst - coding->destination;
5110   return 0;
5111 }
5112
5113 \f
5114 /*** 10. CCL handlers ***/
5115
5116 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5117    Return true if a text is encoded in a coding system of which
5118    encoder/decoder are written in CCL program.  */
5119
5120 static bool
5121 detect_coding_ccl (struct coding_system *coding,
5122                    struct coding_detection_info *detect_info)
5123 {
5124   const unsigned char *src = coding->source, *src_base;
5125   const unsigned char *src_end = coding->source + coding->src_bytes;
5126   bool multibytep = coding->src_multibyte;
5127   ptrdiff_t consumed_chars = 0;
5128   int found = 0;
5129   unsigned char *valids;
5130   ptrdiff_t head_ascii = coding->head_ascii;
5131   Lisp_Object attrs;
5132
5133   detect_info->checked |= CATEGORY_MASK_CCL;
5134
5135   coding = &coding_categories[coding_category_ccl];
5136   valids = CODING_CCL_VALIDS (coding);
5137   attrs = CODING_ID_ATTRS (coding->id);
5138   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5139     src += head_ascii;
5140
5141   while (1)
5142     {
5143       int c;
5144
5145       src_base = src;
5146       ONE_MORE_BYTE (c);
5147       if (c < 0 || ! valids[c])
5148         break;
5149       if ((valids[c] > 1))
5150         found = CATEGORY_MASK_CCL;
5151     }
5152   detect_info->rejected |= CATEGORY_MASK_CCL;
5153   return 0;
5154
5155  no_more_source:
5156   detect_info->found |= found;
5157   return 1;
5158 }
5159
5160 static void
5161 decode_coding_ccl (struct coding_system *coding)
5162 {
5163   const unsigned char *src = coding->source + coding->consumed;
5164   const unsigned char *src_end = coding->source + coding->src_bytes;
5165   int *charbuf = coding->charbuf + coding->charbuf_used;
5166   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5167   ptrdiff_t consumed_chars = 0;
5168   bool multibytep = coding->src_multibyte;
5169   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5170   int source_charbuf[1024];
5171   int source_byteidx[1025];
5172   Lisp_Object attrs, charset_list;
5173
5174   CODING_GET_INFO (coding, attrs, charset_list);
5175
5176   while (1)
5177     {
5178       const unsigned char *p = src;
5179       ptrdiff_t offset;
5180       int i = 0;
5181
5182       if (multibytep)
5183         {
5184           while (i < 1024 && p < src_end)
5185             {
5186               source_byteidx[i] = p - src;
5187               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5188             }
5189           source_byteidx[i] = p - src;
5190         }
5191       else
5192         while (i < 1024 && p < src_end)
5193           source_charbuf[i++] = *p++;
5194
5195       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5196         ccl->last_block = true;
5197       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5198       charset_map_loaded = 0;
5199       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5200                   charset_list);
5201       if (charset_map_loaded
5202           && (offset = coding_change_source (coding)))
5203         {
5204           p += offset;
5205           src += offset;
5206           src_end += offset;
5207         }
5208       charbuf += ccl->produced;
5209       if (multibytep)
5210         src += source_byteidx[ccl->consumed];
5211       else
5212         src += ccl->consumed;
5213       consumed_chars += ccl->consumed;
5214       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5215         break;
5216     }
5217
5218   switch (ccl->status)
5219     {
5220     case CCL_STAT_SUSPEND_BY_SRC:
5221       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5222       break;
5223     case CCL_STAT_SUSPEND_BY_DST:
5224       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5225       break;
5226     case CCL_STAT_QUIT:
5227     case CCL_STAT_INVALID_CMD:
5228       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5229       break;
5230     default:
5231       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5232       break;
5233     }
5234   coding->consumed_char += consumed_chars;
5235   coding->consumed = src - coding->source;
5236   coding->charbuf_used = charbuf - coding->charbuf;
5237 }
5238
5239 static bool
5240 encode_coding_ccl (struct coding_system *coding)
5241 {
5242   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5243   bool multibytep = coding->dst_multibyte;
5244   int *charbuf = coding->charbuf;
5245   int *charbuf_end = charbuf + coding->charbuf_used;
5246   unsigned char *dst = coding->destination + coding->produced;
5247   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5248   int destination_charbuf[1024];
5249   ptrdiff_t produced_chars = 0;
5250   int i;
5251   Lisp_Object attrs, charset_list;
5252
5253   CODING_GET_INFO (coding, attrs, charset_list);
5254   if (coding->consumed_char == coding->src_chars
5255       && coding->mode & CODING_MODE_LAST_BLOCK)
5256     ccl->last_block = true;
5257
5258   do
5259     {
5260       ptrdiff_t offset;
5261
5262       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5263       charset_map_loaded = 0;
5264       ccl_driver (ccl, charbuf, destination_charbuf,
5265                   charbuf_end - charbuf, 1024, charset_list);
5266       if (charset_map_loaded
5267           && (offset = coding_change_destination (coding)))
5268         dst += offset;
5269       if (multibytep)
5270         {
5271           ASSURE_DESTINATION (ccl->produced * 2);
5272           for (i = 0; i < ccl->produced; i++)
5273             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5274         }
5275       else
5276         {
5277           ASSURE_DESTINATION (ccl->produced);
5278           for (i = 0; i < ccl->produced; i++)
5279             *dst++ = destination_charbuf[i] & 0xFF;
5280           produced_chars += ccl->produced;
5281         }
5282       charbuf += ccl->consumed;
5283       if (ccl->status == CCL_STAT_QUIT
5284           || ccl->status == CCL_STAT_INVALID_CMD)
5285         break;
5286     }
5287   while (charbuf < charbuf_end);
5288
5289   switch (ccl->status)
5290     {
5291     case CCL_STAT_SUSPEND_BY_SRC:
5292       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5293       break;
5294     case CCL_STAT_SUSPEND_BY_DST:
5295       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5296       break;
5297     case CCL_STAT_QUIT:
5298     case CCL_STAT_INVALID_CMD:
5299       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5300       break;
5301     default:
5302       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5303       break;
5304     }
5305
5306   coding->produced_char += produced_chars;
5307   coding->produced = dst - coding->destination;
5308   return 0;
5309 }
5310
5311 \f
5312 /*** 10, 11. no-conversion handlers ***/
5313
5314 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5315
5316 static void
5317 decode_coding_raw_text (struct coding_system *coding)
5318 {
5319   bool eol_dos
5320     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5321
5322   coding->chars_at_source = 1;
5323   coding->consumed_char = coding->src_chars;
5324   coding->consumed = coding->src_bytes;
5325   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5326     {
5327       coding->consumed_char--;
5328       coding->consumed--;
5329       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5330     }
5331   else
5332     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5333 }
5334
5335 static bool
5336 encode_coding_raw_text (struct coding_system *coding)
5337 {
5338   bool multibytep = coding->dst_multibyte;
5339   int *charbuf = coding->charbuf;
5340   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5341   unsigned char *dst = coding->destination + coding->produced;
5342   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5343   ptrdiff_t produced_chars = 0;
5344   int c;
5345
5346   if (multibytep)
5347     {
5348       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5349
5350       if (coding->src_multibyte)
5351         while (charbuf < charbuf_end)
5352           {
5353             ASSURE_DESTINATION (safe_room);
5354             c = *charbuf++;
5355             if (ASCII_CHAR_P (c))
5356               EMIT_ONE_ASCII_BYTE (c);
5357             else if (CHAR_BYTE8_P (c))
5358               {
5359                 c = CHAR_TO_BYTE8 (c);
5360                 EMIT_ONE_BYTE (c);
5361               }
5362             else
5363               {
5364                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5365
5366                 CHAR_STRING_ADVANCE (c, p1);
5367                 do
5368                   {
5369                     EMIT_ONE_BYTE (*p0);
5370                     p0++;
5371                   }
5372                 while (p0 < p1);
5373               }
5374           }
5375       else
5376         while (charbuf < charbuf_end)
5377           {
5378             ASSURE_DESTINATION (safe_room);
5379             c = *charbuf++;
5380             EMIT_ONE_BYTE (c);
5381           }
5382     }
5383   else
5384     {
5385       if (coding->src_multibyte)
5386         {
5387           int safe_room = MAX_MULTIBYTE_LENGTH;
5388
5389           while (charbuf < charbuf_end)
5390             {
5391               ASSURE_DESTINATION (safe_room);
5392               c = *charbuf++;
5393               if (ASCII_CHAR_P (c))
5394                 *dst++ = c;
5395               else if (CHAR_BYTE8_P (c))
5396                 *dst++ = CHAR_TO_BYTE8 (c);
5397               else
5398                 CHAR_STRING_ADVANCE (c, dst);
5399             }
5400         }
5401       else
5402         {
5403           ASSURE_DESTINATION (charbuf_end - charbuf);
5404           while (charbuf < charbuf_end && dst < dst_end)
5405             *dst++ = *charbuf++;
5406         }
5407       produced_chars = dst - (coding->destination + coding->produced);
5408     }
5409   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5410   coding->produced_char += produced_chars;
5411   coding->produced = dst - coding->destination;
5412   return 0;
5413 }
5414
5415 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5416    Return true if a text is encoded in a charset-based coding system.  */
5417
5418 static bool
5419 detect_coding_charset (struct coding_system *coding,
5420                        struct coding_detection_info *detect_info)
5421 {
5422   const unsigned char *src = coding->source, *src_base;
5423   const unsigned char *src_end = coding->source + coding->src_bytes;
5424   bool multibytep = coding->src_multibyte;
5425   ptrdiff_t consumed_chars = 0;
5426   Lisp_Object attrs, valids, name;
5427   int found = 0;
5428   ptrdiff_t head_ascii = coding->head_ascii;
5429   bool check_latin_extra = 0;
5430
5431   detect_info->checked |= CATEGORY_MASK_CHARSET;
5432
5433   coding = &coding_categories[coding_category_charset];
5434   attrs = CODING_ID_ATTRS (coding->id);
5435   valids = AREF (attrs, coding_attr_charset_valids);
5436   name = CODING_ID_NAME (coding->id);
5437   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5438                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5439       || strncmp (SSDATA (SYMBOL_NAME (name)),
5440                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5441     check_latin_extra = 1;
5442
5443   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5444     src += head_ascii;
5445
5446   while (1)
5447     {
5448       int c;
5449       Lisp_Object val;
5450       struct charset *charset;
5451       int dim, idx;
5452
5453       src_base = src;
5454       ONE_MORE_BYTE (c);
5455       if (c < 0)
5456         continue;
5457       val = AREF (valids, c);
5458       if (NILP (val))
5459         break;
5460       if (c >= 0x80)
5461         {
5462           if (c < 0xA0
5463               && check_latin_extra
5464               && (!VECTORP (Vlatin_extra_code_table)
5465                   || NILP (AREF (Vlatin_extra_code_table, c))))
5466             break;
5467           found = CATEGORY_MASK_CHARSET;
5468         }
5469       if (INTEGERP (val))
5470         {
5471           charset = CHARSET_FROM_ID (XFASTINT (val));
5472           dim = CHARSET_DIMENSION (charset);
5473           for (idx = 1; idx < dim; idx++)
5474             {
5475               if (src == src_end)
5476                 goto too_short;
5477               ONE_MORE_BYTE (c);
5478               if (c < charset->code_space[(dim - 1 - idx) * 4]
5479                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5480                 break;
5481             }
5482           if (idx < dim)
5483             break;
5484         }
5485       else
5486         {
5487           idx = 1;
5488           for (; CONSP (val); val = XCDR (val))
5489             {
5490               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5491               dim = CHARSET_DIMENSION (charset);
5492               while (idx < dim)
5493                 {
5494                   if (src == src_end)
5495                     goto too_short;
5496                   ONE_MORE_BYTE (c);
5497                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5498                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5499                     break;
5500                   idx++;
5501                 }
5502               if (idx == dim)
5503                 {
5504                   val = Qnil;
5505                   break;
5506                 }
5507             }
5508           if (CONSP (val))
5509             break;
5510         }
5511     }
5512  too_short:
5513   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5514   return 0;
5515
5516  no_more_source:
5517   detect_info->found |= found;
5518   return 1;
5519 }
5520
5521 static void
5522 decode_coding_charset (struct coding_system *coding)
5523 {
5524   const unsigned char *src = coding->source + coding->consumed;
5525   const unsigned char *src_end = coding->source + coding->src_bytes;
5526   const unsigned char *src_base;
5527   int *charbuf = coding->charbuf + coding->charbuf_used;
5528   /* We may produce one charset annotation in one loop and one more at
5529      the end.  */
5530   int *charbuf_end
5531     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5532   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5533   bool multibytep = coding->src_multibyte;
5534   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5535   Lisp_Object valids;
5536   ptrdiff_t char_offset = coding->produced_char;
5537   ptrdiff_t last_offset = char_offset;
5538   int last_id = charset_ascii;
5539   bool eol_dos
5540     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5541   int byte_after_cr = -1;
5542
5543   valids = AREF (attrs, coding_attr_charset_valids);
5544
5545   while (1)
5546     {
5547       int c;
5548       Lisp_Object val;
5549       struct charset *charset;
5550       int dim;
5551       int len = 1;
5552       unsigned code;
5553
5554       src_base = src;
5555       consumed_chars_base = consumed_chars;
5556
5557       if (charbuf >= charbuf_end)
5558         {
5559           if (byte_after_cr >= 0)
5560             src_base--;
5561           break;
5562         }
5563
5564       if (byte_after_cr >= 0)
5565         {
5566           c = byte_after_cr;
5567           byte_after_cr = -1;
5568         }
5569       else
5570         {
5571           ONE_MORE_BYTE (c);
5572           if (eol_dos && c == '\r')
5573             ONE_MORE_BYTE (byte_after_cr);
5574         }
5575       if (c < 0)
5576         goto invalid_code;
5577       code = c;
5578
5579       val = AREF (valids, c);
5580       if (! INTEGERP (val) && ! CONSP (val))
5581         goto invalid_code;
5582       if (INTEGERP (val))
5583         {
5584           charset = CHARSET_FROM_ID (XFASTINT (val));
5585           dim = CHARSET_DIMENSION (charset);
5586           while (len < dim)
5587             {
5588               ONE_MORE_BYTE (c);
5589               code = (code << 8) | c;
5590               len++;
5591             }
5592           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5593                               charset, code, c);
5594         }
5595       else
5596         {
5597           /* VAL is a list of charset IDs.  It is assured that the
5598              list is sorted by charset dimensions (smaller one
5599              comes first).  */
5600           while (CONSP (val))
5601             {
5602               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5603               dim = CHARSET_DIMENSION (charset);
5604               while (len < dim)
5605                 {
5606                   ONE_MORE_BYTE (c);
5607                   code = (code << 8) | c;
5608                   len++;
5609                 }
5610               CODING_DECODE_CHAR (coding, src, src_base,
5611                                   src_end, charset, code, c);
5612               if (c >= 0)
5613                 break;
5614               val = XCDR (val);
5615             }
5616         }
5617       if (c < 0)
5618         goto invalid_code;
5619       if (charset->id != charset_ascii
5620           && last_id != charset->id)
5621         {
5622           if (last_id != charset_ascii)
5623             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5624           last_id = charset->id;
5625           last_offset = char_offset;
5626         }
5627
5628       *charbuf++ = c;
5629       char_offset++;
5630       continue;
5631
5632     invalid_code:
5633       src = src_base;
5634       consumed_chars = consumed_chars_base;
5635       ONE_MORE_BYTE (c);
5636       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5637       char_offset++;
5638     }
5639
5640  no_more_source:
5641   if (last_id != charset_ascii)
5642     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5643   coding->consumed_char += consumed_chars_base;
5644   coding->consumed = src_base - coding->source;
5645   coding->charbuf_used = charbuf - coding->charbuf;
5646 }
5647
5648 static bool
5649 encode_coding_charset (struct coding_system *coding)
5650 {
5651   bool multibytep = coding->dst_multibyte;
5652   int *charbuf = coding->charbuf;
5653   int *charbuf_end = charbuf + coding->charbuf_used;
5654   unsigned char *dst = coding->destination + coding->produced;
5655   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5656   int safe_room = MAX_MULTIBYTE_LENGTH;
5657   ptrdiff_t produced_chars = 0;
5658   Lisp_Object attrs, charset_list;
5659   bool ascii_compatible;
5660   int c;
5661
5662   CODING_GET_INFO (coding, attrs, charset_list);
5663   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5664
5665   while (charbuf < charbuf_end)
5666     {
5667       struct charset *charset;
5668       unsigned code;
5669
5670       ASSURE_DESTINATION (safe_room);
5671       c = *charbuf++;
5672       if (ascii_compatible && ASCII_CHAR_P (c))
5673         EMIT_ONE_ASCII_BYTE (c);
5674       else if (CHAR_BYTE8_P (c))
5675         {
5676           c = CHAR_TO_BYTE8 (c);
5677           EMIT_ONE_BYTE (c);
5678         }
5679       else
5680         {
5681           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5682                                &code, charset);
5683
5684           if (charset)
5685             {
5686               if (CHARSET_DIMENSION (charset) == 1)
5687                 EMIT_ONE_BYTE (code);
5688               else if (CHARSET_DIMENSION (charset) == 2)
5689                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5690               else if (CHARSET_DIMENSION (charset) == 3)
5691                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5692               else
5693                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5694                                  (code >> 8) & 0xFF, code & 0xFF);
5695             }
5696           else
5697             {
5698               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5699                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5700               else
5701                 c = coding->default_char;
5702               EMIT_ONE_BYTE (c);
5703             }
5704         }
5705     }
5706
5707   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5708   coding->produced_char += produced_chars;
5709   coding->produced = dst - coding->destination;
5710   return 0;
5711 }
5712
5713 \f
5714 /*** 7. C library functions ***/
5715
5716 /* Setup coding context CODING from information about CODING_SYSTEM.
5717    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5718    CODING_SYSTEM is invalid, signal an error.  */
5719
5720 void
5721 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5722 {
5723   Lisp_Object attrs;
5724   Lisp_Object eol_type;
5725   Lisp_Object coding_type;
5726   Lisp_Object val;
5727
5728   if (NILP (coding_system))
5729     coding_system = Qundecided;
5730
5731   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5732
5733   attrs = CODING_ID_ATTRS (coding->id);
5734   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5735
5736   coding->mode = 0;
5737   if (VECTORP (eol_type))
5738     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5739                             | CODING_REQUIRE_DETECTION_MASK);
5740   else if (! EQ (eol_type, Qunix))
5741     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5742                             | CODING_REQUIRE_ENCODING_MASK);
5743   else
5744     coding->common_flags = 0;
5745   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5746     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5747   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5748     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5749   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5750     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5751
5752   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5753   coding->max_charset_id = SCHARS (val) - 1;
5754   coding->safe_charsets = SDATA (val);
5755   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5756   coding->carryover_bytes = 0;
5757   coding->raw_destination = 0;
5758
5759   coding_type = CODING_ATTR_TYPE (attrs);
5760   if (EQ (coding_type, Qundecided))
5761     {
5762       coding->detector = NULL;
5763       coding->decoder = decode_coding_raw_text;
5764       coding->encoder = encode_coding_raw_text;
5765       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5766       coding->spec.undecided.inhibit_nbd
5767         = (encode_inhibit_flag
5768            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5769       coding->spec.undecided.inhibit_ied
5770         = (encode_inhibit_flag
5771            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5772       coding->spec.undecided.prefer_utf_8
5773         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5774     }
5775   else if (EQ (coding_type, Qiso_2022))
5776     {
5777       int i;
5778       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5779
5780       /* Invoke graphic register 0 to plane 0.  */
5781       CODING_ISO_INVOCATION (coding, 0) = 0;
5782       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5783       CODING_ISO_INVOCATION (coding, 1)
5784         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5785       /* Setup the initial status of designation.  */
5786       for (i = 0; i < 4; i++)
5787         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5788       /* Not single shifting initially.  */
5789       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5790       /* Beginning of buffer should also be regarded as bol. */
5791       CODING_ISO_BOL (coding) = 1;
5792       coding->detector = detect_coding_iso_2022;
5793       coding->decoder = decode_coding_iso_2022;
5794       coding->encoder = encode_coding_iso_2022;
5795       if (flags & CODING_ISO_FLAG_SAFE)
5796         coding->mode |= CODING_MODE_SAFE_ENCODING;
5797       coding->common_flags
5798         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5799             | CODING_REQUIRE_FLUSHING_MASK);
5800       if (flags & CODING_ISO_FLAG_COMPOSITION)
5801         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5802       if (flags & CODING_ISO_FLAG_DESIGNATION)
5803         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5804       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5805         {
5806           setup_iso_safe_charsets (attrs);
5807           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5808           coding->max_charset_id = SCHARS (val) - 1;
5809           coding->safe_charsets = SDATA (val);
5810         }
5811       CODING_ISO_FLAGS (coding) = flags;
5812       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5813       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5814       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5815       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5816     }
5817   else if (EQ (coding_type, Qcharset))
5818     {
5819       coding->detector = detect_coding_charset;
5820       coding->decoder = decode_coding_charset;
5821       coding->encoder = encode_coding_charset;
5822       coding->common_flags
5823         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5824     }
5825   else if (EQ (coding_type, Qutf_8))
5826     {
5827       val = AREF (attrs, coding_attr_utf_bom);
5828       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5829                                    : EQ (val, Qt) ? utf_with_bom
5830                                    : utf_without_bom);
5831       coding->detector = detect_coding_utf_8;
5832       coding->decoder = decode_coding_utf_8;
5833       coding->encoder = encode_coding_utf_8;
5834       coding->common_flags
5835         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5836       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5837         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5838     }
5839   else if (EQ (coding_type, Qutf_16))
5840     {
5841       val = AREF (attrs, coding_attr_utf_bom);
5842       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5843                                     : EQ (val, Qt) ? utf_with_bom
5844                                     : utf_without_bom);
5845       val = AREF (attrs, coding_attr_utf_16_endian);
5846       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5847                                        : utf_16_little_endian);
5848       CODING_UTF_16_SURROGATE (coding) = 0;
5849       coding->detector = detect_coding_utf_16;
5850       coding->decoder = decode_coding_utf_16;
5851       coding->encoder = encode_coding_utf_16;
5852       coding->common_flags
5853         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5854       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5855         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5856     }
5857   else if (EQ (coding_type, Qccl))
5858     {
5859       coding->detector = detect_coding_ccl;
5860       coding->decoder = decode_coding_ccl;
5861       coding->encoder = encode_coding_ccl;
5862       coding->common_flags
5863         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5864             | CODING_REQUIRE_FLUSHING_MASK);
5865     }
5866   else if (EQ (coding_type, Qemacs_mule))
5867     {
5868       coding->detector = detect_coding_emacs_mule;
5869       coding->decoder = decode_coding_emacs_mule;
5870       coding->encoder = encode_coding_emacs_mule;
5871       coding->common_flags
5872         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5873       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5874           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5875         {
5876           Lisp_Object tail, safe_charsets;
5877           int max_charset_id = 0;
5878
5879           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5880                tail = XCDR (tail))
5881             if (max_charset_id < XFASTINT (XCAR (tail)))
5882               max_charset_id = XFASTINT (XCAR (tail));
5883           safe_charsets = make_uninit_string (max_charset_id + 1);
5884           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5885           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5886                tail = XCDR (tail))
5887             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5888           coding->max_charset_id = max_charset_id;
5889           coding->safe_charsets = SDATA (safe_charsets);
5890         }
5891       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5892       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5893     }
5894   else if (EQ (coding_type, Qshift_jis))
5895     {
5896       coding->detector = detect_coding_sjis;
5897       coding->decoder = decode_coding_sjis;
5898       coding->encoder = encode_coding_sjis;
5899       coding->common_flags
5900         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5901     }
5902   else if (EQ (coding_type, Qbig5))
5903     {
5904       coding->detector = detect_coding_big5;
5905       coding->decoder = decode_coding_big5;
5906       coding->encoder = encode_coding_big5;
5907       coding->common_flags
5908         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5909     }
5910   else                          /* EQ (coding_type, Qraw_text) */
5911     {
5912       coding->detector = NULL;
5913       coding->decoder = decode_coding_raw_text;
5914       coding->encoder = encode_coding_raw_text;
5915       if (! EQ (eol_type, Qunix))
5916         {
5917           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5918           if (! VECTORP (eol_type))
5919             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5920         }
5921
5922     }
5923
5924   return;
5925 }
5926
5927 /* Return a list of charsets supported by CODING.  */
5928
5929 Lisp_Object
5930 coding_charset_list (struct coding_system *coding)
5931 {
5932   Lisp_Object attrs, charset_list;
5933
5934   CODING_GET_INFO (coding, attrs, charset_list);
5935   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5936     {
5937       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5938
5939       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5940         charset_list = Viso_2022_charset_list;
5941     }
5942   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5943     {
5944       charset_list = Vemacs_mule_charset_list;
5945     }
5946   return charset_list;
5947 }
5948
5949
5950 /* Return a list of charsets supported by CODING-SYSTEM.  */
5951
5952 Lisp_Object
5953 coding_system_charset_list (Lisp_Object coding_system)
5954 {
5955   ptrdiff_t id;
5956   Lisp_Object attrs, charset_list;
5957
5958   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5959   attrs = CODING_ID_ATTRS (id);
5960
5961   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5962     {
5963       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5964
5965       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5966         charset_list = Viso_2022_charset_list;
5967       else
5968         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5969     }
5970   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5971     {
5972       charset_list = Vemacs_mule_charset_list;
5973     }
5974   else
5975     {
5976       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5977     }
5978   return charset_list;
5979 }
5980
5981
5982 /* Return raw-text or one of its subsidiaries that has the same
5983    eol_type as CODING-SYSTEM.  */
5984
5985 Lisp_Object
5986 raw_text_coding_system (Lisp_Object coding_system)
5987 {
5988   Lisp_Object spec, attrs;
5989   Lisp_Object eol_type, raw_text_eol_type;
5990
5991   if (NILP (coding_system))
5992     return Qraw_text;
5993   spec = CODING_SYSTEM_SPEC (coding_system);
5994   attrs = AREF (spec, 0);
5995
5996   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5997     return coding_system;
5998
5999   eol_type = AREF (spec, 2);
6000   if (VECTORP (eol_type))
6001     return Qraw_text;
6002   spec = CODING_SYSTEM_SPEC (Qraw_text);
6003   raw_text_eol_type = AREF (spec, 2);
6004   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6005           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6006           : AREF (raw_text_eol_type, 2));
6007 }
6008
6009
6010 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6011    the subsidiary that has the same eol-spec as PARENT (if it is not
6012    nil and specifies end-of-line format) or the system's setting
6013    (system_eol_type).  */
6014
6015 Lisp_Object
6016 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6017 {
6018   Lisp_Object spec, eol_type;
6019
6020   if (NILP (coding_system))
6021     coding_system = Qraw_text;
6022   spec = CODING_SYSTEM_SPEC (coding_system);
6023   eol_type = AREF (spec, 2);
6024   if (VECTORP (eol_type))
6025     {
6026       Lisp_Object parent_eol_type;
6027
6028       if (! NILP (parent))
6029         {
6030           Lisp_Object parent_spec;
6031
6032           parent_spec = CODING_SYSTEM_SPEC (parent);
6033           parent_eol_type = AREF (parent_spec, 2);
6034           if (VECTORP (parent_eol_type))
6035             parent_eol_type = system_eol_type;
6036         }
6037       else
6038         parent_eol_type = system_eol_type;
6039       if (EQ (parent_eol_type, Qunix))
6040         coding_system = AREF (eol_type, 0);
6041       else if (EQ (parent_eol_type, Qdos))
6042         coding_system = AREF (eol_type, 1);
6043       else if (EQ (parent_eol_type, Qmac))
6044         coding_system = AREF (eol_type, 2);
6045     }
6046   return coding_system;
6047 }
6048
6049
6050 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6051    decided for writing to a process.  If not, complement them, and
6052    return a new coding system.  */
6053
6054 Lisp_Object
6055 complement_process_encoding_system (Lisp_Object coding_system)
6056 {
6057   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6058   Lisp_Object spec, attrs;
6059   int i;
6060
6061   for (i = 0; i < 3; i++)
6062     {
6063       if (i == 1)
6064         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6065       else if (i == 2)
6066         coding_system = preferred_coding_system ();
6067       spec = CODING_SYSTEM_SPEC (coding_system);
6068       if (NILP (spec))
6069         continue;
6070       attrs = AREF (spec, 0);
6071       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6072         coding_base = CODING_ATTR_BASE_NAME (attrs);
6073       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6074         eol_base = coding_system;
6075       if (! NILP (coding_base) && ! NILP (eol_base))
6076         break;
6077     }
6078
6079   if (i > 0)
6080     /* The original CODING_SYSTEM didn't specify text-conversion or
6081        eol-conversion.  Be sure that we return a fully complemented
6082        coding system.  */
6083     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6084   return coding_system;
6085 }
6086
6087
6088 /* Emacs has a mechanism to automatically detect a coding system if it
6089    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6090    it's impossible to distinguish some coding systems accurately
6091    because they use the same range of codes.  So, at first, coding
6092    systems are categorized into 7, those are:
6093
6094    o coding-category-emacs-mule
6095
6096         The category for a coding system which has the same code range
6097         as Emacs' internal format.  Assigned the coding-system (Lisp
6098         symbol) `emacs-mule' by default.
6099
6100    o coding-category-sjis
6101
6102         The category for a coding system which has the same code range
6103         as SJIS.  Assigned the coding-system (Lisp
6104         symbol) `japanese-shift-jis' by default.
6105
6106    o coding-category-iso-7
6107
6108         The category for a coding system which has the same code range
6109         as ISO2022 of 7-bit environment.  This doesn't use any locking
6110         shift and single shift functions.  This can encode/decode all
6111         charsets.  Assigned the coding-system (Lisp symbol)
6112         `iso-2022-7bit' by default.
6113
6114    o coding-category-iso-7-tight
6115
6116         Same as coding-category-iso-7 except that this can
6117         encode/decode only the specified charsets.
6118
6119    o coding-category-iso-8-1
6120
6121         The category for a coding system which has the same code range
6122         as ISO2022 of 8-bit environment and graphic plane 1 used only
6123         for DIMENSION1 charset.  This doesn't use any locking shift
6124         and single shift functions.  Assigned the coding-system (Lisp
6125         symbol) `iso-latin-1' by default.
6126
6127    o coding-category-iso-8-2
6128
6129         The category for a coding system which has the same code range
6130         as ISO2022 of 8-bit environment and graphic plane 1 used only
6131         for DIMENSION2 charset.  This doesn't use any locking shift
6132         and single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `japanese-iso-8bit' by default.
6134
6135    o coding-category-iso-7-else
6136
6137         The category for a coding system which has the same code range
6138         as ISO2022 of 7-bit environment but uses locking shift or
6139         single shift functions.  Assigned the coding-system (Lisp
6140         symbol) `iso-2022-7bit-lock' by default.
6141
6142    o coding-category-iso-8-else
6143
6144         The category for a coding system which has the same code range
6145         as ISO2022 of 8-bit environment but uses locking shift or
6146         single shift functions.  Assigned the coding-system (Lisp
6147         symbol) `iso-2022-8bit-ss2' by default.
6148
6149    o coding-category-big5
6150
6151         The category for a coding system which has the same code range
6152         as BIG5.  Assigned the coding-system (Lisp symbol)
6153         `cn-big5' by default.
6154
6155    o coding-category-utf-8
6156
6157         The category for a coding system which has the same code range
6158         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6159         symbol) `utf-8' by default.
6160
6161    o coding-category-utf-16-be
6162
6163         The category for a coding system in which a text has an
6164         Unicode signature (cf. Unicode Standard) in the order of BIG
6165         endian at the head.  Assigned the coding-system (Lisp symbol)
6166         `utf-16-be' by default.
6167
6168    o coding-category-utf-16-le
6169
6170         The category for a coding system in which a text has an
6171         Unicode signature (cf. Unicode Standard) in the order of
6172         LITTLE endian at the head.  Assigned the coding-system (Lisp
6173         symbol) `utf-16-le' by default.
6174
6175    o coding-category-ccl
6176
6177         The category for a coding system of which encoder/decoder is
6178         written in CCL programs.  The default value is nil, i.e., no
6179         coding system is assigned.
6180
6181    o coding-category-binary
6182
6183         The category for a coding system not categorized in any of the
6184         above.  Assigned the coding-system (Lisp symbol)
6185         `no-conversion' by default.
6186
6187    Each of them is a Lisp symbol and the value is an actual
6188    `coding-system's (this is also a Lisp symbol) assigned by a user.
6189    What Emacs does actually is to detect a category of coding system.
6190    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6191    decide only one possible category, it selects a category of the
6192    highest priority.  Priorities of categories are also specified by a
6193    user in a Lisp variable `coding-category-list'.
6194
6195 */
6196
6197 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6198                                            int eol_seen);
6199
6200
6201 /* Return the number of ASCII characters at the head of the source.
6202    By side effects, set coding->head_ascii and update
6203    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6204    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6205    reliable only when all the source bytes are ASCII.  */
6206
6207 static ptrdiff_t
6208 check_ascii (struct coding_system *coding)
6209 {
6210   const unsigned char *src, *end;
6211   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6212   int eol_seen = coding->eol_seen;
6213
6214   coding_set_source (coding);
6215   src = coding->source;
6216   end = src + coding->src_bytes;
6217
6218   if (inhibit_eol_conversion
6219       || SYMBOLP (eol_type))
6220     {
6221       /* We don't have to check EOL format.  */
6222       while (src < end && !( *src & 0x80))
6223         {
6224           if (*src++ == '\n')
6225             eol_seen |= EOL_SEEN_LF;
6226         }
6227     }
6228   else
6229     {
6230       end--;                /* We look ahead one byte for "CR LF".  */
6231       while (src < end)
6232         {
6233           int c = *src;
6234
6235           if (c & 0x80)
6236             break;
6237           src++;
6238           if (c == '\r')
6239             {
6240               if (*src == '\n')
6241                 {
6242                   eol_seen |= EOL_SEEN_CRLF;
6243                   src++;
6244                 }
6245               else
6246                 eol_seen |= EOL_SEEN_CR;
6247             }
6248           else if (c == '\n')
6249             eol_seen |= EOL_SEEN_LF;
6250         }
6251       if (src == end)
6252         {
6253           int c = *src;
6254
6255           /* All bytes but the last one C are ASCII.  */
6256           if (! (c & 0x80))
6257             {
6258               if (c == '\r')
6259                 eol_seen |= EOL_SEEN_CR;
6260               else if (c  == '\n')
6261                 eol_seen |= EOL_SEEN_LF;
6262               src++;
6263             }
6264         }
6265     }
6266   coding->head_ascii = src - coding->source;
6267   coding->eol_seen = eol_seen;
6268   return (coding->head_ascii);
6269 }
6270
6271
6272 /* Return the number of characters at the source if all the bytes are
6273    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6274    effects, update coding->eol_seen.  The value of coding->eol_seen is
6275    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6276    the value is reliable only when all the source bytes are valid
6277    UTF-8.  */
6278
6279 static ptrdiff_t
6280 check_utf_8 (struct coding_system *coding)
6281 {
6282   const unsigned char *src, *end;
6283   int eol_seen;
6284   ptrdiff_t nchars = coding->head_ascii;
6285
6286   if (coding->head_ascii < 0)
6287     check_ascii (coding);
6288   else
6289     coding_set_source (coding);
6290   src = coding->source + coding->head_ascii;
6291   /* We look ahead one byte for CR LF.  */
6292   end = coding->source + coding->src_bytes - 1;
6293   eol_seen = coding->eol_seen;
6294   while (src < end)
6295     {
6296       int c = *src;
6297
6298       if (UTF_8_1_OCTET_P (*src))
6299         {
6300           src++;
6301           if (c < 0x20)
6302             {
6303               if (c == '\r')
6304                 {
6305                   if (*src == '\n')
6306                     {
6307                       eol_seen |= EOL_SEEN_CRLF;
6308                       src++;
6309                       nchars++;
6310                     }
6311                   else
6312                     eol_seen |= EOL_SEEN_CR;
6313                 }
6314               else if (c == '\n')
6315                 eol_seen |= EOL_SEEN_LF;
6316             }
6317         }
6318       else if (UTF_8_2_OCTET_LEADING_P (c))
6319         {
6320           if (c < 0xC2          /* overlong sequence */
6321               || src + 1 >= end
6322               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6323             return -1;
6324           src += 2;
6325         }
6326       else if (UTF_8_3_OCTET_LEADING_P (c))
6327         {
6328           if (src + 2 >= end
6329               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6330                     && UTF_8_EXTRA_OCTET_P (src[2])))
6331             return -1;
6332           c = (((c & 0xF) << 12)
6333                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6334           if (c < 0x800                       /* overlong sequence */
6335               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6336             return -1;
6337           src += 3;
6338         }
6339       else if (UTF_8_4_OCTET_LEADING_P (c))
6340         {
6341           if (src + 3 >= end
6342               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6343                     && UTF_8_EXTRA_OCTET_P (src[2])
6344                     && UTF_8_EXTRA_OCTET_P (src[3])))
6345             return -1;
6346           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6347                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6348           if (c < 0x10000       /* overlong sequence */
6349               || c >= 0x110000) /* non-Unicode character  */
6350             return -1;
6351           src += 4;
6352         }
6353       else
6354         return -1;
6355       nchars++;
6356     }
6357
6358   if (src == end)
6359     {
6360       if (! UTF_8_1_OCTET_P (*src))
6361         return -1;
6362       nchars++;
6363       if (*src == '\r')
6364         eol_seen |= EOL_SEEN_CR;
6365       else if (*src  == '\n')
6366         eol_seen |= EOL_SEEN_LF;
6367     }
6368   coding->eol_seen = eol_seen;
6369   return nchars;
6370 }
6371
6372
6373 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6374    SOURCE is encoded.  If CATEGORY is one of
6375    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6376    two-byte, else they are encoded by one-byte.
6377
6378    Return one of EOL_SEEN_XXX.  */
6379
6380 #define MAX_EOL_CHECK_COUNT 3
6381
6382 static int
6383 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6384             enum coding_category category)
6385 {
6386   const unsigned char *src = source, *src_end = src + src_bytes;
6387   unsigned char c;
6388   int total  = 0;
6389   int eol_seen = EOL_SEEN_NONE;
6390
6391   if ((1 << category) & CATEGORY_MASK_UTF_16)
6392     {
6393       bool msb = category == (coding_category_utf_16_le
6394                               | coding_category_utf_16_le_nosig);
6395       bool lsb = !msb;
6396
6397       while (src + 1 < src_end)
6398         {
6399           c = src[lsb];
6400           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6401             {
6402               int this_eol;
6403
6404               if (c == '\n')
6405                 this_eol = EOL_SEEN_LF;
6406               else if (src + 3 >= src_end
6407                        || src[msb + 2] != 0
6408                        || src[lsb + 2] != '\n')
6409                 this_eol = EOL_SEEN_CR;
6410               else
6411                 {
6412                   this_eol = EOL_SEEN_CRLF;
6413                   src += 2;
6414                 }
6415
6416               if (eol_seen == EOL_SEEN_NONE)
6417                 /* This is the first end-of-line.  */
6418                 eol_seen = this_eol;
6419               else if (eol_seen != this_eol)
6420                 {
6421                   /* The found type is different from what found before.
6422                      Allow for stray ^M characters in DOS EOL files.  */
6423                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6424                       || (eol_seen == EOL_SEEN_CRLF
6425                           && this_eol == EOL_SEEN_CR))
6426                     eol_seen = EOL_SEEN_CRLF;
6427                   else
6428                     {
6429                       eol_seen = EOL_SEEN_LF;
6430                       break;
6431                     }
6432                 }
6433               if (++total == MAX_EOL_CHECK_COUNT)
6434                 break;
6435             }
6436           src += 2;
6437         }
6438     }
6439   else
6440     while (src < src_end)
6441       {
6442         c = *src++;
6443         if (c == '\n' || c == '\r')
6444           {
6445             int this_eol;
6446
6447             if (c == '\n')
6448               this_eol = EOL_SEEN_LF;
6449             else if (src >= src_end || *src != '\n')
6450               this_eol = EOL_SEEN_CR;
6451             else
6452               this_eol = EOL_SEEN_CRLF, src++;
6453
6454             if (eol_seen == EOL_SEEN_NONE)
6455               /* This is the first end-of-line.  */
6456               eol_seen = this_eol;
6457             else if (eol_seen != this_eol)
6458               {
6459                 /* The found type is different from what found before.
6460                    Allow for stray ^M characters in DOS EOL files.  */
6461                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6462                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6463                   eol_seen = EOL_SEEN_CRLF;
6464                 else
6465                   {
6466                     eol_seen = EOL_SEEN_LF;
6467                     break;
6468                   }
6469               }
6470             if (++total == MAX_EOL_CHECK_COUNT)
6471               break;
6472           }
6473       }
6474   return eol_seen;
6475 }
6476
6477
6478 static Lisp_Object
6479 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6480 {
6481   Lisp_Object eol_type;
6482
6483   eol_type = CODING_ID_EOL_TYPE (coding->id);
6484   if (! VECTORP (eol_type))
6485     /* Already adjusted.  */
6486     return eol_type;
6487   if (eol_seen & EOL_SEEN_LF)
6488     {
6489       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6490       eol_type = Qunix;
6491     }
6492   else if (eol_seen & EOL_SEEN_CRLF)
6493     {
6494       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6495       eol_type = Qdos;
6496     }
6497   else if (eol_seen & EOL_SEEN_CR)
6498     {
6499       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6500       eol_type = Qmac;
6501     }
6502   return eol_type;
6503 }
6504
6505 /* Detect how a text specified in CODING is encoded.  If a coding
6506    system is detected, update fields of CODING by the detected coding
6507    system.  */
6508
6509 static void
6510 detect_coding (struct coding_system *coding)
6511 {
6512   const unsigned char *src, *src_end;
6513   unsigned int saved_mode = coding->mode;
6514   Lisp_Object found = Qnil;
6515   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6516
6517   coding->consumed = coding->consumed_char = 0;
6518   coding->produced = coding->produced_char = 0;
6519   coding_set_source (coding);
6520
6521   src_end = coding->source + coding->src_bytes;
6522
6523   coding->eol_seen = EOL_SEEN_NONE;
6524   /* If we have not yet decided the text encoding type, detect it
6525      now.  */
6526   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6527     {
6528       int c, i;
6529       struct coding_detection_info detect_info;
6530       bool null_byte_found = 0, eight_bit_found = 0;
6531       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6532                                        inhibit_null_byte_detection);
6533       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6534                                        inhibit_iso_escape_detection);
6535       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6536
6537       coding->head_ascii = 0;
6538       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6539       for (src = coding->source; src < src_end; src++)
6540         {
6541           c = *src;
6542           if (c & 0x80)
6543             {
6544               eight_bit_found = 1;
6545               if (null_byte_found)
6546                 break;
6547             }
6548           else if (c < 0x20)
6549             {
6550               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6551                   && ! inhibit_ied
6552                   && ! detect_info.checked)
6553                 {
6554                   if (detect_coding_iso_2022 (coding, &detect_info))
6555                     {
6556                       /* We have scanned the whole data.  */
6557                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6558                         {
6559                           /* We didn't find an 8-bit code.  We may
6560                              have found a null-byte, but it's very
6561                              rare that a binary file conforms to
6562                              ISO-2022.  */
6563                           src = src_end;
6564                           coding->head_ascii = src - coding->source;
6565                         }
6566                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6567                       break;
6568                     }
6569                 }
6570               else if (! c && !inhibit_nbd)
6571                 {
6572                   null_byte_found = 1;
6573                   if (eight_bit_found)
6574                     break;
6575                 }
6576               else if (! disable_ascii_optimization
6577                        && ! inhibit_eol_conversion)
6578                 {
6579                   if (c == '\r')
6580                     {
6581                       if (src < src_end && src[1] == '\n')
6582                         {
6583                           coding->eol_seen |= EOL_SEEN_CRLF;
6584                           src++;
6585                           if (! eight_bit_found)
6586                             coding->head_ascii++;
6587                         }
6588                       else
6589                         coding->eol_seen |= EOL_SEEN_CR;
6590                     }
6591                   else if (c == '\n')
6592                     {
6593                       coding->eol_seen |= EOL_SEEN_LF;
6594                     }
6595                 }
6596
6597               if (! eight_bit_found)
6598                 coding->head_ascii++;
6599             }
6600           else if (! eight_bit_found)
6601             coding->head_ascii++;
6602         }
6603
6604       if (null_byte_found || eight_bit_found
6605           || coding->head_ascii < coding->src_bytes
6606           || detect_info.found)
6607         {
6608           enum coding_category category;
6609           struct coding_system *this;
6610
6611           if (coding->head_ascii == coding->src_bytes)
6612             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6613             for (i = 0; i < coding_category_raw_text; i++)
6614               {
6615                 category = coding_priorities[i];
6616                 this = coding_categories + category;
6617                 if (detect_info.found & (1 << category))
6618                   break;
6619               }
6620           else
6621             {
6622               if (null_byte_found)
6623                 {
6624                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6625                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6626                 }
6627               else if (prefer_utf_8
6628                        && detect_coding_utf_8 (coding, &detect_info))
6629                 {
6630                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6631                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6632                 }
6633               for (i = 0; i < coding_category_raw_text; i++)
6634                 {
6635                   category = coding_priorities[i];
6636                   this = coding_categories + category;
6637                   /* Some of this->detector (e.g. detect_coding_sjis)
6638                      require this information.  */
6639                   coding->id = this->id;
6640                   if (this->id < 0)
6641                     {
6642                       /* No coding system of this category is defined.  */
6643                       detect_info.rejected |= (1 << category);
6644                     }
6645                   else if (category >= coding_category_raw_text)
6646                     continue;
6647                   else if (detect_info.checked & (1 << category))
6648                     {
6649                       if (detect_info.found & (1 << category))
6650                         break;
6651                     }
6652                   else if ((*(this->detector)) (coding, &detect_info)
6653                            && detect_info.found & (1 << category))
6654                     break;
6655                 }
6656             }
6657
6658           if (i < coding_category_raw_text)
6659             {
6660               if (category == coding_category_utf_8_auto)
6661                 {
6662                   Lisp_Object coding_systems;
6663
6664                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6665                                          coding_attr_utf_bom);
6666                   if (CONSP (coding_systems))
6667                     {
6668                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6669                         found = XCAR (coding_systems);
6670                       else
6671                         found = XCDR (coding_systems);
6672                     }
6673                   else
6674                     found = CODING_ID_NAME (this->id);
6675                 }
6676               else if (category == coding_category_utf_16_auto)
6677                 {
6678                   Lisp_Object coding_systems;
6679
6680                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6681                                          coding_attr_utf_bom);
6682                   if (CONSP (coding_systems))
6683                     {
6684                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6685                         found = XCAR (coding_systems);
6686                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6687                         found = XCDR (coding_systems);
6688                     }
6689                   else
6690                     found = CODING_ID_NAME (this->id);
6691                 }
6692               else
6693                 found = CODING_ID_NAME (this->id);
6694             }
6695           else if (null_byte_found)
6696             found = Qno_conversion;
6697           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6698                    == CATEGORY_MASK_ANY)
6699             found = Qraw_text;
6700           else if (detect_info.rejected)
6701             for (i = 0; i < coding_category_raw_text; i++)
6702               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6703                 {
6704                   this = coding_categories + coding_priorities[i];
6705                   found = CODING_ID_NAME (this->id);
6706                   break;
6707                 }
6708         }
6709     }
6710   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6711            == coding_category_utf_8_auto)
6712     {
6713       Lisp_Object coding_systems;
6714       struct coding_detection_info detect_info;
6715
6716       coding_systems
6717         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6718       detect_info.found = detect_info.rejected = 0;
6719       if (check_ascii (coding) == coding->src_bytes)
6720         {
6721           if (CONSP (coding_systems))
6722             found = XCDR (coding_systems);
6723         }
6724       else
6725         {
6726           if (CONSP (coding_systems)
6727               && detect_coding_utf_8 (coding, &detect_info))
6728             {
6729               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6730                 found = XCAR (coding_systems);
6731               else
6732                 found = XCDR (coding_systems);
6733             }
6734         }
6735     }
6736   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6737            == coding_category_utf_16_auto)
6738     {
6739       Lisp_Object coding_systems;
6740       struct coding_detection_info detect_info;
6741
6742       coding_systems
6743         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6744       detect_info.found = detect_info.rejected = 0;
6745       coding->head_ascii = 0;
6746       if (CONSP (coding_systems)
6747           && detect_coding_utf_16 (coding, &detect_info))
6748         {
6749           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6750             found = XCAR (coding_systems);
6751           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6752             found = XCDR (coding_systems);
6753         }
6754     }
6755
6756   if (! NILP (found))
6757     {
6758       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6759                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6760                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6761                            : EOL_SEEN_LF);
6762
6763       setup_coding_system (found, coding);
6764       if (specified_eol != EOL_SEEN_NONE)
6765         adjust_coding_eol_type (coding, specified_eol);
6766     }
6767
6768   coding->mode = saved_mode;
6769 }
6770
6771
6772 static void
6773 decode_eol (struct coding_system *coding)
6774 {
6775   Lisp_Object eol_type;
6776   unsigned char *p, *pbeg, *pend;
6777
6778   eol_type = CODING_ID_EOL_TYPE (coding->id);
6779   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6780     return;
6781
6782   if (NILP (coding->dst_object))
6783     pbeg = coding->destination;
6784   else
6785     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6786   pend = pbeg + coding->produced;
6787
6788   if (VECTORP (eol_type))
6789     {
6790       int eol_seen = EOL_SEEN_NONE;
6791
6792       for (p = pbeg; p < pend; p++)
6793         {
6794           if (*p == '\n')
6795             eol_seen |= EOL_SEEN_LF;
6796           else if (*p == '\r')
6797             {
6798               if (p + 1 < pend && *(p + 1) == '\n')
6799                 {
6800                   eol_seen |= EOL_SEEN_CRLF;
6801                   p++;
6802                 }
6803               else
6804                 eol_seen |= EOL_SEEN_CR;
6805             }
6806         }
6807       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6808       if ((eol_seen & EOL_SEEN_CRLF) != 0
6809           && (eol_seen & EOL_SEEN_CR) != 0
6810           && (eol_seen & EOL_SEEN_LF) == 0)
6811         eol_seen = EOL_SEEN_CRLF;
6812       else if (eol_seen != EOL_SEEN_NONE
6813           && eol_seen != EOL_SEEN_LF
6814           && eol_seen != EOL_SEEN_CRLF
6815           && eol_seen != EOL_SEEN_CR)
6816         eol_seen = EOL_SEEN_LF;
6817       if (eol_seen != EOL_SEEN_NONE)
6818         eol_type = adjust_coding_eol_type (coding, eol_seen);
6819     }
6820
6821   if (EQ (eol_type, Qmac))
6822     {
6823       for (p = pbeg; p < pend; p++)
6824         if (*p == '\r')
6825           *p = '\n';
6826     }
6827   else if (EQ (eol_type, Qdos))
6828     {
6829       ptrdiff_t n = 0;
6830
6831       if (NILP (coding->dst_object))
6832         {
6833           /* Start deleting '\r' from the tail to minimize the memory
6834              movement.  */
6835           for (p = pend - 2; p >= pbeg; p--)
6836             if (*p == '\r')
6837               {
6838                 memmove (p, p + 1, pend-- - p - 1);
6839                 n++;
6840               }
6841         }
6842       else
6843         {
6844           ptrdiff_t pos_byte = coding->dst_pos_byte;
6845           ptrdiff_t pos = coding->dst_pos;
6846           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6847
6848           while (pos < pos_end)
6849             {
6850               p = BYTE_POS_ADDR (pos_byte);
6851               if (*p == '\r' && p[1] == '\n')
6852                 {
6853                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6854                   n++;
6855                   pos_end--;
6856                 }
6857               pos++;
6858               if (coding->dst_multibyte)
6859                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6860               else
6861                 pos_byte++;
6862             }
6863         }
6864       coding->produced -= n;
6865       coding->produced_char -= n;
6866     }
6867 }
6868
6869
6870 /* Return a translation table (or list of them) from coding system
6871    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6872    not ENCODEP). */
6873
6874 static Lisp_Object
6875 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6876 {
6877   Lisp_Object standard, translation_table;
6878   Lisp_Object val;
6879
6880   if (NILP (Venable_character_translation))
6881     {
6882       if (max_lookup)
6883         *max_lookup = 0;
6884       return Qnil;
6885     }
6886   if (encodep)
6887     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6888       standard = Vstandard_translation_table_for_encode;
6889   else
6890     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6891       standard = Vstandard_translation_table_for_decode;
6892   if (NILP (translation_table))
6893     translation_table = standard;
6894   else
6895     {
6896       if (SYMBOLP (translation_table))
6897         translation_table = Fget (translation_table, Qtranslation_table);
6898       else if (CONSP (translation_table))
6899         {
6900           translation_table = Fcopy_sequence (translation_table);
6901           for (val = translation_table; CONSP (val); val = XCDR (val))
6902             if (SYMBOLP (XCAR (val)))
6903               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6904         }
6905       if (CHAR_TABLE_P (standard))
6906         {
6907           if (CONSP (translation_table))
6908             translation_table = nconc2 (translation_table, list1 (standard));
6909           else
6910             translation_table = list2 (translation_table, standard);
6911         }
6912     }
6913
6914   if (max_lookup)
6915     {
6916       *max_lookup = 1;
6917       if (CHAR_TABLE_P (translation_table)
6918           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6919         {
6920           val = XCHAR_TABLE (translation_table)->extras[1];
6921           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6922             *max_lookup = XFASTINT (val);
6923         }
6924       else if (CONSP (translation_table))
6925         {
6926           Lisp_Object tail;
6927
6928           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6929             if (CHAR_TABLE_P (XCAR (tail))
6930                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6931               {
6932                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6933                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6934                   *max_lookup = XFASTINT (tailval);
6935               }
6936         }
6937     }
6938   return translation_table;
6939 }
6940
6941 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6942   do {                                                          \
6943     trans = Qnil;                                               \
6944     if (CHAR_TABLE_P (table))                                   \
6945       {                                                         \
6946         trans = CHAR_TABLE_REF (table, c);                      \
6947         if (CHARACTERP (trans))                                 \
6948           c = XFASTINT (trans), trans = Qnil;                   \
6949       }                                                         \
6950     else if (CONSP (table))                                     \
6951       {                                                         \
6952         Lisp_Object tail;                                       \
6953                                                                 \
6954         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6955           if (CHAR_TABLE_P (XCAR (tail)))                       \
6956             {                                                   \
6957               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6958               if (CHARACTERP (trans))                           \
6959                 c = XFASTINT (trans), trans = Qnil;             \
6960               else if (! NILP (trans))                          \
6961                 break;                                          \
6962             }                                                   \
6963       }                                                         \
6964   } while (0)
6965
6966
6967 /* Return a translation of character(s) at BUF according to TRANS.
6968    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6969    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6970    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6971    translation is found, and Qnil if not found..
6972    If BUF is too short to lookup characters in FROM, return Qt.  */
6973
6974 static Lisp_Object
6975 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6976 {
6977
6978   if (INTEGERP (trans))
6979     return trans;
6980   for (; CONSP (trans); trans = XCDR (trans))
6981     {
6982       Lisp_Object val = XCAR (trans);
6983       Lisp_Object from = XCAR (val);
6984       ptrdiff_t len = ASIZE (from);
6985       ptrdiff_t i;
6986
6987       for (i = 0; i < len; i++)
6988         {
6989           if (buf + i == buf_end)
6990             return Qt;
6991           if (XINT (AREF (from, i)) != buf[i])
6992             break;
6993         }
6994       if (i == len)
6995         return val;
6996     }
6997   return Qnil;
6998 }
6999
7000
7001 static int
7002 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7003                bool last_block)
7004 {
7005   unsigned char *dst = coding->destination + coding->produced;
7006   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7007   ptrdiff_t produced;
7008   ptrdiff_t produced_chars = 0;
7009   int carryover = 0;
7010
7011   if (! coding->chars_at_source)
7012     {
7013       /* Source characters are in coding->charbuf.  */
7014       int *buf = coding->charbuf;
7015       int *buf_end = buf + coding->charbuf_used;
7016
7017       if (EQ (coding->src_object, coding->dst_object))
7018         {
7019           coding_set_source (coding);
7020           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7021         }
7022
7023       while (buf < buf_end)
7024         {
7025           int c = *buf;
7026           ptrdiff_t i;
7027
7028           if (c >= 0)
7029             {
7030               ptrdiff_t from_nchars = 1, to_nchars = 1;
7031               Lisp_Object trans = Qnil;
7032
7033               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7034               if (! NILP (trans))
7035                 {
7036                   trans = get_translation (trans, buf, buf_end);
7037                   if (INTEGERP (trans))
7038                     c = XINT (trans);
7039                   else if (CONSP (trans))
7040                     {
7041                       from_nchars = ASIZE (XCAR (trans));
7042                       trans = XCDR (trans);
7043                       if (INTEGERP (trans))
7044                         c = XINT (trans);
7045                       else
7046                         {
7047                           to_nchars = ASIZE (trans);
7048                           c = XINT (AREF (trans, 0));
7049                         }
7050                     }
7051                   else if (EQ (trans, Qt) && ! last_block)
7052                     break;
7053                 }
7054
7055               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7056                 {
7057                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7058                        / MAX_MULTIBYTE_LENGTH)
7059                       < to_nchars)
7060                     memory_full (SIZE_MAX);
7061                   dst = alloc_destination (coding,
7062                                            buf_end - buf
7063                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7064                                            dst);
7065                   if (EQ (coding->src_object, coding->dst_object))
7066                     {
7067                       coding_set_source (coding);
7068                       dst_end = (((unsigned char *) coding->source)
7069                                  + coding->consumed);
7070                     }
7071                   else
7072                     dst_end = coding->destination + coding->dst_bytes;
7073                 }
7074
7075               for (i = 0; i < to_nchars; i++)
7076                 {
7077                   if (i > 0)
7078                     c = XINT (AREF (trans, i));
7079                   if (coding->dst_multibyte
7080                       || ! CHAR_BYTE8_P (c))
7081                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7082                   else
7083                     *dst++ = CHAR_TO_BYTE8 (c);
7084                 }
7085               produced_chars += to_nchars;
7086               buf += from_nchars;
7087             }
7088           else
7089             /* This is an annotation datum.  (-C) is the length.  */
7090             buf += -c;
7091         }
7092       carryover = buf_end - buf;
7093     }
7094   else
7095     {
7096       /* Source characters are at coding->source.  */
7097       const unsigned char *src = coding->source;
7098       const unsigned char *src_end = src + coding->consumed;
7099
7100       if (EQ (coding->dst_object, coding->src_object))
7101         dst_end = (unsigned char *) src;
7102       if (coding->src_multibyte != coding->dst_multibyte)
7103         {
7104           if (coding->src_multibyte)
7105             {
7106               bool multibytep = 1;
7107               ptrdiff_t consumed_chars = 0;
7108
7109               while (1)
7110                 {
7111                   const unsigned char *src_base = src;
7112                   int c;
7113
7114                   ONE_MORE_BYTE (c);
7115                   if (dst == dst_end)
7116                     {
7117                       if (EQ (coding->src_object, coding->dst_object))
7118                         dst_end = (unsigned char *) src;
7119                       if (dst == dst_end)
7120                         {
7121                           ptrdiff_t offset = src - coding->source;
7122
7123                           dst = alloc_destination (coding, src_end - src + 1,
7124                                                    dst);
7125                           dst_end = coding->destination + coding->dst_bytes;
7126                           coding_set_source (coding);
7127                           src = coding->source + offset;
7128                           src_end = coding->source + coding->consumed;
7129                           if (EQ (coding->src_object, coding->dst_object))
7130                             dst_end = (unsigned char *) src;
7131                         }
7132                     }
7133                   *dst++ = c;
7134                   produced_chars++;
7135                 }
7136             no_more_source:
7137               ;
7138             }
7139           else
7140             while (src < src_end)
7141               {
7142                 bool multibytep = 1;
7143                 int c = *src++;
7144
7145                 if (dst >= dst_end - 1)
7146                   {
7147                     if (EQ (coding->src_object, coding->dst_object))
7148                       dst_end = (unsigned char *) src;
7149                     if (dst >= dst_end - 1)
7150                       {
7151                         ptrdiff_t offset = src - coding->source;
7152                         ptrdiff_t more_bytes;
7153
7154                         if (EQ (coding->src_object, coding->dst_object))
7155                           more_bytes = ((src_end - src) / 2) + 2;
7156                         else
7157                           more_bytes = src_end - src + 2;
7158                         dst = alloc_destination (coding, more_bytes, dst);
7159                         dst_end = coding->destination + coding->dst_bytes;
7160                         coding_set_source (coding);
7161                         src = coding->source + offset;
7162                         src_end = coding->source + coding->consumed;
7163                         if (EQ (coding->src_object, coding->dst_object))
7164                           dst_end = (unsigned char *) src;
7165                       }
7166                   }
7167                 EMIT_ONE_BYTE (c);
7168               }
7169         }
7170       else
7171         {
7172           if (!EQ (coding->src_object, coding->dst_object))
7173             {
7174               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7175
7176               if (require > 0)
7177                 {
7178                   ptrdiff_t offset = src - coding->source;
7179
7180                   dst = alloc_destination (coding, require, dst);
7181                   coding_set_source (coding);
7182                   src = coding->source + offset;
7183                   src_end = coding->source + coding->consumed;
7184                 }
7185             }
7186           produced_chars = coding->consumed_char;
7187           while (src < src_end)
7188             *dst++ = *src++;
7189         }
7190     }
7191
7192   produced = dst - (coding->destination + coding->produced);
7193   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7194     insert_from_gap (produced_chars, produced, 0);
7195   coding->produced += produced;
7196   coding->produced_char += produced_chars;
7197   return carryover;
7198 }
7199
7200 /* Compose text in CODING->object according to the annotation data at
7201    CHARBUF.  CHARBUF is an array:
7202      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7203  */
7204
7205 static void
7206 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7207 {
7208   int len;
7209   ptrdiff_t to;
7210   enum composition_method method;
7211   Lisp_Object components;
7212
7213   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7214   to = pos + charbuf[2];
7215   method = (enum composition_method) (charbuf[4]);
7216
7217   if (method == COMPOSITION_RELATIVE)
7218     components = Qnil;
7219   else
7220     {
7221       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7222       int i, j;
7223
7224       if (method == COMPOSITION_WITH_RULE)
7225         len = charbuf[2] * 3 - 2;
7226       charbuf += MAX_ANNOTATION_LENGTH;
7227       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7228       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7229         {
7230           if (charbuf[i] >= 0)
7231             args[j] = make_number (charbuf[i]);
7232           else
7233             {
7234               i++;
7235               args[j] = make_number (charbuf[i] % 0x100);
7236             }
7237         }
7238       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7239     }
7240   compose_text (pos, to, components, Qnil, coding->dst_object);
7241 }
7242
7243
7244 /* Put `charset' property on text in CODING->object according to
7245    the annotation data at CHARBUF.  CHARBUF is an array:
7246      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7247  */
7248
7249 static void
7250 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7251 {
7252   ptrdiff_t from = pos - charbuf[2];
7253   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7254
7255   Fput_text_property (make_number (from), make_number (pos),
7256                       Qcharset, CHARSET_NAME (charset),
7257                       coding->dst_object);
7258 }
7259
7260 #define MAX_CHARBUF_SIZE 0x4000
7261 /* How many units decoding functions expect in coding->charbuf at
7262    most.  Currently, decode_coding_emacs_mule expects the following
7263    size, and that is the largest value.  */
7264 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7265
7266 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7267   do {                                                          \
7268     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7269                            MAX_CHARBUF_SIZE);                   \
7270     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7271     coding->charbuf_size = units;                               \
7272   } while (0)
7273
7274 static void
7275 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7276 {
7277   int *charbuf = coding->charbuf;
7278   int *charbuf_end = charbuf + coding->charbuf_used;
7279
7280   if (NILP (coding->dst_object))
7281     return;
7282
7283   while (charbuf < charbuf_end)
7284     {
7285       if (*charbuf >= 0)
7286         pos++, charbuf++;
7287       else
7288         {
7289           int len = -*charbuf;
7290
7291           if (len > 2)
7292             switch (charbuf[1])
7293               {
7294               case CODING_ANNOTATE_COMPOSITION_MASK:
7295                 produce_composition (coding, charbuf, pos);
7296                 break;
7297               case CODING_ANNOTATE_CHARSET_MASK:
7298                 produce_charset (coding, charbuf, pos);
7299                 break;
7300               }
7301           charbuf += len;
7302         }
7303     }
7304 }
7305
7306 /* Decode the data at CODING->src_object into CODING->dst_object.
7307    CODING->src_object is a buffer, a string, or nil.
7308    CODING->dst_object is a buffer.
7309
7310    If CODING->src_object is a buffer, it must be the current buffer.
7311    In this case, if CODING->src_pos is positive, it is a position of
7312    the source text in the buffer, otherwise, the source text is in the
7313    gap area of the buffer, and CODING->src_pos specifies the offset of
7314    the text from GPT (which must be the same as PT).  If this is the
7315    same buffer as CODING->dst_object, CODING->src_pos must be
7316    negative.
7317
7318    If CODING->src_object is a string, CODING->src_pos is an index to
7319    that string.
7320
7321    If CODING->src_object is nil, CODING->source must already point to
7322    the non-relocatable memory area.  In this case, CODING->src_pos is
7323    an offset from CODING->source.
7324
7325    The decoded data is inserted at the current point of the buffer
7326    CODING->dst_object.
7327 */
7328
7329 static void
7330 decode_coding (struct coding_system *coding)
7331 {
7332   Lisp_Object attrs;
7333   Lisp_Object undo_list;
7334   Lisp_Object translation_table;
7335   struct ccl_spec cclspec;
7336   int carryover;
7337   int i;
7338
7339   USE_SAFE_ALLOCA;
7340
7341   if (BUFFERP (coding->src_object)
7342       && coding->src_pos > 0
7343       && coding->src_pos < GPT
7344       && coding->src_pos + coding->src_chars > GPT)
7345     move_gap_both (coding->src_pos, coding->src_pos_byte);
7346
7347   undo_list = Qt;
7348   if (BUFFERP (coding->dst_object))
7349     {
7350       set_buffer_internal (XBUFFER (coding->dst_object));
7351       if (GPT != PT)
7352         move_gap_both (PT, PT_BYTE);
7353
7354       /* We must disable undo_list in order to record the whole insert
7355          transaction via record_insert at the end.  But doing so also
7356          disables the recording of the first change to the undo_list.
7357          Therefore we check for first change here and record it via
7358          record_first_change if needed.  */
7359       if (MODIFF <= SAVE_MODIFF)
7360         record_first_change ();
7361
7362       undo_list = BVAR (current_buffer, undo_list);
7363       bset_undo_list (current_buffer, Qt);
7364     }
7365
7366   coding->consumed = coding->consumed_char = 0;
7367   coding->produced = coding->produced_char = 0;
7368   coding->chars_at_source = 0;
7369   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7370
7371   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7372
7373   attrs = CODING_ID_ATTRS (coding->id);
7374   translation_table = get_translation_table (attrs, 0, NULL);
7375
7376   carryover = 0;
7377   if (coding->decoder == decode_coding_ccl)
7378     {
7379       coding->spec.ccl = &cclspec;
7380       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7381     }
7382   do
7383     {
7384       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7385
7386       coding_set_source (coding);
7387       coding->annotated = 0;
7388       coding->charbuf_used = carryover;
7389       (*(coding->decoder)) (coding);
7390       coding_set_destination (coding);
7391       carryover = produce_chars (coding, translation_table, 0);
7392       if (coding->annotated)
7393         produce_annotation (coding, pos);
7394       for (i = 0; i < carryover; i++)
7395         coding->charbuf[i]
7396           = coding->charbuf[coding->charbuf_used - carryover + i];
7397     }
7398   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7399          || (coding->consumed < coding->src_bytes
7400              && (coding->result == CODING_RESULT_SUCCESS
7401                  || coding->result == CODING_RESULT_INVALID_SRC)));
7402
7403   if (carryover > 0)
7404     {
7405       coding_set_destination (coding);
7406       coding->charbuf_used = carryover;
7407       produce_chars (coding, translation_table, 1);
7408     }
7409
7410   coding->carryover_bytes = 0;
7411   if (coding->consumed < coding->src_bytes)
7412     {
7413       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7414       const unsigned char *src;
7415
7416       coding_set_source (coding);
7417       coding_set_destination (coding);
7418       src = coding->source + coding->consumed;
7419
7420       if (coding->mode & CODING_MODE_LAST_BLOCK)
7421         {
7422           /* Flush out unprocessed data as binary chars.  We are sure
7423              that the number of data is less than the size of
7424              coding->charbuf.  */
7425           coding->charbuf_used = 0;
7426           coding->chars_at_source = 0;
7427
7428           while (nbytes-- > 0)
7429             {
7430               int c = *src++;
7431
7432               if (c & 0x80)
7433                 c = BYTE8_TO_CHAR (c);
7434               coding->charbuf[coding->charbuf_used++] = c;
7435             }
7436           produce_chars (coding, Qnil, 1);
7437         }
7438       else
7439         {
7440           /* Record unprocessed bytes in coding->carryover.  We are
7441              sure that the number of data is less than the size of
7442              coding->carryover.  */
7443           unsigned char *p = coding->carryover;
7444
7445           if (nbytes > sizeof coding->carryover)
7446             nbytes = sizeof coding->carryover;
7447           coding->carryover_bytes = nbytes;
7448           while (nbytes-- > 0)
7449             *p++ = *src++;
7450         }
7451       coding->consumed = coding->src_bytes;
7452     }
7453
7454   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7455       && !inhibit_eol_conversion)
7456     decode_eol (coding);
7457   if (BUFFERP (coding->dst_object))
7458     {
7459       bset_undo_list (current_buffer, undo_list);
7460       record_insert (coding->dst_pos, coding->produced_char);
7461     }
7462
7463   SAFE_FREE ();
7464 }
7465
7466
7467 /* Extract an annotation datum from a composition starting at POS and
7468    ending before LIMIT of CODING->src_object (buffer or string), store
7469    the data in BUF, set *STOP to a starting position of the next
7470    composition (if any) or to LIMIT, and return the address of the
7471    next element of BUF.
7472
7473    If such an annotation is not found, set *STOP to a starting
7474    position of a composition after POS (if any) or to LIMIT, and
7475    return BUF.  */
7476
7477 static int *
7478 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7479                                struct coding_system *coding, int *buf,
7480                                ptrdiff_t *stop)
7481 {
7482   ptrdiff_t start, end;
7483   Lisp_Object prop;
7484
7485   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7486       || end > limit)
7487     *stop = limit;
7488   else if (start > pos)
7489     *stop = start;
7490   else
7491     {
7492       if (start == pos)
7493         {
7494           /* We found a composition.  Store the corresponding
7495              annotation data in BUF.  */
7496           int *head = buf;
7497           enum composition_method method = composition_method (prop);
7498           int nchars = COMPOSITION_LENGTH (prop);
7499
7500           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7501           if (method != COMPOSITION_RELATIVE)
7502             {
7503               Lisp_Object components;
7504               ptrdiff_t i, len, i_byte;
7505
7506               components = COMPOSITION_COMPONENTS (prop);
7507               if (VECTORP (components))
7508                 {
7509                   len = ASIZE (components);
7510                   for (i = 0; i < len; i++)
7511                     *buf++ = XINT (AREF (components, i));
7512                 }
7513               else if (STRINGP (components))
7514                 {
7515                   len = SCHARS (components);
7516                   i = i_byte = 0;
7517                   while (i < len)
7518                     {
7519                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7520                       buf++;
7521                     }
7522                 }
7523               else if (INTEGERP (components))
7524                 {
7525                   len = 1;
7526                   *buf++ = XINT (components);
7527                 }
7528               else if (CONSP (components))
7529                 {
7530                   for (len = 0; CONSP (components);
7531                        len++, components = XCDR (components))
7532                     *buf++ = XINT (XCAR (components));
7533                 }
7534               else
7535                 emacs_abort ();
7536               *head -= len;
7537             }
7538         }
7539
7540       if (find_composition (end, limit, &start, &end, &prop,
7541                             coding->src_object)
7542           && end <= limit)
7543         *stop = start;
7544       else
7545         *stop = limit;
7546     }
7547   return buf;
7548 }
7549
7550
7551 /* Extract an annotation datum from a text property `charset' at POS of
7552    CODING->src_object (buffer of string), store the data in BUF, set
7553    *STOP to the position where the value of `charset' property changes
7554    (limiting by LIMIT), and return the address of the next element of
7555    BUF.
7556
7557    If the property value is nil, set *STOP to the position where the
7558    property value is non-nil (limiting by LIMIT), and return BUF.  */
7559
7560 static int *
7561 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7562                            struct coding_system *coding, int *buf,
7563                            ptrdiff_t *stop)
7564 {
7565   Lisp_Object val, next;
7566   int id;
7567
7568   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7569   if (! NILP (val) && CHARSETP (val))
7570     id = XINT (CHARSET_SYMBOL_ID (val));
7571   else
7572     id = -1;
7573   ADD_CHARSET_DATA (buf, 0, id);
7574   next = Fnext_single_property_change (make_number (pos), Qcharset,
7575                                        coding->src_object,
7576                                        make_number (limit));
7577   *stop = XINT (next);
7578   return buf;
7579 }
7580
7581
7582 static void
7583 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7584                int max_lookup)
7585 {
7586   int *buf = coding->charbuf;
7587   int *buf_end = coding->charbuf + coding->charbuf_size;
7588   const unsigned char *src = coding->source + coding->consumed;
7589   const unsigned char *src_end = coding->source + coding->src_bytes;
7590   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7591   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7592   bool multibytep = coding->src_multibyte;
7593   Lisp_Object eol_type;
7594   int c;
7595   ptrdiff_t stop, stop_composition, stop_charset;
7596   int *lookup_buf = NULL;
7597
7598   if (! NILP (translation_table))
7599     lookup_buf = alloca (sizeof (int) * max_lookup);
7600
7601   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7602   if (VECTORP (eol_type))
7603     eol_type = Qunix;
7604
7605   /* Note: composition handling is not yet implemented.  */
7606   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7607
7608   if (NILP (coding->src_object))
7609     stop = stop_composition = stop_charset = end_pos;
7610   else
7611     {
7612       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7613         stop = stop_composition = pos;
7614       else
7615         stop = stop_composition = end_pos;
7616       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7617         stop = stop_charset = pos;
7618       else
7619         stop_charset = end_pos;
7620     }
7621
7622   /* Compensate for CRLF and conversion.  */
7623   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7624   while (buf < buf_end)
7625     {
7626       Lisp_Object trans;
7627
7628       if (pos == stop)
7629         {
7630           if (pos == end_pos)
7631             break;
7632           if (pos == stop_composition)
7633             buf = handle_composition_annotation (pos, end_pos, coding,
7634                                                  buf, &stop_composition);
7635           if (pos == stop_charset)
7636             buf = handle_charset_annotation (pos, end_pos, coding,
7637                                              buf, &stop_charset);
7638           stop = (stop_composition < stop_charset
7639                   ? stop_composition : stop_charset);
7640         }
7641
7642       if (! multibytep)
7643         {
7644           int bytes;
7645
7646           if (coding->encoder == encode_coding_raw_text
7647               || coding->encoder == encode_coding_ccl)
7648             c = *src++, pos++;
7649           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7650             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7651           else
7652             c = BYTE8_TO_CHAR (*src), src++, pos++;
7653         }
7654       else
7655         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7656       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7657         c = '\n';
7658       if (! EQ (eol_type, Qunix))
7659         {
7660           if (c == '\n')
7661             {
7662               if (EQ (eol_type, Qdos))
7663                 *buf++ = '\r';
7664               else
7665                 c = '\r';
7666             }
7667         }
7668
7669       trans = Qnil;
7670       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7671       if (NILP (trans))
7672         *buf++ = c;
7673       else
7674         {
7675           ptrdiff_t from_nchars = 1, to_nchars = 1;
7676           int *lookup_buf_end;
7677           const unsigned char *p = src;
7678           int i;
7679
7680           lookup_buf[0] = c;
7681           for (i = 1; i < max_lookup && p < src_end; i++)
7682             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7683           lookup_buf_end = lookup_buf + i;
7684           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7685           if (INTEGERP (trans))
7686             c = XINT (trans);
7687           else if (CONSP (trans))
7688             {
7689               from_nchars = ASIZE (XCAR (trans));
7690               trans = XCDR (trans);
7691               if (INTEGERP (trans))
7692                 c = XINT (trans);
7693               else
7694                 {
7695                   to_nchars = ASIZE (trans);
7696                   if (buf_end - buf < to_nchars)
7697                     break;
7698                   c = XINT (AREF (trans, 0));
7699                 }
7700             }
7701           else
7702             break;
7703           *buf++ = c;
7704           for (i = 1; i < to_nchars; i++)
7705             *buf++ = XINT (AREF (trans, i));
7706           for (i = 1; i < from_nchars; i++, pos++)
7707             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7708         }
7709     }
7710
7711   coding->consumed = src - coding->source;
7712   coding->consumed_char = pos - coding->src_pos;
7713   coding->charbuf_used = buf - coding->charbuf;
7714   coding->chars_at_source = 0;
7715 }
7716
7717
7718 /* Encode the text at CODING->src_object into CODING->dst_object.
7719    CODING->src_object is a buffer or a string.
7720    CODING->dst_object is a buffer or nil.
7721
7722    If CODING->src_object is a buffer, it must be the current buffer.
7723    In this case, if CODING->src_pos is positive, it is a position of
7724    the source text in the buffer, otherwise. the source text is in the
7725    gap area of the buffer, and coding->src_pos specifies the offset of
7726    the text from GPT (which must be the same as PT).  If this is the
7727    same buffer as CODING->dst_object, CODING->src_pos must be
7728    negative and CODING should not have `pre-write-conversion'.
7729
7730    If CODING->src_object is a string, CODING should not have
7731    `pre-write-conversion'.
7732
7733    If CODING->dst_object is a buffer, the encoded data is inserted at
7734    the current point of that buffer.
7735
7736    If CODING->dst_object is nil, the encoded data is placed at the
7737    memory area specified by CODING->destination.  */
7738
7739 static void
7740 encode_coding (struct coding_system *coding)
7741 {
7742   Lisp_Object attrs;
7743   Lisp_Object translation_table;
7744   int max_lookup;
7745   struct ccl_spec cclspec;
7746
7747   USE_SAFE_ALLOCA;
7748
7749   attrs = CODING_ID_ATTRS (coding->id);
7750   if (coding->encoder == encode_coding_raw_text)
7751     translation_table = Qnil, max_lookup = 0;
7752   else
7753     translation_table = get_translation_table (attrs, 1, &max_lookup);
7754
7755   if (BUFFERP (coding->dst_object))
7756     {
7757       set_buffer_internal (XBUFFER (coding->dst_object));
7758       coding->dst_multibyte
7759         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7760     }
7761
7762   coding->consumed = coding->consumed_char = 0;
7763   coding->produced = coding->produced_char = 0;
7764   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7765
7766   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7767
7768   if (coding->encoder == encode_coding_ccl)
7769     {
7770       coding->spec.ccl = &cclspec;
7771       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7772     }
7773   do {
7774     coding_set_source (coding);
7775     consume_chars (coding, translation_table, max_lookup);
7776     coding_set_destination (coding);
7777     (*(coding->encoder)) (coding);
7778   } while (coding->consumed_char < coding->src_chars);
7779
7780   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7781     insert_from_gap (coding->produced_char, coding->produced, 0);
7782
7783   SAFE_FREE ();
7784 }
7785
7786
7787 /* Name (or base name) of work buffer for code conversion.  */
7788 static Lisp_Object Vcode_conversion_workbuf_name;
7789
7790 /* A working buffer used by the top level conversion.  Once it is
7791    created, it is never destroyed.  It has the name
7792    Vcode_conversion_workbuf_name.  The other working buffers are
7793    destroyed after the use is finished, and their names are modified
7794    versions of Vcode_conversion_workbuf_name.  */
7795 static Lisp_Object Vcode_conversion_reused_workbuf;
7796
7797 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7798 static bool reused_workbuf_in_use;
7799
7800
7801 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7802    multibyteness of returning buffer.  */
7803
7804 static Lisp_Object
7805 make_conversion_work_buffer (bool multibyte)
7806 {
7807   Lisp_Object name, workbuf;
7808   struct buffer *current;
7809
7810   if (reused_workbuf_in_use)
7811     {
7812       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7813       workbuf = Fget_buffer_create (name);
7814     }
7815   else
7816     {
7817       reused_workbuf_in_use = 1;
7818       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7819         Vcode_conversion_reused_workbuf
7820           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7821       workbuf = Vcode_conversion_reused_workbuf;
7822     }
7823   current = current_buffer;
7824   set_buffer_internal (XBUFFER (workbuf));
7825   /* We can't allow modification hooks to run in the work buffer.  For
7826      instance, directory_files_internal assumes that file decoding
7827      doesn't compile new regexps.  */
7828   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7829   Ferase_buffer ();
7830   bset_undo_list (current_buffer, Qt);
7831   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7832   set_buffer_internal (current);
7833   return workbuf;
7834 }
7835
7836
7837 static void
7838 code_conversion_restore (Lisp_Object arg)
7839 {
7840   Lisp_Object current, workbuf;
7841   struct gcpro gcpro1;
7842
7843   GCPRO1 (arg);
7844   current = XCAR (arg);
7845   workbuf = XCDR (arg);
7846   if (! NILP (workbuf))
7847     {
7848       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7849         reused_workbuf_in_use = 0;
7850       else
7851         Fkill_buffer (workbuf);
7852     }
7853   set_buffer_internal (XBUFFER (current));
7854   UNGCPRO;
7855 }
7856
7857 Lisp_Object
7858 code_conversion_save (bool with_work_buf, bool multibyte)
7859 {
7860   Lisp_Object workbuf = Qnil;
7861
7862   if (with_work_buf)
7863     workbuf = make_conversion_work_buffer (multibyte);
7864   record_unwind_protect (code_conversion_restore,
7865                          Fcons (Fcurrent_buffer (), workbuf));
7866   return workbuf;
7867 }
7868
7869 void
7870 decode_coding_gap (struct coding_system *coding,
7871                    ptrdiff_t chars, ptrdiff_t bytes)
7872 {
7873   ptrdiff_t count = SPECPDL_INDEX ();
7874   Lisp_Object attrs;
7875
7876   coding->src_object = Fcurrent_buffer ();
7877   coding->src_chars = chars;
7878   coding->src_bytes = bytes;
7879   coding->src_pos = -chars;
7880   coding->src_pos_byte = -bytes;
7881   coding->src_multibyte = chars < bytes;
7882   coding->dst_object = coding->src_object;
7883   coding->dst_pos = PT;
7884   coding->dst_pos_byte = PT_BYTE;
7885   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7886
7887   coding->head_ascii = -1;
7888   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7889   coding->eol_seen = EOL_SEEN_NONE;
7890   if (CODING_REQUIRE_DETECTION (coding))
7891     detect_coding (coding);
7892   attrs = CODING_ID_ATTRS (coding->id);
7893   if (! disable_ascii_optimization
7894       && ! coding->src_multibyte
7895       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7896       && NILP (CODING_ATTR_POST_READ (attrs))
7897       && NILP (get_translation_table (attrs, 0, NULL)))
7898     {
7899       chars = coding->head_ascii;
7900       if (chars < 0)
7901         chars = check_ascii (coding);
7902       if (chars != bytes)
7903         {
7904           /* There exists a non-ASCII byte.  */
7905           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7906               && coding->detected_utf8_bytes == coding->src_bytes)
7907             {
7908               if (coding->detected_utf8_chars >= 0)
7909                 chars = coding->detected_utf8_chars;
7910               else
7911                 chars = check_utf_8 (coding);
7912               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7913                   && coding->head_ascii == 0
7914                   && coding->source[0] == UTF_8_BOM_1
7915                   && coding->source[1] == UTF_8_BOM_2
7916                   && coding->source[2] == UTF_8_BOM_3)
7917                 {
7918                   chars--;
7919                   bytes -= 3;
7920                   coding->src_bytes -= 3;
7921                 }
7922             }
7923           else
7924             chars = -1;
7925         }
7926       if (chars >= 0)
7927         {
7928           Lisp_Object eol_type;
7929
7930           eol_type = CODING_ID_EOL_TYPE (coding->id);
7931           if (VECTORP (eol_type))
7932             {
7933               if (coding->eol_seen != EOL_SEEN_NONE)
7934                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7935             }
7936           if (EQ (eol_type, Qmac))
7937             {
7938               unsigned char *src_end = GAP_END_ADDR;
7939               unsigned char *src = src_end - coding->src_bytes;
7940
7941               while (src < src_end)
7942                 {
7943                   if (*src++ == '\r')
7944                     src[-1] = '\n';
7945                 }
7946             }
7947           else if (EQ (eol_type, Qdos))
7948             {
7949               unsigned char *src = GAP_END_ADDR;
7950               unsigned char *src_beg = src - coding->src_bytes;
7951               unsigned char *dst = src;
7952               ptrdiff_t diff;
7953
7954               while (src_beg < src)
7955                 {
7956                   *--dst = *--src;
7957                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7958                     src--;
7959                 }
7960               diff = dst - src;
7961               bytes -= diff;
7962               chars -= diff;
7963             }
7964           coding->produced = bytes;
7965           coding->produced_char = chars;
7966           insert_from_gap (chars, bytes, 1);
7967           return;
7968         }
7969     }
7970   code_conversion_save (0, 0);
7971
7972   coding->mode |= CODING_MODE_LAST_BLOCK;
7973   current_buffer->text->inhibit_shrinking = 1;
7974   decode_coding (coding);
7975   current_buffer->text->inhibit_shrinking = 0;
7976
7977   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7978     {
7979       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7980       Lisp_Object val;
7981
7982       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7983       val = call1 (CODING_ATTR_POST_READ (attrs),
7984                    make_number (coding->produced_char));
7985       CHECK_NATNUM (val);
7986       coding->produced_char += Z - prev_Z;
7987       coding->produced += Z_BYTE - prev_Z_BYTE;
7988     }
7989
7990   unbind_to (count, Qnil);
7991 }
7992
7993
7994 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7995    SRC_OBJECT into DST_OBJECT by coding context CODING.
7996
7997    SRC_OBJECT is a buffer, a string, or Qnil.
7998
7999    If it is a buffer, the text is at point of the buffer.  FROM and TO
8000    are positions in the buffer.
8001
8002    If it is a string, the text is at the beginning of the string.
8003    FROM and TO are indices to the string.
8004
8005    If it is nil, the text is at coding->source.  FROM and TO are
8006    indices to coding->source.
8007
8008    DST_OBJECT is a buffer, Qt, or Qnil.
8009
8010    If it is a buffer, the decoded text is inserted at point of the
8011    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8012    is deleted.
8013
8014    If it is Qt, a string is made from the decoded text, and
8015    set in CODING->dst_object.
8016
8017    If it is Qnil, the decoded text is stored at CODING->destination.
8018    The caller must allocate CODING->dst_bytes bytes at
8019    CODING->destination by xmalloc.  If the decoded text is longer than
8020    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8021  */
8022
8023 void
8024 decode_coding_object (struct coding_system *coding,
8025                       Lisp_Object src_object,
8026                       ptrdiff_t from, ptrdiff_t from_byte,
8027                       ptrdiff_t to, ptrdiff_t to_byte,
8028                       Lisp_Object dst_object)
8029 {
8030   ptrdiff_t count = SPECPDL_INDEX ();
8031   unsigned char *destination IF_LINT (= NULL);
8032   ptrdiff_t dst_bytes IF_LINT (= 0);
8033   ptrdiff_t chars = to - from;
8034   ptrdiff_t bytes = to_byte - from_byte;
8035   Lisp_Object attrs;
8036   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8037   bool need_marker_adjustment = 0;
8038   Lisp_Object old_deactivate_mark;
8039
8040   old_deactivate_mark = Vdeactivate_mark;
8041
8042   if (NILP (dst_object))
8043     {
8044       destination = coding->destination;
8045       dst_bytes = coding->dst_bytes;
8046     }
8047
8048   coding->src_object = src_object;
8049   coding->src_chars = chars;
8050   coding->src_bytes = bytes;
8051   coding->src_multibyte = chars < bytes;
8052
8053   if (STRINGP (src_object))
8054     {
8055       coding->src_pos = from;
8056       coding->src_pos_byte = from_byte;
8057     }
8058   else if (BUFFERP (src_object))
8059     {
8060       set_buffer_internal (XBUFFER (src_object));
8061       if (from != GPT)
8062         move_gap_both (from, from_byte);
8063       if (EQ (src_object, dst_object))
8064         {
8065           struct Lisp_Marker *tail;
8066
8067           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8068             {
8069               tail->need_adjustment
8070                 = tail->charpos == (tail->insertion_type ? from : to);
8071               need_marker_adjustment |= tail->need_adjustment;
8072             }
8073           saved_pt = PT, saved_pt_byte = PT_BYTE;
8074           TEMP_SET_PT_BOTH (from, from_byte);
8075           current_buffer->text->inhibit_shrinking = 1;
8076           del_range_both (from, from_byte, to, to_byte, 1);
8077           coding->src_pos = -chars;
8078           coding->src_pos_byte = -bytes;
8079         }
8080       else
8081         {
8082           coding->src_pos = from;
8083           coding->src_pos_byte = from_byte;
8084         }
8085     }
8086
8087   if (CODING_REQUIRE_DETECTION (coding))
8088     detect_coding (coding);
8089   attrs = CODING_ID_ATTRS (coding->id);
8090
8091   if (EQ (dst_object, Qt)
8092       || (! NILP (CODING_ATTR_POST_READ (attrs))
8093           && NILP (dst_object)))
8094     {
8095       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8096       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8097       coding->dst_pos = BEG;
8098       coding->dst_pos_byte = BEG_BYTE;
8099     }
8100   else if (BUFFERP (dst_object))
8101     {
8102       code_conversion_save (0, 0);
8103       coding->dst_object = dst_object;
8104       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8105       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8106       coding->dst_multibyte
8107         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8108     }
8109   else
8110     {
8111       code_conversion_save (0, 0);
8112       coding->dst_object = Qnil;
8113       /* Most callers presume this will return a multibyte result, and they
8114          won't use `binary' or `raw-text' anyway, so let's not worry about
8115          CODING_FOR_UNIBYTE.  */
8116       coding->dst_multibyte = 1;
8117     }
8118
8119   decode_coding (coding);
8120
8121   if (BUFFERP (coding->dst_object))
8122     set_buffer_internal (XBUFFER (coding->dst_object));
8123
8124   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8125     {
8126       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8127       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8128       Lisp_Object val;
8129
8130       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8131       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8132               old_deactivate_mark);
8133       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8134                         make_number (coding->produced_char));
8135       UNGCPRO;
8136       CHECK_NATNUM (val);
8137       coding->produced_char += Z - prev_Z;
8138       coding->produced += Z_BYTE - prev_Z_BYTE;
8139     }
8140
8141   if (EQ (dst_object, Qt))
8142     {
8143       coding->dst_object = Fbuffer_string ();
8144     }
8145   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8146     {
8147       set_buffer_internal (XBUFFER (coding->dst_object));
8148       if (dst_bytes < coding->produced)
8149         {
8150           eassert (coding->produced > 0);
8151           destination = xrealloc (destination, coding->produced);
8152           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8153             move_gap_both (BEGV, BEGV_BYTE);
8154           memcpy (destination, BEGV_ADDR, coding->produced);
8155           coding->destination = destination;
8156         }
8157     }
8158
8159   if (saved_pt >= 0)
8160     {
8161       /* This is the case of:
8162          (BUFFERP (src_object) && EQ (src_object, dst_object))
8163          As we have moved PT while replacing the original buffer
8164          contents, we must recover it now.  */
8165       set_buffer_internal (XBUFFER (src_object));
8166       current_buffer->text->inhibit_shrinking = 0;
8167       if (saved_pt < from)
8168         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8169       else if (saved_pt < from + chars)
8170         TEMP_SET_PT_BOTH (from, from_byte);
8171       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8172         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8173                           saved_pt_byte + (coding->produced - bytes));
8174       else
8175         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8176                           saved_pt_byte + (coding->produced - bytes));
8177
8178       if (need_marker_adjustment)
8179         {
8180           struct Lisp_Marker *tail;
8181
8182           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8183             if (tail->need_adjustment)
8184               {
8185                 tail->need_adjustment = 0;
8186                 if (tail->insertion_type)
8187                   {
8188                     tail->bytepos = from_byte;
8189                     tail->charpos = from;
8190                   }
8191                 else
8192                   {
8193                     tail->bytepos = from_byte + coding->produced;
8194                     tail->charpos
8195                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8196                          ? tail->bytepos : from + coding->produced_char);
8197                   }
8198               }
8199         }
8200     }
8201
8202   Vdeactivate_mark = old_deactivate_mark;
8203   unbind_to (count, coding->dst_object);
8204 }
8205
8206
8207 void
8208 encode_coding_object (struct coding_system *coding,
8209                       Lisp_Object src_object,
8210                       ptrdiff_t from, ptrdiff_t from_byte,
8211                       ptrdiff_t to, ptrdiff_t to_byte,
8212                       Lisp_Object dst_object)
8213 {
8214   ptrdiff_t count = SPECPDL_INDEX ();
8215   ptrdiff_t chars = to - from;
8216   ptrdiff_t bytes = to_byte - from_byte;
8217   Lisp_Object attrs;
8218   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8219   bool need_marker_adjustment = 0;
8220   bool kill_src_buffer = 0;
8221   Lisp_Object old_deactivate_mark;
8222
8223   old_deactivate_mark = Vdeactivate_mark;
8224
8225   coding->src_object = src_object;
8226   coding->src_chars = chars;
8227   coding->src_bytes = bytes;
8228   coding->src_multibyte = chars < bytes;
8229
8230   attrs = CODING_ID_ATTRS (coding->id);
8231
8232   if (EQ (src_object, dst_object))
8233     {
8234       struct Lisp_Marker *tail;
8235
8236       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8237         {
8238           tail->need_adjustment
8239             = tail->charpos == (tail->insertion_type ? from : to);
8240           need_marker_adjustment |= tail->need_adjustment;
8241         }
8242     }
8243
8244   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8245     {
8246       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8247       set_buffer_internal (XBUFFER (coding->src_object));
8248       if (STRINGP (src_object))
8249         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8250       else if (BUFFERP (src_object))
8251         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8252       else
8253         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8254
8255       if (EQ (src_object, dst_object))
8256         {
8257           set_buffer_internal (XBUFFER (src_object));
8258           saved_pt = PT, saved_pt_byte = PT_BYTE;
8259           del_range_both (from, from_byte, to, to_byte, 1);
8260           set_buffer_internal (XBUFFER (coding->src_object));
8261         }
8262
8263       {
8264         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8265
8266         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8267                 old_deactivate_mark);
8268         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8269                     make_number (BEG), make_number (Z));
8270         UNGCPRO;
8271       }
8272       if (XBUFFER (coding->src_object) != current_buffer)
8273         kill_src_buffer = 1;
8274       coding->src_object = Fcurrent_buffer ();
8275       if (BEG != GPT)
8276         move_gap_both (BEG, BEG_BYTE);
8277       coding->src_chars = Z - BEG;
8278       coding->src_bytes = Z_BYTE - BEG_BYTE;
8279       coding->src_pos = BEG;
8280       coding->src_pos_byte = BEG_BYTE;
8281       coding->src_multibyte = Z < Z_BYTE;
8282     }
8283   else if (STRINGP (src_object))
8284     {
8285       code_conversion_save (0, 0);
8286       coding->src_pos = from;
8287       coding->src_pos_byte = from_byte;
8288     }
8289   else if (BUFFERP (src_object))
8290     {
8291       code_conversion_save (0, 0);
8292       set_buffer_internal (XBUFFER (src_object));
8293       if (EQ (src_object, dst_object))
8294         {
8295           saved_pt = PT, saved_pt_byte = PT_BYTE;
8296           coding->src_object = del_range_1 (from, to, 1, 1);
8297           coding->src_pos = 0;
8298           coding->src_pos_byte = 0;
8299         }
8300       else
8301         {
8302           if (from < GPT && to >= GPT)
8303             move_gap_both (from, from_byte);
8304           coding->src_pos = from;
8305           coding->src_pos_byte = from_byte;
8306         }
8307     }
8308   else
8309     code_conversion_save (0, 0);
8310
8311   if (BUFFERP (dst_object))
8312     {
8313       coding->dst_object = dst_object;
8314       if (EQ (src_object, dst_object))
8315         {
8316           coding->dst_pos = from;
8317           coding->dst_pos_byte = from_byte;
8318         }
8319       else
8320         {
8321           struct buffer *current = current_buffer;
8322
8323           set_buffer_temp (XBUFFER (dst_object));
8324           coding->dst_pos = PT;
8325           coding->dst_pos_byte = PT_BYTE;
8326           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8327           set_buffer_temp (current);
8328         }
8329       coding->dst_multibyte
8330         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8331     }
8332   else if (EQ (dst_object, Qt))
8333     {
8334       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8335       coding->dst_object = Qnil;
8336       coding->destination = xmalloc (dst_bytes);
8337       coding->dst_bytes = dst_bytes;
8338       coding->dst_multibyte = 0;
8339     }
8340   else
8341     {
8342       coding->dst_object = Qnil;
8343       coding->dst_multibyte = 0;
8344     }
8345
8346   encode_coding (coding);
8347
8348   if (EQ (dst_object, Qt))
8349     {
8350       if (BUFFERP (coding->dst_object))
8351         coding->dst_object = Fbuffer_string ();
8352       else if (coding->raw_destination)
8353         /* This is used to avoid creating huge Lisp string.
8354            NOTE: caller who sets `raw_destination' is also
8355            responsible for freeing `destination' buffer.  */
8356         coding->dst_object = Qnil;
8357       else
8358         {
8359           coding->dst_object
8360             = make_unibyte_string ((char *) coding->destination,
8361                                    coding->produced);
8362           xfree (coding->destination);
8363         }
8364     }
8365
8366   if (saved_pt >= 0)
8367     {
8368       /* This is the case of:
8369          (BUFFERP (src_object) && EQ (src_object, dst_object))
8370          As we have moved PT while replacing the original buffer
8371          contents, we must recover it now.  */
8372       set_buffer_internal (XBUFFER (src_object));
8373       if (saved_pt < from)
8374         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8375       else if (saved_pt < from + chars)
8376         TEMP_SET_PT_BOTH (from, from_byte);
8377       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8378         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8379                           saved_pt_byte + (coding->produced - bytes));
8380       else
8381         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8382                           saved_pt_byte + (coding->produced - bytes));
8383
8384       if (need_marker_adjustment)
8385         {
8386           struct Lisp_Marker *tail;
8387
8388           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8389             if (tail->need_adjustment)
8390               {
8391                 tail->need_adjustment = 0;
8392                 if (tail->insertion_type)
8393                   {
8394                     tail->bytepos = from_byte;
8395                     tail->charpos = from;
8396                   }
8397                 else
8398                   {
8399                     tail->bytepos = from_byte + coding->produced;
8400                     tail->charpos
8401                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8402                          ? tail->bytepos : from + coding->produced_char);
8403                   }
8404               }
8405         }
8406     }
8407
8408   if (kill_src_buffer)
8409     Fkill_buffer (coding->src_object);
8410
8411   Vdeactivate_mark = old_deactivate_mark;
8412   unbind_to (count, Qnil);
8413 }
8414
8415
8416 Lisp_Object
8417 preferred_coding_system (void)
8418 {
8419   int id = coding_categories[coding_priorities[0]].id;
8420
8421   return CODING_ID_NAME (id);
8422 }
8423
8424 #if defined (WINDOWSNT) || defined (CYGWIN)
8425
8426 Lisp_Object
8427 from_unicode (Lisp_Object str)
8428 {
8429   CHECK_STRING (str);
8430   if (!STRING_MULTIBYTE (str) &&
8431       SBYTES (str) & 1)
8432     {
8433       str = Fsubstring (str, make_number (0), make_number (-1));
8434     }
8435
8436   return code_convert_string_norecord (str, Qutf_16le, 0);
8437 }
8438
8439 Lisp_Object
8440 from_unicode_buffer (const wchar_t *wstr)
8441 {
8442     return from_unicode (
8443         make_unibyte_string (
8444             (char *) wstr,
8445             /* we get one of the two final 0 bytes for free. */
8446             1 + sizeof (wchar_t) * wcslen (wstr)));
8447 }
8448
8449 wchar_t *
8450 to_unicode (Lisp_Object str, Lisp_Object *buf)
8451 {
8452   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8453   /* We need to make another copy (in addition to the one made by
8454      code_convert_string_norecord) to ensure that the final string is
8455      _doubly_ zero terminated --- that is, that the string is
8456      terminated by two zero bytes and one utf-16le null character.
8457      Because strings are already terminated with a single zero byte,
8458      we just add one additional zero. */
8459   str = make_uninit_string (SBYTES (*buf) + 1);
8460   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8461   SDATA (str) [SBYTES (*buf)] = '\0';
8462   *buf = str;
8463   return WCSDATA (*buf);
8464 }
8465
8466 #endif /* WINDOWSNT || CYGWIN */
8467
8468 \f
8469 #ifdef emacs
8470 /*** 8. Emacs Lisp library functions ***/
8471
8472 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8473        doc: /* Return t if OBJECT is nil or a coding-system.
8474 See the documentation of `define-coding-system' for information
8475 about coding-system objects.  */)
8476   (Lisp_Object object)
8477 {
8478   if (NILP (object)
8479       || CODING_SYSTEM_ID (object) >= 0)
8480     return Qt;
8481   if (! SYMBOLP (object)
8482       || NILP (Fget (object, Qcoding_system_define_form)))
8483     return Qnil;
8484   return Qt;
8485 }
8486
8487 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8488        Sread_non_nil_coding_system, 1, 1, 0,
8489        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8490   (Lisp_Object prompt)
8491 {
8492   Lisp_Object val;
8493   do
8494     {
8495       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8496                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8497     }
8498   while (SCHARS (val) == 0);
8499   return (Fintern (val, Qnil));
8500 }
8501
8502 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8503        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8504 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8505 Ignores case when completing coding systems (all Emacs coding systems
8506 are lower-case).  */)
8507   (Lisp_Object prompt, Lisp_Object default_coding_system)
8508 {
8509   Lisp_Object val;
8510   ptrdiff_t count = SPECPDL_INDEX ();
8511
8512   if (SYMBOLP (default_coding_system))
8513     default_coding_system = SYMBOL_NAME (default_coding_system);
8514   specbind (Qcompletion_ignore_case, Qt);
8515   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8516                           Qt, Qnil, Qcoding_system_history,
8517                           default_coding_system, Qnil);
8518   unbind_to (count, Qnil);
8519   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8520 }
8521
8522 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8523        1, 1, 0,
8524        doc: /* Check validity of CODING-SYSTEM.
8525 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8526 It is valid if it is nil or a symbol defined as a coding system by the
8527 function `define-coding-system'.  */)
8528   (Lisp_Object coding_system)
8529 {
8530   Lisp_Object define_form;
8531
8532   define_form = Fget (coding_system, Qcoding_system_define_form);
8533   if (! NILP (define_form))
8534     {
8535       Fput (coding_system, Qcoding_system_define_form, Qnil);
8536       safe_eval (define_form);
8537     }
8538   if (!NILP (Fcoding_system_p (coding_system)))
8539     return coding_system;
8540   xsignal1 (Qcoding_system_error, coding_system);
8541 }
8542
8543 \f
8544 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8545    HIGHEST, return the coding system of the highest
8546    priority among the detected coding systems.  Otherwise return a
8547    list of detected coding systems sorted by their priorities.  If
8548    MULTIBYTEP, it is assumed that the bytes are in correct
8549    multibyte form but contains only ASCII and eight-bit chars.
8550    Otherwise, the bytes are raw bytes.
8551
8552    CODING-SYSTEM controls the detection as below:
8553
8554    If it is nil, detect both text-format and eol-format.  If the
8555    text-format part of CODING-SYSTEM is already specified
8556    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8557    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8558    detect only text-format.  */
8559
8560 Lisp_Object
8561 detect_coding_system (const unsigned char *src,
8562                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8563                       bool highest, bool multibytep,
8564                       Lisp_Object coding_system)
8565 {
8566   const unsigned char *src_end = src + src_bytes;
8567   Lisp_Object attrs, eol_type;
8568   Lisp_Object val = Qnil;
8569   struct coding_system coding;
8570   ptrdiff_t id;
8571   struct coding_detection_info detect_info;
8572   enum coding_category base_category;
8573   bool null_byte_found = 0, eight_bit_found = 0;
8574
8575   if (NILP (coding_system))
8576     coding_system = Qundecided;
8577   setup_coding_system (coding_system, &coding);
8578   attrs = CODING_ID_ATTRS (coding.id);
8579   eol_type = CODING_ID_EOL_TYPE (coding.id);
8580   coding_system = CODING_ATTR_BASE_NAME (attrs);
8581
8582   coding.source = src;
8583   coding.src_chars = src_chars;
8584   coding.src_bytes = src_bytes;
8585   coding.src_multibyte = multibytep;
8586   coding.consumed = 0;
8587   coding.mode |= CODING_MODE_LAST_BLOCK;
8588   coding.head_ascii = 0;
8589
8590   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8591
8592   /* At first, detect text-format if necessary.  */
8593   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8594   if (base_category == coding_category_undecided)
8595     {
8596       enum coding_category category IF_LINT (= 0);
8597       struct coding_system *this IF_LINT (= NULL);
8598       int c, i;
8599       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8600                                        inhibit_null_byte_detection);
8601       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8602                                        inhibit_iso_escape_detection);
8603       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8604
8605       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8606       for (; src < src_end; src++)
8607         {
8608           c = *src;
8609           if (c & 0x80)
8610             {
8611               eight_bit_found = 1;
8612               if (null_byte_found)
8613                 break;
8614             }
8615           else if (c < 0x20)
8616             {
8617               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8618                   && ! inhibit_ied
8619                   && ! detect_info.checked)
8620                 {
8621                   if (detect_coding_iso_2022 (&coding, &detect_info))
8622                     {
8623                       /* We have scanned the whole data.  */
8624                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8625                         {
8626                           /* We didn't find an 8-bit code.  We may
8627                              have found a null-byte, but it's very
8628                              rare that a binary file confirm to
8629                              ISO-2022.  */
8630                           src = src_end;
8631                           coding.head_ascii = src - coding.source;
8632                         }
8633                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8634                       break;
8635                     }
8636                 }
8637               else if (! c && !inhibit_nbd)
8638                 {
8639                   null_byte_found = 1;
8640                   if (eight_bit_found)
8641                     break;
8642                 }
8643               if (! eight_bit_found)
8644                 coding.head_ascii++;
8645             }
8646           else if (! eight_bit_found)
8647             coding.head_ascii++;
8648         }
8649
8650       if (null_byte_found || eight_bit_found
8651           || coding.head_ascii < coding.src_bytes
8652           || detect_info.found)
8653         {
8654           if (coding.head_ascii == coding.src_bytes)
8655             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8656             for (i = 0; i < coding_category_raw_text; i++)
8657               {
8658                 category = coding_priorities[i];
8659                 this = coding_categories + category;
8660                 if (detect_info.found & (1 << category))
8661                   break;
8662               }
8663           else
8664             {
8665               if (null_byte_found)
8666                 {
8667                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8668                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8669                 }
8670               else if (prefer_utf_8
8671                        && detect_coding_utf_8 (&coding, &detect_info))
8672                 {
8673                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8674                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8675                 }
8676               for (i = 0; i < coding_category_raw_text; i++)
8677                 {
8678                   category = coding_priorities[i];
8679                   this = coding_categories + category;
8680
8681                   if (this->id < 0)
8682                     {
8683                       /* No coding system of this category is defined.  */
8684                       detect_info.rejected |= (1 << category);
8685                     }
8686                   else if (category >= coding_category_raw_text)
8687                     continue;
8688                   else if (detect_info.checked & (1 << category))
8689                     {
8690                       if (highest
8691                           && (detect_info.found & (1 << category)))
8692                         break;
8693                     }
8694                   else if ((*(this->detector)) (&coding, &detect_info)
8695                            && highest
8696                            && (detect_info.found & (1 << category)))
8697                     {
8698                       if (category == coding_category_utf_16_auto)
8699                         {
8700                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8701                             category = coding_category_utf_16_le;
8702                           else
8703                             category = coding_category_utf_16_be;
8704                         }
8705                       break;
8706                     }
8707                 }
8708             }
8709         }
8710
8711       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8712           || null_byte_found)
8713         {
8714           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8715           id = CODING_SYSTEM_ID (Qno_conversion);
8716           val = list1 (make_number (id));
8717         }
8718       else if (! detect_info.rejected && ! detect_info.found)
8719         {
8720           detect_info.found = CATEGORY_MASK_ANY;
8721           id = coding_categories[coding_category_undecided].id;
8722           val = list1 (make_number (id));
8723         }
8724       else if (highest)
8725         {
8726           if (detect_info.found)
8727             {
8728               detect_info.found = 1 << category;
8729               val = list1 (make_number (this->id));
8730             }
8731           else
8732             for (i = 0; i < coding_category_raw_text; i++)
8733               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8734                 {
8735                   detect_info.found = 1 << coding_priorities[i];
8736                   id = coding_categories[coding_priorities[i]].id;
8737                   val = list1 (make_number (id));
8738                   break;
8739                 }
8740         }
8741       else
8742         {
8743           int mask = detect_info.rejected | detect_info.found;
8744           int found = 0;
8745
8746           for (i = coding_category_raw_text - 1; i >= 0; i--)
8747             {
8748               category = coding_priorities[i];
8749               if (! (mask & (1 << category)))
8750                 {
8751                   found |= 1 << category;
8752                   id = coding_categories[category].id;
8753                   if (id >= 0)
8754                     val = list1 (make_number (id));
8755                 }
8756             }
8757           for (i = coding_category_raw_text - 1; i >= 0; i--)
8758             {
8759               category = coding_priorities[i];
8760               if (detect_info.found & (1 << category))
8761                 {
8762                   id = coding_categories[category].id;
8763                   val = Fcons (make_number (id), val);
8764                 }
8765             }
8766           detect_info.found |= found;
8767         }
8768     }
8769   else if (base_category == coding_category_utf_8_auto)
8770     {
8771       if (detect_coding_utf_8 (&coding, &detect_info))
8772         {
8773           struct coding_system *this;
8774
8775           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8776             this = coding_categories + coding_category_utf_8_sig;
8777           else
8778             this = coding_categories + coding_category_utf_8_nosig;
8779           val = list1 (make_number (this->id));
8780         }
8781     }
8782   else if (base_category == coding_category_utf_16_auto)
8783     {
8784       if (detect_coding_utf_16 (&coding, &detect_info))
8785         {
8786           struct coding_system *this;
8787
8788           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8789             this = coding_categories + coding_category_utf_16_le;
8790           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8791             this = coding_categories + coding_category_utf_16_be;
8792           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8793             this = coding_categories + coding_category_utf_16_be_nosig;
8794           else
8795             this = coding_categories + coding_category_utf_16_le_nosig;
8796           val = list1 (make_number (this->id));
8797         }
8798     }
8799   else
8800     {
8801       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8802       val = list1 (make_number (coding.id));
8803     }
8804
8805   /* Then, detect eol-format if necessary.  */
8806   {
8807     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8808     Lisp_Object tail;
8809
8810     if (VECTORP (eol_type))
8811       {
8812         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8813           {
8814             if (null_byte_found)
8815               normal_eol = EOL_SEEN_LF;
8816             else
8817               normal_eol = detect_eol (coding.source, src_bytes,
8818                                        coding_category_raw_text);
8819           }
8820         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8821                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8822           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8823                                       coding_category_utf_16_be);
8824         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8825                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8826           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8827                                       coding_category_utf_16_le);
8828       }
8829     else
8830       {
8831         if (EQ (eol_type, Qunix))
8832           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8833         else if (EQ (eol_type, Qdos))
8834           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8835         else
8836           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8837       }
8838
8839     for (tail = val; CONSP (tail); tail = XCDR (tail))
8840       {
8841         enum coding_category category;
8842         int this_eol;
8843
8844         id = XINT (XCAR (tail));
8845         attrs = CODING_ID_ATTRS (id);
8846         category = XINT (CODING_ATTR_CATEGORY (attrs));
8847         eol_type = CODING_ID_EOL_TYPE (id);
8848         if (VECTORP (eol_type))
8849           {
8850             if (category == coding_category_utf_16_be
8851                 || category == coding_category_utf_16_be_nosig)
8852               this_eol = utf_16_be_eol;
8853             else if (category == coding_category_utf_16_le
8854                      || category == coding_category_utf_16_le_nosig)
8855               this_eol = utf_16_le_eol;
8856             else
8857               this_eol = normal_eol;
8858
8859             if (this_eol == EOL_SEEN_LF)
8860               XSETCAR (tail, AREF (eol_type, 0));
8861             else if (this_eol == EOL_SEEN_CRLF)
8862               XSETCAR (tail, AREF (eol_type, 1));
8863             else if (this_eol == EOL_SEEN_CR)
8864               XSETCAR (tail, AREF (eol_type, 2));
8865             else
8866               XSETCAR (tail, CODING_ID_NAME (id));
8867           }
8868         else
8869           XSETCAR (tail, CODING_ID_NAME (id));
8870       }
8871   }
8872
8873   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8874 }
8875
8876
8877 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8878        2, 3, 0,
8879        doc: /* Detect coding system of the text in the region between START and END.
8880 Return a list of possible coding systems ordered by priority.
8881 The coding systems to try and their priorities follows what
8882 the function `coding-system-priority-list' (which see) returns.
8883
8884 If only ASCII characters are found (except for such ISO-2022 control
8885 characters as ESC), it returns a list of single element `undecided'
8886 or its subsidiary coding system according to a detected end-of-line
8887 format.
8888
8889 If optional argument HIGHEST is non-nil, return the coding system of
8890 highest priority.  */)
8891   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8892 {
8893   ptrdiff_t from, to;
8894   ptrdiff_t from_byte, to_byte;
8895
8896   validate_region (&start, &end);
8897   from = XINT (start), to = XINT (end);
8898   from_byte = CHAR_TO_BYTE (from);
8899   to_byte = CHAR_TO_BYTE (to);
8900
8901   if (from < GPT && to >= GPT)
8902     move_gap_both (to, to_byte);
8903
8904   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8905                                to - from, to_byte - from_byte,
8906                                !NILP (highest),
8907                                !NILP (BVAR (current_buffer
8908                                       , enable_multibyte_characters)),
8909                                Qnil);
8910 }
8911
8912 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8913        1, 2, 0,
8914        doc: /* Detect coding system of the text in STRING.
8915 Return a list of possible coding systems ordered by priority.
8916 The coding systems to try and their priorities follows what
8917 the function `coding-system-priority-list' (which see) returns.
8918
8919 If only ASCII characters are found (except for such ISO-2022 control
8920 characters as ESC), it returns a list of single element `undecided'
8921 or its subsidiary coding system according to a detected end-of-line
8922 format.
8923
8924 If optional argument HIGHEST is non-nil, return the coding system of
8925 highest priority.  */)
8926   (Lisp_Object string, Lisp_Object highest)
8927 {
8928   CHECK_STRING (string);
8929
8930   return detect_coding_system (SDATA (string),
8931                                SCHARS (string), SBYTES (string),
8932                                !NILP (highest), STRING_MULTIBYTE (string),
8933                                Qnil);
8934 }
8935
8936
8937 static bool
8938 char_encodable_p (int c, Lisp_Object attrs)
8939 {
8940   Lisp_Object tail;
8941   struct charset *charset;
8942   Lisp_Object translation_table;
8943
8944   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8945   if (! NILP (translation_table))
8946     c = translate_char (translation_table, c);
8947   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8948        CONSP (tail); tail = XCDR (tail))
8949     {
8950       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8951       if (CHAR_CHARSET_P (c, charset))
8952         break;
8953     }
8954   return (! NILP (tail));
8955 }
8956
8957
8958 /* Return a list of coding systems that safely encode the text between
8959    START and END.  If EXCLUDE is non-nil, it is a list of coding
8960    systems not to check.  The returned list doesn't contain any such
8961    coding systems.  In any case, if the text contains only ASCII or is
8962    unibyte, return t.  */
8963
8964 DEFUN ("find-coding-systems-region-internal",
8965        Ffind_coding_systems_region_internal,
8966        Sfind_coding_systems_region_internal, 2, 3, 0,
8967        doc: /* Internal use only.  */)
8968   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8969 {
8970   Lisp_Object coding_attrs_list, safe_codings;
8971   ptrdiff_t start_byte, end_byte;
8972   const unsigned char *p, *pbeg, *pend;
8973   int c;
8974   Lisp_Object tail, elt, work_table;
8975
8976   if (STRINGP (start))
8977     {
8978       if (!STRING_MULTIBYTE (start)
8979           || SCHARS (start) == SBYTES (start))
8980         return Qt;
8981       start_byte = 0;
8982       end_byte = SBYTES (start);
8983     }
8984   else
8985     {
8986       CHECK_NUMBER_COERCE_MARKER (start);
8987       CHECK_NUMBER_COERCE_MARKER (end);
8988       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8989         args_out_of_range (start, end);
8990       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8991         return Qt;
8992       start_byte = CHAR_TO_BYTE (XINT (start));
8993       end_byte = CHAR_TO_BYTE (XINT (end));
8994       if (XINT (end) - XINT (start) == end_byte - start_byte)
8995         return Qt;
8996
8997       if (XINT (start) < GPT && XINT (end) > GPT)
8998         {
8999           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9000             move_gap_both (XINT (start), start_byte);
9001           else
9002             move_gap_both (XINT (end), end_byte);
9003         }
9004     }
9005
9006   coding_attrs_list = Qnil;
9007   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9008     if (NILP (exclude)
9009         || NILP (Fmemq (XCAR (tail), exclude)))
9010       {
9011         Lisp_Object attrs;
9012
9013         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9014         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9015           {
9016             ASET (attrs, coding_attr_trans_tbl,
9017                   get_translation_table (attrs, 1, NULL));
9018             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9019           }
9020       }
9021
9022   if (STRINGP (start))
9023     p = pbeg = SDATA (start);
9024   else
9025     p = pbeg = BYTE_POS_ADDR (start_byte);
9026   pend = p + (end_byte - start_byte);
9027
9028   while (p < pend && ASCII_CHAR_P (*p)) p++;
9029   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9030
9031   work_table = Fmake_char_table (Qnil, Qnil);
9032   while (p < pend)
9033     {
9034       if (ASCII_CHAR_P (*p))
9035         p++;
9036       else
9037         {
9038           c = STRING_CHAR_ADVANCE (p);
9039           if (!NILP (char_table_ref (work_table, c)))
9040             /* This character was already checked.  Ignore it.  */
9041             continue;
9042
9043           charset_map_loaded = 0;
9044           for (tail = coding_attrs_list; CONSP (tail);)
9045             {
9046               elt = XCAR (tail);
9047               if (NILP (elt))
9048                 tail = XCDR (tail);
9049               else if (char_encodable_p (c, elt))
9050                 tail = XCDR (tail);
9051               else if (CONSP (XCDR (tail)))
9052                 {
9053                   XSETCAR (tail, XCAR (XCDR (tail)));
9054                   XSETCDR (tail, XCDR (XCDR (tail)));
9055                 }
9056               else
9057                 {
9058                   XSETCAR (tail, Qnil);
9059                   tail = XCDR (tail);
9060                 }
9061             }
9062           if (charset_map_loaded)
9063             {
9064               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9065
9066               if (STRINGP (start))
9067                 pbeg = SDATA (start);
9068               else
9069                 pbeg = BYTE_POS_ADDR (start_byte);
9070               p = pbeg + p_offset;
9071               pend = pbeg + pend_offset;
9072             }
9073           char_table_set (work_table, c, Qt);
9074         }
9075     }
9076
9077   safe_codings = list2 (Qraw_text, Qno_conversion);
9078   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9079     if (! NILP (XCAR (tail)))
9080       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9081
9082   return safe_codings;
9083 }
9084
9085
9086 DEFUN ("unencodable-char-position", Funencodable_char_position,
9087        Sunencodable_char_position, 3, 5, 0,
9088        doc: /* Return position of first un-encodable character in a region.
9089 START and END specify the region and CODING-SYSTEM specifies the
9090 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9091
9092 If optional 4th argument COUNT is non-nil, it specifies at most how
9093 many un-encodable characters to search.  In this case, the value is a
9094 list of positions.
9095
9096 If optional 5th argument STRING is non-nil, it is a string to search
9097 for un-encodable characters.  In that case, START and END are indexes
9098 to the string and treated as in `substring'.  */)
9099   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9100    Lisp_Object count, Lisp_Object string)
9101 {
9102   EMACS_INT n;
9103   struct coding_system coding;
9104   Lisp_Object attrs, charset_list, translation_table;
9105   Lisp_Object positions;
9106   ptrdiff_t from, to;
9107   const unsigned char *p, *stop, *pend;
9108   bool ascii_compatible;
9109
9110   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9111   attrs = CODING_ID_ATTRS (coding.id);
9112   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9113     return Qnil;
9114   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9115   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9116   translation_table = get_translation_table (attrs, 1, NULL);
9117
9118   if (NILP (string))
9119     {
9120       validate_region (&start, &end);
9121       from = XINT (start);
9122       to = XINT (end);
9123       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9124           || (ascii_compatible
9125               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9126         return Qnil;
9127       p = CHAR_POS_ADDR (from);
9128       pend = CHAR_POS_ADDR (to);
9129       if (from < GPT && to >= GPT)
9130         stop = GPT_ADDR;
9131       else
9132         stop = pend;
9133     }
9134   else
9135     {
9136       CHECK_STRING (string);
9137       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9138       if (! STRING_MULTIBYTE (string))
9139         return Qnil;
9140       p = SDATA (string) + string_char_to_byte (string, from);
9141       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9142       if (ascii_compatible && (to - from) == (pend - p))
9143         return Qnil;
9144     }
9145
9146   if (NILP (count))
9147     n = 1;
9148   else
9149     {
9150       CHECK_NATNUM (count);
9151       n = XINT (count);
9152     }
9153
9154   positions = Qnil;
9155   charset_map_loaded = 0;
9156   while (1)
9157     {
9158       int c;
9159
9160       if (ascii_compatible)
9161         while (p < stop && ASCII_CHAR_P (*p))
9162           p++, from++;
9163       if (p >= stop)
9164         {
9165           if (p >= pend)
9166             break;
9167           stop = pend;
9168           p = GAP_END_ADDR;
9169         }
9170
9171       c = STRING_CHAR_ADVANCE (p);
9172       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9173           && ! char_charset (translate_char (translation_table, c),
9174                              charset_list, NULL))
9175         {
9176           positions = Fcons (make_number (from), positions);
9177           n--;
9178           if (n == 0)
9179             break;
9180         }
9181
9182       from++;
9183       if (charset_map_loaded && NILP (string))
9184         {
9185           p = CHAR_POS_ADDR (from);
9186           pend = CHAR_POS_ADDR (to);
9187           if (from < GPT && to >= GPT)
9188             stop = GPT_ADDR;
9189           else
9190             stop = pend;
9191           charset_map_loaded = 0;
9192         }
9193     }
9194
9195   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9196 }
9197
9198
9199 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9200        Scheck_coding_systems_region, 3, 3, 0,
9201        doc: /* Check if the region is encodable by coding systems.
9202
9203 START and END are buffer positions specifying the region.
9204 CODING-SYSTEM-LIST is a list of coding systems to check.
9205
9206 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9207 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9208 whole region, POS0, POS1, ... are buffer positions where non-encodable
9209 characters are found.
9210
9211 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9212 value is nil.
9213
9214 START may be a string.  In that case, check if the string is
9215 encodable, and the value contains indices to the string instead of
9216 buffer positions.  END is ignored.
9217
9218 If the current buffer (or START if it is a string) is unibyte, the value
9219 is nil.  */)
9220   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9221 {
9222   Lisp_Object list;
9223   ptrdiff_t start_byte, end_byte;
9224   ptrdiff_t pos;
9225   const unsigned char *p, *pbeg, *pend;
9226   int c;
9227   Lisp_Object tail, elt, attrs;
9228
9229   if (STRINGP (start))
9230     {
9231       if (!STRING_MULTIBYTE (start)
9232           || SCHARS (start) == SBYTES (start))
9233         return Qnil;
9234       start_byte = 0;
9235       end_byte = SBYTES (start);
9236       pos = 0;
9237     }
9238   else
9239     {
9240       CHECK_NUMBER_COERCE_MARKER (start);
9241       CHECK_NUMBER_COERCE_MARKER (end);
9242       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9243         args_out_of_range (start, end);
9244       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9245         return Qnil;
9246       start_byte = CHAR_TO_BYTE (XINT (start));
9247       end_byte = CHAR_TO_BYTE (XINT (end));
9248       if (XINT (end) - XINT (start) == end_byte - start_byte)
9249         return Qnil;
9250
9251       if (XINT (start) < GPT && XINT (end) > GPT)
9252         {
9253           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9254             move_gap_both (XINT (start), start_byte);
9255           else
9256             move_gap_both (XINT (end), end_byte);
9257         }
9258       pos = XINT (start);
9259     }
9260
9261   list = Qnil;
9262   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9263     {
9264       elt = XCAR (tail);
9265       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9266       ASET (attrs, coding_attr_trans_tbl,
9267             get_translation_table (attrs, 1, NULL));
9268       list = Fcons (list2 (elt, attrs), list);
9269     }
9270
9271   if (STRINGP (start))
9272     p = pbeg = SDATA (start);
9273   else
9274     p = pbeg = BYTE_POS_ADDR (start_byte);
9275   pend = p + (end_byte - start_byte);
9276
9277   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9278   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9279
9280   while (p < pend)
9281     {
9282       if (ASCII_CHAR_P (*p))
9283         p++;
9284       else
9285         {
9286           c = STRING_CHAR_ADVANCE (p);
9287
9288           charset_map_loaded = 0;
9289           for (tail = list; CONSP (tail); tail = XCDR (tail))
9290             {
9291               elt = XCDR (XCAR (tail));
9292               if (! char_encodable_p (c, XCAR (elt)))
9293                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9294             }
9295           if (charset_map_loaded)
9296             {
9297               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9298
9299               if (STRINGP (start))
9300                 pbeg = SDATA (start);
9301               else
9302                 pbeg = BYTE_POS_ADDR (start_byte);
9303               p = pbeg + p_offset;
9304               pend = pbeg + pend_offset;
9305             }
9306         }
9307       pos++;
9308     }
9309
9310   tail = list;
9311   list = Qnil;
9312   for (; CONSP (tail); tail = XCDR (tail))
9313     {
9314       elt = XCAR (tail);
9315       if (CONSP (XCDR (XCDR (elt))))
9316         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9317                       list);
9318     }
9319
9320   return list;
9321 }
9322
9323
9324 static Lisp_Object
9325 code_convert_region (Lisp_Object start, Lisp_Object end,
9326                      Lisp_Object coding_system, Lisp_Object dst_object,
9327                      bool encodep, bool norecord)
9328 {
9329   struct coding_system coding;
9330   ptrdiff_t from, from_byte, to, to_byte;
9331   Lisp_Object src_object;
9332
9333   if (NILP (coding_system))
9334     coding_system = Qno_conversion;
9335   else
9336     CHECK_CODING_SYSTEM (coding_system);
9337   src_object = Fcurrent_buffer ();
9338   if (NILP (dst_object))
9339     dst_object = src_object;
9340   else if (! EQ (dst_object, Qt))
9341     CHECK_BUFFER (dst_object);
9342
9343   validate_region (&start, &end);
9344   from = XFASTINT (start);
9345   from_byte = CHAR_TO_BYTE (from);
9346   to = XFASTINT (end);
9347   to_byte = CHAR_TO_BYTE (to);
9348
9349   setup_coding_system (coding_system, &coding);
9350   coding.mode |= CODING_MODE_LAST_BLOCK;
9351
9352   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9353     {
9354       struct buffer *buf = XBUFFER (dst_object);
9355       ptrdiff_t buf_pt = BUF_PT (buf);
9356
9357       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9358     }
9359
9360   if (encodep)
9361     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9362                           dst_object);
9363   else
9364     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9365                           dst_object);
9366   if (! norecord)
9367     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9368
9369   return (BUFFERP (dst_object)
9370           ? make_number (coding.produced_char)
9371           : coding.dst_object);
9372 }
9373
9374
9375 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9376        3, 4, "r\nzCoding system: ",
9377        doc: /* Decode the current region from the specified coding system.
9378 When called from a program, takes four arguments:
9379         START, END, CODING-SYSTEM, and DESTINATION.
9380 START and END are buffer positions.
9381
9382 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9383 If nil, the region between START and END is replaced by the decoded text.
9384 If buffer, the decoded text is inserted in that buffer after point (point
9385 does not move).
9386 In those cases, the length of the decoded text is returned.
9387 If DESTINATION is t, the decoded text is returned.
9388
9389 This function sets `last-coding-system-used' to the precise coding system
9390 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9391 not fully specified.)  */)
9392   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9393 {
9394   return code_convert_region (start, end, coding_system, destination, 0, 0);
9395 }
9396
9397 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9398        3, 4, "r\nzCoding system: ",
9399        doc: /* Encode the current region by specified coding system.
9400 When called from a program, takes four arguments:
9401         START, END, CODING-SYSTEM and DESTINATION.
9402 START and END are buffer positions.
9403
9404 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9405 If nil, the region between START and END is replace by the encoded text.
9406 If buffer, the encoded text is inserted in that buffer after point (point
9407 does not move).
9408 In those cases, the length of the encoded text is returned.
9409 If DESTINATION is t, the encoded text is returned.
9410
9411 This function sets `last-coding-system-used' to the precise coding system
9412 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9413 not fully specified.)  */)
9414   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9415 {
9416   return code_convert_region (start, end, coding_system, destination, 1, 0);
9417 }
9418
9419 Lisp_Object
9420 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9421                      Lisp_Object dst_object, bool encodep, bool nocopy,
9422                      bool norecord)
9423 {
9424   struct coding_system coding;
9425   ptrdiff_t chars, bytes;
9426
9427   CHECK_STRING (string);
9428   if (NILP (coding_system))
9429     {
9430       if (! norecord)
9431         Vlast_coding_system_used = Qno_conversion;
9432       if (NILP (dst_object))
9433         return (nocopy ? Fcopy_sequence (string) : string);
9434     }
9435
9436   if (NILP (coding_system))
9437     coding_system = Qno_conversion;
9438   else
9439     CHECK_CODING_SYSTEM (coding_system);
9440   if (NILP (dst_object))
9441     dst_object = Qt;
9442   else if (! EQ (dst_object, Qt))
9443     CHECK_BUFFER (dst_object);
9444
9445   setup_coding_system (coding_system, &coding);
9446   coding.mode |= CODING_MODE_LAST_BLOCK;
9447   chars = SCHARS (string);
9448   bytes = SBYTES (string);
9449
9450   if (BUFFERP (dst_object))
9451     {
9452       struct buffer *buf = XBUFFER (dst_object);
9453       ptrdiff_t buf_pt = BUF_PT (buf);
9454
9455       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9456     }
9457
9458   if (encodep)
9459     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9460   else
9461     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9462   if (! norecord)
9463     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9464
9465   return (BUFFERP (dst_object)
9466           ? make_number (coding.produced_char)
9467           : coding.dst_object);
9468 }
9469
9470
9471 /* Encode or decode STRING according to CODING_SYSTEM.
9472    Do not set Vlast_coding_system_used.
9473
9474    This function is called only from macros DECODE_FILE and
9475    ENCODE_FILE, thus we ignore character composition.  */
9476
9477 Lisp_Object
9478 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9479                               bool encodep)
9480 {
9481   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9482 }
9483
9484 /* Encode or decode a file name, to or from a unibyte string suitable
9485    for passing to C library functions.  */
9486 Lisp_Object
9487 decode_file_name (Lisp_Object fname)
9488 {
9489 #ifdef WINDOWSNT
9490   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9491      converts the file names either to UTF-16LE or to the system ANSI
9492      codepage internally, depending on the underlying OS; see w32.c.  */
9493   if (! NILP (Fcoding_system_p (Qutf_8)))
9494     return code_convert_string_norecord (fname, Qutf_8, 0);
9495   return fname;
9496 #else  /* !WINDOWSNT */
9497   if (! NILP (Vfile_name_coding_system))
9498     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9499   else if (! NILP (Vdefault_file_name_coding_system))
9500     return code_convert_string_norecord (fname,
9501                                          Vdefault_file_name_coding_system, 0);
9502   else
9503     return fname;
9504 #endif
9505 }
9506
9507 Lisp_Object
9508 encode_file_name (Lisp_Object fname)
9509 {
9510   /* This is especially important during bootstrap and dumping, when
9511      file-name encoding is not yet known, and therefore any non-ASCII
9512      file names are unibyte strings, and could only be thrashed if we
9513      try to encode them.  */
9514   if (!STRING_MULTIBYTE (fname))
9515     return fname;
9516 #ifdef WINDOWSNT
9517   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9518      converts the file names either to UTF-16LE or to the system ANSI
9519      codepage internally, depending on the underlying OS; see w32.c.  */
9520   if (! NILP (Fcoding_system_p (Qutf_8)))
9521     return code_convert_string_norecord (fname, Qutf_8, 1);
9522   return fname;
9523 #else  /* !WINDOWSNT */
9524   if (! NILP (Vfile_name_coding_system))
9525     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9526   else if (! NILP (Vdefault_file_name_coding_system))
9527     return code_convert_string_norecord (fname,
9528                                          Vdefault_file_name_coding_system, 1);
9529   else
9530     return fname;
9531 #endif
9532 }
9533
9534 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9535        2, 4, 0,
9536        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9537
9538 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9539 if the decoding operation is trivial.
9540
9541 Optional fourth arg BUFFER non-nil means that the decoded text is
9542 inserted in that buffer after point (point does not move).  In this
9543 case, the return value is the length of the decoded text.
9544
9545 This function sets `last-coding-system-used' to the precise coding system
9546 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9547 not fully specified.)  */)
9548   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9549 {
9550   return code_convert_string (string, coding_system, buffer,
9551                               0, ! NILP (nocopy), 0);
9552 }
9553
9554 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9555        2, 4, 0,
9556        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9557
9558 Optional third arg NOCOPY non-nil means it is OK to return STRING
9559 itself if the encoding operation is trivial.
9560
9561 Optional fourth arg BUFFER non-nil means that the encoded text is
9562 inserted in that buffer after point (point does not move).  In this
9563 case, the return value is the length of the encoded text.
9564
9565 This function sets `last-coding-system-used' to the precise coding system
9566 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9567 not fully specified.)  */)
9568   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9569 {
9570   return code_convert_string (string, coding_system, buffer,
9571                               1, ! NILP (nocopy), 0);
9572 }
9573
9574 \f
9575 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9576        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9577 Return the corresponding character.  */)
9578   (Lisp_Object code)
9579 {
9580   Lisp_Object spec, attrs, val;
9581   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9582   EMACS_INT ch;
9583   int c;
9584
9585   CHECK_NATNUM (code);
9586   ch = XFASTINT (code);
9587   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9588   attrs = AREF (spec, 0);
9589
9590   if (ASCII_CHAR_P (ch)
9591       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9592     return code;
9593
9594   val = CODING_ATTR_CHARSET_LIST (attrs);
9595   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9596   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9597   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9598
9599   if (ch <= 0x7F)
9600     {
9601       c = ch;
9602       charset = charset_roman;
9603     }
9604   else if (ch >= 0xA0 && ch < 0xDF)
9605     {
9606       c = ch - 0x80;
9607       charset = charset_kana;
9608     }
9609   else
9610     {
9611       EMACS_INT c1 = ch >> 8;
9612       int c2 = ch & 0xFF;
9613
9614       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9615           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9616         error ("Invalid code: %"pI"d", ch);
9617       c = ch;
9618       SJIS_TO_JIS (c);
9619       charset = charset_kanji;
9620     }
9621   c = DECODE_CHAR (charset, c);
9622   if (c < 0)
9623     error ("Invalid code: %"pI"d", ch);
9624   return make_number (c);
9625 }
9626
9627
9628 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9629        doc: /* Encode a Japanese character CH to shift_jis encoding.
9630 Return the corresponding code in SJIS.  */)
9631   (Lisp_Object ch)
9632 {
9633   Lisp_Object spec, attrs, charset_list;
9634   int c;
9635   struct charset *charset;
9636   unsigned code;
9637
9638   CHECK_CHARACTER (ch);
9639   c = XFASTINT (ch);
9640   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9641   attrs = AREF (spec, 0);
9642
9643   if (ASCII_CHAR_P (c)
9644       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9645     return ch;
9646
9647   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9648   charset = char_charset (c, charset_list, &code);
9649   if (code == CHARSET_INVALID_CODE (charset))
9650     error ("Can't encode by shift_jis encoding: %c", c);
9651   JIS_TO_SJIS (code);
9652
9653   return make_number (code);
9654 }
9655
9656 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9657        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9658 Return the corresponding character.  */)
9659   (Lisp_Object code)
9660 {
9661   Lisp_Object spec, attrs, val;
9662   struct charset *charset_roman, *charset_big5, *charset;
9663   EMACS_INT ch;
9664   int c;
9665
9666   CHECK_NATNUM (code);
9667   ch = XFASTINT (code);
9668   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9669   attrs = AREF (spec, 0);
9670
9671   if (ASCII_CHAR_P (ch)
9672       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9673     return code;
9674
9675   val = CODING_ATTR_CHARSET_LIST (attrs);
9676   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9677   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9678
9679   if (ch <= 0x7F)
9680     {
9681       c = ch;
9682       charset = charset_roman;
9683     }
9684   else
9685     {
9686       EMACS_INT b1 = ch >> 8;
9687       int b2 = ch & 0x7F;
9688       if (b1 < 0xA1 || b1 > 0xFE
9689           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9690         error ("Invalid code: %"pI"d", ch);
9691       c = ch;
9692       charset = charset_big5;
9693     }
9694   c = DECODE_CHAR (charset, c);
9695   if (c < 0)
9696     error ("Invalid code: %"pI"d", ch);
9697   return make_number (c);
9698 }
9699
9700 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9701        doc: /* Encode the Big5 character CH to BIG5 coding system.
9702 Return the corresponding character code in Big5.  */)
9703   (Lisp_Object ch)
9704 {
9705   Lisp_Object spec, attrs, charset_list;
9706   struct charset *charset;
9707   int c;
9708   unsigned code;
9709
9710   CHECK_CHARACTER (ch);
9711   c = XFASTINT (ch);
9712   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9713   attrs = AREF (spec, 0);
9714   if (ASCII_CHAR_P (c)
9715       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9716     return ch;
9717
9718   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9719   charset = char_charset (c, charset_list, &code);
9720   if (code == CHARSET_INVALID_CODE (charset))
9721     error ("Can't encode by Big5 encoding: %c", c);
9722
9723   return make_number (code);
9724 }
9725
9726 \f
9727 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9728        Sset_terminal_coding_system_internal, 1, 2, 0,
9729        doc: /* Internal use only.  */)
9730   (Lisp_Object coding_system, Lisp_Object terminal)
9731 {
9732   struct terminal *term = get_terminal (terminal, 1);
9733   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9734   CHECK_SYMBOL (coding_system);
9735   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9736   /* We had better not send unsafe characters to terminal.  */
9737   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9738   /* Character composition should be disabled.  */
9739   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9740   terminal_coding->src_multibyte = 1;
9741   terminal_coding->dst_multibyte = 0;
9742   tset_charset_list
9743     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9744             ? coding_charset_list (terminal_coding)
9745             : list1 (make_number (charset_ascii))));
9746   return Qnil;
9747 }
9748
9749 DEFUN ("set-safe-terminal-coding-system-internal",
9750        Fset_safe_terminal_coding_system_internal,
9751        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9752        doc: /* Internal use only.  */)
9753   (Lisp_Object coding_system)
9754 {
9755   CHECK_SYMBOL (coding_system);
9756   setup_coding_system (Fcheck_coding_system (coding_system),
9757                        &safe_terminal_coding);
9758   /* Character composition should be disabled.  */
9759   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9760   safe_terminal_coding.src_multibyte = 1;
9761   safe_terminal_coding.dst_multibyte = 0;
9762   return Qnil;
9763 }
9764
9765 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9766        Sterminal_coding_system, 0, 1, 0,
9767        doc: /* Return coding system specified for terminal output on the given terminal.
9768 TERMINAL may be a terminal object, a frame, or nil for the selected
9769 frame's terminal device.  */)
9770   (Lisp_Object terminal)
9771 {
9772   struct coding_system *terminal_coding
9773     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9774   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9775
9776   /* For backward compatibility, return nil if it is `undecided'.  */
9777   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9778 }
9779
9780 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9781        Sset_keyboard_coding_system_internal, 1, 2, 0,
9782        doc: /* Internal use only.  */)
9783   (Lisp_Object coding_system, Lisp_Object terminal)
9784 {
9785   struct terminal *t = get_terminal (terminal, 1);
9786   CHECK_SYMBOL (coding_system);
9787   if (NILP (coding_system))
9788     coding_system = Qno_conversion;
9789   else
9790     Fcheck_coding_system (coding_system);
9791   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9792   /* Character composition should be disabled.  */
9793   TERMINAL_KEYBOARD_CODING (t)->common_flags
9794     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9795   return Qnil;
9796 }
9797
9798 DEFUN ("keyboard-coding-system",
9799        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9800        doc: /* Return coding system specified for decoding keyboard input.  */)
9801   (Lisp_Object terminal)
9802 {
9803   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9804                          (get_terminal (terminal, 1))->id);
9805 }
9806
9807 \f
9808 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9809        Sfind_operation_coding_system,  1, MANY, 0,
9810        doc: /* Choose a coding system for an operation based on the target name.
9811 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9812 DECODING-SYSTEM is the coding system to use for decoding
9813 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9814 for encoding (in case OPERATION does encoding).
9815
9816 The first argument OPERATION specifies an I/O primitive:
9817   For file I/O, `insert-file-contents' or `write-region'.
9818   For process I/O, `call-process', `call-process-region', or `start-process'.
9819   For network I/O, `open-network-stream'.
9820
9821 The remaining arguments should be the same arguments that were passed
9822 to the primitive.  Depending on which primitive, one of those arguments
9823 is selected as the TARGET.  For example, if OPERATION does file I/O,
9824 whichever argument specifies the file name is TARGET.
9825
9826 TARGET has a meaning which depends on OPERATION:
9827   For file I/O, TARGET is a file name (except for the special case below).
9828   For process I/O, TARGET is a process name.
9829   For network I/O, TARGET is a service name or a port number.
9830
9831 This function looks up what is specified for TARGET in
9832 `file-coding-system-alist', `process-coding-system-alist',
9833 or `network-coding-system-alist' depending on OPERATION.
9834 They may specify a coding system, a cons of coding systems,
9835 or a function symbol to call.
9836 In the last case, we call the function with one argument,
9837 which is a list of all the arguments given to this function.
9838 If the function can't decide a coding system, it can return
9839 `undecided' so that the normal code-detection is performed.
9840
9841 If OPERATION is `insert-file-contents', the argument corresponding to
9842 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9843 file name to look up, and BUFFER is a buffer that contains the file's
9844 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9845 function to call for FILENAME, that function should examine the
9846 contents of BUFFER instead of reading the file.
9847
9848 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9849   (ptrdiff_t nargs, Lisp_Object *args)
9850 {
9851   Lisp_Object operation, target_idx, target, val;
9852   register Lisp_Object chain;
9853
9854   if (nargs < 2)
9855     error ("Too few arguments");
9856   operation = args[0];
9857   if (!SYMBOLP (operation)
9858       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9859     error ("Invalid first argument");
9860   if (nargs <= 1 + XFASTINT (target_idx))
9861     error ("Too few arguments for operation `%s'",
9862            SDATA (SYMBOL_NAME (operation)));
9863   target = args[XFASTINT (target_idx) + 1];
9864   if (!(STRINGP (target)
9865         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9866             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9867         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9868     error ("Invalid argument %"pI"d of operation `%s'",
9869            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9870   if (CONSP (target))
9871     target = XCAR (target);
9872
9873   chain = ((EQ (operation, Qinsert_file_contents)
9874             || EQ (operation, Qwrite_region))
9875            ? Vfile_coding_system_alist
9876            : (EQ (operation, Qopen_network_stream)
9877               ? Vnetwork_coding_system_alist
9878               : Vprocess_coding_system_alist));
9879   if (NILP (chain))
9880     return Qnil;
9881
9882   for (; CONSP (chain); chain = XCDR (chain))
9883     {
9884       Lisp_Object elt;
9885
9886       elt = XCAR (chain);
9887       if (CONSP (elt)
9888           && ((STRINGP (target)
9889                && STRINGP (XCAR (elt))
9890                && fast_string_match (XCAR (elt), target) >= 0)
9891               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9892         {
9893           val = XCDR (elt);
9894           /* Here, if VAL is both a valid coding system and a valid
9895              function symbol, we return VAL as a coding system.  */
9896           if (CONSP (val))
9897             return val;
9898           if (! SYMBOLP (val))
9899             return Qnil;
9900           if (! NILP (Fcoding_system_p (val)))
9901             return Fcons (val, val);
9902           if (! NILP (Ffboundp (val)))
9903             {
9904               /* We use call1 rather than safe_call1
9905                  so as to get bug reports about functions called here
9906                  which don't handle the current interface.  */
9907               val = call1 (val, Flist (nargs, args));
9908               if (CONSP (val))
9909                 return val;
9910               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9911                 return Fcons (val, val);
9912             }
9913           return Qnil;
9914         }
9915     }
9916   return Qnil;
9917 }
9918
9919 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9920        Sset_coding_system_priority, 0, MANY, 0,
9921        doc: /* Assign higher priority to the coding systems given as arguments.
9922 If multiple coding systems belong to the same category,
9923 all but the first one are ignored.
9924
9925 usage: (set-coding-system-priority &rest coding-systems)  */)
9926   (ptrdiff_t nargs, Lisp_Object *args)
9927 {
9928   ptrdiff_t i, j;
9929   bool changed[coding_category_max];
9930   enum coding_category priorities[coding_category_max];
9931
9932   memset (changed, 0, sizeof changed);
9933
9934   for (i = j = 0; i < nargs; i++)
9935     {
9936       enum coding_category category;
9937       Lisp_Object spec, attrs;
9938
9939       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9940       attrs = AREF (spec, 0);
9941       category = XINT (CODING_ATTR_CATEGORY (attrs));
9942       if (changed[category])
9943         /* Ignore this coding system because a coding system of the
9944            same category already had a higher priority.  */
9945         continue;
9946       changed[category] = 1;
9947       priorities[j++] = category;
9948       if (coding_categories[category].id >= 0
9949           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9950         setup_coding_system (args[i], &coding_categories[category]);
9951       Fset (AREF (Vcoding_category_table, category), args[i]);
9952     }
9953
9954   /* Now we have decided top J priorities.  Reflect the order of the
9955      original priorities to the remaining priorities.  */
9956
9957   for (i = j, j = 0; i < coding_category_max; i++, j++)
9958     {
9959       while (j < coding_category_max
9960              && changed[coding_priorities[j]])
9961         j++;
9962       if (j == coding_category_max)
9963         emacs_abort ();
9964       priorities[i] = coding_priorities[j];
9965     }
9966
9967   memcpy (coding_priorities, priorities, sizeof priorities);
9968
9969   /* Update `coding-category-list'.  */
9970   Vcoding_category_list = Qnil;
9971   for (i = coding_category_max; i-- > 0; )
9972     Vcoding_category_list
9973       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9974                Vcoding_category_list);
9975
9976   return Qnil;
9977 }
9978
9979 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9980        Scoding_system_priority_list, 0, 1, 0,
9981        doc: /* Return a list of coding systems ordered by their priorities.
9982 The list contains a subset of coding systems; i.e. coding systems
9983 assigned to each coding category (see `coding-category-list').
9984
9985 HIGHESTP non-nil means just return the highest priority one.  */)
9986   (Lisp_Object highestp)
9987 {
9988   int i;
9989   Lisp_Object val;
9990
9991   for (i = 0, val = Qnil; i < coding_category_max; i++)
9992     {
9993       enum coding_category category = coding_priorities[i];
9994       int id = coding_categories[category].id;
9995       Lisp_Object attrs;
9996
9997       if (id < 0)
9998         continue;
9999       attrs = CODING_ID_ATTRS (id);
10000       if (! NILP (highestp))
10001         return CODING_ATTR_BASE_NAME (attrs);
10002       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10003     }
10004   return Fnreverse (val);
10005 }
10006
10007 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10008
10009 static Lisp_Object
10010 make_subsidiaries (Lisp_Object base)
10011 {
10012   Lisp_Object subsidiaries;
10013   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10014   char *buf = alloca (base_name_len + 6);
10015   int i;
10016
10017   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10018   subsidiaries = make_uninit_vector (3);
10019   for (i = 0; i < 3; i++)
10020     {
10021       strcpy (buf + base_name_len, suffixes[i]);
10022       ASET (subsidiaries, i, intern (buf));
10023     }
10024   return subsidiaries;
10025 }
10026
10027
10028 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10029        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10030        doc: /* For internal use only.
10031 usage: (define-coding-system-internal ...)  */)
10032   (ptrdiff_t nargs, Lisp_Object *args)
10033 {
10034   Lisp_Object name;
10035   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10036   Lisp_Object attrs;            /* Vector of attributes.  */
10037   Lisp_Object eol_type;
10038   Lisp_Object aliases;
10039   Lisp_Object coding_type, charset_list, safe_charsets;
10040   enum coding_category category;
10041   Lisp_Object tail, val;
10042   int max_charset_id = 0;
10043   int i;
10044
10045   if (nargs < coding_arg_max)
10046     goto short_args;
10047
10048   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10049
10050   name = args[coding_arg_name];
10051   CHECK_SYMBOL (name);
10052   ASET (attrs, coding_attr_base_name, name);
10053
10054   val = args[coding_arg_mnemonic];
10055   if (! STRINGP (val))
10056     CHECK_CHARACTER (val);
10057   ASET (attrs, coding_attr_mnemonic, val);
10058
10059   coding_type = args[coding_arg_coding_type];
10060   CHECK_SYMBOL (coding_type);
10061   ASET (attrs, coding_attr_type, coding_type);
10062
10063   charset_list = args[coding_arg_charset_list];
10064   if (SYMBOLP (charset_list))
10065     {
10066       if (EQ (charset_list, Qiso_2022))
10067         {
10068           if (! EQ (coding_type, Qiso_2022))
10069             error ("Invalid charset-list");
10070           charset_list = Viso_2022_charset_list;
10071         }
10072       else if (EQ (charset_list, Qemacs_mule))
10073         {
10074           if (! EQ (coding_type, Qemacs_mule))
10075             error ("Invalid charset-list");
10076           charset_list = Vemacs_mule_charset_list;
10077         }
10078       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10079         {
10080           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10081             error ("Invalid charset-list");
10082           if (max_charset_id < XFASTINT (XCAR (tail)))
10083             max_charset_id = XFASTINT (XCAR (tail));
10084         }
10085     }
10086   else
10087     {
10088       charset_list = Fcopy_sequence (charset_list);
10089       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10090         {
10091           struct charset *charset;
10092
10093           val = XCAR (tail);
10094           CHECK_CHARSET_GET_CHARSET (val, charset);
10095           if (EQ (coding_type, Qiso_2022)
10096               ? CHARSET_ISO_FINAL (charset) < 0
10097               : EQ (coding_type, Qemacs_mule)
10098               ? CHARSET_EMACS_MULE_ID (charset) < 0
10099               : 0)
10100             error ("Can't handle charset `%s'",
10101                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10102
10103           XSETCAR (tail, make_number (charset->id));
10104           if (max_charset_id < charset->id)
10105             max_charset_id = charset->id;
10106         }
10107     }
10108   ASET (attrs, coding_attr_charset_list, charset_list);
10109
10110   safe_charsets = make_uninit_string (max_charset_id + 1);
10111   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10112   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10113     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10114   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10115
10116   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10117
10118   val = args[coding_arg_decode_translation_table];
10119   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10120     CHECK_SYMBOL (val);
10121   ASET (attrs, coding_attr_decode_tbl, val);
10122
10123   val = args[coding_arg_encode_translation_table];
10124   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10125     CHECK_SYMBOL (val);
10126   ASET (attrs, coding_attr_encode_tbl, val);
10127
10128   val = args[coding_arg_post_read_conversion];
10129   CHECK_SYMBOL (val);
10130   ASET (attrs, coding_attr_post_read, val);
10131
10132   val = args[coding_arg_pre_write_conversion];
10133   CHECK_SYMBOL (val);
10134   ASET (attrs, coding_attr_pre_write, val);
10135
10136   val = args[coding_arg_default_char];
10137   if (NILP (val))
10138     ASET (attrs, coding_attr_default_char, make_number (' '));
10139   else
10140     {
10141       CHECK_CHARACTER (val);
10142       ASET (attrs, coding_attr_default_char, val);
10143     }
10144
10145   val = args[coding_arg_for_unibyte];
10146   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10147
10148   val = args[coding_arg_plist];
10149   CHECK_LIST (val);
10150   ASET (attrs, coding_attr_plist, val);
10151
10152   if (EQ (coding_type, Qcharset))
10153     {
10154       /* Generate a lisp vector of 256 elements.  Each element is nil,
10155          integer, or a list of charset IDs.
10156
10157          If Nth element is nil, the byte code N is invalid in this
10158          coding system.
10159
10160          If Nth element is a number NUM, N is the first byte of a
10161          charset whose ID is NUM.
10162
10163          If Nth element is a list of charset IDs, N is the first byte
10164          of one of them.  The list is sorted by dimensions of the
10165          charsets.  A charset of smaller dimension comes first. */
10166       val = Fmake_vector (make_number (256), Qnil);
10167
10168       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10169         {
10170           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10171           int dim = CHARSET_DIMENSION (charset);
10172           int idx = (dim - 1) * 4;
10173
10174           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10175             ASET (attrs, coding_attr_ascii_compat, Qt);
10176
10177           for (i = charset->code_space[idx];
10178                i <= charset->code_space[idx + 1]; i++)
10179             {
10180               Lisp_Object tmp, tmp2;
10181               int dim2;
10182
10183               tmp = AREF (val, i);
10184               if (NILP (tmp))
10185                 tmp = XCAR (tail);
10186               else if (NUMBERP (tmp))
10187                 {
10188                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10189                   if (dim < dim2)
10190                     tmp = list2 (XCAR (tail), tmp);
10191                   else
10192                     tmp = list2 (tmp, XCAR (tail));
10193                 }
10194               else
10195                 {
10196                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10197                     {
10198                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10199                       if (dim < dim2)
10200                         break;
10201                     }
10202                   if (NILP (tmp2))
10203                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10204                   else
10205                     {
10206                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10207                       XSETCAR (tmp2, XCAR (tail));
10208                     }
10209                 }
10210               ASET (val, i, tmp);
10211             }
10212         }
10213       ASET (attrs, coding_attr_charset_valids, val);
10214       category = coding_category_charset;
10215     }
10216   else if (EQ (coding_type, Qccl))
10217     {
10218       Lisp_Object valids;
10219
10220       if (nargs < coding_arg_ccl_max)
10221         goto short_args;
10222
10223       val = args[coding_arg_ccl_decoder];
10224       CHECK_CCL_PROGRAM (val);
10225       if (VECTORP (val))
10226         val = Fcopy_sequence (val);
10227       ASET (attrs, coding_attr_ccl_decoder, val);
10228
10229       val = args[coding_arg_ccl_encoder];
10230       CHECK_CCL_PROGRAM (val);
10231       if (VECTORP (val))
10232         val = Fcopy_sequence (val);
10233       ASET (attrs, coding_attr_ccl_encoder, val);
10234
10235       val = args[coding_arg_ccl_valids];
10236       valids = Fmake_string (make_number (256), make_number (0));
10237       for (tail = val; CONSP (tail); tail = XCDR (tail))
10238         {
10239           int from, to;
10240
10241           val = XCAR (tail);
10242           if (INTEGERP (val))
10243             {
10244               if (! (0 <= XINT (val) && XINT (val) <= 255))
10245                 args_out_of_range_3 (val, make_number (0), make_number (255));
10246               from = to = XINT (val);
10247             }
10248           else
10249             {
10250               CHECK_CONS (val);
10251               CHECK_NATNUM_CAR (val);
10252               CHECK_NUMBER_CDR (val);
10253               if (XINT (XCAR (val)) > 255)
10254                 args_out_of_range_3 (XCAR (val),
10255                                      make_number (0), make_number (255));
10256               from = XINT (XCAR (val));
10257               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10258                 args_out_of_range_3 (XCDR (val),
10259                                      XCAR (val), make_number (255));
10260               to = XINT (XCDR (val));
10261             }
10262           for (i = from; i <= to; i++)
10263             SSET (valids, i, 1);
10264         }
10265       ASET (attrs, coding_attr_ccl_valids, valids);
10266
10267       category = coding_category_ccl;
10268     }
10269   else if (EQ (coding_type, Qutf_16))
10270     {
10271       Lisp_Object bom, endian;
10272
10273       ASET (attrs, coding_attr_ascii_compat, Qnil);
10274
10275       if (nargs < coding_arg_utf16_max)
10276         goto short_args;
10277
10278       bom = args[coding_arg_utf16_bom];
10279       if (! NILP (bom) && ! EQ (bom, Qt))
10280         {
10281           CHECK_CONS (bom);
10282           val = XCAR (bom);
10283           CHECK_CODING_SYSTEM (val);
10284           val = XCDR (bom);
10285           CHECK_CODING_SYSTEM (val);
10286         }
10287       ASET (attrs, coding_attr_utf_bom, bom);
10288
10289       endian = args[coding_arg_utf16_endian];
10290       CHECK_SYMBOL (endian);
10291       if (NILP (endian))
10292         endian = Qbig;
10293       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10294         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10295       ASET (attrs, coding_attr_utf_16_endian, endian);
10296
10297       category = (CONSP (bom)
10298                   ? coding_category_utf_16_auto
10299                   : NILP (bom)
10300                   ? (EQ (endian, Qbig)
10301                      ? coding_category_utf_16_be_nosig
10302                      : coding_category_utf_16_le_nosig)
10303                   : (EQ (endian, Qbig)
10304                      ? coding_category_utf_16_be
10305                      : coding_category_utf_16_le));
10306     }
10307   else if (EQ (coding_type, Qiso_2022))
10308     {
10309       Lisp_Object initial, reg_usage, request, flags;
10310
10311       if (nargs < coding_arg_iso2022_max)
10312         goto short_args;
10313
10314       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10315       CHECK_VECTOR (initial);
10316       for (i = 0; i < 4; i++)
10317         {
10318           val = AREF (initial, i);
10319           if (! NILP (val))
10320             {
10321               struct charset *charset;
10322
10323               CHECK_CHARSET_GET_CHARSET (val, charset);
10324               ASET (initial, i, make_number (CHARSET_ID (charset)));
10325               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10326                 ASET (attrs, coding_attr_ascii_compat, Qt);
10327             }
10328           else
10329             ASET (initial, i, make_number (-1));
10330         }
10331
10332       reg_usage = args[coding_arg_iso2022_reg_usage];
10333       CHECK_CONS (reg_usage);
10334       CHECK_NUMBER_CAR (reg_usage);
10335       CHECK_NUMBER_CDR (reg_usage);
10336
10337       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10338       for (tail = request; CONSP (tail); tail = XCDR (tail))
10339         {
10340           int id;
10341           Lisp_Object tmp1;
10342
10343           val = XCAR (tail);
10344           CHECK_CONS (val);
10345           tmp1 = XCAR (val);
10346           CHECK_CHARSET_GET_ID (tmp1, id);
10347           CHECK_NATNUM_CDR (val);
10348           if (XINT (XCDR (val)) >= 4)
10349             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10350           XSETCAR (val, make_number (id));
10351         }
10352
10353       flags = args[coding_arg_iso2022_flags];
10354       CHECK_NATNUM (flags);
10355       i = XINT (flags) & INT_MAX;
10356       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10357         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10358       flags = make_number (i);
10359
10360       ASET (attrs, coding_attr_iso_initial, initial);
10361       ASET (attrs, coding_attr_iso_usage, reg_usage);
10362       ASET (attrs, coding_attr_iso_request, request);
10363       ASET (attrs, coding_attr_iso_flags, flags);
10364       setup_iso_safe_charsets (attrs);
10365
10366       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10367         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10368                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10369                     ? coding_category_iso_7_else
10370                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10371                     ? coding_category_iso_7
10372                     : coding_category_iso_7_tight);
10373       else
10374         {
10375           int id = XINT (AREF (initial, 1));
10376
10377           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10378                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10379                        || id < 0)
10380                       ? coding_category_iso_8_else
10381                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10382                       ? coding_category_iso_8_1
10383                       : coding_category_iso_8_2);
10384         }
10385       if (category != coding_category_iso_8_1
10386           && category != coding_category_iso_8_2)
10387         ASET (attrs, coding_attr_ascii_compat, Qnil);
10388     }
10389   else if (EQ (coding_type, Qemacs_mule))
10390     {
10391       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10392         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10393       ASET (attrs, coding_attr_ascii_compat, Qt);
10394       category = coding_category_emacs_mule;
10395     }
10396   else if (EQ (coding_type, Qshift_jis))
10397     {
10398
10399       struct charset *charset;
10400
10401       if (XINT (Flength (charset_list)) != 3
10402           && XINT (Flength (charset_list)) != 4)
10403         error ("There should be three or four charsets");
10404
10405       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10406       if (CHARSET_DIMENSION (charset) != 1)
10407         error ("Dimension of charset %s is not one",
10408                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10409       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10410         ASET (attrs, coding_attr_ascii_compat, Qt);
10411
10412       charset_list = XCDR (charset_list);
10413       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10414       if (CHARSET_DIMENSION (charset) != 1)
10415         error ("Dimension of charset %s is not one",
10416                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10417
10418       charset_list = XCDR (charset_list);
10419       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10420       if (CHARSET_DIMENSION (charset) != 2)
10421         error ("Dimension of charset %s is not two",
10422                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10423
10424       charset_list = XCDR (charset_list);
10425       if (! NILP (charset_list))
10426         {
10427           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10428           if (CHARSET_DIMENSION (charset) != 2)
10429             error ("Dimension of charset %s is not two",
10430                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10431         }
10432
10433       category = coding_category_sjis;
10434       Vsjis_coding_system = name;
10435     }
10436   else if (EQ (coding_type, Qbig5))
10437     {
10438       struct charset *charset;
10439
10440       if (XINT (Flength (charset_list)) != 2)
10441         error ("There should be just two charsets");
10442
10443       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10444       if (CHARSET_DIMENSION (charset) != 1)
10445         error ("Dimension of charset %s is not one",
10446                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10447       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10448         ASET (attrs, coding_attr_ascii_compat, Qt);
10449
10450       charset_list = XCDR (charset_list);
10451       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10452       if (CHARSET_DIMENSION (charset) != 2)
10453         error ("Dimension of charset %s is not two",
10454                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10455
10456       category = coding_category_big5;
10457       Vbig5_coding_system = name;
10458     }
10459   else if (EQ (coding_type, Qraw_text))
10460     {
10461       category = coding_category_raw_text;
10462       ASET (attrs, coding_attr_ascii_compat, Qt);
10463     }
10464   else if (EQ (coding_type, Qutf_8))
10465     {
10466       Lisp_Object bom;
10467
10468       if (nargs < coding_arg_utf8_max)
10469         goto short_args;
10470
10471       bom = args[coding_arg_utf8_bom];
10472       if (! NILP (bom) && ! EQ (bom, Qt))
10473         {
10474           CHECK_CONS (bom);
10475           val = XCAR (bom);
10476           CHECK_CODING_SYSTEM (val);
10477           val = XCDR (bom);
10478           CHECK_CODING_SYSTEM (val);
10479         }
10480       ASET (attrs, coding_attr_utf_bom, bom);
10481       if (NILP (bom))
10482         ASET (attrs, coding_attr_ascii_compat, Qt);
10483
10484       category = (CONSP (bom) ? coding_category_utf_8_auto
10485                   : NILP (bom) ? coding_category_utf_8_nosig
10486                   : coding_category_utf_8_sig);
10487     }
10488   else if (EQ (coding_type, Qundecided))
10489     {
10490       if (nargs < coding_arg_undecided_max)
10491         goto short_args;
10492       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10493             args[coding_arg_undecided_inhibit_null_byte_detection]);
10494       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10495             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10496       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10497             args[coding_arg_undecided_prefer_utf_8]);
10498       category = coding_category_undecided;
10499     }
10500   else
10501     error ("Invalid coding system type: %s",
10502            SDATA (SYMBOL_NAME (coding_type)));
10503
10504   ASET (attrs, coding_attr_category, make_number (category));
10505   ASET (attrs, coding_attr_plist,
10506         Fcons (QCcategory,
10507                Fcons (AREF (Vcoding_category_table, category),
10508                       CODING_ATTR_PLIST (attrs))));
10509   ASET (attrs, coding_attr_plist,
10510         Fcons (QCascii_compatible_p,
10511                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10512                       CODING_ATTR_PLIST (attrs))));
10513
10514   eol_type = args[coding_arg_eol_type];
10515   if (! NILP (eol_type)
10516       && ! EQ (eol_type, Qunix)
10517       && ! EQ (eol_type, Qdos)
10518       && ! EQ (eol_type, Qmac))
10519     error ("Invalid eol-type");
10520
10521   aliases = list1 (name);
10522
10523   if (NILP (eol_type))
10524     {
10525       eol_type = make_subsidiaries (name);
10526       for (i = 0; i < 3; i++)
10527         {
10528           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10529
10530           this_name = AREF (eol_type, i);
10531           this_aliases = list1 (this_name);
10532           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10533           this_spec = make_uninit_vector (3);
10534           ASET (this_spec, 0, attrs);
10535           ASET (this_spec, 1, this_aliases);
10536           ASET (this_spec, 2, this_eol_type);
10537           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10538           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10539           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10540           if (NILP (val))
10541             Vcoding_system_alist
10542               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10543                        Vcoding_system_alist);
10544         }
10545     }
10546
10547   spec_vec = make_uninit_vector (3);
10548   ASET (spec_vec, 0, attrs);
10549   ASET (spec_vec, 1, aliases);
10550   ASET (spec_vec, 2, eol_type);
10551
10552   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10553   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10554   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10555   if (NILP (val))
10556     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10557                                   Vcoding_system_alist);
10558
10559   {
10560     int id = coding_categories[category].id;
10561
10562     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10563       setup_coding_system (name, &coding_categories[category]);
10564   }
10565
10566   return Qnil;
10567
10568  short_args:
10569   return Fsignal (Qwrong_number_of_arguments,
10570                   Fcons (intern ("define-coding-system-internal"),
10571                          make_number (nargs)));
10572 }
10573
10574
10575 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10576        3, 3, 0,
10577        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10578   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10579 {
10580   Lisp_Object spec, attrs;
10581
10582   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10583   attrs = AREF (spec, 0);
10584   if (EQ (prop, QCmnemonic))
10585     {
10586       if (! STRINGP (val))
10587         CHECK_CHARACTER (val);
10588       ASET (attrs, coding_attr_mnemonic, val);
10589     }
10590   else if (EQ (prop, QCdefault_char))
10591     {
10592       if (NILP (val))
10593         val = make_number (' ');
10594       else
10595         CHECK_CHARACTER (val);
10596       ASET (attrs, coding_attr_default_char, val);
10597     }
10598   else if (EQ (prop, QCdecode_translation_table))
10599     {
10600       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10601         CHECK_SYMBOL (val);
10602       ASET (attrs, coding_attr_decode_tbl, val);
10603     }
10604   else if (EQ (prop, QCencode_translation_table))
10605     {
10606       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10607         CHECK_SYMBOL (val);
10608       ASET (attrs, coding_attr_encode_tbl, val);
10609     }
10610   else if (EQ (prop, QCpost_read_conversion))
10611     {
10612       CHECK_SYMBOL (val);
10613       ASET (attrs, coding_attr_post_read, val);
10614     }
10615   else if (EQ (prop, QCpre_write_conversion))
10616     {
10617       CHECK_SYMBOL (val);
10618       ASET (attrs, coding_attr_pre_write, val);
10619     }
10620   else if (EQ (prop, QCascii_compatible_p))
10621     {
10622       ASET (attrs, coding_attr_ascii_compat, val);
10623     }
10624
10625   ASET (attrs, coding_attr_plist,
10626         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10627   return val;
10628 }
10629
10630
10631 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10632        Sdefine_coding_system_alias, 2, 2, 0,
10633        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10634   (Lisp_Object alias, Lisp_Object coding_system)
10635 {
10636   Lisp_Object spec, aliases, eol_type, val;
10637
10638   CHECK_SYMBOL (alias);
10639   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10640   aliases = AREF (spec, 1);
10641   /* ALIASES should be a list of length more than zero, and the first
10642      element is a base coding system.  Append ALIAS at the tail of the
10643      list.  */
10644   while (!NILP (XCDR (aliases)))
10645     aliases = XCDR (aliases);
10646   XSETCDR (aliases, list1 (alias));
10647
10648   eol_type = AREF (spec, 2);
10649   if (VECTORP (eol_type))
10650     {
10651       Lisp_Object subsidiaries;
10652       int i;
10653
10654       subsidiaries = make_subsidiaries (alias);
10655       for (i = 0; i < 3; i++)
10656         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10657                                      AREF (eol_type, i));
10658     }
10659
10660   Fputhash (alias, spec, Vcoding_system_hash_table);
10661   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10662   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10663   if (NILP (val))
10664     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10665                                   Vcoding_system_alist);
10666
10667   return Qnil;
10668 }
10669
10670 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10671        1, 1, 0,
10672        doc: /* Return the base of CODING-SYSTEM.
10673 Any alias or subsidiary coding system is not a base coding system.  */)
10674   (Lisp_Object coding_system)
10675 {
10676   Lisp_Object spec, attrs;
10677
10678   if (NILP (coding_system))
10679     return (Qno_conversion);
10680   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10681   attrs = AREF (spec, 0);
10682   return CODING_ATTR_BASE_NAME (attrs);
10683 }
10684
10685 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10686        1, 1, 0,
10687        doc: "Return the property list of CODING-SYSTEM.")
10688   (Lisp_Object coding_system)
10689 {
10690   Lisp_Object spec, attrs;
10691
10692   if (NILP (coding_system))
10693     coding_system = Qno_conversion;
10694   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10695   attrs = AREF (spec, 0);
10696   return CODING_ATTR_PLIST (attrs);
10697 }
10698
10699
10700 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10701        1, 1, 0,
10702        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10703   (Lisp_Object coding_system)
10704 {
10705   Lisp_Object spec;
10706
10707   if (NILP (coding_system))
10708     coding_system = Qno_conversion;
10709   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10710   return AREF (spec, 1);
10711 }
10712
10713 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10714        Scoding_system_eol_type, 1, 1, 0,
10715        doc: /* Return eol-type of CODING-SYSTEM.
10716 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10717
10718 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10719 and CR respectively.
10720
10721 A vector value indicates that a format of end-of-line should be
10722 detected automatically.  Nth element of the vector is the subsidiary
10723 coding system whose eol-type is N.  */)
10724   (Lisp_Object coding_system)
10725 {
10726   Lisp_Object spec, eol_type;
10727   int n;
10728
10729   if (NILP (coding_system))
10730     coding_system = Qno_conversion;
10731   if (! CODING_SYSTEM_P (coding_system))
10732     return Qnil;
10733   spec = CODING_SYSTEM_SPEC (coding_system);
10734   eol_type = AREF (spec, 2);
10735   if (VECTORP (eol_type))
10736     return Fcopy_sequence (eol_type);
10737   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10738   return make_number (n);
10739 }
10740
10741 #endif /* emacs */
10742
10743 \f
10744 /*** 9. Post-amble ***/
10745
10746 void
10747 init_coding_once (void)
10748 {
10749   int i;
10750
10751   for (i = 0; i < coding_category_max; i++)
10752     {
10753       coding_categories[i].id = -1;
10754       coding_priorities[i] = i;
10755     }
10756
10757   /* ISO2022 specific initialize routine.  */
10758   for (i = 0; i < 0x20; i++)
10759     iso_code_class[i] = ISO_control_0;
10760   for (i = 0x21; i < 0x7F; i++)
10761     iso_code_class[i] = ISO_graphic_plane_0;
10762   for (i = 0x80; i < 0xA0; i++)
10763     iso_code_class[i] = ISO_control_1;
10764   for (i = 0xA1; i < 0xFF; i++)
10765     iso_code_class[i] = ISO_graphic_plane_1;
10766   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10767   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10768   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10769   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10770   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10771   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10772   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10773   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10774   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10775
10776   for (i = 0; i < 256; i++)
10777     {
10778       emacs_mule_bytes[i] = 1;
10779     }
10780   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10781   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10782   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10783   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10784 }
10785
10786 #ifdef emacs
10787
10788 void
10789 syms_of_coding (void)
10790 {
10791   staticpro (&Vcoding_system_hash_table);
10792   {
10793     Lisp_Object args[2];
10794     args[0] = QCtest;
10795     args[1] = Qeq;
10796     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10797   }
10798
10799   staticpro (&Vsjis_coding_system);
10800   Vsjis_coding_system = Qnil;
10801
10802   staticpro (&Vbig5_coding_system);
10803   Vbig5_coding_system = Qnil;
10804
10805   staticpro (&Vcode_conversion_reused_workbuf);
10806   Vcode_conversion_reused_workbuf = Qnil;
10807
10808   staticpro (&Vcode_conversion_workbuf_name);
10809   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10810
10811   reused_workbuf_in_use = 0;
10812
10813   DEFSYM (Qcharset, "charset");
10814   DEFSYM (Qtarget_idx, "target-idx");
10815   DEFSYM (Qcoding_system_history, "coding-system-history");
10816   Fset (Qcoding_system_history, Qnil);
10817
10818   /* Target FILENAME is the first argument.  */
10819   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10820   /* Target FILENAME is the third argument.  */
10821   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10822
10823   DEFSYM (Qcall_process, "call-process");
10824   /* Target PROGRAM is the first argument.  */
10825   Fput (Qcall_process, Qtarget_idx, make_number (0));
10826
10827   DEFSYM (Qcall_process_region, "call-process-region");
10828   /* Target PROGRAM is the third argument.  */
10829   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10830
10831   DEFSYM (Qstart_process, "start-process");
10832   /* Target PROGRAM is the third argument.  */
10833   Fput (Qstart_process, Qtarget_idx, make_number (2));
10834
10835   DEFSYM (Qopen_network_stream, "open-network-stream");
10836   /* Target SERVICE is the fourth argument.  */
10837   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10838
10839   DEFSYM (Qcoding_system, "coding-system");
10840   DEFSYM (Qcoding_aliases, "coding-aliases");
10841
10842   DEFSYM (Qeol_type, "eol-type");
10843   DEFSYM (Qunix, "unix");
10844   DEFSYM (Qdos, "dos");
10845   DEFSYM (Qmac, "mac");
10846
10847   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10848   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10849   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10850   DEFSYM (Qdefault_char, "default-char");
10851   DEFSYM (Qundecided, "undecided");
10852   DEFSYM (Qno_conversion, "no-conversion");
10853   DEFSYM (Qraw_text, "raw-text");
10854
10855   DEFSYM (Qiso_2022, "iso-2022");
10856
10857   DEFSYM (Qutf_8, "utf-8");
10858   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10859
10860 #if defined (WINDOWSNT) || defined (CYGWIN)
10861   /* No, not utf-16-le: that one has a BOM.  */
10862   DEFSYM (Qutf_16le, "utf-16le");
10863 #endif
10864
10865   DEFSYM (Qutf_16, "utf-16");
10866   DEFSYM (Qbig, "big");
10867   DEFSYM (Qlittle, "little");
10868
10869   DEFSYM (Qshift_jis, "shift-jis");
10870   DEFSYM (Qbig5, "big5");
10871
10872   DEFSYM (Qcoding_system_p, "coding-system-p");
10873
10874   DEFSYM (Qcoding_system_error, "coding-system-error");
10875   Fput (Qcoding_system_error, Qerror_conditions,
10876         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10877   Fput (Qcoding_system_error, Qerror_message,
10878         build_pure_c_string ("Invalid coding system"));
10879
10880   DEFSYM (Qtranslation_table, "translation-table");
10881   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10882   DEFSYM (Qtranslation_table_id, "translation-table-id");
10883   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10884   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10885
10886   DEFSYM (Qvalid_codes, "valid-codes");
10887
10888   DEFSYM (Qemacs_mule, "emacs-mule");
10889
10890   DEFSYM (QCcategory, ":category");
10891   DEFSYM (QCmnemonic, ":mnemonic");
10892   DEFSYM (QCdefault_char, ":default-char");
10893   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10894   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10895   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10896   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10897   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10898
10899   Vcoding_category_table
10900     = Fmake_vector (make_number (coding_category_max), Qnil);
10901   staticpro (&Vcoding_category_table);
10902   /* Followings are target of code detection.  */
10903   ASET (Vcoding_category_table, coding_category_iso_7,
10904         intern_c_string ("coding-category-iso-7"));
10905   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10906         intern_c_string ("coding-category-iso-7-tight"));
10907   ASET (Vcoding_category_table, coding_category_iso_8_1,
10908         intern_c_string ("coding-category-iso-8-1"));
10909   ASET (Vcoding_category_table, coding_category_iso_8_2,
10910         intern_c_string ("coding-category-iso-8-2"));
10911   ASET (Vcoding_category_table, coding_category_iso_7_else,
10912         intern_c_string ("coding-category-iso-7-else"));
10913   ASET (Vcoding_category_table, coding_category_iso_8_else,
10914         intern_c_string ("coding-category-iso-8-else"));
10915   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10916         intern_c_string ("coding-category-utf-8-auto"));
10917   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10918         intern_c_string ("coding-category-utf-8"));
10919   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10920         intern_c_string ("coding-category-utf-8-sig"));
10921   ASET (Vcoding_category_table, coding_category_utf_16_be,
10922         intern_c_string ("coding-category-utf-16-be"));
10923   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10924         intern_c_string ("coding-category-utf-16-auto"));
10925   ASET (Vcoding_category_table, coding_category_utf_16_le,
10926         intern_c_string ("coding-category-utf-16-le"));
10927   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10928         intern_c_string ("coding-category-utf-16-be-nosig"));
10929   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10930         intern_c_string ("coding-category-utf-16-le-nosig"));
10931   ASET (Vcoding_category_table, coding_category_charset,
10932         intern_c_string ("coding-category-charset"));
10933   ASET (Vcoding_category_table, coding_category_sjis,
10934         intern_c_string ("coding-category-sjis"));
10935   ASET (Vcoding_category_table, coding_category_big5,
10936         intern_c_string ("coding-category-big5"));
10937   ASET (Vcoding_category_table, coding_category_ccl,
10938         intern_c_string ("coding-category-ccl"));
10939   ASET (Vcoding_category_table, coding_category_emacs_mule,
10940         intern_c_string ("coding-category-emacs-mule"));
10941   /* Followings are NOT target of code detection.  */
10942   ASET (Vcoding_category_table, coding_category_raw_text,
10943         intern_c_string ("coding-category-raw-text"));
10944   ASET (Vcoding_category_table, coding_category_undecided,
10945         intern_c_string ("coding-category-undecided"));
10946
10947   DEFSYM (Qinsufficient_source, "insufficient-source");
10948   DEFSYM (Qinvalid_source, "invalid-source");
10949   DEFSYM (Qinterrupted, "interrupted");
10950   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10951
10952   defsubr (&Scoding_system_p);
10953   defsubr (&Sread_coding_system);
10954   defsubr (&Sread_non_nil_coding_system);
10955   defsubr (&Scheck_coding_system);
10956   defsubr (&Sdetect_coding_region);
10957   defsubr (&Sdetect_coding_string);
10958   defsubr (&Sfind_coding_systems_region_internal);
10959   defsubr (&Sunencodable_char_position);
10960   defsubr (&Scheck_coding_systems_region);
10961   defsubr (&Sdecode_coding_region);
10962   defsubr (&Sencode_coding_region);
10963   defsubr (&Sdecode_coding_string);
10964   defsubr (&Sencode_coding_string);
10965   defsubr (&Sdecode_sjis_char);
10966   defsubr (&Sencode_sjis_char);
10967   defsubr (&Sdecode_big5_char);
10968   defsubr (&Sencode_big5_char);
10969   defsubr (&Sset_terminal_coding_system_internal);
10970   defsubr (&Sset_safe_terminal_coding_system_internal);
10971   defsubr (&Sterminal_coding_system);
10972   defsubr (&Sset_keyboard_coding_system_internal);
10973   defsubr (&Skeyboard_coding_system);
10974   defsubr (&Sfind_operation_coding_system);
10975   defsubr (&Sset_coding_system_priority);
10976   defsubr (&Sdefine_coding_system_internal);
10977   defsubr (&Sdefine_coding_system_alias);
10978   defsubr (&Scoding_system_put);
10979   defsubr (&Scoding_system_base);
10980   defsubr (&Scoding_system_plist);
10981   defsubr (&Scoding_system_aliases);
10982   defsubr (&Scoding_system_eol_type);
10983   defsubr (&Scoding_system_priority_list);
10984
10985   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10986                doc: /* List of coding systems.
10987
10988 Do not alter the value of this variable manually.  This variable should be
10989 updated by the functions `define-coding-system' and
10990 `define-coding-system-alias'.  */);
10991   Vcoding_system_list = Qnil;
10992
10993   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10994                doc: /* Alist of coding system names.
10995 Each element is one element list of coding system name.
10996 This variable is given to `completing-read' as COLLECTION argument.
10997
10998 Do not alter the value of this variable manually.  This variable should be
10999 updated by the functions `make-coding-system' and
11000 `define-coding-system-alias'.  */);
11001   Vcoding_system_alist = Qnil;
11002
11003   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11004                doc: /* List of coding-categories (symbols) ordered by priority.
11005
11006 On detecting a coding system, Emacs tries code detection algorithms
11007 associated with each coding-category one by one in this order.  When
11008 one algorithm agrees with a byte sequence of source text, the coding
11009 system bound to the corresponding coding-category is selected.
11010
11011 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11012   {
11013     int i;
11014
11015     Vcoding_category_list = Qnil;
11016     for (i = coding_category_max - 1; i >= 0; i--)
11017       Vcoding_category_list
11018         = Fcons (AREF (Vcoding_category_table, i),
11019                  Vcoding_category_list);
11020   }
11021
11022   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11023                doc: /* Specify the coding system for read operations.
11024 It is useful to bind this variable with `let', but do not set it globally.
11025 If the value is a coding system, it is used for decoding on read operation.
11026 If not, an appropriate element is used from one of the coding system alists.
11027 There are three such tables: `file-coding-system-alist',
11028 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11029   Vcoding_system_for_read = Qnil;
11030
11031   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11032                doc: /* Specify the coding system for write operations.
11033 Programs bind this variable with `let', but you should not set it globally.
11034 If the value is a coding system, it is used for encoding of output,
11035 when writing it to a file and when sending it to a file or subprocess.
11036
11037 If this does not specify a coding system, an appropriate element
11038 is used from one of the coding system alists.
11039 There are three such tables: `file-coding-system-alist',
11040 `process-coding-system-alist', and `network-coding-system-alist'.
11041 For output to files, if the above procedure does not specify a coding system,
11042 the value of `buffer-file-coding-system' is used.  */);
11043   Vcoding_system_for_write = Qnil;
11044
11045   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11046                doc: /*
11047 Coding system used in the latest file or process I/O.  */);
11048   Vlast_coding_system_used = Qnil;
11049
11050   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11051                doc: /*
11052 Error status of the last code conversion.
11053
11054 When an error was detected in the last code conversion, this variable
11055 is set to one of the following symbols.
11056   `insufficient-source'
11057   `inconsistent-eol'
11058   `invalid-source'
11059   `interrupted'
11060   `insufficient-memory'
11061 When no error was detected, the value doesn't change.  So, to check
11062 the error status of a code conversion by this variable, you must
11063 explicitly set this variable to nil before performing code
11064 conversion.  */);
11065   Vlast_code_conversion_error = Qnil;
11066
11067   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11068                doc: /*
11069 *Non-nil means always inhibit code conversion of end-of-line format.
11070 See info node `Coding Systems' and info node `Text and Binary' concerning
11071 such conversion.  */);
11072   inhibit_eol_conversion = 0;
11073
11074   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11075                doc: /*
11076 Non-nil means process buffer inherits coding system of process output.
11077 Bind it to t if the process output is to be treated as if it were a file
11078 read from some filesystem.  */);
11079   inherit_process_coding_system = 0;
11080
11081   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11082                doc: /*
11083 Alist to decide a coding system to use for a file I/O operation.
11084 The format is ((PATTERN . VAL) ...),
11085 where PATTERN is a regular expression matching a file name,
11086 VAL is a coding system, a cons of coding systems, or a function symbol.
11087 If VAL is a coding system, it is used for both decoding and encoding
11088 the file contents.
11089 If VAL is a cons of coding systems, the car part is used for decoding,
11090 and the cdr part is used for encoding.
11091 If VAL is a function symbol, the function must return a coding system
11092 or a cons of coding systems which are used as above.  The function is
11093 called with an argument that is a list of the arguments with which
11094 `find-operation-coding-system' was called.  If the function can't decide
11095 a coding system, it can return `undecided' so that the normal
11096 code-detection is performed.
11097
11098 See also the function `find-operation-coding-system'
11099 and the variable `auto-coding-alist'.  */);
11100   Vfile_coding_system_alist = Qnil;
11101
11102   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11103                doc: /*
11104 Alist to decide a coding system to use for a process I/O operation.
11105 The format is ((PATTERN . VAL) ...),
11106 where PATTERN is a regular expression matching a program name,
11107 VAL is a coding system, a cons of coding systems, or a function symbol.
11108 If VAL is a coding system, it is used for both decoding what received
11109 from the program and encoding what sent to the program.
11110 If VAL is a cons of coding systems, the car part is used for decoding,
11111 and the cdr part is used for encoding.
11112 If VAL is a function symbol, the function must return a coding system
11113 or a cons of coding systems which are used as above.
11114
11115 See also the function `find-operation-coding-system'.  */);
11116   Vprocess_coding_system_alist = Qnil;
11117
11118   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11119                doc: /*
11120 Alist to decide a coding system to use for a network I/O operation.
11121 The format is ((PATTERN . VAL) ...),
11122 where PATTERN is a regular expression matching a network service name
11123 or is a port number to connect to,
11124 VAL is a coding system, a cons of coding systems, or a function symbol.
11125 If VAL is a coding system, it is used for both decoding what received
11126 from the network stream and encoding what sent to the network stream.
11127 If VAL is a cons of coding systems, the car part is used for decoding,
11128 and the cdr part is used for encoding.
11129 If VAL is a function symbol, the function must return a coding system
11130 or a cons of coding systems which are used as above.
11131
11132 See also the function `find-operation-coding-system'.  */);
11133   Vnetwork_coding_system_alist = Qnil;
11134
11135   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11136                doc: /* Coding system to use with system messages.
11137 Also used for decoding keyboard input on X Window system.  */);
11138   Vlocale_coding_system = Qnil;
11139
11140   /* The eol mnemonics are reset in startup.el system-dependently.  */
11141   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11142                doc: /*
11143 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11144   eol_mnemonic_unix = build_pure_c_string (":");
11145
11146   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11147                doc: /*
11148 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11149   eol_mnemonic_dos = build_pure_c_string ("\\");
11150
11151   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11152                doc: /*
11153 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11154   eol_mnemonic_mac = build_pure_c_string ("/");
11155
11156   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11157                doc: /*
11158 *String displayed in mode line when end-of-line format is not yet determined.  */);
11159   eol_mnemonic_undecided = build_pure_c_string (":");
11160
11161   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11162                doc: /*
11163 *Non-nil enables character translation while encoding and decoding.  */);
11164   Venable_character_translation = Qt;
11165
11166   DEFVAR_LISP ("standard-translation-table-for-decode",
11167                Vstandard_translation_table_for_decode,
11168                doc: /* Table for translating characters while decoding.  */);
11169   Vstandard_translation_table_for_decode = Qnil;
11170
11171   DEFVAR_LISP ("standard-translation-table-for-encode",
11172                Vstandard_translation_table_for_encode,
11173                doc: /* Table for translating characters while encoding.  */);
11174   Vstandard_translation_table_for_encode = Qnil;
11175
11176   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11177                doc: /* Alist of charsets vs revision numbers.
11178 While encoding, if a charset (car part of an element) is found,
11179 designate it with the escape sequence identifying revision (cdr part
11180 of the element).  */);
11181   Vcharset_revision_table = Qnil;
11182
11183   DEFVAR_LISP ("default-process-coding-system",
11184                Vdefault_process_coding_system,
11185                doc: /* Cons of coding systems used for process I/O by default.
11186 The car part is used for decoding a process output,
11187 the cdr part is used for encoding a text to be sent to a process.  */);
11188   Vdefault_process_coding_system = Qnil;
11189
11190   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11191                doc: /*
11192 Table of extra Latin codes in the range 128..159 (inclusive).
11193 This is a vector of length 256.
11194 If Nth element is non-nil, the existence of code N in a file
11195 \(or output of subprocess) doesn't prevent it to be detected as
11196 a coding system of ISO 2022 variant which has a flag
11197 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11198 or reading output of a subprocess.
11199 Only 128th through 159th elements have a meaning.  */);
11200   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11201
11202   DEFVAR_LISP ("select-safe-coding-system-function",
11203                Vselect_safe_coding_system_function,
11204                doc: /*
11205 Function to call to select safe coding system for encoding a text.
11206
11207 If set, this function is called to force a user to select a proper
11208 coding system which can encode the text in the case that a default
11209 coding system used in each operation can't encode the text.  The
11210 function should take care that the buffer is not modified while
11211 the coding system is being selected.
11212
11213 The default value is `select-safe-coding-system' (which see).  */);
11214   Vselect_safe_coding_system_function = Qnil;
11215
11216   DEFVAR_BOOL ("coding-system-require-warning",
11217                coding_system_require_warning,
11218                doc: /* Internal use only.
11219 If non-nil, on writing a file, `select-safe-coding-system-function' is
11220 called even if `coding-system-for-write' is non-nil.  The command
11221 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11222   coding_system_require_warning = 0;
11223
11224
11225   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11226                inhibit_iso_escape_detection,
11227                doc: /*
11228 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11229
11230 When Emacs reads text, it tries to detect how the text is encoded.
11231 This code detection is sensitive to escape sequences.  If Emacs sees
11232 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11233 of the ISO2022 encodings, and decodes text by the corresponding coding
11234 system (e.g. `iso-2022-7bit').
11235
11236 However, there may be a case that you want to read escape sequences in
11237 a file as is.  In such a case, you can set this variable to non-nil.
11238 Then the code detection will ignore any escape sequences, and no text is
11239 detected as encoded in some ISO-2022 encoding.  The result is that all
11240 escape sequences become visible in a buffer.
11241
11242 The default value is nil, and it is strongly recommended not to change
11243 it.  That is because many Emacs Lisp source files that contain
11244 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11245 in Emacs's distribution, and they won't be decoded correctly on
11246 reading if you suppress escape sequence detection.
11247
11248 The other way to read escape sequences in a file without decoding is
11249 to explicitly specify some coding system that doesn't use ISO-2022
11250 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11251   inhibit_iso_escape_detection = 0;
11252
11253   DEFVAR_BOOL ("inhibit-null-byte-detection",
11254                inhibit_null_byte_detection,
11255                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11256 By default, Emacs treats it as binary data, and does not attempt to
11257 decode it.  The effect is as if you specified `no-conversion' for
11258 reading that text.
11259
11260 Set this to non-nil when a regular text happens to include null bytes.
11261 Examples are Index nodes of Info files and null-byte delimited output
11262 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11263 decode text as usual.  */);
11264   inhibit_null_byte_detection = 0;
11265
11266   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11267                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11268 Internal use only.  Removed after the experimental optimizer gets stable. */);
11269   disable_ascii_optimization = 0;
11270
11271   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11272                doc: /* Char table for translating self-inserting characters.
11273 This is applied to the result of input methods, not their input.
11274 See also `keyboard-translate-table'.
11275
11276 Use of this variable for character code unification was rendered
11277 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11278 internal character representation.  */);
11279     Vtranslation_table_for_input = Qnil;
11280
11281   {
11282     Lisp_Object args[coding_arg_undecided_max];
11283     Lisp_Object plist[16];
11284     int i;
11285
11286     for (i = 0; i < coding_arg_undecided_max; i++)
11287       args[i] = Qnil;
11288
11289     plist[0] = intern_c_string (":name");
11290     plist[1] = args[coding_arg_name] = Qno_conversion;
11291     plist[2] = intern_c_string (":mnemonic");
11292     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11293     plist[4] = intern_c_string (":coding-type");
11294     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11295     plist[6] = intern_c_string (":ascii-compatible-p");
11296     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11297     plist[8] = intern_c_string (":default-char");
11298     plist[9] = args[coding_arg_default_char] = make_number (0);
11299     plist[10] = intern_c_string (":for-unibyte");
11300     plist[11] = args[coding_arg_for_unibyte] = Qt;
11301     plist[12] = intern_c_string (":docstring");
11302     plist[13] = build_pure_c_string ("Do no conversion.\n\
11303 \n\
11304 When you visit a file with this coding, the file is read into a\n\
11305 unibyte buffer as is, thus each byte of a file is treated as a\n\
11306 character.");
11307     plist[14] = intern_c_string (":eol-type");
11308     plist[15] = args[coding_arg_eol_type] = Qunix;
11309     args[coding_arg_plist] = Flist (16, plist);
11310     Fdefine_coding_system_internal (coding_arg_max, args);
11311
11312     plist[1] = args[coding_arg_name] = Qundecided;
11313     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11314     plist[5] = args[coding_arg_coding_type] = Qundecided;
11315     /* This is already set.
11316        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11317     plist[8] = intern_c_string (":charset-list");
11318     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11319     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11320     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11321     plist[15] = args[coding_arg_eol_type] = Qnil;
11322     args[coding_arg_plist] = Flist (16, plist);
11323     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11324     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11325     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11326   }
11327
11328   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11329
11330   {
11331     int i;
11332
11333     for (i = 0; i < coding_category_max; i++)
11334       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11335   }
11336 #if defined (DOS_NT)
11337   system_eol_type = Qdos;
11338 #else
11339   system_eol_type = Qunix;
11340 #endif
11341   staticpro (&system_eol_type);
11342 }
11343
11344 char *
11345 emacs_strerror (int error_number)
11346 {
11347   char *str;
11348
11349   synchronize_system_messages_locale ();
11350   str = strerror (error_number);
11351
11352   if (! NILP (Vlocale_coding_system))
11353     {
11354       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11355                                                       Vlocale_coding_system,
11356                                                       0);
11357       str = SSDATA (dec);
11358     }
11359
11360   return str;
11361 }
11362
11363 #endif /* emacs */